Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
Note: File does not exist in v6.13.7.
   1/*
   2 * Kernel-based Virtual Machine - device assignment support
   3 *
   4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
   5 *
   6 * This work is licensed under the terms of the GNU GPL, version 2.  See
   7 * the COPYING file in the top-level directory.
   8 *
   9 */
  10
  11#include <linux/kvm_host.h>
  12#include <linux/kvm.h>
  13#include <linux/uaccess.h>
  14#include <linux/vmalloc.h>
  15#include <linux/errno.h>
  16#include <linux/spinlock.h>
  17#include <linux/pci.h>
  18#include <linux/interrupt.h>
  19#include <linux/slab.h>
  20#include <linux/namei.h>
  21#include <linux/fs.h>
  22#include "irq.h"
  23#include "assigned-dev.h"
  24#include "trace/events/kvm.h"
  25
  26struct kvm_assigned_dev_kernel {
  27	struct kvm_irq_ack_notifier ack_notifier;
  28	struct list_head list;
  29	int assigned_dev_id;
  30	int host_segnr;
  31	int host_busnr;
  32	int host_devfn;
  33	unsigned int entries_nr;
  34	int host_irq;
  35	bool host_irq_disabled;
  36	bool pci_2_3;
  37	struct msix_entry *host_msix_entries;
  38	int guest_irq;
  39	struct msix_entry *guest_msix_entries;
  40	unsigned long irq_requested_type;
  41	int irq_source_id;
  42	int flags;
  43	struct pci_dev *dev;
  44	struct kvm *kvm;
  45	spinlock_t intx_lock;
  46	spinlock_t intx_mask_lock;
  47	char irq_name[32];
  48	struct pci_saved_state *pci_saved_state;
  49};
  50
  51static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
  52						      int assigned_dev_id)
  53{
  54	struct kvm_assigned_dev_kernel *match;
  55
  56	list_for_each_entry(match, head, list) {
  57		if (match->assigned_dev_id == assigned_dev_id)
  58			return match;
  59	}
  60	return NULL;
  61}
  62
  63static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
  64				    *assigned_dev, int irq)
  65{
  66	int i, index;
  67	struct msix_entry *host_msix_entries;
  68
  69	host_msix_entries = assigned_dev->host_msix_entries;
  70
  71	index = -1;
  72	for (i = 0; i < assigned_dev->entries_nr; i++)
  73		if (irq == host_msix_entries[i].vector) {
  74			index = i;
  75			break;
  76		}
  77	if (index < 0)
  78		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
  79
  80	return index;
  81}
  82
  83static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
  84{
  85	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
  86	int ret;
  87
  88	spin_lock(&assigned_dev->intx_lock);
  89	if (pci_check_and_mask_intx(assigned_dev->dev)) {
  90		assigned_dev->host_irq_disabled = true;
  91		ret = IRQ_WAKE_THREAD;
  92	} else
  93		ret = IRQ_NONE;
  94	spin_unlock(&assigned_dev->intx_lock);
  95
  96	return ret;
  97}
  98
  99static void
 100kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
 101				 int vector)
 102{
 103	if (unlikely(assigned_dev->irq_requested_type &
 104		     KVM_DEV_IRQ_GUEST_INTX)) {
 105		spin_lock(&assigned_dev->intx_mask_lock);
 106		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
 107			kvm_set_irq(assigned_dev->kvm,
 108				    assigned_dev->irq_source_id, vector, 1,
 109				    false);
 110		spin_unlock(&assigned_dev->intx_mask_lock);
 111	} else
 112		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 113			    vector, 1, false);
 114}
 115
 116static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 117{
 118	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 119
 120	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
 121		spin_lock_irq(&assigned_dev->intx_lock);
 122		disable_irq_nosync(irq);
 123		assigned_dev->host_irq_disabled = true;
 124		spin_unlock_irq(&assigned_dev->intx_lock);
 125	}
 126
 127	kvm_assigned_dev_raise_guest_irq(assigned_dev,
 128					 assigned_dev->guest_irq);
 129
 130	return IRQ_HANDLED;
 131}
 132
 133/*
 134 * Deliver an IRQ in an atomic context if we can, or return a failure,
 135 * user can retry in a process context.
 136 * Return value:
 137 *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
 138 *  Other values - No need to retry.
 139 */
 140static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
 141				int level)
 142{
 143	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 144	struct kvm_kernel_irq_routing_entry *e;
 145	int ret = -EINVAL;
 146	int idx;
 147
 148	trace_kvm_set_irq(irq, level, irq_source_id);
 149
 150	/*
 151	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
 152	 * which would need to be retried from thread context;  when same GSI
 153	 * is connected to both PIC and IOAPIC, we'd have to report a
 154	 * partial failure here.
 155	 * Since there's no easy way to do this, we only support injecting MSI
 156	 * which is limited to 1:1 GSI mapping.
 157	 */
 158	idx = srcu_read_lock(&kvm->irq_srcu);
 159	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
 160		e = &entries[0];
 161		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
 162						irq, level);
 163	}
 164	srcu_read_unlock(&kvm->irq_srcu, idx);
 165	return ret;
 166}
 167
 168
 169static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 170{
 171	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 172	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
 173				       assigned_dev->irq_source_id,
 174				       assigned_dev->guest_irq, 1);
 175	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
 176}
 177
 178static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 179{
 180	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 181
 182	kvm_assigned_dev_raise_guest_irq(assigned_dev,
 183					 assigned_dev->guest_irq);
 184
 185	return IRQ_HANDLED;
 186}
 187
 188static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
 189{
 190	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 191	int index = find_index_from_host_irq(assigned_dev, irq);
 192	u32 vector;
 193	int ret = 0;
 194
 195	if (index >= 0) {
 196		vector = assigned_dev->guest_msix_entries[index].vector;
 197		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
 198					   assigned_dev->irq_source_id,
 199					   vector, 1);
 200	}
 201
 202	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
 203}
 204
 205static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 206{
 207	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 208	int index = find_index_from_host_irq(assigned_dev, irq);
 209	u32 vector;
 210
 211	if (index >= 0) {
 212		vector = assigned_dev->guest_msix_entries[index].vector;
 213		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
 214	}
 215
 216	return IRQ_HANDLED;
 217}
 218
 219/* Ack the irq line for an assigned device */
 220static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 221{
 222	struct kvm_assigned_dev_kernel *dev =
 223		container_of(kian, struct kvm_assigned_dev_kernel,
 224			     ack_notifier);
 225
 226	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
 227
 228	spin_lock(&dev->intx_mask_lock);
 229
 230	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
 231		bool reassert = false;
 232
 233		spin_lock_irq(&dev->intx_lock);
 234		/*
 235		 * The guest IRQ may be shared so this ack can come from an
 236		 * IRQ for another guest device.
 237		 */
 238		if (dev->host_irq_disabled) {
 239			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
 240				enable_irq(dev->host_irq);
 241			else if (!pci_check_and_unmask_intx(dev->dev))
 242				reassert = true;
 243			dev->host_irq_disabled = reassert;
 244		}
 245		spin_unlock_irq(&dev->intx_lock);
 246
 247		if (reassert)
 248			kvm_set_irq(dev->kvm, dev->irq_source_id,
 249				    dev->guest_irq, 1, false);
 250	}
 251
 252	spin_unlock(&dev->intx_mask_lock);
 253}
 254
 255static void deassign_guest_irq(struct kvm *kvm,
 256			       struct kvm_assigned_dev_kernel *assigned_dev)
 257{
 258	if (assigned_dev->ack_notifier.gsi != -1)
 259		kvm_unregister_irq_ack_notifier(kvm,
 260						&assigned_dev->ack_notifier);
 261
 262	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 263		    assigned_dev->guest_irq, 0, false);
 264
 265	if (assigned_dev->irq_source_id != -1)
 266		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 267	assigned_dev->irq_source_id = -1;
 268	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
 269}
 270
 271/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
 272static void deassign_host_irq(struct kvm *kvm,
 273			      struct kvm_assigned_dev_kernel *assigned_dev)
 274{
 275	/*
 276	 * We disable irq here to prevent further events.
 277	 *
 278	 * Notice this maybe result in nested disable if the interrupt type is
 279	 * INTx, but it's OK for we are going to free it.
 280	 *
 281	 * If this function is a part of VM destroy, please ensure that till
 282	 * now, the kvm state is still legal for probably we also have to wait
 283	 * on a currently running IRQ handler.
 284	 */
 285	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 286		int i;
 287		for (i = 0; i < assigned_dev->entries_nr; i++)
 288			disable_irq(assigned_dev->host_msix_entries[i].vector);
 289
 290		for (i = 0; i < assigned_dev->entries_nr; i++)
 291			free_irq(assigned_dev->host_msix_entries[i].vector,
 292				 assigned_dev);
 293
 294		assigned_dev->entries_nr = 0;
 295		kfree(assigned_dev->host_msix_entries);
 296		kfree(assigned_dev->guest_msix_entries);
 297		pci_disable_msix(assigned_dev->dev);
 298	} else {
 299		/* Deal with MSI and INTx */
 300		if ((assigned_dev->irq_requested_type &
 301		     KVM_DEV_IRQ_HOST_INTX) &&
 302		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
 303			spin_lock_irq(&assigned_dev->intx_lock);
 304			pci_intx(assigned_dev->dev, false);
 305			spin_unlock_irq(&assigned_dev->intx_lock);
 306			synchronize_irq(assigned_dev->host_irq);
 307		} else
 308			disable_irq(assigned_dev->host_irq);
 309
 310		free_irq(assigned_dev->host_irq, assigned_dev);
 311
 312		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
 313			pci_disable_msi(assigned_dev->dev);
 314	}
 315
 316	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
 317}
 318
 319static int kvm_deassign_irq(struct kvm *kvm,
 320			    struct kvm_assigned_dev_kernel *assigned_dev,
 321			    unsigned long irq_requested_type)
 322{
 323	unsigned long guest_irq_type, host_irq_type;
 324
 325	if (!irqchip_in_kernel(kvm))
 326		return -EINVAL;
 327	/* no irq assignment to deassign */
 328	if (!assigned_dev->irq_requested_type)
 329		return -ENXIO;
 330
 331	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
 332	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
 333
 334	if (host_irq_type)
 335		deassign_host_irq(kvm, assigned_dev);
 336	if (guest_irq_type)
 337		deassign_guest_irq(kvm, assigned_dev);
 338
 339	return 0;
 340}
 341
 342static void kvm_free_assigned_irq(struct kvm *kvm,
 343				  struct kvm_assigned_dev_kernel *assigned_dev)
 344{
 345	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 346}
 347
 348static void kvm_free_assigned_device(struct kvm *kvm,
 349				     struct kvm_assigned_dev_kernel
 350				     *assigned_dev)
 351{
 352	kvm_free_assigned_irq(kvm, assigned_dev);
 353
 354	pci_reset_function(assigned_dev->dev);
 355	if (pci_load_and_free_saved_state(assigned_dev->dev,
 356					  &assigned_dev->pci_saved_state))
 357		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
 358		       __func__, dev_name(&assigned_dev->dev->dev));
 359	else
 360		pci_restore_state(assigned_dev->dev);
 361
 362	pci_clear_dev_assigned(assigned_dev->dev);
 363
 364	pci_release_regions(assigned_dev->dev);
 365	pci_disable_device(assigned_dev->dev);
 366	pci_dev_put(assigned_dev->dev);
 367
 368	list_del(&assigned_dev->list);
 369	kfree(assigned_dev);
 370}
 371
 372void kvm_free_all_assigned_devices(struct kvm *kvm)
 373{
 374	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
 375
 376	list_for_each_entry_safe(assigned_dev, tmp,
 377				 &kvm->arch.assigned_dev_head, list) {
 378		kvm_free_assigned_device(kvm, assigned_dev);
 379	}
 380}
 381
 382static int assigned_device_enable_host_intx(struct kvm *kvm,
 383					    struct kvm_assigned_dev_kernel *dev)
 384{
 385	irq_handler_t irq_handler;
 386	unsigned long flags;
 387
 388	dev->host_irq = dev->dev->irq;
 389
 390	/*
 391	 * We can only share the IRQ line with other host devices if we are
 392	 * able to disable the IRQ source at device-level - independently of
 393	 * the guest driver. Otherwise host devices may suffer from unbounded
 394	 * IRQ latencies when the guest keeps the line asserted.
 395	 */
 396	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
 397		irq_handler = kvm_assigned_dev_intx;
 398		flags = IRQF_SHARED;
 399	} else {
 400		irq_handler = NULL;
 401		flags = IRQF_ONESHOT;
 402	}
 403	if (request_threaded_irq(dev->host_irq, irq_handler,
 404				 kvm_assigned_dev_thread_intx, flags,
 405				 dev->irq_name, dev))
 406		return -EIO;
 407
 408	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
 409		spin_lock_irq(&dev->intx_lock);
 410		pci_intx(dev->dev, true);
 411		spin_unlock_irq(&dev->intx_lock);
 412	}
 413	return 0;
 414}
 415
 416static int assigned_device_enable_host_msi(struct kvm *kvm,
 417					   struct kvm_assigned_dev_kernel *dev)
 418{
 419	int r;
 420
 421	if (!dev->dev->msi_enabled) {
 422		r = pci_enable_msi(dev->dev);
 423		if (r)
 424			return r;
 425	}
 426
 427	dev->host_irq = dev->dev->irq;
 428	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
 429				 kvm_assigned_dev_thread_msi, 0,
 430				 dev->irq_name, dev)) {
 431		pci_disable_msi(dev->dev);
 432		return -EIO;
 433	}
 434
 435	return 0;
 436}
 437
 438static int assigned_device_enable_host_msix(struct kvm *kvm,
 439					    struct kvm_assigned_dev_kernel *dev)
 440{
 441	int i, r = -EINVAL;
 442
 443	/* host_msix_entries and guest_msix_entries should have been
 444	 * initialized */
 445	if (dev->entries_nr == 0)
 446		return r;
 447
 448	r = pci_enable_msix_exact(dev->dev,
 449				  dev->host_msix_entries, dev->entries_nr);
 450	if (r)
 451		return r;
 452
 453	for (i = 0; i < dev->entries_nr; i++) {
 454		r = request_threaded_irq(dev->host_msix_entries[i].vector,
 455					 kvm_assigned_dev_msix,
 456					 kvm_assigned_dev_thread_msix,
 457					 0, dev->irq_name, dev);
 458		if (r)
 459			goto err;
 460	}
 461
 462	return 0;
 463err:
 464	for (i -= 1; i >= 0; i--)
 465		free_irq(dev->host_msix_entries[i].vector, dev);
 466	pci_disable_msix(dev->dev);
 467	return r;
 468}
 469
 470static int assigned_device_enable_guest_intx(struct kvm *kvm,
 471				struct kvm_assigned_dev_kernel *dev,
 472				struct kvm_assigned_irq *irq)
 473{
 474	dev->guest_irq = irq->guest_irq;
 475	dev->ack_notifier.gsi = irq->guest_irq;
 476	return 0;
 477}
 478
 479static int assigned_device_enable_guest_msi(struct kvm *kvm,
 480			struct kvm_assigned_dev_kernel *dev,
 481			struct kvm_assigned_irq *irq)
 482{
 483	dev->guest_irq = irq->guest_irq;
 484	dev->ack_notifier.gsi = -1;
 485	return 0;
 486}
 487
 488static int assigned_device_enable_guest_msix(struct kvm *kvm,
 489			struct kvm_assigned_dev_kernel *dev,
 490			struct kvm_assigned_irq *irq)
 491{
 492	dev->guest_irq = irq->guest_irq;
 493	dev->ack_notifier.gsi = -1;
 494	return 0;
 495}
 496
 497static int assign_host_irq(struct kvm *kvm,
 498			   struct kvm_assigned_dev_kernel *dev,
 499			   __u32 host_irq_type)
 500{
 501	int r = -EEXIST;
 502
 503	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
 504		return r;
 505
 506	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
 507		 pci_name(dev->dev));
 508
 509	switch (host_irq_type) {
 510	case KVM_DEV_IRQ_HOST_INTX:
 511		r = assigned_device_enable_host_intx(kvm, dev);
 512		break;
 513	case KVM_DEV_IRQ_HOST_MSI:
 514		r = assigned_device_enable_host_msi(kvm, dev);
 515		break;
 516	case KVM_DEV_IRQ_HOST_MSIX:
 517		r = assigned_device_enable_host_msix(kvm, dev);
 518		break;
 519	default:
 520		r = -EINVAL;
 521	}
 522	dev->host_irq_disabled = false;
 523
 524	if (!r)
 525		dev->irq_requested_type |= host_irq_type;
 526
 527	return r;
 528}
 529
 530static int assign_guest_irq(struct kvm *kvm,
 531			    struct kvm_assigned_dev_kernel *dev,
 532			    struct kvm_assigned_irq *irq,
 533			    unsigned long guest_irq_type)
 534{
 535	int id;
 536	int r = -EEXIST;
 537
 538	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
 539		return r;
 540
 541	id = kvm_request_irq_source_id(kvm);
 542	if (id < 0)
 543		return id;
 544
 545	dev->irq_source_id = id;
 546
 547	switch (guest_irq_type) {
 548	case KVM_DEV_IRQ_GUEST_INTX:
 549		r = assigned_device_enable_guest_intx(kvm, dev, irq);
 550		break;
 551	case KVM_DEV_IRQ_GUEST_MSI:
 552		r = assigned_device_enable_guest_msi(kvm, dev, irq);
 553		break;
 554	case KVM_DEV_IRQ_GUEST_MSIX:
 555		r = assigned_device_enable_guest_msix(kvm, dev, irq);
 556		break;
 557	default:
 558		r = -EINVAL;
 559	}
 560
 561	if (!r) {
 562		dev->irq_requested_type |= guest_irq_type;
 563		if (dev->ack_notifier.gsi != -1)
 564			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
 565	} else {
 566		kvm_free_irq_source_id(kvm, dev->irq_source_id);
 567		dev->irq_source_id = -1;
 568	}
 569
 570	return r;
 571}
 572
 573/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
 574static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 575				   struct kvm_assigned_irq *assigned_irq)
 576{
 577	int r = -EINVAL;
 578	struct kvm_assigned_dev_kernel *match;
 579	unsigned long host_irq_type, guest_irq_type;
 580
 581	if (!irqchip_in_kernel(kvm))
 582		return r;
 583
 584	mutex_lock(&kvm->lock);
 585	r = -ENODEV;
 586	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 587				      assigned_irq->assigned_dev_id);
 588	if (!match)
 589		goto out;
 590
 591	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
 592	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
 593
 594	r = -EINVAL;
 595	/* can only assign one type at a time */
 596	if (hweight_long(host_irq_type) > 1)
 597		goto out;
 598	if (hweight_long(guest_irq_type) > 1)
 599		goto out;
 600	if (host_irq_type == 0 && guest_irq_type == 0)
 601		goto out;
 602
 603	r = 0;
 604	if (host_irq_type)
 605		r = assign_host_irq(kvm, match, host_irq_type);
 606	if (r)
 607		goto out;
 608
 609	if (guest_irq_type)
 610		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
 611out:
 612	mutex_unlock(&kvm->lock);
 613	return r;
 614}
 615
 616static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
 617					 struct kvm_assigned_irq
 618					 *assigned_irq)
 619{
 620	int r = -ENODEV;
 621	struct kvm_assigned_dev_kernel *match;
 622	unsigned long irq_type;
 623
 624	mutex_lock(&kvm->lock);
 625
 626	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 627				      assigned_irq->assigned_dev_id);
 628	if (!match)
 629		goto out;
 630
 631	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
 632					  KVM_DEV_IRQ_GUEST_MASK);
 633	r = kvm_deassign_irq(kvm, match, irq_type);
 634out:
 635	mutex_unlock(&kvm->lock);
 636	return r;
 637}
 638
 639/*
 640 * We want to test whether the caller has been granted permissions to
 641 * use this device.  To be able to configure and control the device,
 642 * the user needs access to PCI configuration space and BAR resources.
 643 * These are accessed through PCI sysfs.  PCI config space is often
 644 * passed to the process calling this ioctl via file descriptor, so we
 645 * can't rely on access to that file.  We can check for permissions
 646 * on each of the BAR resource files, which is a pretty clear
 647 * indicator that the user has been granted access to the device.
 648 */
 649static int probe_sysfs_permissions(struct pci_dev *dev)
 650{
 651#ifdef CONFIG_SYSFS
 652	int i;
 653	bool bar_found = false;
 654
 655	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
 656		char *kpath, *syspath;
 657		struct path path;
 658		struct inode *inode;
 659		int r;
 660
 661		if (!pci_resource_len(dev, i))
 662			continue;
 663
 664		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
 665		if (!kpath)
 666			return -ENOMEM;
 667
 668		/* Per sysfs-rules, sysfs is always at /sys */
 669		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
 670		kfree(kpath);
 671		if (!syspath)
 672			return -ENOMEM;
 673
 674		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
 675		kfree(syspath);
 676		if (r)
 677			return r;
 678
 679		inode = d_backing_inode(path.dentry);
 680
 681		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
 682		path_put(&path);
 683		if (r)
 684			return r;
 685
 686		bar_found = true;
 687	}
 688
 689	/* If no resources, probably something special */
 690	if (!bar_found)
 691		return -EPERM;
 692
 693	return 0;
 694#else
 695	return -EINVAL; /* No way to control the device without sysfs */
 696#endif
 697}
 698
 699static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 700				      struct kvm_assigned_pci_dev *assigned_dev)
 701{
 702	int r = 0, idx;
 703	struct kvm_assigned_dev_kernel *match;
 704	struct pci_dev *dev;
 705
 706	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
 707		return -EINVAL;
 708
 709	mutex_lock(&kvm->lock);
 710	idx = srcu_read_lock(&kvm->srcu);
 711
 712	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 713				      assigned_dev->assigned_dev_id);
 714	if (match) {
 715		/* device already assigned */
 716		r = -EEXIST;
 717		goto out;
 718	}
 719
 720	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
 721	if (match == NULL) {
 722		printk(KERN_INFO "%s: Couldn't allocate memory\n",
 723		       __func__);
 724		r = -ENOMEM;
 725		goto out;
 726	}
 727	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
 728				   assigned_dev->busnr,
 729				   assigned_dev->devfn);
 730	if (!dev) {
 731		printk(KERN_INFO "%s: host device not found\n", __func__);
 732		r = -EINVAL;
 733		goto out_free;
 734	}
 735
 736	/* Don't allow bridges to be assigned */
 737	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
 738		r = -EPERM;
 739		goto out_put;
 740	}
 741
 742	r = probe_sysfs_permissions(dev);
 743	if (r)
 744		goto out_put;
 745
 746	if (pci_enable_device(dev)) {
 747		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
 748		r = -EBUSY;
 749		goto out_put;
 750	}
 751	r = pci_request_regions(dev, "kvm_assigned_device");
 752	if (r) {
 753		printk(KERN_INFO "%s: Could not get access to device regions\n",
 754		       __func__);
 755		goto out_disable;
 756	}
 757
 758	pci_reset_function(dev);
 759	pci_save_state(dev);
 760	match->pci_saved_state = pci_store_saved_state(dev);
 761	if (!match->pci_saved_state)
 762		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
 763		       __func__, dev_name(&dev->dev));
 764
 765	if (!pci_intx_mask_supported(dev))
 766		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
 767
 768	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 769	match->host_segnr = assigned_dev->segnr;
 770	match->host_busnr = assigned_dev->busnr;
 771	match->host_devfn = assigned_dev->devfn;
 772	match->flags = assigned_dev->flags;
 773	match->dev = dev;
 774	spin_lock_init(&match->intx_lock);
 775	spin_lock_init(&match->intx_mask_lock);
 776	match->irq_source_id = -1;
 777	match->kvm = kvm;
 778	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
 779
 780	list_add(&match->list, &kvm->arch.assigned_dev_head);
 781
 782	if (!kvm->arch.iommu_domain) {
 783		r = kvm_iommu_map_guest(kvm);
 784		if (r)
 785			goto out_list_del;
 786	}
 787	r = kvm_assign_device(kvm, match->dev);
 788	if (r)
 789		goto out_list_del;
 790
 791out:
 792	srcu_read_unlock(&kvm->srcu, idx);
 793	mutex_unlock(&kvm->lock);
 794	return r;
 795out_list_del:
 796	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
 797		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
 798		       __func__, dev_name(&dev->dev));
 799	list_del(&match->list);
 800	pci_release_regions(dev);
 801out_disable:
 802	pci_disable_device(dev);
 803out_put:
 804	pci_dev_put(dev);
 805out_free:
 806	kfree(match);
 807	srcu_read_unlock(&kvm->srcu, idx);
 808	mutex_unlock(&kvm->lock);
 809	return r;
 810}
 811
 812static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
 813		struct kvm_assigned_pci_dev *assigned_dev)
 814{
 815	int r = 0;
 816	struct kvm_assigned_dev_kernel *match;
 817
 818	mutex_lock(&kvm->lock);
 819
 820	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 821				      assigned_dev->assigned_dev_id);
 822	if (!match) {
 823		printk(KERN_INFO "%s: device hasn't been assigned before, "
 824		  "so cannot be deassigned\n", __func__);
 825		r = -EINVAL;
 826		goto out;
 827	}
 828
 829	kvm_deassign_device(kvm, match->dev);
 830
 831	kvm_free_assigned_device(kvm, match);
 832
 833out:
 834	mutex_unlock(&kvm->lock);
 835	return r;
 836}
 837
 838
 839static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 840				    struct kvm_assigned_msix_nr *entry_nr)
 841{
 842	int r = 0;
 843	struct kvm_assigned_dev_kernel *adev;
 844
 845	mutex_lock(&kvm->lock);
 846
 847	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 848				      entry_nr->assigned_dev_id);
 849	if (!adev) {
 850		r = -EINVAL;
 851		goto msix_nr_out;
 852	}
 853
 854	if (adev->entries_nr == 0) {
 855		adev->entries_nr = entry_nr->entry_nr;
 856		if (adev->entries_nr == 0 ||
 857		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
 858			r = -EINVAL;
 859			goto msix_nr_out;
 860		}
 861
 862		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
 863						entry_nr->entry_nr,
 864						GFP_KERNEL);
 865		if (!adev->host_msix_entries) {
 866			r = -ENOMEM;
 867			goto msix_nr_out;
 868		}
 869		adev->guest_msix_entries =
 870			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
 871				GFP_KERNEL);
 872		if (!adev->guest_msix_entries) {
 873			kfree(adev->host_msix_entries);
 874			r = -ENOMEM;
 875			goto msix_nr_out;
 876		}
 877	} else /* Not allowed set MSI-X number twice */
 878		r = -EINVAL;
 879msix_nr_out:
 880	mutex_unlock(&kvm->lock);
 881	return r;
 882}
 883
 884static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
 885				       struct kvm_assigned_msix_entry *entry)
 886{
 887	int r = 0, i;
 888	struct kvm_assigned_dev_kernel *adev;
 889
 890	mutex_lock(&kvm->lock);
 891
 892	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 893				      entry->assigned_dev_id);
 894
 895	if (!adev) {
 896		r = -EINVAL;
 897		goto msix_entry_out;
 898	}
 899
 900	for (i = 0; i < adev->entries_nr; i++)
 901		if (adev->guest_msix_entries[i].vector == 0 ||
 902		    adev->guest_msix_entries[i].entry == entry->entry) {
 903			adev->guest_msix_entries[i].entry = entry->entry;
 904			adev->guest_msix_entries[i].vector = entry->gsi;
 905			adev->host_msix_entries[i].entry = entry->entry;
 906			break;
 907		}
 908	if (i == adev->entries_nr) {
 909		r = -ENOSPC;
 910		goto msix_entry_out;
 911	}
 912
 913msix_entry_out:
 914	mutex_unlock(&kvm->lock);
 915
 916	return r;
 917}
 918
 919static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 920		struct kvm_assigned_pci_dev *assigned_dev)
 921{
 922	int r = 0;
 923	struct kvm_assigned_dev_kernel *match;
 924
 925	mutex_lock(&kvm->lock);
 926
 927	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 928				      assigned_dev->assigned_dev_id);
 929	if (!match) {
 930		r = -ENODEV;
 931		goto out;
 932	}
 933
 934	spin_lock(&match->intx_mask_lock);
 935
 936	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
 937	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
 938
 939	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
 940		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
 941			kvm_set_irq(match->kvm, match->irq_source_id,
 942				    match->guest_irq, 0, false);
 943			/*
 944			 * Masking at hardware-level is performed on demand,
 945			 * i.e. when an IRQ actually arrives at the host.
 946			 */
 947		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
 948			/*
 949			 * Unmask the IRQ line if required. Unmasking at
 950			 * device level will be performed by user space.
 951			 */
 952			spin_lock_irq(&match->intx_lock);
 953			if (match->host_irq_disabled) {
 954				enable_irq(match->host_irq);
 955				match->host_irq_disabled = false;
 956			}
 957			spin_unlock_irq(&match->intx_lock);
 958		}
 959	}
 960
 961	spin_unlock(&match->intx_mask_lock);
 962
 963out:
 964	mutex_unlock(&kvm->lock);
 965	return r;
 966}
 967
 968long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 969				  unsigned long arg)
 970{
 971	void __user *argp = (void __user *)arg;
 972	int r;
 973
 974	switch (ioctl) {
 975	case KVM_ASSIGN_PCI_DEVICE: {
 976		struct kvm_assigned_pci_dev assigned_dev;
 977
 978		r = -EFAULT;
 979		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
 980			goto out;
 981		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
 982		if (r)
 983			goto out;
 984		break;
 985	}
 986	case KVM_ASSIGN_IRQ: {
 987		r = -EOPNOTSUPP;
 988		break;
 989	}
 990	case KVM_ASSIGN_DEV_IRQ: {
 991		struct kvm_assigned_irq assigned_irq;
 992
 993		r = -EFAULT;
 994		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
 995			goto out;
 996		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
 997		if (r)
 998			goto out;
 999		break;
1000	}
1001	case KVM_DEASSIGN_DEV_IRQ: {
1002		struct kvm_assigned_irq assigned_irq;
1003
1004		r = -EFAULT;
1005		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1006			goto out;
1007		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
1008		if (r)
1009			goto out;
1010		break;
1011	}
1012	case KVM_DEASSIGN_PCI_DEVICE: {
1013		struct kvm_assigned_pci_dev assigned_dev;
1014
1015		r = -EFAULT;
1016		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1017			goto out;
1018		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
1019		if (r)
1020			goto out;
1021		break;
1022	}
1023	case KVM_ASSIGN_SET_MSIX_NR: {
1024		struct kvm_assigned_msix_nr entry_nr;
1025		r = -EFAULT;
1026		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
1027			goto out;
1028		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
1029		if (r)
1030			goto out;
1031		break;
1032	}
1033	case KVM_ASSIGN_SET_MSIX_ENTRY: {
1034		struct kvm_assigned_msix_entry entry;
1035		r = -EFAULT;
1036		if (copy_from_user(&entry, argp, sizeof entry))
1037			goto out;
1038		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
1039		if (r)
1040			goto out;
1041		break;
1042	}
1043	case KVM_ASSIGN_SET_INTX_MASK: {
1044		struct kvm_assigned_pci_dev assigned_dev;
1045
1046		r = -EFAULT;
1047		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1048			goto out;
1049		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
1050		break;
1051	}
1052	default:
1053		r = -ENOTTY;
1054		break;
1055	}
1056out:
1057	return r;
1058}