Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
   1/*
   2 * Copyright (c) Microsoft Corporation.
   3 *
   4 * Author:
   5 *   Jake Oshins <jakeo@microsoft.com>
   6 *
   7 * This driver acts as a paravirtual front-end for PCI Express root buses.
   8 * When a PCI Express function (either an entire device or an SR-IOV
   9 * Virtual Function) is being passed through to the VM, this driver exposes
  10 * a new bus to the guest VM.  This is modeled as a root PCI bus because
  11 * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
  12 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
  13 * until a device as been exposed using this driver.
  14 *
  15 * Each root PCI bus has its own PCI domain, which is called "Segment" in
  16 * the PCI Firmware Specifications.  Thus while each device passed through
  17 * to the VM using this front-end will appear at "device 0", the domain will
  18 * be unique.  Typically, each bus will have one PCI function on it, though
  19 * this driver does support more than one.
  20 *
  21 * In order to map the interrupts from the device through to the guest VM,
  22 * this driver also implements an IRQ Domain, which handles interrupts (either
  23 * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
  24 * set up, torn down, or reaffined, this driver communicates with the
  25 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
  26 * interrupt will be delivered to the correct virtual processor at the right
  27 * vector.  This driver does not support level-triggered (line-based)
  28 * interrupts, and will report that the Interrupt Line register in the
  29 * function's configuration space is zero.
  30 *
  31 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
  32 * facilities.  For instance, the configuration space of a function exposed
  33 * by Hyper-V is mapped into a single page of memory space, and the
  34 * read and write handlers for config space must be aware of this mechanism.
  35 * Similarly, device setup and teardown involves messages sent to and from
  36 * the PCI back-end driver in Hyper-V.
  37 *
  38 * This program is free software; you can redistribute it and/or modify it
  39 * under the terms of the GNU General Public License version 2 as published
  40 * by the Free Software Foundation.
  41 *
  42 * This program is distributed in the hope that it will be useful, but
  43 * WITHOUT ANY WARRANTY; without even the implied warranty of
  44 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  45 * NON INFRINGEMENT.  See the GNU General Public License for more
  46 * details.
  47 *
  48 */
  49
  50#include <linux/kernel.h>
  51#include <linux/module.h>
  52#include <linux/pci.h>
  53#include <linux/semaphore.h>
  54#include <linux/irqdomain.h>
  55#include <asm/irqdomain.h>
  56#include <asm/apic.h>
  57#include <linux/msi.h>
  58#include <linux/hyperv.h>
  59#include <asm/mshyperv.h>
  60
  61/*
  62 * Protocol versions. The low word is the minor version, the high word the
  63 * major version.
  64 */
  65
  66#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (major)))
  67#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
  68#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
  69
  70enum {
  71	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
  72	PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
  73};
  74
  75#define PCI_CONFIG_MMIO_LENGTH	0x2000
  76#define CFG_PAGE_OFFSET 0x1000
  77#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
  78
  79#define MAX_SUPPORTED_MSI_MESSAGES 0x400
  80
  81/*
  82 * Message Types
  83 */
  84
  85enum pci_message_type {
  86	/*
  87	 * Version 1.1
  88	 */
  89	PCI_MESSAGE_BASE                = 0x42490000,
  90	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
  91	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
  92	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
  93	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
  94	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
  95	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
  96	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
  97	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
  98	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
  99	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
 100	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
 101	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
 102	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
 103	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
 104	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
 105	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
 106	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
 107	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
 108	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
 109	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
 110	PCI_MESSAGE_MAXIMUM
 111};
 112
 113/*
 114 * Structures defining the virtual PCI Express protocol.
 115 */
 116
 117union pci_version {
 118	struct {
 119		u16 minor_version;
 120		u16 major_version;
 121	} parts;
 122	u32 version;
 123} __packed;
 124
 125/*
 126 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
 127 * which is all this driver does.  This representation is the one used in
 128 * Windows, which is what is expected when sending this back and forth with
 129 * the Hyper-V parent partition.
 130 */
 131union win_slot_encoding {
 132	struct {
 133		u32	func:8;
 134		u32	reserved:24;
 135	} bits;
 136	u32 slot;
 137} __packed;
 138
 139/*
 140 * Pretty much as defined in the PCI Specifications.
 141 */
 142struct pci_function_description {
 143	u16	v_id;	/* vendor ID */
 144	u16	d_id;	/* device ID */
 145	u8	rev;
 146	u8	prog_intf;
 147	u8	subclass;
 148	u8	base_class;
 149	u32	subsystem_id;
 150	union win_slot_encoding win_slot;
 151	u32	ser;	/* serial number */
 152} __packed;
 153
 154/**
 155 * struct hv_msi_desc
 156 * @vector:		IDT entry
 157 * @delivery_mode:	As defined in Intel's Programmer's
 158 *			Reference Manual, Volume 3, Chapter 8.
 159 * @vector_count:	Number of contiguous entries in the
 160 *			Interrupt Descriptor Table that are
 161 *			occupied by this Message-Signaled
 162 *			Interrupt. For "MSI", as first defined
 163 *			in PCI 2.2, this can be between 1 and
 164 *			32. For "MSI-X," as first defined in PCI
 165 *			3.0, this must be 1, as each MSI-X table
 166 *			entry would have its own descriptor.
 167 * @reserved:		Empty space
 168 * @cpu_mask:		All the target virtual processors.
 169 */
 170struct hv_msi_desc {
 171	u8	vector;
 172	u8	delivery_mode;
 173	u16	vector_count;
 174	u32	reserved;
 175	u64	cpu_mask;
 176} __packed;
 177
 178/**
 179 * struct tran_int_desc
 180 * @reserved:		unused, padding
 181 * @vector_count:	same as in hv_msi_desc
 182 * @data:		This is the "data payload" value that is
 183 *			written by the device when it generates
 184 *			a message-signaled interrupt, either MSI
 185 *			or MSI-X.
 186 * @address:		This is the address to which the data
 187 *			payload is written on interrupt
 188 *			generation.
 189 */
 190struct tran_int_desc {
 191	u16	reserved;
 192	u16	vector_count;
 193	u32	data;
 194	u64	address;
 195} __packed;
 196
 197/*
 198 * A generic message format for virtual PCI.
 199 * Specific message formats are defined later in the file.
 200 */
 201
 202struct pci_message {
 203	u32 message_type;
 204} __packed;
 205
 206struct pci_child_message {
 207	u32 message_type;
 208	union win_slot_encoding wslot;
 209} __packed;
 210
 211struct pci_incoming_message {
 212	struct vmpacket_descriptor hdr;
 213	struct pci_message message_type;
 214} __packed;
 215
 216struct pci_response {
 217	struct vmpacket_descriptor hdr;
 218	s32 status;			/* negative values are failures */
 219} __packed;
 220
 221struct pci_packet {
 222	void (*completion_func)(void *context, struct pci_response *resp,
 223				int resp_packet_size);
 224	void *compl_ctxt;
 225	struct pci_message message;
 226};
 227
 228/*
 229 * Specific message types supporting the PCI protocol.
 230 */
 231
 232/*
 233 * Version negotiation message. Sent from the guest to the host.
 234 * The guest is free to try different versions until the host
 235 * accepts the version.
 236 *
 237 * pci_version: The protocol version requested.
 238 * is_last_attempt: If TRUE, this is the last version guest will request.
 239 * reservedz: Reserved field, set to zero.
 240 */
 241
 242struct pci_version_request {
 243	struct pci_message message_type;
 244	enum pci_message_type protocol_version;
 245} __packed;
 246
 247/*
 248 * Bus D0 Entry.  This is sent from the guest to the host when the virtual
 249 * bus (PCI Express port) is ready for action.
 250 */
 251
 252struct pci_bus_d0_entry {
 253	struct pci_message message_type;
 254	u32 reserved;
 255	u64 mmio_base;
 256} __packed;
 257
 258struct pci_bus_relations {
 259	struct pci_incoming_message incoming;
 260	u32 device_count;
 261	struct pci_function_description func[1];
 262} __packed;
 263
 264struct pci_q_res_req_response {
 265	struct vmpacket_descriptor hdr;
 266	s32 status;			/* negative values are failures */
 267	u32 probed_bar[6];
 268} __packed;
 269
 270struct pci_set_power {
 271	struct pci_message message_type;
 272	union win_slot_encoding wslot;
 273	u32 power_state;		/* In Windows terms */
 274	u32 reserved;
 275} __packed;
 276
 277struct pci_set_power_response {
 278	struct vmpacket_descriptor hdr;
 279	s32 status;			/* negative values are failures */
 280	union win_slot_encoding wslot;
 281	u32 resultant_state;		/* In Windows terms */
 282	u32 reserved;
 283} __packed;
 284
 285struct pci_resources_assigned {
 286	struct pci_message message_type;
 287	union win_slot_encoding wslot;
 288	u8 memory_range[0x14][6];	/* not used here */
 289	u32 msi_descriptors;
 290	u32 reserved[4];
 291} __packed;
 292
 293struct pci_create_interrupt {
 294	struct pci_message message_type;
 295	union win_slot_encoding wslot;
 296	struct hv_msi_desc int_desc;
 297} __packed;
 298
 299struct pci_create_int_response {
 300	struct pci_response response;
 301	u32 reserved;
 302	struct tran_int_desc int_desc;
 303} __packed;
 304
 305struct pci_delete_interrupt {
 306	struct pci_message message_type;
 307	union win_slot_encoding wslot;
 308	struct tran_int_desc int_desc;
 309} __packed;
 310
 311struct pci_dev_incoming {
 312	struct pci_incoming_message incoming;
 313	union win_slot_encoding wslot;
 314} __packed;
 315
 316struct pci_eject_response {
 317	u32 message_type;
 318	union win_slot_encoding wslot;
 319	u32 status;
 320} __packed;
 321
 322static int pci_ring_size = (4 * PAGE_SIZE);
 323
 324/*
 325 * Definitions or interrupt steering hypercall.
 326 */
 327#define HV_PARTITION_ID_SELF		((u64)-1)
 328#define HVCALL_RETARGET_INTERRUPT	0x7e
 329
 330struct retarget_msi_interrupt {
 331	u64	partition_id;		/* use "self" */
 332	u64	device_id;
 333	u32	source;			/* 1 for MSI(-X) */
 334	u32	reserved1;
 335	u32	address;
 336	u32	data;
 337	u64	reserved2;
 338	u32	vector;
 339	u32	flags;
 340	u64	vp_mask;
 341} __packed;
 342
 343/*
 344 * Driver specific state.
 345 */
 346
 347enum hv_pcibus_state {
 348	hv_pcibus_init = 0,
 349	hv_pcibus_probed,
 350	hv_pcibus_installed,
 351	hv_pcibus_maximum
 352};
 353
 354struct hv_pcibus_device {
 355	struct pci_sysdata sysdata;
 356	enum hv_pcibus_state state;
 357	atomic_t remove_lock;
 358	struct hv_device *hdev;
 359	resource_size_t low_mmio_space;
 360	resource_size_t high_mmio_space;
 361	struct resource *mem_config;
 362	struct resource *low_mmio_res;
 363	struct resource *high_mmio_res;
 364	struct completion *survey_event;
 365	struct completion remove_event;
 366	struct pci_bus *pci_bus;
 367	spinlock_t config_lock;	/* Avoid two threads writing index page */
 368	spinlock_t device_list_lock;	/* Protect lists below */
 369	void __iomem *cfg_addr;
 370
 371	struct semaphore enum_sem;
 372	struct list_head resources_for_children;
 373
 374	struct list_head children;
 375	struct list_head dr_list;
 376	struct work_struct wrk;
 377
 378	struct msi_domain_info msi_info;
 379	struct msi_controller msi_chip;
 380	struct irq_domain *irq_domain;
 381};
 382
 383/*
 384 * Tracks "Device Relations" messages from the host, which must be both
 385 * processed in order and deferred so that they don't run in the context
 386 * of the incoming packet callback.
 387 */
 388struct hv_dr_work {
 389	struct work_struct wrk;
 390	struct hv_pcibus_device *bus;
 391};
 392
 393struct hv_dr_state {
 394	struct list_head list_entry;
 395	u32 device_count;
 396	struct pci_function_description func[1];
 397};
 398
 399enum hv_pcichild_state {
 400	hv_pcichild_init = 0,
 401	hv_pcichild_requirements,
 402	hv_pcichild_resourced,
 403	hv_pcichild_ejecting,
 404	hv_pcichild_maximum
 405};
 406
 407enum hv_pcidev_ref_reason {
 408	hv_pcidev_ref_invalid = 0,
 409	hv_pcidev_ref_initial,
 410	hv_pcidev_ref_by_slot,
 411	hv_pcidev_ref_packet,
 412	hv_pcidev_ref_pnp,
 413	hv_pcidev_ref_childlist,
 414	hv_pcidev_irqdata,
 415	hv_pcidev_ref_max
 416};
 417
 418struct hv_pci_dev {
 419	/* List protected by pci_rescan_remove_lock */
 420	struct list_head list_entry;
 421	atomic_t refs;
 422	enum hv_pcichild_state state;
 423	struct pci_function_description desc;
 424	bool reported_missing;
 425	struct hv_pcibus_device *hbus;
 426	struct work_struct wrk;
 427
 428	/*
 429	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
 430	 * read it back, for each of the BAR offsets within config space.
 431	 */
 432	u32 probed_bar[6];
 433};
 434
 435struct hv_pci_compl {
 436	struct completion host_event;
 437	s32 completion_status;
 438};
 439
 440/**
 441 * hv_pci_generic_compl() - Invoked for a completion packet
 442 * @context:		Set up by the sender of the packet.
 443 * @resp:		The response packet
 444 * @resp_packet_size:	Size in bytes of the packet
 445 *
 446 * This function is used to trigger an event and report status
 447 * for any message for which the completion packet contains a
 448 * status and nothing else.
 449 */
 450static
 451void
 452hv_pci_generic_compl(void *context, struct pci_response *resp,
 453		     int resp_packet_size)
 454{
 455	struct hv_pci_compl *comp_pkt = context;
 456
 457	if (resp_packet_size >= offsetofend(struct pci_response, status))
 458		comp_pkt->completion_status = resp->status;
 459	complete(&comp_pkt->host_event);
 460}
 461
 462static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
 463						u32 wslot);
 464static void get_pcichild(struct hv_pci_dev *hv_pcidev,
 465			 enum hv_pcidev_ref_reason reason);
 466static void put_pcichild(struct hv_pci_dev *hv_pcidev,
 467			 enum hv_pcidev_ref_reason reason);
 468
 469static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 470static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 471
 472/**
 473 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
 474 * @devfn:	The Linux representation of PCI slot
 475 *
 476 * Windows uses a slightly different representation of PCI slot.
 477 *
 478 * Return: The Windows representation
 479 */
 480static u32 devfn_to_wslot(int devfn)
 481{
 482	union win_slot_encoding wslot;
 483
 484	wslot.slot = 0;
 485	wslot.bits.func = PCI_SLOT(devfn) | (PCI_FUNC(devfn) << 5);
 486
 487	return wslot.slot;
 488}
 489
 490/**
 491 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
 492 * @wslot:	The Windows representation of PCI slot
 493 *
 494 * Windows uses a slightly different representation of PCI slot.
 495 *
 496 * Return: The Linux representation
 497 */
 498static int wslot_to_devfn(u32 wslot)
 499{
 500	union win_slot_encoding slot_no;
 501
 502	slot_no.slot = wslot;
 503	return PCI_DEVFN(0, slot_no.bits.func);
 504}
 505
 506/*
 507 * PCI Configuration Space for these root PCI buses is implemented as a pair
 508 * of pages in memory-mapped I/O space.  Writing to the first page chooses
 509 * the PCI function being written or read.  Once the first page has been
 510 * written to, the following page maps in the entire configuration space of
 511 * the function.
 512 */
 513
 514/**
 515 * _hv_pcifront_read_config() - Internal PCI config read
 516 * @hpdev:	The PCI driver's representation of the device
 517 * @where:	Offset within config space
 518 * @size:	Size of the transfer
 519 * @val:	Pointer to the buffer receiving the data
 520 */
 521static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
 522				     int size, u32 *val)
 523{
 524	unsigned long flags;
 525	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 526
 527	/*
 528	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
 529	 */
 530	if (where + size <= PCI_COMMAND) {
 531		memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
 532	} else if (where >= PCI_CLASS_REVISION && where + size <=
 533		   PCI_CACHE_LINE_SIZE) {
 534		memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
 535		       PCI_CLASS_REVISION, size);
 536	} else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
 537		   PCI_ROM_ADDRESS) {
 538		memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
 539		       PCI_SUBSYSTEM_VENDOR_ID, size);
 540	} else if (where >= PCI_ROM_ADDRESS && where + size <=
 541		   PCI_CAPABILITY_LIST) {
 542		/* ROM BARs are unimplemented */
 543		*val = 0;
 544	} else if (where >= PCI_INTERRUPT_LINE && where + size <=
 545		   PCI_INTERRUPT_PIN) {
 546		/*
 547		 * Interrupt Line and Interrupt PIN are hard-wired to zero
 548		 * because this front-end only supports message-signaled
 549		 * interrupts.
 550		 */
 551		*val = 0;
 552	} else if (where + size <= CFG_PAGE_SIZE) {
 553		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 554		/* Choose the function to be read. (See comment above) */
 555		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 556		/* Read from that function's config space. */
 557		switch (size) {
 558		case 1:
 559			*val = readb(addr);
 560			break;
 561		case 2:
 562			*val = readw(addr);
 563			break;
 564		default:
 565			*val = readl(addr);
 566			break;
 567		}
 568		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 569	} else {
 570		dev_err(&hpdev->hbus->hdev->device,
 571			"Attempt to read beyond a function's config space.\n");
 572	}
 573}
 574
 575/**
 576 * _hv_pcifront_write_config() - Internal PCI config write
 577 * @hpdev:	The PCI driver's representation of the device
 578 * @where:	Offset within config space
 579 * @size:	Size of the transfer
 580 * @val:	The data being transferred
 581 */
 582static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
 583				      int size, u32 val)
 584{
 585	unsigned long flags;
 586	void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
 587
 588	if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
 589	    where + size <= PCI_CAPABILITY_LIST) {
 590		/* SSIDs and ROM BARs are read-only */
 591	} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
 592		spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
 593		/* Choose the function to be written. (See comment above) */
 594		writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
 595		/* Write to that function's config space. */
 596		switch (size) {
 597		case 1:
 598			writeb(val, addr);
 599			break;
 600		case 2:
 601			writew(val, addr);
 602			break;
 603		default:
 604			writel(val, addr);
 605			break;
 606		}
 607		spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
 608	} else {
 609		dev_err(&hpdev->hbus->hdev->device,
 610			"Attempt to write beyond a function's config space.\n");
 611	}
 612}
 613
 614/**
 615 * hv_pcifront_read_config() - Read configuration space
 616 * @bus: PCI Bus structure
 617 * @devfn: Device/function
 618 * @where: Offset from base
 619 * @size: Byte/word/dword
 620 * @val: Value to be read
 621 *
 622 * Return: PCIBIOS_SUCCESSFUL on success
 623 *	   PCIBIOS_DEVICE_NOT_FOUND on failure
 624 */
 625static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
 626				   int where, int size, u32 *val)
 627{
 628	struct hv_pcibus_device *hbus =
 629		container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
 630	struct hv_pci_dev *hpdev;
 631
 632	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
 633	if (!hpdev)
 634		return PCIBIOS_DEVICE_NOT_FOUND;
 635
 636	_hv_pcifront_read_config(hpdev, where, size, val);
 637
 638	put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 639	return PCIBIOS_SUCCESSFUL;
 640}
 641
 642/**
 643 * hv_pcifront_write_config() - Write configuration space
 644 * @bus: PCI Bus structure
 645 * @devfn: Device/function
 646 * @where: Offset from base
 647 * @size: Byte/word/dword
 648 * @val: Value to be written to device
 649 *
 650 * Return: PCIBIOS_SUCCESSFUL on success
 651 *	   PCIBIOS_DEVICE_NOT_FOUND on failure
 652 */
 653static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
 654				    int where, int size, u32 val)
 655{
 656	struct hv_pcibus_device *hbus =
 657	    container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
 658	struct hv_pci_dev *hpdev;
 659
 660	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
 661	if (!hpdev)
 662		return PCIBIOS_DEVICE_NOT_FOUND;
 663
 664	_hv_pcifront_write_config(hpdev, where, size, val);
 665
 666	put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 667	return PCIBIOS_SUCCESSFUL;
 668}
 669
 670/* PCIe operations */
 671static struct pci_ops hv_pcifront_ops = {
 672	.read  = hv_pcifront_read_config,
 673	.write = hv_pcifront_write_config,
 674};
 675
 676/* Interrupt management hooks */
 677static void hv_int_desc_free(struct hv_pci_dev *hpdev,
 678			     struct tran_int_desc *int_desc)
 679{
 680	struct pci_delete_interrupt *int_pkt;
 681	struct {
 682		struct pci_packet pkt;
 683		u8 buffer[sizeof(struct pci_delete_interrupt) -
 684			  sizeof(struct pci_message)];
 685	} ctxt;
 686
 687	memset(&ctxt, 0, sizeof(ctxt));
 688	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
 689	int_pkt->message_type.message_type =
 690		PCI_DELETE_INTERRUPT_MESSAGE;
 691	int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
 692	int_pkt->int_desc = *int_desc;
 693	vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
 694			 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
 695	kfree(int_desc);
 696}
 697
 698/**
 699 * hv_msi_free() - Free the MSI.
 700 * @domain:	The interrupt domain pointer
 701 * @info:	Extra MSI-related context
 702 * @irq:	Identifies the IRQ.
 703 *
 704 * The Hyper-V parent partition and hypervisor are tracking the
 705 * messages that are in use, keeping the interrupt redirection
 706 * table up to date.  This callback sends a message that frees
 707 * the IRT entry and related tracking nonsense.
 708 */
 709static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
 710			unsigned int irq)
 711{
 712	struct hv_pcibus_device *hbus;
 713	struct hv_pci_dev *hpdev;
 714	struct pci_dev *pdev;
 715	struct tran_int_desc *int_desc;
 716	struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
 717	struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
 718
 719	pdev = msi_desc_to_pci_dev(msi);
 720	hbus = info->data;
 721	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
 722	if (!hpdev)
 723		return;
 724
 725	int_desc = irq_data_get_irq_chip_data(irq_data);
 726	if (int_desc) {
 727		irq_data->chip_data = NULL;
 728		hv_int_desc_free(hpdev, int_desc);
 729	}
 730
 731	put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 732}
 733
 734static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
 735			   bool force)
 736{
 737	struct irq_data *parent = data->parent_data;
 738
 739	return parent->chip->irq_set_affinity(parent, dest, force);
 740}
 741
 742void hv_irq_mask(struct irq_data *data)
 743{
 744	pci_msi_mask_irq(data);
 745}
 746
 747/**
 748 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
 749 * affinity.
 750 * @data:	Describes the IRQ
 751 *
 752 * Build new a destination for the MSI and make a hypercall to
 753 * update the Interrupt Redirection Table. "Device Logical ID"
 754 * is built out of this PCI bus's instance GUID and the function
 755 * number of the device.
 756 */
 757void hv_irq_unmask(struct irq_data *data)
 758{
 759	struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
 760	struct irq_cfg *cfg = irqd_cfg(data);
 761	struct retarget_msi_interrupt params;
 762	struct hv_pcibus_device *hbus;
 763	struct cpumask *dest;
 764	struct pci_bus *pbus;
 765	struct pci_dev *pdev;
 766	int cpu;
 767
 768	dest = irq_data_get_affinity_mask(data);
 769	pdev = msi_desc_to_pci_dev(msi_desc);
 770	pbus = pdev->bus;
 771	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
 772
 773	memset(&params, 0, sizeof(params));
 774	params.partition_id = HV_PARTITION_ID_SELF;
 775	params.source = 1; /* MSI(-X) */
 776	params.address = msi_desc->msg.address_lo;
 777	params.data = msi_desc->msg.data;
 778	params.device_id = (hbus->hdev->dev_instance.b[5] << 24) |
 779			   (hbus->hdev->dev_instance.b[4] << 16) |
 780			   (hbus->hdev->dev_instance.b[7] << 8) |
 781			   (hbus->hdev->dev_instance.b[6] & 0xf8) |
 782			   PCI_FUNC(pdev->devfn);
 783	params.vector = cfg->vector;
 784
 785	for_each_cpu_and(cpu, dest, cpu_online_mask)
 786		params.vp_mask |= (1ULL << vmbus_cpu_number_to_vp_number(cpu));
 787
 788	hv_do_hypercall(HVCALL_RETARGET_INTERRUPT, &params, NULL);
 789
 790	pci_msi_unmask_irq(data);
 791}
 792
 793struct compose_comp_ctxt {
 794	struct hv_pci_compl comp_pkt;
 795	struct tran_int_desc int_desc;
 796};
 797
 798static void hv_pci_compose_compl(void *context, struct pci_response *resp,
 799				 int resp_packet_size)
 800{
 801	struct compose_comp_ctxt *comp_pkt = context;
 802	struct pci_create_int_response *int_resp =
 803		(struct pci_create_int_response *)resp;
 804
 805	comp_pkt->comp_pkt.completion_status = resp->status;
 806	comp_pkt->int_desc = int_resp->int_desc;
 807	complete(&comp_pkt->comp_pkt.host_event);
 808}
 809
 810/**
 811 * hv_compose_msi_msg() - Supplies a valid MSI address/data
 812 * @data:	Everything about this MSI
 813 * @msg:	Buffer that is filled in by this function
 814 *
 815 * This function unpacks the IRQ looking for target CPU set, IDT
 816 * vector and mode and sends a message to the parent partition
 817 * asking for a mapping for that tuple in this partition.  The
 818 * response supplies a data value and address to which that data
 819 * should be written to trigger that interrupt.
 820 */
 821static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 822{
 823	struct irq_cfg *cfg = irqd_cfg(data);
 824	struct hv_pcibus_device *hbus;
 825	struct hv_pci_dev *hpdev;
 826	struct pci_bus *pbus;
 827	struct pci_dev *pdev;
 828	struct pci_create_interrupt *int_pkt;
 829	struct compose_comp_ctxt comp;
 830	struct tran_int_desc *int_desc;
 831	struct cpumask *affinity;
 832	struct {
 833		struct pci_packet pkt;
 834		u8 buffer[sizeof(struct pci_create_interrupt) -
 835			  sizeof(struct pci_message)];
 836	} ctxt;
 837	int cpu;
 838	int ret;
 839
 840	pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
 841	pbus = pdev->bus;
 842	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
 843	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
 844	if (!hpdev)
 845		goto return_null_message;
 846
 847	/* Free any previous message that might have already been composed. */
 848	if (data->chip_data) {
 849		int_desc = data->chip_data;
 850		data->chip_data = NULL;
 851		hv_int_desc_free(hpdev, int_desc);
 852	}
 853
 854	int_desc = kzalloc(sizeof(*int_desc), GFP_KERNEL);
 855	if (!int_desc)
 856		goto drop_reference;
 857
 858	memset(&ctxt, 0, sizeof(ctxt));
 859	init_completion(&comp.comp_pkt.host_event);
 860	ctxt.pkt.completion_func = hv_pci_compose_compl;
 861	ctxt.pkt.compl_ctxt = &comp;
 862	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
 863	int_pkt->message_type.message_type = PCI_CREATE_INTERRUPT_MESSAGE;
 864	int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
 865	int_pkt->int_desc.vector = cfg->vector;
 866	int_pkt->int_desc.vector_count = 1;
 867	int_pkt->int_desc.delivery_mode =
 868		(apic->irq_delivery_mode == dest_LowestPrio) ? 1 : 0;
 869
 870	/*
 871	 * This bit doesn't have to work on machines with more than 64
 872	 * processors because Hyper-V only supports 64 in a guest.
 873	 */
 874	affinity = irq_data_get_affinity_mask(data);
 875	for_each_cpu_and(cpu, affinity, cpu_online_mask) {
 876		int_pkt->int_desc.cpu_mask |=
 877			(1ULL << vmbus_cpu_number_to_vp_number(cpu));
 878	}
 879
 880	ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt,
 881			       sizeof(*int_pkt), (unsigned long)&ctxt.pkt,
 882			       VM_PKT_DATA_INBAND,
 883			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 884	if (!ret)
 885		wait_for_completion(&comp.comp_pkt.host_event);
 886
 887	if (comp.comp_pkt.completion_status < 0) {
 888		dev_err(&hbus->hdev->device,
 889			"Request for interrupt failed: 0x%x",
 890			comp.comp_pkt.completion_status);
 891		goto free_int_desc;
 892	}
 893
 894	/*
 895	 * Record the assignment so that this can be unwound later. Using
 896	 * irq_set_chip_data() here would be appropriate, but the lock it takes
 897	 * is already held.
 898	 */
 899	*int_desc = comp.int_desc;
 900	data->chip_data = int_desc;
 901
 902	/* Pass up the result. */
 903	msg->address_hi = comp.int_desc.address >> 32;
 904	msg->address_lo = comp.int_desc.address & 0xffffffff;
 905	msg->data = comp.int_desc.data;
 906
 907	put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 908	return;
 909
 910free_int_desc:
 911	kfree(int_desc);
 912drop_reference:
 913	put_pcichild(hpdev, hv_pcidev_ref_by_slot);
 914return_null_message:
 915	msg->address_hi = 0;
 916	msg->address_lo = 0;
 917	msg->data = 0;
 918}
 919
 920/* HW Interrupt Chip Descriptor */
 921static struct irq_chip hv_msi_irq_chip = {
 922	.name			= "Hyper-V PCIe MSI",
 923	.irq_compose_msi_msg	= hv_compose_msi_msg,
 924	.irq_set_affinity	= hv_set_affinity,
 925	.irq_ack		= irq_chip_ack_parent,
 926	.irq_mask		= hv_irq_mask,
 927	.irq_unmask		= hv_irq_unmask,
 928};
 929
 930static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
 931						   msi_alloc_info_t *arg)
 932{
 933	return arg->msi_hwirq;
 934}
 935
 936static struct msi_domain_ops hv_msi_ops = {
 937	.get_hwirq	= hv_msi_domain_ops_get_hwirq,
 938	.msi_prepare	= pci_msi_prepare,
 939	.set_desc	= pci_msi_set_desc,
 940	.msi_free	= hv_msi_free,
 941};
 942
 943/**
 944 * hv_pcie_init_irq_domain() - Initialize IRQ domain
 945 * @hbus:	The root PCI bus
 946 *
 947 * This function creates an IRQ domain which will be used for
 948 * interrupts from devices that have been passed through.  These
 949 * devices only support MSI and MSI-X, not line-based interrupts
 950 * or simulations of line-based interrupts through PCIe's
 951 * fabric-layer messages.  Because interrupts are remapped, we
 952 * can support multi-message MSI here.
 953 *
 954 * Return: '0' on success and error value on failure
 955 */
 956static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
 957{
 958	hbus->msi_info.chip = &hv_msi_irq_chip;
 959	hbus->msi_info.ops = &hv_msi_ops;
 960	hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
 961		MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
 962		MSI_FLAG_PCI_MSIX);
 963	hbus->msi_info.handler = handle_edge_irq;
 964	hbus->msi_info.handler_name = "edge";
 965	hbus->msi_info.data = hbus;
 966	hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
 967						     &hbus->msi_info,
 968						     x86_vector_domain);
 969	if (!hbus->irq_domain) {
 970		dev_err(&hbus->hdev->device,
 971			"Failed to build an MSI IRQ domain\n");
 972		return -ENODEV;
 973	}
 974
 975	return 0;
 976}
 977
 978/**
 979 * get_bar_size() - Get the address space consumed by a BAR
 980 * @bar_val:	Value that a BAR returned after -1 was written
 981 *              to it.
 982 *
 983 * This function returns the size of the BAR, rounded up to 1
 984 * page.  It has to be rounded up because the hypervisor's page
 985 * table entry that maps the BAR into the VM can't specify an
 986 * offset within a page.  The invariant is that the hypervisor
 987 * must place any BARs of smaller than page length at the
 988 * beginning of a page.
 989 *
 990 * Return:	Size in bytes of the consumed MMIO space.
 991 */
 992static u64 get_bar_size(u64 bar_val)
 993{
 994	return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
 995			PAGE_SIZE);
 996}
 997
 998/**
 999 * survey_child_resources() - Total all MMIO requirements
1000 * @hbus:	Root PCI bus, as understood by this driver
1001 */
1002static void survey_child_resources(struct hv_pcibus_device *hbus)
1003{
1004	struct list_head *iter;
1005	struct hv_pci_dev *hpdev;
1006	resource_size_t bar_size = 0;
1007	unsigned long flags;
1008	struct completion *event;
1009	u64 bar_val;
1010	int i;
1011
1012	/* If nobody is waiting on the answer, don't compute it. */
1013	event = xchg(&hbus->survey_event, NULL);
1014	if (!event)
1015		return;
1016
1017	/* If the answer has already been computed, go with it. */
1018	if (hbus->low_mmio_space || hbus->high_mmio_space) {
1019		complete(event);
1020		return;
1021	}
1022
1023	spin_lock_irqsave(&hbus->device_list_lock, flags);
1024
1025	/*
1026	 * Due to an interesting quirk of the PCI spec, all memory regions
1027	 * for a child device are a power of 2 in size and aligned in memory,
1028	 * so it's sufficient to just add them up without tracking alignment.
1029	 */
1030	list_for_each(iter, &hbus->children) {
1031		hpdev = container_of(iter, struct hv_pci_dev, list_entry);
1032		for (i = 0; i < 6; i++) {
1033			if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1034				dev_err(&hbus->hdev->device,
1035					"There's an I/O BAR in this list!\n");
1036
1037			if (hpdev->probed_bar[i] != 0) {
1038				/*
1039				 * A probed BAR has all the upper bits set that
1040				 * can be changed.
1041				 */
1042
1043				bar_val = hpdev->probed_bar[i];
1044				if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1045					bar_val |=
1046					((u64)hpdev->probed_bar[++i] << 32);
1047				else
1048					bar_val |= 0xffffffff00000000ULL;
1049
1050				bar_size = get_bar_size(bar_val);
1051
1052				if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1053					hbus->high_mmio_space += bar_size;
1054				else
1055					hbus->low_mmio_space += bar_size;
1056			}
1057		}
1058	}
1059
1060	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1061	complete(event);
1062}
1063
1064/**
1065 * prepopulate_bars() - Fill in BARs with defaults
1066 * @hbus:	Root PCI bus, as understood by this driver
1067 *
1068 * The core PCI driver code seems much, much happier if the BARs
1069 * for a device have values upon first scan. So fill them in.
1070 * The algorithm below works down from large sizes to small,
1071 * attempting to pack the assignments optimally. The assumption,
1072 * enforced in other parts of the code, is that the beginning of
1073 * the memory-mapped I/O space will be aligned on the largest
1074 * BAR size.
1075 */
1076static void prepopulate_bars(struct hv_pcibus_device *hbus)
1077{
1078	resource_size_t high_size = 0;
1079	resource_size_t low_size = 0;
1080	resource_size_t high_base = 0;
1081	resource_size_t low_base = 0;
1082	resource_size_t bar_size;
1083	struct hv_pci_dev *hpdev;
1084	struct list_head *iter;
1085	unsigned long flags;
1086	u64 bar_val;
1087	u32 command;
1088	bool high;
1089	int i;
1090
1091	if (hbus->low_mmio_space) {
1092		low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1093		low_base = hbus->low_mmio_res->start;
1094	}
1095
1096	if (hbus->high_mmio_space) {
1097		high_size = 1ULL <<
1098			(63 - __builtin_clzll(hbus->high_mmio_space));
1099		high_base = hbus->high_mmio_res->start;
1100	}
1101
1102	spin_lock_irqsave(&hbus->device_list_lock, flags);
1103
1104	/* Pick addresses for the BARs. */
1105	do {
1106		list_for_each(iter, &hbus->children) {
1107			hpdev = container_of(iter, struct hv_pci_dev,
1108					     list_entry);
1109			for (i = 0; i < 6; i++) {
1110				bar_val = hpdev->probed_bar[i];
1111				if (bar_val == 0)
1112					continue;
1113				high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1114				if (high) {
1115					bar_val |=
1116						((u64)hpdev->probed_bar[i + 1]
1117						 << 32);
1118				} else {
1119					bar_val |= 0xffffffffULL << 32;
1120				}
1121				bar_size = get_bar_size(bar_val);
1122				if (high) {
1123					if (high_size != bar_size) {
1124						i++;
1125						continue;
1126					}
1127					_hv_pcifront_write_config(hpdev,
1128						PCI_BASE_ADDRESS_0 + (4 * i),
1129						4,
1130						(u32)(high_base & 0xffffff00));
1131					i++;
1132					_hv_pcifront_write_config(hpdev,
1133						PCI_BASE_ADDRESS_0 + (4 * i),
1134						4, (u32)(high_base >> 32));
1135					high_base += bar_size;
1136				} else {
1137					if (low_size != bar_size)
1138						continue;
1139					_hv_pcifront_write_config(hpdev,
1140						PCI_BASE_ADDRESS_0 + (4 * i),
1141						4,
1142						(u32)(low_base & 0xffffff00));
1143					low_base += bar_size;
1144				}
1145			}
1146			if (high_size <= 1 && low_size <= 1) {
1147				/* Set the memory enable bit. */
1148				_hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1149							 &command);
1150				command |= PCI_COMMAND_MEMORY;
1151				_hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1152							  command);
1153				break;
1154			}
1155		}
1156
1157		high_size >>= 1;
1158		low_size >>= 1;
1159	}  while (high_size || low_size);
1160
1161	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1162}
1163
1164/**
1165 * create_root_hv_pci_bus() - Expose a new root PCI bus
1166 * @hbus:	Root PCI bus, as understood by this driver
1167 *
1168 * Return: 0 on success, -errno on failure
1169 */
1170static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1171{
1172	/* Register the device */
1173	hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1174					    0, /* bus number is always zero */
1175					    &hv_pcifront_ops,
1176					    &hbus->sysdata,
1177					    &hbus->resources_for_children);
1178	if (!hbus->pci_bus)
1179		return -ENODEV;
1180
1181	hbus->pci_bus->msi = &hbus->msi_chip;
1182	hbus->pci_bus->msi->dev = &hbus->hdev->device;
1183
1184	pci_scan_child_bus(hbus->pci_bus);
1185	pci_bus_assign_resources(hbus->pci_bus);
1186	pci_bus_add_devices(hbus->pci_bus);
1187	hbus->state = hv_pcibus_installed;
1188	return 0;
1189}
1190
1191struct q_res_req_compl {
1192	struct completion host_event;
1193	struct hv_pci_dev *hpdev;
1194};
1195
1196/**
1197 * q_resource_requirements() - Query Resource Requirements
1198 * @context:		The completion context.
1199 * @resp:		The response that came from the host.
1200 * @resp_packet_size:	The size in bytes of resp.
1201 *
1202 * This function is invoked on completion of a Query Resource
1203 * Requirements packet.
1204 */
1205static void q_resource_requirements(void *context, struct pci_response *resp,
1206				    int resp_packet_size)
1207{
1208	struct q_res_req_compl *completion = context;
1209	struct pci_q_res_req_response *q_res_req =
1210		(struct pci_q_res_req_response *)resp;
1211	int i;
1212
1213	if (resp->status < 0) {
1214		dev_err(&completion->hpdev->hbus->hdev->device,
1215			"query resource requirements failed: %x\n",
1216			resp->status);
1217	} else {
1218		for (i = 0; i < 6; i++) {
1219			completion->hpdev->probed_bar[i] =
1220				q_res_req->probed_bar[i];
1221		}
1222	}
1223
1224	complete(&completion->host_event);
1225}
1226
1227static void get_pcichild(struct hv_pci_dev *hpdev,
1228			    enum hv_pcidev_ref_reason reason)
1229{
1230	atomic_inc(&hpdev->refs);
1231}
1232
1233static void put_pcichild(struct hv_pci_dev *hpdev,
1234			    enum hv_pcidev_ref_reason reason)
1235{
1236	if (atomic_dec_and_test(&hpdev->refs))
1237		kfree(hpdev);
1238}
1239
1240/**
1241 * new_pcichild_device() - Create a new child device
1242 * @hbus:	The internal struct tracking this root PCI bus.
1243 * @desc:	The information supplied so far from the host
1244 *              about the device.
1245 *
1246 * This function creates the tracking structure for a new child
1247 * device and kicks off the process of figuring out what it is.
1248 *
1249 * Return: Pointer to the new tracking struct
1250 */
1251static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1252		struct pci_function_description *desc)
1253{
1254	struct hv_pci_dev *hpdev;
1255	struct pci_child_message *res_req;
1256	struct q_res_req_compl comp_pkt;
1257	union {
1258	struct pci_packet init_packet;
1259		u8 buffer[0x100];
1260	} pkt;
1261	unsigned long flags;
1262	int ret;
1263
1264	hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1265	if (!hpdev)
1266		return NULL;
1267
1268	hpdev->hbus = hbus;
1269
1270	memset(&pkt, 0, sizeof(pkt));
1271	init_completion(&comp_pkt.host_event);
1272	comp_pkt.hpdev = hpdev;
1273	pkt.init_packet.compl_ctxt = &comp_pkt;
1274	pkt.init_packet.completion_func = q_resource_requirements;
1275	res_req = (struct pci_child_message *)&pkt.init_packet.message;
1276	res_req->message_type = PCI_QUERY_RESOURCE_REQUIREMENTS;
1277	res_req->wslot.slot = desc->win_slot.slot;
1278
1279	ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1280			       sizeof(struct pci_child_message),
1281			       (unsigned long)&pkt.init_packet,
1282			       VM_PKT_DATA_INBAND,
1283			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1284	if (ret)
1285		goto error;
1286
1287	wait_for_completion(&comp_pkt.host_event);
1288
1289	hpdev->desc = *desc;
1290	get_pcichild(hpdev, hv_pcidev_ref_initial);
1291	get_pcichild(hpdev, hv_pcidev_ref_childlist);
1292	spin_lock_irqsave(&hbus->device_list_lock, flags);
1293	list_add_tail(&hpdev->list_entry, &hbus->children);
1294	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1295	return hpdev;
1296
1297error:
1298	kfree(hpdev);
1299	return NULL;
1300}
1301
1302/**
1303 * get_pcichild_wslot() - Find device from slot
1304 * @hbus:	Root PCI bus, as understood by this driver
1305 * @wslot:	Location on the bus
1306 *
1307 * This function looks up a PCI device and returns the internal
1308 * representation of it.  It acquires a reference on it, so that
1309 * the device won't be deleted while somebody is using it.  The
1310 * caller is responsible for calling put_pcichild() to release
1311 * this reference.
1312 *
1313 * Return:	Internal representation of a PCI device
1314 */
1315static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1316					     u32 wslot)
1317{
1318	unsigned long flags;
1319	struct hv_pci_dev *iter, *hpdev = NULL;
1320
1321	spin_lock_irqsave(&hbus->device_list_lock, flags);
1322	list_for_each_entry(iter, &hbus->children, list_entry) {
1323		if (iter->desc.win_slot.slot == wslot) {
1324			hpdev = iter;
1325			get_pcichild(hpdev, hv_pcidev_ref_by_slot);
1326			break;
1327		}
1328	}
1329	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1330
1331	return hpdev;
1332}
1333
1334/**
1335 * pci_devices_present_work() - Handle new list of child devices
1336 * @work:	Work struct embedded in struct hv_dr_work
1337 *
1338 * "Bus Relations" is the Windows term for "children of this
1339 * bus."  The terminology is preserved here for people trying to
1340 * debug the interaction between Hyper-V and Linux.  This
1341 * function is called when the parent partition reports a list
1342 * of functions that should be observed under this PCI Express
1343 * port (bus).
1344 *
1345 * This function updates the list, and must tolerate being
1346 * called multiple times with the same information.  The typical
1347 * number of child devices is one, with very atypical cases
1348 * involving three or four, so the algorithms used here can be
1349 * simple and inefficient.
1350 *
1351 * It must also treat the omission of a previously observed device as
1352 * notification that the device no longer exists.
1353 *
1354 * Note that this function is a work item, and it may not be
1355 * invoked in the order that it was queued.  Back to back
1356 * updates of the list of present devices may involve queuing
1357 * multiple work items, and this one may run before ones that
1358 * were sent later. As such, this function only does something
1359 * if is the last one in the queue.
1360 */
1361static void pci_devices_present_work(struct work_struct *work)
1362{
1363	u32 child_no;
1364	bool found;
1365	struct list_head *iter;
1366	struct pci_function_description *new_desc;
1367	struct hv_pci_dev *hpdev;
1368	struct hv_pcibus_device *hbus;
1369	struct list_head removed;
1370	struct hv_dr_work *dr_wrk;
1371	struct hv_dr_state *dr = NULL;
1372	unsigned long flags;
1373
1374	dr_wrk = container_of(work, struct hv_dr_work, wrk);
1375	hbus = dr_wrk->bus;
1376	kfree(dr_wrk);
1377
1378	INIT_LIST_HEAD(&removed);
1379
1380	if (down_interruptible(&hbus->enum_sem)) {
1381		put_hvpcibus(hbus);
1382		return;
1383	}
1384
1385	/* Pull this off the queue and process it if it was the last one. */
1386	spin_lock_irqsave(&hbus->device_list_lock, flags);
1387	while (!list_empty(&hbus->dr_list)) {
1388		dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1389				      list_entry);
1390		list_del(&dr->list_entry);
1391
1392		/* Throw this away if the list still has stuff in it. */
1393		if (!list_empty(&hbus->dr_list)) {
1394			kfree(dr);
1395			continue;
1396		}
1397	}
1398	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1399
1400	if (!dr) {
1401		up(&hbus->enum_sem);
1402		put_hvpcibus(hbus);
1403		return;
1404	}
1405
1406	/* First, mark all existing children as reported missing. */
1407	spin_lock_irqsave(&hbus->device_list_lock, flags);
1408	list_for_each(iter, &hbus->children) {
1409			hpdev = container_of(iter, struct hv_pci_dev,
1410					     list_entry);
1411			hpdev->reported_missing = true;
1412	}
1413	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1414
1415	/* Next, add back any reported devices. */
1416	for (child_no = 0; child_no < dr->device_count; child_no++) {
1417		found = false;
1418		new_desc = &dr->func[child_no];
1419
1420		spin_lock_irqsave(&hbus->device_list_lock, flags);
1421		list_for_each(iter, &hbus->children) {
1422			hpdev = container_of(iter, struct hv_pci_dev,
1423					     list_entry);
1424			if ((hpdev->desc.win_slot.slot ==
1425			     new_desc->win_slot.slot) &&
1426			    (hpdev->desc.v_id == new_desc->v_id) &&
1427			    (hpdev->desc.d_id == new_desc->d_id) &&
1428			    (hpdev->desc.ser == new_desc->ser)) {
1429				hpdev->reported_missing = false;
1430				found = true;
1431			}
1432		}
1433		spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1434
1435		if (!found) {
1436			hpdev = new_pcichild_device(hbus, new_desc);
1437			if (!hpdev)
1438				dev_err(&hbus->hdev->device,
1439					"couldn't record a child device.\n");
1440		}
1441	}
1442
1443	/* Move missing children to a list on the stack. */
1444	spin_lock_irqsave(&hbus->device_list_lock, flags);
1445	do {
1446		found = false;
1447		list_for_each(iter, &hbus->children) {
1448			hpdev = container_of(iter, struct hv_pci_dev,
1449					     list_entry);
1450			if (hpdev->reported_missing) {
1451				found = true;
1452				put_pcichild(hpdev, hv_pcidev_ref_childlist);
1453				list_del(&hpdev->list_entry);
1454				list_add_tail(&hpdev->list_entry, &removed);
1455				break;
1456			}
1457		}
1458	} while (found);
1459	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1460
1461	/* Delete everything that should no longer exist. */
1462	while (!list_empty(&removed)) {
1463		hpdev = list_first_entry(&removed, struct hv_pci_dev,
1464					 list_entry);
1465		list_del(&hpdev->list_entry);
1466		put_pcichild(hpdev, hv_pcidev_ref_initial);
1467	}
1468
1469	/* Tell the core to rescan bus because there may have been changes. */
1470	if (hbus->state == hv_pcibus_installed) {
1471		pci_lock_rescan_remove();
1472		pci_scan_child_bus(hbus->pci_bus);
1473		pci_unlock_rescan_remove();
1474	} else {
1475		survey_child_resources(hbus);
1476	}
1477
1478	up(&hbus->enum_sem);
1479	put_hvpcibus(hbus);
1480	kfree(dr);
1481}
1482
1483/**
1484 * hv_pci_devices_present() - Handles list of new children
1485 * @hbus:	Root PCI bus, as understood by this driver
1486 * @relations:	Packet from host listing children
1487 *
1488 * This function is invoked whenever a new list of devices for
1489 * this bus appears.
1490 */
1491static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1492				   struct pci_bus_relations *relations)
1493{
1494	struct hv_dr_state *dr;
1495	struct hv_dr_work *dr_wrk;
1496	unsigned long flags;
1497
1498	dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1499	if (!dr_wrk)
1500		return;
1501
1502	dr = kzalloc(offsetof(struct hv_dr_state, func) +
1503		     (sizeof(struct pci_function_description) *
1504		      (relations->device_count)), GFP_NOWAIT);
1505	if (!dr)  {
1506		kfree(dr_wrk);
1507		return;
1508	}
1509
1510	INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1511	dr_wrk->bus = hbus;
1512	dr->device_count = relations->device_count;
1513	if (dr->device_count != 0) {
1514		memcpy(dr->func, relations->func,
1515		       sizeof(struct pci_function_description) *
1516		       dr->device_count);
1517	}
1518
1519	spin_lock_irqsave(&hbus->device_list_lock, flags);
1520	list_add_tail(&dr->list_entry, &hbus->dr_list);
1521	spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1522
1523	get_hvpcibus(hbus);
1524	schedule_work(&dr_wrk->wrk);
1525}
1526
1527/**
1528 * hv_eject_device_work() - Asynchronously handles ejection
1529 * @work:	Work struct embedded in internal device struct
1530 *
1531 * This function handles ejecting a device.  Windows will
1532 * attempt to gracefully eject a device, waiting 60 seconds to
1533 * hear back from the guest OS that this completed successfully.
1534 * If this timer expires, the device will be forcibly removed.
1535 */
1536static void hv_eject_device_work(struct work_struct *work)
1537{
1538	struct pci_eject_response *ejct_pkt;
1539	struct hv_pci_dev *hpdev;
1540	struct pci_dev *pdev;
1541	unsigned long flags;
1542	int wslot;
1543	struct {
1544		struct pci_packet pkt;
1545		u8 buffer[sizeof(struct pci_eject_response) -
1546			  sizeof(struct pci_message)];
1547	} ctxt;
1548
1549	hpdev = container_of(work, struct hv_pci_dev, wrk);
1550
1551	if (hpdev->state != hv_pcichild_ejecting) {
1552		put_pcichild(hpdev, hv_pcidev_ref_pnp);
1553		return;
1554	}
1555
1556	/*
1557	 * Ejection can come before or after the PCI bus has been set up, so
1558	 * attempt to find it and tear down the bus state, if it exists.  This
1559	 * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1560	 * because hbus->pci_bus may not exist yet.
1561	 */
1562	wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1563	pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1564					   wslot);
1565	if (pdev) {
1566		pci_stop_and_remove_bus_device(pdev);
1567		pci_dev_put(pdev);
1568	}
1569
1570	memset(&ctxt, 0, sizeof(ctxt));
1571	ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
1572	ejct_pkt->message_type = PCI_EJECTION_COMPLETE;
1573	ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1574	vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1575			 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1576			 VM_PKT_DATA_INBAND, 0);
1577
1578	spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1579	list_del(&hpdev->list_entry);
1580	spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1581
1582	put_pcichild(hpdev, hv_pcidev_ref_childlist);
1583	put_pcichild(hpdev, hv_pcidev_ref_pnp);
1584	put_hvpcibus(hpdev->hbus);
1585}
1586
1587/**
1588 * hv_pci_eject_device() - Handles device ejection
1589 * @hpdev:	Internal device tracking struct
1590 *
1591 * This function is invoked when an ejection packet arrives.  It
1592 * just schedules work so that we don't re-enter the packet
1593 * delivery code handling the ejection.
1594 */
1595static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1596{
1597	hpdev->state = hv_pcichild_ejecting;
1598	get_pcichild(hpdev, hv_pcidev_ref_pnp);
1599	INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1600	get_hvpcibus(hpdev->hbus);
1601	schedule_work(&hpdev->wrk);
1602}
1603
1604/**
1605 * hv_pci_onchannelcallback() - Handles incoming packets
1606 * @context:	Internal bus tracking struct
1607 *
1608 * This function is invoked whenever the host sends a packet to
1609 * this channel (which is private to this root PCI bus).
1610 */
1611static void hv_pci_onchannelcallback(void *context)
1612{
1613	const int packet_size = 0x100;
1614	int ret;
1615	struct hv_pcibus_device *hbus = context;
1616	u32 bytes_recvd;
1617	u64 req_id;
1618	struct vmpacket_descriptor *desc;
1619	unsigned char *buffer;
1620	int bufferlen = packet_size;
1621	struct pci_packet *comp_packet;
1622	struct pci_response *response;
1623	struct pci_incoming_message *new_message;
1624	struct pci_bus_relations *bus_rel;
1625	struct pci_dev_incoming *dev_message;
1626	struct hv_pci_dev *hpdev;
1627
1628	buffer = kmalloc(bufferlen, GFP_ATOMIC);
1629	if (!buffer)
1630		return;
1631
1632	while (1) {
1633		ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1634					   bufferlen, &bytes_recvd, &req_id);
1635
1636		if (ret == -ENOBUFS) {
1637			kfree(buffer);
1638			/* Handle large packet */
1639			bufferlen = bytes_recvd;
1640			buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1641			if (!buffer)
1642				return;
1643			continue;
1644		}
1645
1646		/*
1647		 * All incoming packets must be at least as large as a
1648		 * response.
1649		 */
1650		if (bytes_recvd <= sizeof(struct pci_response)) {
1651			kfree(buffer);
1652			return;
1653		}
1654		desc = (struct vmpacket_descriptor *)buffer;
1655
1656		switch (desc->type) {
1657		case VM_PKT_COMP:
1658
1659			/*
1660			 * The host is trusted, and thus it's safe to interpret
1661			 * this transaction ID as a pointer.
1662			 */
1663			comp_packet = (struct pci_packet *)req_id;
1664			response = (struct pci_response *)buffer;
1665			comp_packet->completion_func(comp_packet->compl_ctxt,
1666						     response,
1667						     bytes_recvd);
1668			kfree(buffer);
1669			return;
1670
1671		case VM_PKT_DATA_INBAND:
1672
1673			new_message = (struct pci_incoming_message *)buffer;
1674			switch (new_message->message_type.message_type) {
1675			case PCI_BUS_RELATIONS:
1676
1677				bus_rel = (struct pci_bus_relations *)buffer;
1678				if (bytes_recvd <
1679				    offsetof(struct pci_bus_relations, func) +
1680				    (sizeof(struct pci_function_description) *
1681				     (bus_rel->device_count))) {
1682					dev_err(&hbus->hdev->device,
1683						"bus relations too small\n");
1684					break;
1685				}
1686
1687				hv_pci_devices_present(hbus, bus_rel);
1688				break;
1689
1690			case PCI_EJECT:
1691
1692				dev_message = (struct pci_dev_incoming *)buffer;
1693				hpdev = get_pcichild_wslot(hbus,
1694						      dev_message->wslot.slot);
1695				if (hpdev) {
1696					hv_pci_eject_device(hpdev);
1697					put_pcichild(hpdev,
1698							hv_pcidev_ref_by_slot);
1699				}
1700				break;
1701
1702			default:
1703				dev_warn(&hbus->hdev->device,
1704					"Unimplemented protocol message %x\n",
1705					new_message->message_type.message_type);
1706				break;
1707			}
1708			break;
1709
1710		default:
1711			dev_err(&hbus->hdev->device,
1712				"unhandled packet type %d, tid %llx len %d\n",
1713				desc->type, req_id, bytes_recvd);
1714			break;
1715		}
1716		break;
1717	}
1718}
1719
1720/**
1721 * hv_pci_protocol_negotiation() - Set up protocol
1722 * @hdev:	VMBus's tracking struct for this root PCI bus
1723 *
1724 * This driver is intended to support running on Windows 10
1725 * (server) and later versions. It will not run on earlier
1726 * versions, as they assume that many of the operations which
1727 * Linux needs accomplished with a spinlock held were done via
1728 * asynchronous messaging via VMBus.  Windows 10 increases the
1729 * surface area of PCI emulation so that these actions can take
1730 * place by suspending a virtual processor for their duration.
1731 *
1732 * This function negotiates the channel protocol version,
1733 * failing if the host doesn't support the necessary protocol
1734 * level.
1735 */
1736static int hv_pci_protocol_negotiation(struct hv_device *hdev)
1737{
1738	struct pci_version_request *version_req;
1739	struct hv_pci_compl comp_pkt;
1740	struct pci_packet *pkt;
1741	int ret;
1742
1743	/*
1744	 * Initiate the handshake with the host and negotiate
1745	 * a version that the host can support. We start with the
1746	 * highest version number and go down if the host cannot
1747	 * support it.
1748	 */
1749	pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
1750	if (!pkt)
1751		return -ENOMEM;
1752
1753	init_completion(&comp_pkt.host_event);
1754	pkt->completion_func = hv_pci_generic_compl;
1755	pkt->compl_ctxt = &comp_pkt;
1756	version_req = (struct pci_version_request *)&pkt->message;
1757	version_req->message_type.message_type = PCI_QUERY_PROTOCOL_VERSION;
1758	version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1759
1760	ret = vmbus_sendpacket(hdev->channel, version_req,
1761			       sizeof(struct pci_version_request),
1762			       (unsigned long)pkt, VM_PKT_DATA_INBAND,
1763			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1764	if (ret)
1765		goto exit;
1766
1767	wait_for_completion(&comp_pkt.host_event);
1768
1769	if (comp_pkt.completion_status < 0) {
1770		dev_err(&hdev->device,
1771			"PCI Pass-through VSP failed version request %x\n",
1772			comp_pkt.completion_status);
1773		ret = -EPROTO;
1774		goto exit;
1775	}
1776
1777	ret = 0;
1778
1779exit:
1780	kfree(pkt);
1781	return ret;
1782}
1783
1784/**
1785 * hv_pci_free_bridge_windows() - Release memory regions for the
1786 * bus
1787 * @hbus:	Root PCI bus, as understood by this driver
1788 */
1789static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
1790{
1791	/*
1792	 * Set the resources back to the way they looked when they
1793	 * were allocated by setting IORESOURCE_BUSY again.
1794	 */
1795
1796	if (hbus->low_mmio_space && hbus->low_mmio_res) {
1797		hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
1798		release_mem_region(hbus->low_mmio_res->start,
1799				   resource_size(hbus->low_mmio_res));
1800	}
1801
1802	if (hbus->high_mmio_space && hbus->high_mmio_res) {
1803		hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
1804		release_mem_region(hbus->high_mmio_res->start,
1805				   resource_size(hbus->high_mmio_res));
1806	}
1807}
1808
1809/**
1810 * hv_pci_allocate_bridge_windows() - Allocate memory regions
1811 * for the bus
1812 * @hbus:	Root PCI bus, as understood by this driver
1813 *
1814 * This function calls vmbus_allocate_mmio(), which is itself a
1815 * bit of a compromise.  Ideally, we might change the pnp layer
1816 * in the kernel such that it comprehends either PCI devices
1817 * which are "grandchildren of ACPI," with some intermediate bus
1818 * node (in this case, VMBus) or change it such that it
1819 * understands VMBus.  The pnp layer, however, has been declared
1820 * deprecated, and not subject to change.
1821 *
1822 * The workaround, implemented here, is to ask VMBus to allocate
1823 * MMIO space for this bus.  VMBus itself knows which ranges are
1824 * appropriate by looking at its own ACPI objects.  Then, after
1825 * these ranges are claimed, they're modified to look like they
1826 * would have looked if the ACPI and pnp code had allocated
1827 * bridge windows.  These descriptors have to exist in this form
1828 * in order to satisfy the code which will get invoked when the
1829 * endpoint PCI function driver calls request_mem_region() or
1830 * request_mem_region_exclusive().
1831 *
1832 * Return: 0 on success, -errno on failure
1833 */
1834static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
1835{
1836	resource_size_t align;
1837	int ret;
1838
1839	if (hbus->low_mmio_space) {
1840		align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1841		ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
1842					  (u64)(u32)0xffffffff,
1843					  hbus->low_mmio_space,
1844					  align, false);
1845		if (ret) {
1846			dev_err(&hbus->hdev->device,
1847				"Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
1848				hbus->low_mmio_space);
1849			return ret;
1850		}
1851
1852		/* Modify this resource to become a bridge window. */
1853		hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
1854		hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
1855		pci_add_resource(&hbus->resources_for_children,
1856				 hbus->low_mmio_res);
1857	}
1858
1859	if (hbus->high_mmio_space) {
1860		align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
1861		ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
1862					  0x100000000, -1,
1863					  hbus->high_mmio_space, align,
1864					  false);
1865		if (ret) {
1866			dev_err(&hbus->hdev->device,
1867				"Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
1868				hbus->high_mmio_space);
1869			goto release_low_mmio;
1870		}
1871
1872		/* Modify this resource to become a bridge window. */
1873		hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
1874		hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
1875		pci_add_resource(&hbus->resources_for_children,
1876				 hbus->high_mmio_res);
1877	}
1878
1879	return 0;
1880
1881release_low_mmio:
1882	if (hbus->low_mmio_res) {
1883		release_mem_region(hbus->low_mmio_res->start,
1884				   resource_size(hbus->low_mmio_res));
1885	}
1886
1887	return ret;
1888}
1889
1890/**
1891 * hv_allocate_config_window() - Find MMIO space for PCI Config
1892 * @hbus:	Root PCI bus, as understood by this driver
1893 *
1894 * This function claims memory-mapped I/O space for accessing
1895 * configuration space for the functions on this bus.
1896 *
1897 * Return: 0 on success, -errno on failure
1898 */
1899static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
1900{
1901	int ret;
1902
1903	/*
1904	 * Set up a region of MMIO space to use for accessing configuration
1905	 * space.
1906	 */
1907	ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
1908				  PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
1909	if (ret)
1910		return ret;
1911
1912	/*
1913	 * vmbus_allocate_mmio() gets used for allocating both device endpoint
1914	 * resource claims (those which cannot be overlapped) and the ranges
1915	 * which are valid for the children of this bus, which are intended
1916	 * to be overlapped by those children.  Set the flag on this claim
1917	 * meaning that this region can't be overlapped.
1918	 */
1919
1920	hbus->mem_config->flags |= IORESOURCE_BUSY;
1921
1922	return 0;
1923}
1924
1925static void hv_free_config_window(struct hv_pcibus_device *hbus)
1926{
1927	release_mem_region(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
1928}
1929
1930/**
1931 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
1932 * @hdev:	VMBus's tracking struct for this root PCI bus
1933 *
1934 * Return: 0 on success, -errno on failure
1935 */
1936static int hv_pci_enter_d0(struct hv_device *hdev)
1937{
1938	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
1939	struct pci_bus_d0_entry *d0_entry;
1940	struct hv_pci_compl comp_pkt;
1941	struct pci_packet *pkt;
1942	int ret;
1943
1944	/*
1945	 * Tell the host that the bus is ready to use, and moved into the
1946	 * powered-on state.  This includes telling the host which region
1947	 * of memory-mapped I/O space has been chosen for configuration space
1948	 * access.
1949	 */
1950	pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
1951	if (!pkt)
1952		return -ENOMEM;
1953
1954	init_completion(&comp_pkt.host_event);
1955	pkt->completion_func = hv_pci_generic_compl;
1956	pkt->compl_ctxt = &comp_pkt;
1957	d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
1958	d0_entry->message_type.message_type = PCI_BUS_D0ENTRY;
1959	d0_entry->mmio_base = hbus->mem_config->start;
1960
1961	ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
1962			       (unsigned long)pkt, VM_PKT_DATA_INBAND,
1963			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1964	if (ret)
1965		goto exit;
1966
1967	wait_for_completion(&comp_pkt.host_event);
1968
1969	if (comp_pkt.completion_status < 0) {
1970		dev_err(&hdev->device,
1971			"PCI Pass-through VSP failed D0 Entry with status %x\n",
1972			comp_pkt.completion_status);
1973		ret = -EPROTO;
1974		goto exit;
1975	}
1976
1977	ret = 0;
1978
1979exit:
1980	kfree(pkt);
1981	return ret;
1982}
1983
1984/**
1985 * hv_pci_query_relations() - Ask host to send list of child
1986 * devices
1987 * @hdev:	VMBus's tracking struct for this root PCI bus
1988 *
1989 * Return: 0 on success, -errno on failure
1990 */
1991static int hv_pci_query_relations(struct hv_device *hdev)
1992{
1993	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
1994	struct pci_message message;
1995	struct completion comp;
1996	int ret;
1997
1998	/* Ask the host to send along the list of child devices */
1999	init_completion(&comp);
2000	if (cmpxchg(&hbus->survey_event, NULL, &comp))
2001		return -ENOTEMPTY;
2002
2003	memset(&message, 0, sizeof(message));
2004	message.message_type = PCI_QUERY_BUS_RELATIONS;
2005
2006	ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2007			       0, VM_PKT_DATA_INBAND, 0);
2008	if (ret)
2009		return ret;
2010
2011	wait_for_completion(&comp);
2012	return 0;
2013}
2014
2015/**
2016 * hv_send_resources_allocated() - Report local resource choices
2017 * @hdev:	VMBus's tracking struct for this root PCI bus
2018 *
2019 * The host OS is expecting to be sent a request as a message
2020 * which contains all the resources that the device will use.
2021 * The response contains those same resources, "translated"
2022 * which is to say, the values which should be used by the
2023 * hardware, when it delivers an interrupt.  (MMIO resources are
2024 * used in local terms.)  This is nice for Windows, and lines up
2025 * with the FDO/PDO split, which doesn't exist in Linux.  Linux
2026 * is deeply expecting to scan an emulated PCI configuration
2027 * space.  So this message is sent here only to drive the state
2028 * machine on the host forward.
2029 *
2030 * Return: 0 on success, -errno on failure
2031 */
2032static int hv_send_resources_allocated(struct hv_device *hdev)
2033{
2034	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2035	struct pci_resources_assigned *res_assigned;
2036	struct hv_pci_compl comp_pkt;
2037	struct hv_pci_dev *hpdev;
2038	struct pci_packet *pkt;
2039	u32 wslot;
2040	int ret;
2041
2042	pkt = kmalloc(sizeof(*pkt) + sizeof(*res_assigned), GFP_KERNEL);
2043	if (!pkt)
2044		return -ENOMEM;
2045
2046	ret = 0;
2047
2048	for (wslot = 0; wslot < 256; wslot++) {
2049		hpdev = get_pcichild_wslot(hbus, wslot);
2050		if (!hpdev)
2051			continue;
2052
2053		memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
2054		init_completion(&comp_pkt.host_event);
2055		pkt->completion_func = hv_pci_generic_compl;
2056		pkt->compl_ctxt = &comp_pkt;
2057		pkt->message.message_type = PCI_RESOURCES_ASSIGNED;
2058		res_assigned = (struct pci_resources_assigned *)&pkt->message;
2059		res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2060
2061		put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2062
2063		ret = vmbus_sendpacket(
2064			hdev->channel, &pkt->message,
2065			sizeof(*res_assigned),
2066			(unsigned long)pkt,
2067			VM_PKT_DATA_INBAND,
2068			VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2069		if (ret)
2070			break;
2071
2072		wait_for_completion(&comp_pkt.host_event);
2073
2074		if (comp_pkt.completion_status < 0) {
2075			ret = -EPROTO;
2076			dev_err(&hdev->device,
2077				"resource allocated returned 0x%x",
2078				comp_pkt.completion_status);
2079			break;
2080		}
2081	}
2082
2083	kfree(pkt);
2084	return ret;
2085}
2086
2087/**
2088 * hv_send_resources_released() - Report local resources
2089 * released
2090 * @hdev:	VMBus's tracking struct for this root PCI bus
2091 *
2092 * Return: 0 on success, -errno on failure
2093 */
2094static int hv_send_resources_released(struct hv_device *hdev)
2095{
2096	struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2097	struct pci_child_message pkt;
2098	struct hv_pci_dev *hpdev;
2099	u32 wslot;
2100	int ret;
2101
2102	for (wslot = 0; wslot < 256; wslot++) {
2103		hpdev = get_pcichild_wslot(hbus, wslot);
2104		if (!hpdev)
2105			continue;
2106
2107		memset(&pkt, 0, sizeof(pkt));
2108		pkt.message_type = PCI_RESOURCES_RELEASED;
2109		pkt.wslot.slot = hpdev->desc.win_slot.slot;
2110
2111		put_pcichild(hpdev, hv_pcidev_ref_by_slot);
2112
2113		ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2114				       VM_PKT_DATA_INBAND, 0);
2115		if (ret)
2116			return ret;
2117	}
2118
2119	return 0;
2120}
2121
2122static void get_hvpcibus(struct hv_pcibus_device *hbus)
2123{
2124	atomic_inc(&hbus->remove_lock);
2125}
2126
2127static void put_hvpcibus(struct hv_pcibus_device *hbus)
2128{
2129	if (atomic_dec_and_test(&hbus->remove_lock))
2130		complete(&hbus->remove_event);
2131}
2132
2133/**
2134 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2135 * @hdev:	VMBus's tracking struct for this root PCI bus
2136 * @dev_id:	Identifies the device itself
2137 *
2138 * Return: 0 on success, -errno on failure
2139 */
2140static int hv_pci_probe(struct hv_device *hdev,
2141			const struct hv_vmbus_device_id *dev_id)
2142{
2143	struct hv_pcibus_device *hbus;
2144	int ret;
2145
2146	hbus = kzalloc(sizeof(*hbus), GFP_KERNEL);
2147	if (!hbus)
2148		return -ENOMEM;
2149
2150	/*
2151	 * The PCI bus "domain" is what is called "segment" in ACPI and
2152	 * other specs.  Pull it from the instance ID, to get something
2153	 * unique.  Bytes 8 and 9 are what is used in Windows guests, so
2154	 * do the same thing for consistency.  Note that, since this code
2155	 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2156	 * that (1) the only domain in use for something that looks like
2157	 * a physical PCI bus (which is actually emulated by the
2158	 * hypervisor) is domain 0 and (2) there will be no overlap
2159	 * between domains derived from these instance IDs in the same
2160	 * VM.
2161	 */
2162	hbus->sysdata.domain = hdev->dev_instance.b[9] |
2163			       hdev->dev_instance.b[8] << 8;
2164
2165	hbus->hdev = hdev;
2166	atomic_inc(&hbus->remove_lock);
2167	INIT_LIST_HEAD(&hbus->children);
2168	INIT_LIST_HEAD(&hbus->dr_list);
2169	INIT_LIST_HEAD(&hbus->resources_for_children);
2170	spin_lock_init(&hbus->config_lock);
2171	spin_lock_init(&hbus->device_list_lock);
2172	sema_init(&hbus->enum_sem, 1);
2173	init_completion(&hbus->remove_event);
2174
2175	ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2176			 hv_pci_onchannelcallback, hbus);
2177	if (ret)
2178		goto free_bus;
2179
2180	hv_set_drvdata(hdev, hbus);
2181
2182	ret = hv_pci_protocol_negotiation(hdev);
2183	if (ret)
2184		goto close;
2185
2186	ret = hv_allocate_config_window(hbus);
2187	if (ret)
2188		goto close;
2189
2190	hbus->cfg_addr = ioremap(hbus->mem_config->start,
2191				 PCI_CONFIG_MMIO_LENGTH);
2192	if (!hbus->cfg_addr) {
2193		dev_err(&hdev->device,
2194			"Unable to map a virtual address for config space\n");
2195		ret = -ENOMEM;
2196		goto free_config;
2197	}
2198
2199	hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2200	if (!hbus->sysdata.fwnode) {
2201		ret = -ENOMEM;
2202		goto unmap;
2203	}
2204
2205	ret = hv_pcie_init_irq_domain(hbus);
2206	if (ret)
2207		goto free_fwnode;
2208
2209	ret = hv_pci_query_relations(hdev);
2210	if (ret)
2211		goto free_irq_domain;
2212
2213	ret = hv_pci_enter_d0(hdev);
2214	if (ret)
2215		goto free_irq_domain;
2216
2217	ret = hv_pci_allocate_bridge_windows(hbus);
2218	if (ret)
2219		goto free_irq_domain;
2220
2221	ret = hv_send_resources_allocated(hdev);
2222	if (ret)
2223		goto free_windows;
2224
2225	prepopulate_bars(hbus);
2226
2227	hbus->state = hv_pcibus_probed;
2228
2229	ret = create_root_hv_pci_bus(hbus);
2230	if (ret)
2231		goto free_windows;
2232
2233	return 0;
2234
2235free_windows:
2236	hv_pci_free_bridge_windows(hbus);
2237free_irq_domain:
2238	irq_domain_remove(hbus->irq_domain);
2239free_fwnode:
2240	irq_domain_free_fwnode(hbus->sysdata.fwnode);
2241unmap:
2242	iounmap(hbus->cfg_addr);
2243free_config:
2244	hv_free_config_window(hbus);
2245close:
2246	vmbus_close(hdev->channel);
2247free_bus:
2248	kfree(hbus);
2249	return ret;
2250}
2251
2252/**
2253 * hv_pci_remove() - Remove routine for this VMBus channel
2254 * @hdev:	VMBus's tracking struct for this root PCI bus
2255 *
2256 * Return: 0 on success, -errno on failure
2257 */
2258static int hv_pci_remove(struct hv_device *hdev)
2259{
2260	int ret;
2261	struct hv_pcibus_device *hbus;
2262	union {
2263		struct pci_packet teardown_packet;
2264		u8 buffer[0x100];
2265	} pkt;
2266	struct pci_bus_relations relations;
2267	struct hv_pci_compl comp_pkt;
2268
2269	hbus = hv_get_drvdata(hdev);
2270
2271	ret = hv_send_resources_released(hdev);
2272	if (ret)
2273		dev_err(&hdev->device,
2274			"Couldn't send resources released packet(s)\n");
2275
2276	memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2277	init_completion(&comp_pkt.host_event);
2278	pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2279	pkt.teardown_packet.compl_ctxt = &comp_pkt;
2280	pkt.teardown_packet.message.message_type = PCI_BUS_D0EXIT;
2281
2282	ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2283			       sizeof(struct pci_message),
2284			       (unsigned long)&pkt.teardown_packet,
2285			       VM_PKT_DATA_INBAND,
2286			       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2287	if (!ret)
2288		wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
2289
2290	if (hbus->state == hv_pcibus_installed) {
2291		/* Remove the bus from PCI's point of view. */
2292		pci_lock_rescan_remove();
2293		pci_stop_root_bus(hbus->pci_bus);
2294		pci_remove_root_bus(hbus->pci_bus);
2295		pci_unlock_rescan_remove();
2296	}
2297
2298	vmbus_close(hdev->channel);
2299
2300	/* Delete any children which might still exist. */
2301	memset(&relations, 0, sizeof(relations));
2302	hv_pci_devices_present(hbus, &relations);
2303
2304	iounmap(hbus->cfg_addr);
2305	hv_free_config_window(hbus);
2306	pci_free_resource_list(&hbus->resources_for_children);
2307	hv_pci_free_bridge_windows(hbus);
2308	irq_domain_remove(hbus->irq_domain);
2309	irq_domain_free_fwnode(hbus->sysdata.fwnode);
2310	put_hvpcibus(hbus);
2311	wait_for_completion(&hbus->remove_event);
2312	kfree(hbus);
2313	return 0;
2314}
2315
2316static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2317	/* PCI Pass-through Class ID */
2318	/* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2319	{ HV_PCIE_GUID, },
2320	{ },
2321};
2322
2323MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2324
2325static struct hv_driver hv_pci_drv = {
2326	.name		= "hv_pci",
2327	.id_table	= hv_pci_id_table,
2328	.probe		= hv_pci_probe,
2329	.remove		= hv_pci_remove,
2330};
2331
2332static void __exit exit_hv_pci_drv(void)
2333{
2334	vmbus_driver_unregister(&hv_pci_drv);
2335}
2336
2337static int __init init_hv_pci_drv(void)
2338{
2339	return vmbus_driver_register(&hv_pci_drv);
2340}
2341
2342module_init(init_hv_pci_drv);
2343module_exit(exit_hv_pci_drv);
2344
2345MODULE_DESCRIPTION("Hyper-V PCI");
2346MODULE_LICENSE("GPL v2");