Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.13.7.
    1/*
    2 * Kernel-based Virtual Machine driver for Linux
    3 *
    4 * This module enables machines with Intel VT-x extensions to run virtual
    5 * machines without emulation or binary translation.
    6 *
    7 * Copyright (C) 2006 Qumranet, Inc.
    8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
    9 *
   10 * Authors:
   11 *   Avi Kivity   <avi@qumranet.com>
   12 *   Yaniv Kamay  <yaniv@qumranet.com>
   13 *
   14 * This work is licensed under the terms of the GNU GPL, version 2.  See
   15 * the COPYING file in the top-level directory.
   16 *
   17 */
   18
   19#include "irq.h"
   20#include "mmu.h"
   21#include "cpuid.h"
   22#include "lapic.h"
   23
   24#include <linux/kvm_host.h>
   25#include <linux/module.h>
   26#include <linux/kernel.h>
   27#include <linux/mm.h>
   28#include <linux/highmem.h>
   29#include <linux/sched.h>
   30#include <linux/moduleparam.h>
   31#include <linux/mod_devicetable.h>
   32#include <linux/trace_events.h>
   33#include <linux/slab.h>
   34#include <linux/tboot.h>
   35#include <linux/hrtimer.h>
   36#include <linux/frame.h>
   37#include <linux/nospec.h>
   38#include "kvm_cache_regs.h"
   39#include "x86.h"
   40
   41#include <asm/cpu.h>
   42#include <asm/io.h>
   43#include <asm/desc.h>
   44#include <asm/vmx.h>
   45#include <asm/virtext.h>
   46#include <asm/mce.h>
   47#include <asm/fpu/internal.h>
   48#include <asm/perf_event.h>
   49#include <asm/debugreg.h>
   50#include <asm/kexec.h>
   51#include <asm/apic.h>
   52#include <asm/irq_remapping.h>
   53#include <asm/mmu_context.h>
   54#include <asm/spec-ctrl.h>
   55#include <asm/mshyperv.h>
   56
   57#include "trace.h"
   58#include "pmu.h"
   59#include "vmx_evmcs.h"
   60
   61#define __ex(x) __kvm_handle_fault_on_reboot(x)
   62#define __ex_clear(x, reg) \
   63	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
   64
   65MODULE_AUTHOR("Qumranet");
   66MODULE_LICENSE("GPL");
   67
   68static const struct x86_cpu_id vmx_cpu_id[] = {
   69	X86_FEATURE_MATCH(X86_FEATURE_VMX),
   70	{}
   71};
   72MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
   73
   74static bool __read_mostly enable_vpid = 1;
   75module_param_named(vpid, enable_vpid, bool, 0444);
   76
   77static bool __read_mostly enable_vnmi = 1;
   78module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
   79
   80static bool __read_mostly flexpriority_enabled = 1;
   81module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
   82
   83static bool __read_mostly enable_ept = 1;
   84module_param_named(ept, enable_ept, bool, S_IRUGO);
   85
   86static bool __read_mostly enable_unrestricted_guest = 1;
   87module_param_named(unrestricted_guest,
   88			enable_unrestricted_guest, bool, S_IRUGO);
   89
   90static bool __read_mostly enable_ept_ad_bits = 1;
   91module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
   92
   93static bool __read_mostly emulate_invalid_guest_state = true;
   94module_param(emulate_invalid_guest_state, bool, S_IRUGO);
   95
   96static bool __read_mostly fasteoi = 1;
   97module_param(fasteoi, bool, S_IRUGO);
   98
   99static bool __read_mostly enable_apicv = 1;
  100module_param(enable_apicv, bool, S_IRUGO);
  101
  102static bool __read_mostly enable_shadow_vmcs = 1;
  103module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  104/*
  105 * If nested=1, nested virtualization is supported, i.e., guests may use
  106 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
  107 * use VMX instructions.
  108 */
  109static bool __read_mostly nested = 0;
  110module_param(nested, bool, S_IRUGO);
  111
  112static u64 __read_mostly host_xss;
  113
  114static bool __read_mostly enable_pml = 1;
  115module_param_named(pml, enable_pml, bool, S_IRUGO);
  116
  117#define MSR_TYPE_R	1
  118#define MSR_TYPE_W	2
  119#define MSR_TYPE_RW	3
  120
  121#define MSR_BITMAP_MODE_X2APIC		1
  122#define MSR_BITMAP_MODE_X2APIC_APICV	2
  123#define MSR_BITMAP_MODE_LM		4
  124
  125#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
  126
  127/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
  128static int __read_mostly cpu_preemption_timer_multi;
  129static bool __read_mostly enable_preemption_timer = 1;
  130#ifdef CONFIG_X86_64
  131module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
  132#endif
  133
  134#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
  135#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
  136#define KVM_VM_CR0_ALWAYS_ON				\
  137	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | 	\
  138	 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
  139#define KVM_CR4_GUEST_OWNED_BITS				      \
  140	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
  141	 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
  142
  143#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
  144#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
  145#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
  146
  147#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
  148
  149#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
  150
  151/*
  152 * Hyper-V requires all of these, so mark them as supported even though
  153 * they are just treated the same as all-context.
  154 */
  155#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
  156	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
  157	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
  158	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
  159	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
  160
  161/*
  162 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  163 * ple_gap:    upper bound on the amount of time between two successive
  164 *             executions of PAUSE in a loop. Also indicate if ple enabled.
  165 *             According to test, this time is usually smaller than 128 cycles.
  166 * ple_window: upper bound on the amount of time a guest is allowed to execute
  167 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
  168 *             less than 2^12 cycles
  169 * Time is measured based on a counter that runs at the same rate as the TSC,
  170 * refer SDM volume 3b section 21.6.13 & 22.1.3.
  171 */
  172static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
  173
  174static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
  175module_param(ple_window, uint, 0444);
  176
  177/* Default doubles per-vcpu window every exit. */
  178static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
  179module_param(ple_window_grow, uint, 0444);
  180
  181/* Default resets per-vcpu window every exit to ple_window. */
  182static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
  183module_param(ple_window_shrink, uint, 0444);
  184
  185/* Default is to compute the maximum so we can never overflow. */
  186static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
  187module_param(ple_window_max, uint, 0444);
  188
  189extern const ulong vmx_return;
  190
  191struct kvm_vmx {
  192	struct kvm kvm;
  193
  194	unsigned int tss_addr;
  195	bool ept_identity_pagetable_done;
  196	gpa_t ept_identity_map_addr;
  197};
  198
  199#define NR_AUTOLOAD_MSRS 8
  200
  201struct vmcs {
  202	u32 revision_id;
  203	u32 abort;
  204	char data[0];
  205};
  206
  207/*
  208 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
  209 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
  210 * loaded on this CPU (so we can clear them if the CPU goes down).
  211 */
  212struct loaded_vmcs {
  213	struct vmcs *vmcs;
  214	struct vmcs *shadow_vmcs;
  215	int cpu;
  216	bool launched;
  217	bool nmi_known_unmasked;
  218	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
  219	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
  220	/* Support for vnmi-less CPUs */
  221	int soft_vnmi_blocked;
  222	ktime_t entry_time;
  223	s64 vnmi_blocked_time;
  224	unsigned long *msr_bitmap;
  225	struct list_head loaded_vmcss_on_cpu_link;
  226};
  227
  228struct shared_msr_entry {
  229	unsigned index;
  230	u64 data;
  231	u64 mask;
  232};
  233
  234/*
  235 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
  236 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
  237 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
  238 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
  239 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
  240 * More than one of these structures may exist, if L1 runs multiple L2 guests.
  241 * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
  242 * underlying hardware which will be used to run L2.
  243 * This structure is packed to ensure that its layout is identical across
  244 * machines (necessary for live migration).
  245 * If there are changes in this struct, VMCS12_REVISION must be changed.
  246 */
  247typedef u64 natural_width;
  248struct __packed vmcs12 {
  249	/* According to the Intel spec, a VMCS region must start with the
  250	 * following two fields. Then follow implementation-specific data.
  251	 */
  252	u32 revision_id;
  253	u32 abort;
  254
  255	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
  256	u32 padding[7]; /* room for future expansion */
  257
  258	u64 io_bitmap_a;
  259	u64 io_bitmap_b;
  260	u64 msr_bitmap;
  261	u64 vm_exit_msr_store_addr;
  262	u64 vm_exit_msr_load_addr;
  263	u64 vm_entry_msr_load_addr;
  264	u64 tsc_offset;
  265	u64 virtual_apic_page_addr;
  266	u64 apic_access_addr;
  267	u64 posted_intr_desc_addr;
  268	u64 vm_function_control;
  269	u64 ept_pointer;
  270	u64 eoi_exit_bitmap0;
  271	u64 eoi_exit_bitmap1;
  272	u64 eoi_exit_bitmap2;
  273	u64 eoi_exit_bitmap3;
  274	u64 eptp_list_address;
  275	u64 xss_exit_bitmap;
  276	u64 guest_physical_address;
  277	u64 vmcs_link_pointer;
  278	u64 pml_address;
  279	u64 guest_ia32_debugctl;
  280	u64 guest_ia32_pat;
  281	u64 guest_ia32_efer;
  282	u64 guest_ia32_perf_global_ctrl;
  283	u64 guest_pdptr0;
  284	u64 guest_pdptr1;
  285	u64 guest_pdptr2;
  286	u64 guest_pdptr3;
  287	u64 guest_bndcfgs;
  288	u64 host_ia32_pat;
  289	u64 host_ia32_efer;
  290	u64 host_ia32_perf_global_ctrl;
  291	u64 padding64[8]; /* room for future expansion */
  292	/*
  293	 * To allow migration of L1 (complete with its L2 guests) between
  294	 * machines of different natural widths (32 or 64 bit), we cannot have
  295	 * unsigned long fields with no explict size. We use u64 (aliased
  296	 * natural_width) instead. Luckily, x86 is little-endian.
  297	 */
  298	natural_width cr0_guest_host_mask;
  299	natural_width cr4_guest_host_mask;
  300	natural_width cr0_read_shadow;
  301	natural_width cr4_read_shadow;
  302	natural_width cr3_target_value0;
  303	natural_width cr3_target_value1;
  304	natural_width cr3_target_value2;
  305	natural_width cr3_target_value3;
  306	natural_width exit_qualification;
  307	natural_width guest_linear_address;
  308	natural_width guest_cr0;
  309	natural_width guest_cr3;
  310	natural_width guest_cr4;
  311	natural_width guest_es_base;
  312	natural_width guest_cs_base;
  313	natural_width guest_ss_base;
  314	natural_width guest_ds_base;
  315	natural_width guest_fs_base;
  316	natural_width guest_gs_base;
  317	natural_width guest_ldtr_base;
  318	natural_width guest_tr_base;
  319	natural_width guest_gdtr_base;
  320	natural_width guest_idtr_base;
  321	natural_width guest_dr7;
  322	natural_width guest_rsp;
  323	natural_width guest_rip;
  324	natural_width guest_rflags;
  325	natural_width guest_pending_dbg_exceptions;
  326	natural_width guest_sysenter_esp;
  327	natural_width guest_sysenter_eip;
  328	natural_width host_cr0;
  329	natural_width host_cr3;
  330	natural_width host_cr4;
  331	natural_width host_fs_base;
  332	natural_width host_gs_base;
  333	natural_width host_tr_base;
  334	natural_width host_gdtr_base;
  335	natural_width host_idtr_base;
  336	natural_width host_ia32_sysenter_esp;
  337	natural_width host_ia32_sysenter_eip;
  338	natural_width host_rsp;
  339	natural_width host_rip;
  340	natural_width paddingl[8]; /* room for future expansion */
  341	u32 pin_based_vm_exec_control;
  342	u32 cpu_based_vm_exec_control;
  343	u32 exception_bitmap;
  344	u32 page_fault_error_code_mask;
  345	u32 page_fault_error_code_match;
  346	u32 cr3_target_count;
  347	u32 vm_exit_controls;
  348	u32 vm_exit_msr_store_count;
  349	u32 vm_exit_msr_load_count;
  350	u32 vm_entry_controls;
  351	u32 vm_entry_msr_load_count;
  352	u32 vm_entry_intr_info_field;
  353	u32 vm_entry_exception_error_code;
  354	u32 vm_entry_instruction_len;
  355	u32 tpr_threshold;
  356	u32 secondary_vm_exec_control;
  357	u32 vm_instruction_error;
  358	u32 vm_exit_reason;
  359	u32 vm_exit_intr_info;
  360	u32 vm_exit_intr_error_code;
  361	u32 idt_vectoring_info_field;
  362	u32 idt_vectoring_error_code;
  363	u32 vm_exit_instruction_len;
  364	u32 vmx_instruction_info;
  365	u32 guest_es_limit;
  366	u32 guest_cs_limit;
  367	u32 guest_ss_limit;
  368	u32 guest_ds_limit;
  369	u32 guest_fs_limit;
  370	u32 guest_gs_limit;
  371	u32 guest_ldtr_limit;
  372	u32 guest_tr_limit;
  373	u32 guest_gdtr_limit;
  374	u32 guest_idtr_limit;
  375	u32 guest_es_ar_bytes;
  376	u32 guest_cs_ar_bytes;
  377	u32 guest_ss_ar_bytes;
  378	u32 guest_ds_ar_bytes;
  379	u32 guest_fs_ar_bytes;
  380	u32 guest_gs_ar_bytes;
  381	u32 guest_ldtr_ar_bytes;
  382	u32 guest_tr_ar_bytes;
  383	u32 guest_interruptibility_info;
  384	u32 guest_activity_state;
  385	u32 guest_sysenter_cs;
  386	u32 host_ia32_sysenter_cs;
  387	u32 vmx_preemption_timer_value;
  388	u32 padding32[7]; /* room for future expansion */
  389	u16 virtual_processor_id;
  390	u16 posted_intr_nv;
  391	u16 guest_es_selector;
  392	u16 guest_cs_selector;
  393	u16 guest_ss_selector;
  394	u16 guest_ds_selector;
  395	u16 guest_fs_selector;
  396	u16 guest_gs_selector;
  397	u16 guest_ldtr_selector;
  398	u16 guest_tr_selector;
  399	u16 guest_intr_status;
  400	u16 guest_pml_index;
  401	u16 host_es_selector;
  402	u16 host_cs_selector;
  403	u16 host_ss_selector;
  404	u16 host_ds_selector;
  405	u16 host_fs_selector;
  406	u16 host_gs_selector;
  407	u16 host_tr_selector;
  408};
  409
  410/*
  411 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
  412 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
  413 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
  414 */
  415#define VMCS12_REVISION 0x11e57ed0
  416
  417/*
  418 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
  419 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
  420 * current implementation, 4K are reserved to avoid future complications.
  421 */
  422#define VMCS12_SIZE 0x1000
  423
  424/*
  425 * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
  426 * supported VMCS12 field encoding.
  427 */
  428#define VMCS12_MAX_FIELD_INDEX 0x17
  429
  430struct nested_vmx_msrs {
  431	/*
  432	 * We only store the "true" versions of the VMX capability MSRs. We
  433	 * generate the "non-true" versions by setting the must-be-1 bits
  434	 * according to the SDM.
  435	 */
  436	u32 procbased_ctls_low;
  437	u32 procbased_ctls_high;
  438	u32 secondary_ctls_low;
  439	u32 secondary_ctls_high;
  440	u32 pinbased_ctls_low;
  441	u32 pinbased_ctls_high;
  442	u32 exit_ctls_low;
  443	u32 exit_ctls_high;
  444	u32 entry_ctls_low;
  445	u32 entry_ctls_high;
  446	u32 misc_low;
  447	u32 misc_high;
  448	u32 ept_caps;
  449	u32 vpid_caps;
  450	u64 basic;
  451	u64 cr0_fixed0;
  452	u64 cr0_fixed1;
  453	u64 cr4_fixed0;
  454	u64 cr4_fixed1;
  455	u64 vmcs_enum;
  456	u64 vmfunc_controls;
  457};
  458
  459/*
  460 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  461 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
  462 */
  463struct nested_vmx {
  464	/* Has the level1 guest done vmxon? */
  465	bool vmxon;
  466	gpa_t vmxon_ptr;
  467	bool pml_full;
  468
  469	/* The guest-physical address of the current VMCS L1 keeps for L2 */
  470	gpa_t current_vmptr;
  471	/*
  472	 * Cache of the guest's VMCS, existing outside of guest memory.
  473	 * Loaded from guest memory during VMPTRLD. Flushed to guest
  474	 * memory during VMCLEAR and VMPTRLD.
  475	 */
  476	struct vmcs12 *cached_vmcs12;
  477	/*
  478	 * Indicates if the shadow vmcs must be updated with the
  479	 * data hold by vmcs12
  480	 */
  481	bool sync_shadow_vmcs;
  482	bool dirty_vmcs12;
  483
  484	bool change_vmcs01_virtual_x2apic_mode;
  485	/* L2 must run next, and mustn't decide to exit to L1. */
  486	bool nested_run_pending;
  487
  488	struct loaded_vmcs vmcs02;
  489
  490	/*
  491	 * Guest pages referred to in the vmcs02 with host-physical
  492	 * pointers, so we must keep them pinned while L2 runs.
  493	 */
  494	struct page *apic_access_page;
  495	struct page *virtual_apic_page;
  496	struct page *pi_desc_page;
  497	struct pi_desc *pi_desc;
  498	bool pi_pending;
  499	u16 posted_intr_nv;
  500
  501	struct hrtimer preemption_timer;
  502	bool preemption_timer_expired;
  503
  504	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
  505	u64 vmcs01_debugctl;
  506
  507	u16 vpid02;
  508	u16 last_vpid;
  509
  510	struct nested_vmx_msrs msrs;
  511
  512	/* SMM related state */
  513	struct {
  514		/* in VMX operation on SMM entry? */
  515		bool vmxon;
  516		/* in guest mode on SMM entry? */
  517		bool guest_mode;
  518	} smm;
  519};
  520
  521#define POSTED_INTR_ON  0
  522#define POSTED_INTR_SN  1
  523
  524/* Posted-Interrupt Descriptor */
  525struct pi_desc {
  526	u32 pir[8];     /* Posted interrupt requested */
  527	union {
  528		struct {
  529				/* bit 256 - Outstanding Notification */
  530			u16	on	: 1,
  531				/* bit 257 - Suppress Notification */
  532				sn	: 1,
  533				/* bit 271:258 - Reserved */
  534				rsvd_1	: 14;
  535				/* bit 279:272 - Notification Vector */
  536			u8	nv;
  537				/* bit 287:280 - Reserved */
  538			u8	rsvd_2;
  539				/* bit 319:288 - Notification Destination */
  540			u32	ndst;
  541		};
  542		u64 control;
  543	};
  544	u32 rsvd[6];
  545} __aligned(64);
  546
  547static bool pi_test_and_set_on(struct pi_desc *pi_desc)
  548{
  549	return test_and_set_bit(POSTED_INTR_ON,
  550			(unsigned long *)&pi_desc->control);
  551}
  552
  553static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
  554{
  555	return test_and_clear_bit(POSTED_INTR_ON,
  556			(unsigned long *)&pi_desc->control);
  557}
  558
  559static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
  560{
  561	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
  562}
  563
  564static inline void pi_clear_sn(struct pi_desc *pi_desc)
  565{
  566	return clear_bit(POSTED_INTR_SN,
  567			(unsigned long *)&pi_desc->control);
  568}
  569
  570static inline void pi_set_sn(struct pi_desc *pi_desc)
  571{
  572	return set_bit(POSTED_INTR_SN,
  573			(unsigned long *)&pi_desc->control);
  574}
  575
  576static inline void pi_clear_on(struct pi_desc *pi_desc)
  577{
  578	clear_bit(POSTED_INTR_ON,
  579  		  (unsigned long *)&pi_desc->control);
  580}
  581
  582static inline int pi_test_on(struct pi_desc *pi_desc)
  583{
  584	return test_bit(POSTED_INTR_ON,
  585			(unsigned long *)&pi_desc->control);
  586}
  587
  588static inline int pi_test_sn(struct pi_desc *pi_desc)
  589{
  590	return test_bit(POSTED_INTR_SN,
  591			(unsigned long *)&pi_desc->control);
  592}
  593
  594struct vcpu_vmx {
  595	struct kvm_vcpu       vcpu;
  596	unsigned long         host_rsp;
  597	u8                    fail;
  598	u8		      msr_bitmap_mode;
  599	u32                   exit_intr_info;
  600	u32                   idt_vectoring_info;
  601	ulong                 rflags;
  602	struct shared_msr_entry *guest_msrs;
  603	int                   nmsrs;
  604	int                   save_nmsrs;
  605	unsigned long	      host_idt_base;
  606#ifdef CONFIG_X86_64
  607	u64 		      msr_host_kernel_gs_base;
  608	u64 		      msr_guest_kernel_gs_base;
  609#endif
  610
  611	u64 		      arch_capabilities;
  612	u64 		      spec_ctrl;
  613
  614	u32 vm_entry_controls_shadow;
  615	u32 vm_exit_controls_shadow;
  616	u32 secondary_exec_control;
  617
  618	/*
  619	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
  620	 * non-nested (L1) guest, it always points to vmcs01. For a nested
  621	 * guest (L2), it points to a different VMCS.
  622	 */
  623	struct loaded_vmcs    vmcs01;
  624	struct loaded_vmcs   *loaded_vmcs;
  625	bool                  __launched; /* temporary, used in vmx_vcpu_run */
  626	struct msr_autoload {
  627		unsigned nr;
  628		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
  629		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
  630	} msr_autoload;
  631	struct {
  632		int           loaded;
  633		u16           fs_sel, gs_sel, ldt_sel;
  634#ifdef CONFIG_X86_64
  635		u16           ds_sel, es_sel;
  636#endif
  637		int           gs_ldt_reload_needed;
  638		int           fs_reload_needed;
  639		u64           msr_host_bndcfgs;
  640	} host_state;
  641	struct {
  642		int vm86_active;
  643		ulong save_rflags;
  644		struct kvm_segment segs[8];
  645	} rmode;
  646	struct {
  647		u32 bitmask; /* 4 bits per segment (1 bit per field) */
  648		struct kvm_save_segment {
  649			u16 selector;
  650			unsigned long base;
  651			u32 limit;
  652			u32 ar;
  653		} seg[8];
  654	} segment_cache;
  655	int vpid;
  656	bool emulation_required;
  657
  658	u32 exit_reason;
  659
  660	/* Posted interrupt descriptor */
  661	struct pi_desc pi_desc;
  662
  663	/* Support for a guest hypervisor (nested VMX) */
  664	struct nested_vmx nested;
  665
  666	/* Dynamic PLE window. */
  667	int ple_window;
  668	bool ple_window_dirty;
  669
  670	/* Support for PML */
  671#define PML_ENTITY_NUM		512
  672	struct page *pml_pg;
  673
  674	/* apic deadline value in host tsc */
  675	u64 hv_deadline_tsc;
  676
  677	u64 current_tsc_ratio;
  678
  679	u32 host_pkru;
  680
  681	unsigned long host_debugctlmsr;
  682
  683	/*
  684	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
  685	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
  686	 * in msr_ia32_feature_control_valid_bits.
  687	 */
  688	u64 msr_ia32_feature_control;
  689	u64 msr_ia32_feature_control_valid_bits;
  690};
  691
  692enum segment_cache_field {
  693	SEG_FIELD_SEL = 0,
  694	SEG_FIELD_BASE = 1,
  695	SEG_FIELD_LIMIT = 2,
  696	SEG_FIELD_AR = 3,
  697
  698	SEG_FIELD_NR = 4
  699};
  700
  701static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
  702{
  703	return container_of(kvm, struct kvm_vmx, kvm);
  704}
  705
  706static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
  707{
  708	return container_of(vcpu, struct vcpu_vmx, vcpu);
  709}
  710
  711static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
  712{
  713	return &(to_vmx(vcpu)->pi_desc);
  714}
  715
  716#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
  717#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
  718#define FIELD(number, name)	[ROL16(number, 6)] = VMCS12_OFFSET(name)
  719#define FIELD64(number, name)						\
  720	FIELD(number, name),						\
  721	[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
  722
  723
  724static u16 shadow_read_only_fields[] = {
  725#define SHADOW_FIELD_RO(x) x,
  726#include "vmx_shadow_fields.h"
  727};
  728static int max_shadow_read_only_fields =
  729	ARRAY_SIZE(shadow_read_only_fields);
  730
  731static u16 shadow_read_write_fields[] = {
  732#define SHADOW_FIELD_RW(x) x,
  733#include "vmx_shadow_fields.h"
  734};
  735static int max_shadow_read_write_fields =
  736	ARRAY_SIZE(shadow_read_write_fields);
  737
  738static const unsigned short vmcs_field_to_offset_table[] = {
  739	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
  740	FIELD(POSTED_INTR_NV, posted_intr_nv),
  741	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
  742	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
  743	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
  744	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
  745	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
  746	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
  747	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
  748	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
  749	FIELD(GUEST_INTR_STATUS, guest_intr_status),
  750	FIELD(GUEST_PML_INDEX, guest_pml_index),
  751	FIELD(HOST_ES_SELECTOR, host_es_selector),
  752	FIELD(HOST_CS_SELECTOR, host_cs_selector),
  753	FIELD(HOST_SS_SELECTOR, host_ss_selector),
  754	FIELD(HOST_DS_SELECTOR, host_ds_selector),
  755	FIELD(HOST_FS_SELECTOR, host_fs_selector),
  756	FIELD(HOST_GS_SELECTOR, host_gs_selector),
  757	FIELD(HOST_TR_SELECTOR, host_tr_selector),
  758	FIELD64(IO_BITMAP_A, io_bitmap_a),
  759	FIELD64(IO_BITMAP_B, io_bitmap_b),
  760	FIELD64(MSR_BITMAP, msr_bitmap),
  761	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
  762	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
  763	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
  764	FIELD64(TSC_OFFSET, tsc_offset),
  765	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
  766	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
  767	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
  768	FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
  769	FIELD64(EPT_POINTER, ept_pointer),
  770	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
  771	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
  772	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
  773	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
  774	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
  775	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
  776	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
  777	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
  778	FIELD64(PML_ADDRESS, pml_address),
  779	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
  780	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
  781	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
  782	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
  783	FIELD64(GUEST_PDPTR0, guest_pdptr0),
  784	FIELD64(GUEST_PDPTR1, guest_pdptr1),
  785	FIELD64(GUEST_PDPTR2, guest_pdptr2),
  786	FIELD64(GUEST_PDPTR3, guest_pdptr3),
  787	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
  788	FIELD64(HOST_IA32_PAT, host_ia32_pat),
  789	FIELD64(HOST_IA32_EFER, host_ia32_efer),
  790	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
  791	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
  792	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
  793	FIELD(EXCEPTION_BITMAP, exception_bitmap),
  794	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
  795	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
  796	FIELD(CR3_TARGET_COUNT, cr3_target_count),
  797	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
  798	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
  799	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
  800	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
  801	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
  802	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
  803	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
  804	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
  805	FIELD(TPR_THRESHOLD, tpr_threshold),
  806	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
  807	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
  808	FIELD(VM_EXIT_REASON, vm_exit_reason),
  809	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
  810	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
  811	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
  812	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
  813	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
  814	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
  815	FIELD(GUEST_ES_LIMIT, guest_es_limit),
  816	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
  817	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
  818	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
  819	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
  820	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
  821	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
  822	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
  823	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
  824	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
  825	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
  826	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
  827	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
  828	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
  829	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
  830	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
  831	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
  832	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
  833	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
  834	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
  835	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
  836	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
  837	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
  838	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
  839	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
  840	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
  841	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
  842	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
  843	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
  844	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
  845	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
  846	FIELD(EXIT_QUALIFICATION, exit_qualification),
  847	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
  848	FIELD(GUEST_CR0, guest_cr0),
  849	FIELD(GUEST_CR3, guest_cr3),
  850	FIELD(GUEST_CR4, guest_cr4),
  851	FIELD(GUEST_ES_BASE, guest_es_base),
  852	FIELD(GUEST_CS_BASE, guest_cs_base),
  853	FIELD(GUEST_SS_BASE, guest_ss_base),
  854	FIELD(GUEST_DS_BASE, guest_ds_base),
  855	FIELD(GUEST_FS_BASE, guest_fs_base),
  856	FIELD(GUEST_GS_BASE, guest_gs_base),
  857	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
  858	FIELD(GUEST_TR_BASE, guest_tr_base),
  859	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
  860	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
  861	FIELD(GUEST_DR7, guest_dr7),
  862	FIELD(GUEST_RSP, guest_rsp),
  863	FIELD(GUEST_RIP, guest_rip),
  864	FIELD(GUEST_RFLAGS, guest_rflags),
  865	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
  866	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
  867	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
  868	FIELD(HOST_CR0, host_cr0),
  869	FIELD(HOST_CR3, host_cr3),
  870	FIELD(HOST_CR4, host_cr4),
  871	FIELD(HOST_FS_BASE, host_fs_base),
  872	FIELD(HOST_GS_BASE, host_gs_base),
  873	FIELD(HOST_TR_BASE, host_tr_base),
  874	FIELD(HOST_GDTR_BASE, host_gdtr_base),
  875	FIELD(HOST_IDTR_BASE, host_idtr_base),
  876	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
  877	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
  878	FIELD(HOST_RSP, host_rsp),
  879	FIELD(HOST_RIP, host_rip),
  880};
  881
  882static inline short vmcs_field_to_offset(unsigned long field)
  883{
  884	const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
  885	unsigned short offset;
  886	unsigned index;
  887
  888	if (field >> 15)
  889		return -ENOENT;
  890
  891	index = ROL16(field, 6);
  892	if (index >= size)
  893		return -ENOENT;
  894
  895	index = array_index_nospec(index, size);
  896	offset = vmcs_field_to_offset_table[index];
  897	if (offset == 0)
  898		return -ENOENT;
  899	return offset;
  900}
  901
  902static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
  903{
  904	return to_vmx(vcpu)->nested.cached_vmcs12;
  905}
  906
  907static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
  908static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  909static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
  910static bool vmx_xsaves_supported(void);
  911static void vmx_set_segment(struct kvm_vcpu *vcpu,
  912			    struct kvm_segment *var, int seg);
  913static void vmx_get_segment(struct kvm_vcpu *vcpu,
  914			    struct kvm_segment *var, int seg);
  915static bool guest_state_valid(struct kvm_vcpu *vcpu);
  916static u32 vmx_segment_access_rights(struct kvm_segment *var);
  917static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
  918static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
  919static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  920static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
  921					    u16 error_code);
  922static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
  923static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
  924							  u32 msr, int type);
  925
  926static DEFINE_PER_CPU(struct vmcs *, vmxarea);
  927static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
  928/*
  929 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
  930 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
  931 */
  932static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
  933
  934/*
  935 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
  936 * can find which vCPU should be waken up.
  937 */
  938static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
  939static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
  940
  941enum {
  942	VMX_VMREAD_BITMAP,
  943	VMX_VMWRITE_BITMAP,
  944	VMX_BITMAP_NR
  945};
  946
  947static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  948
  949#define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
  950#define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
  951
  952static bool cpu_has_load_ia32_efer;
  953static bool cpu_has_load_perf_global_ctrl;
  954
  955static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
  956static DEFINE_SPINLOCK(vmx_vpid_lock);
  957
  958static struct vmcs_config {
  959	int size;
  960	int order;
  961	u32 basic_cap;
  962	u32 revision_id;
  963	u32 pin_based_exec_ctrl;
  964	u32 cpu_based_exec_ctrl;
  965	u32 cpu_based_2nd_exec_ctrl;
  966	u32 vmexit_ctrl;
  967	u32 vmentry_ctrl;
  968	struct nested_vmx_msrs nested;
  969} vmcs_config;
  970
  971static struct vmx_capability {
  972	u32 ept;
  973	u32 vpid;
  974} vmx_capability;
  975
  976#define VMX_SEGMENT_FIELD(seg)					\
  977	[VCPU_SREG_##seg] = {                                   \
  978		.selector = GUEST_##seg##_SELECTOR,		\
  979		.base = GUEST_##seg##_BASE,		   	\
  980		.limit = GUEST_##seg##_LIMIT,		   	\
  981		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
  982	}
  983
  984static const struct kvm_vmx_segment_field {
  985	unsigned selector;
  986	unsigned base;
  987	unsigned limit;
  988	unsigned ar_bytes;
  989} kvm_vmx_segment_fields[] = {
  990	VMX_SEGMENT_FIELD(CS),
  991	VMX_SEGMENT_FIELD(DS),
  992	VMX_SEGMENT_FIELD(ES),
  993	VMX_SEGMENT_FIELD(FS),
  994	VMX_SEGMENT_FIELD(GS),
  995	VMX_SEGMENT_FIELD(SS),
  996	VMX_SEGMENT_FIELD(TR),
  997	VMX_SEGMENT_FIELD(LDTR),
  998};
  999
 1000static u64 host_efer;
 1001
 1002static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 1003
 1004/*
 1005 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
 1006 * away by decrementing the array size.
 1007 */
 1008static const u32 vmx_msr_index[] = {
 1009#ifdef CONFIG_X86_64
 1010	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 1011#endif
 1012	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 1013};
 1014
 1015DEFINE_STATIC_KEY_FALSE(enable_evmcs);
 1016
 1017#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
 1018
 1019#define KVM_EVMCS_VERSION 1
 1020
 1021#if IS_ENABLED(CONFIG_HYPERV)
 1022static bool __read_mostly enlightened_vmcs = true;
 1023module_param(enlightened_vmcs, bool, 0444);
 1024
 1025static inline void evmcs_write64(unsigned long field, u64 value)
 1026{
 1027	u16 clean_field;
 1028	int offset = get_evmcs_offset(field, &clean_field);
 1029
 1030	if (offset < 0)
 1031		return;
 1032
 1033	*(u64 *)((char *)current_evmcs + offset) = value;
 1034
 1035	current_evmcs->hv_clean_fields &= ~clean_field;
 1036}
 1037
 1038static inline void evmcs_write32(unsigned long field, u32 value)
 1039{
 1040	u16 clean_field;
 1041	int offset = get_evmcs_offset(field, &clean_field);
 1042
 1043	if (offset < 0)
 1044		return;
 1045
 1046	*(u32 *)((char *)current_evmcs + offset) = value;
 1047	current_evmcs->hv_clean_fields &= ~clean_field;
 1048}
 1049
 1050static inline void evmcs_write16(unsigned long field, u16 value)
 1051{
 1052	u16 clean_field;
 1053	int offset = get_evmcs_offset(field, &clean_field);
 1054
 1055	if (offset < 0)
 1056		return;
 1057
 1058	*(u16 *)((char *)current_evmcs + offset) = value;
 1059	current_evmcs->hv_clean_fields &= ~clean_field;
 1060}
 1061
 1062static inline u64 evmcs_read64(unsigned long field)
 1063{
 1064	int offset = get_evmcs_offset(field, NULL);
 1065
 1066	if (offset < 0)
 1067		return 0;
 1068
 1069	return *(u64 *)((char *)current_evmcs + offset);
 1070}
 1071
 1072static inline u32 evmcs_read32(unsigned long field)
 1073{
 1074	int offset = get_evmcs_offset(field, NULL);
 1075
 1076	if (offset < 0)
 1077		return 0;
 1078
 1079	return *(u32 *)((char *)current_evmcs + offset);
 1080}
 1081
 1082static inline u16 evmcs_read16(unsigned long field)
 1083{
 1084	int offset = get_evmcs_offset(field, NULL);
 1085
 1086	if (offset < 0)
 1087		return 0;
 1088
 1089	return *(u16 *)((char *)current_evmcs + offset);
 1090}
 1091
 1092static void evmcs_load(u64 phys_addr)
 1093{
 1094	struct hv_vp_assist_page *vp_ap =
 1095		hv_get_vp_assist_page(smp_processor_id());
 1096
 1097	vp_ap->current_nested_vmcs = phys_addr;
 1098	vp_ap->enlighten_vmentry = 1;
 1099}
 1100
 1101static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 1102{
 1103	/*
 1104	 * Enlightened VMCSv1 doesn't support these:
 1105	 *
 1106	 *	POSTED_INTR_NV                  = 0x00000002,
 1107	 *	GUEST_INTR_STATUS               = 0x00000810,
 1108	 *	APIC_ACCESS_ADDR		= 0x00002014,
 1109	 *	POSTED_INTR_DESC_ADDR           = 0x00002016,
 1110	 *	EOI_EXIT_BITMAP0                = 0x0000201c,
 1111	 *	EOI_EXIT_BITMAP1                = 0x0000201e,
 1112	 *	EOI_EXIT_BITMAP2                = 0x00002020,
 1113	 *	EOI_EXIT_BITMAP3                = 0x00002022,
 1114	 */
 1115	vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 1116	vmcs_conf->cpu_based_2nd_exec_ctrl &=
 1117		~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 1118	vmcs_conf->cpu_based_2nd_exec_ctrl &=
 1119		~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 1120	vmcs_conf->cpu_based_2nd_exec_ctrl &=
 1121		~SECONDARY_EXEC_APIC_REGISTER_VIRT;
 1122
 1123	/*
 1124	 *	GUEST_PML_INDEX			= 0x00000812,
 1125	 *	PML_ADDRESS			= 0x0000200e,
 1126	 */
 1127	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
 1128
 1129	/*	VM_FUNCTION_CONTROL             = 0x00002018, */
 1130	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
 1131
 1132	/*
 1133	 *	EPTP_LIST_ADDRESS               = 0x00002024,
 1134	 *	VMREAD_BITMAP                   = 0x00002026,
 1135	 *	VMWRITE_BITMAP                  = 0x00002028,
 1136	 */
 1137	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
 1138
 1139	/*
 1140	 *	TSC_MULTIPLIER                  = 0x00002032,
 1141	 */
 1142	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
 1143
 1144	/*
 1145	 *	PLE_GAP                         = 0x00004020,
 1146	 *	PLE_WINDOW                      = 0x00004022,
 1147	 */
 1148	vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 1149
 1150	/*
 1151	 *	VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
 1152	 */
 1153	vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 1154
 1155	/*
 1156	 *      GUEST_IA32_PERF_GLOBAL_CTRL     = 0x00002808,
 1157	 *      HOST_IA32_PERF_GLOBAL_CTRL      = 0x00002c04,
 1158	 */
 1159	vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
 1160	vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
 1161
 1162	/*
 1163	 * Currently unsupported in KVM:
 1164	 *	GUEST_IA32_RTIT_CTL		= 0x00002814,
 1165	 */
 1166}
 1167#else /* !IS_ENABLED(CONFIG_HYPERV) */
 1168static inline void evmcs_write64(unsigned long field, u64 value) {}
 1169static inline void evmcs_write32(unsigned long field, u32 value) {}
 1170static inline void evmcs_write16(unsigned long field, u16 value) {}
 1171static inline u64 evmcs_read64(unsigned long field) { return 0; }
 1172static inline u32 evmcs_read32(unsigned long field) { return 0; }
 1173static inline u16 evmcs_read16(unsigned long field) { return 0; }
 1174static inline void evmcs_load(u64 phys_addr) {}
 1175static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
 1176#endif /* IS_ENABLED(CONFIG_HYPERV) */
 1177
 1178static inline bool is_exception_n(u32 intr_info, u8 vector)
 1179{
 1180	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 1181			     INTR_INFO_VALID_MASK)) ==
 1182		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
 1183}
 1184
 1185static inline bool is_debug(u32 intr_info)
 1186{
 1187	return is_exception_n(intr_info, DB_VECTOR);
 1188}
 1189
 1190static inline bool is_breakpoint(u32 intr_info)
 1191{
 1192	return is_exception_n(intr_info, BP_VECTOR);
 1193}
 1194
 1195static inline bool is_page_fault(u32 intr_info)
 1196{
 1197	return is_exception_n(intr_info, PF_VECTOR);
 1198}
 1199
 1200static inline bool is_no_device(u32 intr_info)
 1201{
 1202	return is_exception_n(intr_info, NM_VECTOR);
 1203}
 1204
 1205static inline bool is_invalid_opcode(u32 intr_info)
 1206{
 1207	return is_exception_n(intr_info, UD_VECTOR);
 1208}
 1209
 1210static inline bool is_gp_fault(u32 intr_info)
 1211{
 1212	return is_exception_n(intr_info, GP_VECTOR);
 1213}
 1214
 1215static inline bool is_external_interrupt(u32 intr_info)
 1216{
 1217	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 1218		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 1219}
 1220
 1221static inline bool is_machine_check(u32 intr_info)
 1222{
 1223	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 1224			     INTR_INFO_VALID_MASK)) ==
 1225		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 1226}
 1227
 1228/* Undocumented: icebp/int1 */
 1229static inline bool is_icebp(u32 intr_info)
 1230{
 1231	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 1232		== (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
 1233}
 1234
 1235static inline bool cpu_has_vmx_msr_bitmap(void)
 1236{
 1237	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 1238}
 1239
 1240static inline bool cpu_has_vmx_tpr_shadow(void)
 1241{
 1242	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 1243}
 1244
 1245static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
 1246{
 1247	return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
 1248}
 1249
 1250static inline bool cpu_has_secondary_exec_ctrls(void)
 1251{
 1252	return vmcs_config.cpu_based_exec_ctrl &
 1253		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 1254}
 1255
 1256static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 1257{
 1258	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1259		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 1260}
 1261
 1262static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
 1263{
 1264	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1265		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 1266}
 1267
 1268static inline bool cpu_has_vmx_apic_register_virt(void)
 1269{
 1270	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1271		SECONDARY_EXEC_APIC_REGISTER_VIRT;
 1272}
 1273
 1274static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 1275{
 1276	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1277		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 1278}
 1279
 1280/*
 1281 * Comment's format: document - errata name - stepping - processor name.
 1282 * Refer from
 1283 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
 1284 */
 1285static u32 vmx_preemption_cpu_tfms[] = {
 1286/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
 12870x000206E6,
 1288/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
 1289/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
 1290/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
 12910x00020652,
 1292/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
 12930x00020655,
 1294/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
 1295/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
 1296/*
 1297 * 320767.pdf - AAP86  - B1 -
 1298 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
 1299 */
 13000x000106E5,
 1301/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
 13020x000106A0,
 1303/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
 13040x000106A1,
 1305/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
 13060x000106A4,
 1307 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
 1308 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
 1309 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
 13100x000106A5,
 1311};
 1312
 1313static inline bool cpu_has_broken_vmx_preemption_timer(void)
 1314{
 1315	u32 eax = cpuid_eax(0x00000001), i;
 1316
 1317	/* Clear the reserved bits */
 1318	eax &= ~(0x3U << 14 | 0xfU << 28);
 1319	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
 1320		if (eax == vmx_preemption_cpu_tfms[i])
 1321			return true;
 1322
 1323	return false;
 1324}
 1325
 1326static inline bool cpu_has_vmx_preemption_timer(void)
 1327{
 1328	return vmcs_config.pin_based_exec_ctrl &
 1329		PIN_BASED_VMX_PREEMPTION_TIMER;
 1330}
 1331
 1332static inline bool cpu_has_vmx_posted_intr(void)
 1333{
 1334	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
 1335		vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
 1336}
 1337
 1338static inline bool cpu_has_vmx_apicv(void)
 1339{
 1340	return cpu_has_vmx_apic_register_virt() &&
 1341		cpu_has_vmx_virtual_intr_delivery() &&
 1342		cpu_has_vmx_posted_intr();
 1343}
 1344
 1345static inline bool cpu_has_vmx_flexpriority(void)
 1346{
 1347	return cpu_has_vmx_tpr_shadow() &&
 1348		cpu_has_vmx_virtualize_apic_accesses();
 1349}
 1350
 1351static inline bool cpu_has_vmx_ept_execute_only(void)
 1352{
 1353	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 1354}
 1355
 1356static inline bool cpu_has_vmx_ept_2m_page(void)
 1357{
 1358	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
 1359}
 1360
 1361static inline bool cpu_has_vmx_ept_1g_page(void)
 1362{
 1363	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 1364}
 1365
 1366static inline bool cpu_has_vmx_ept_4levels(void)
 1367{
 1368	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
 1369}
 1370
 1371static inline bool cpu_has_vmx_ept_mt_wb(void)
 1372{
 1373	return vmx_capability.ept & VMX_EPTP_WB_BIT;
 1374}
 1375
 1376static inline bool cpu_has_vmx_ept_5levels(void)
 1377{
 1378	return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
 1379}
 1380
 1381static inline bool cpu_has_vmx_ept_ad_bits(void)
 1382{
 1383	return vmx_capability.ept & VMX_EPT_AD_BIT;
 1384}
 1385
 1386static inline bool cpu_has_vmx_invept_context(void)
 1387{
 1388	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
 1389}
 1390
 1391static inline bool cpu_has_vmx_invept_global(void)
 1392{
 1393	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 1394}
 1395
 1396static inline bool cpu_has_vmx_invvpid_single(void)
 1397{
 1398	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
 1399}
 1400
 1401static inline bool cpu_has_vmx_invvpid_global(void)
 1402{
 1403	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
 1404}
 1405
 1406static inline bool cpu_has_vmx_invvpid(void)
 1407{
 1408	return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
 1409}
 1410
 1411static inline bool cpu_has_vmx_ept(void)
 1412{
 1413	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1414		SECONDARY_EXEC_ENABLE_EPT;
 1415}
 1416
 1417static inline bool cpu_has_vmx_unrestricted_guest(void)
 1418{
 1419	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1420		SECONDARY_EXEC_UNRESTRICTED_GUEST;
 1421}
 1422
 1423static inline bool cpu_has_vmx_ple(void)
 1424{
 1425	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1426		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 1427}
 1428
 1429static inline bool cpu_has_vmx_basic_inout(void)
 1430{
 1431	return	(((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
 1432}
 1433
 1434static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 1435{
 1436	return flexpriority_enabled && lapic_in_kernel(vcpu);
 1437}
 1438
 1439static inline bool cpu_has_vmx_vpid(void)
 1440{
 1441	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1442		SECONDARY_EXEC_ENABLE_VPID;
 1443}
 1444
 1445static inline bool cpu_has_vmx_rdtscp(void)
 1446{
 1447	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1448		SECONDARY_EXEC_RDTSCP;
 1449}
 1450
 1451static inline bool cpu_has_vmx_invpcid(void)
 1452{
 1453	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1454		SECONDARY_EXEC_ENABLE_INVPCID;
 1455}
 1456
 1457static inline bool cpu_has_virtual_nmis(void)
 1458{
 1459	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 1460}
 1461
 1462static inline bool cpu_has_vmx_wbinvd_exit(void)
 1463{
 1464	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1465		SECONDARY_EXEC_WBINVD_EXITING;
 1466}
 1467
 1468static inline bool cpu_has_vmx_shadow_vmcs(void)
 1469{
 1470	u64 vmx_msr;
 1471	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
 1472	/* check if the cpu supports writing r/o exit information fields */
 1473	if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
 1474		return false;
 1475
 1476	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1477		SECONDARY_EXEC_SHADOW_VMCS;
 1478}
 1479
 1480static inline bool cpu_has_vmx_pml(void)
 1481{
 1482	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
 1483}
 1484
 1485static inline bool cpu_has_vmx_tsc_scaling(void)
 1486{
 1487	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1488		SECONDARY_EXEC_TSC_SCALING;
 1489}
 1490
 1491static inline bool cpu_has_vmx_vmfunc(void)
 1492{
 1493	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1494		SECONDARY_EXEC_ENABLE_VMFUNC;
 1495}
 1496
 1497static bool vmx_umip_emulated(void)
 1498{
 1499	return vmcs_config.cpu_based_2nd_exec_ctrl &
 1500		SECONDARY_EXEC_DESC;
 1501}
 1502
 1503static inline bool report_flexpriority(void)
 1504{
 1505	return flexpriority_enabled;
 1506}
 1507
 1508static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
 1509{
 1510	return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
 1511}
 1512
 1513static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
 1514{
 1515	return vmcs12->cpu_based_vm_exec_control & bit;
 1516}
 1517
 1518static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
 1519{
 1520	return (vmcs12->cpu_based_vm_exec_control &
 1521			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
 1522		(vmcs12->secondary_vm_exec_control & bit);
 1523}
 1524
 1525static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
 1526{
 1527	return vmcs12->pin_based_vm_exec_control &
 1528		PIN_BASED_VMX_PREEMPTION_TIMER;
 1529}
 1530
 1531static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
 1532{
 1533	return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
 1534}
 1535
 1536static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 1537{
 1538	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 1539}
 1540
 1541static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
 1542{
 1543	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
 1544}
 1545
 1546static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
 1547{
 1548	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 1549}
 1550
 1551static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
 1552{
 1553	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
 1554}
 1555
 1556static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 1557{
 1558	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 1559}
 1560
 1561static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
 1562{
 1563	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
 1564}
 1565
 1566static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 1567{
 1568	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 1569}
 1570
 1571static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
 1572{
 1573	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 1574}
 1575
 1576static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
 1577{
 1578	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
 1579}
 1580
 1581static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
 1582{
 1583	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
 1584}
 1585
 1586static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
 1587{
 1588	return nested_cpu_has_vmfunc(vmcs12) &&
 1589		(vmcs12->vm_function_control &
 1590		 VMX_VMFUNC_EPTP_SWITCHING);
 1591}
 1592
 1593static inline bool is_nmi(u32 intr_info)
 1594{
 1595	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 1596		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
 1597}
 1598
 1599static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 1600			      u32 exit_intr_info,
 1601			      unsigned long exit_qualification);
 1602static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
 1603			struct vmcs12 *vmcs12,
 1604			u32 reason, unsigned long qualification);
 1605
 1606static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 1607{
 1608	int i;
 1609
 1610	for (i = 0; i < vmx->nmsrs; ++i)
 1611		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
 1612			return i;
 1613	return -1;
 1614}
 1615
 1616static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 1617{
 1618    struct {
 1619	u64 vpid : 16;
 1620	u64 rsvd : 48;
 1621	u64 gva;
 1622    } operand = { vpid, 0, gva };
 1623
 1624    asm volatile (__ex(ASM_VMX_INVVPID)
 1625		  /* CF==1 or ZF==1 --> rc = -1 */
 1626		  "; ja 1f ; ud2 ; 1:"
 1627		  : : "a"(&operand), "c"(ext) : "cc", "memory");
 1628}
 1629
 1630static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 1631{
 1632	struct {
 1633		u64 eptp, gpa;
 1634	} operand = {eptp, gpa};
 1635
 1636	asm volatile (__ex(ASM_VMX_INVEPT)
 1637			/* CF==1 or ZF==1 --> rc = -1 */
 1638			"; ja 1f ; ud2 ; 1:\n"
 1639			: : "a" (&operand), "c" (ext) : "cc", "memory");
 1640}
 1641
 1642static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 1643{
 1644	int i;
 1645
 1646	i = __find_msr_index(vmx, msr);
 1647	if (i >= 0)
 1648		return &vmx->guest_msrs[i];
 1649	return NULL;
 1650}
 1651
 1652static void vmcs_clear(struct vmcs *vmcs)
 1653{
 1654	u64 phys_addr = __pa(vmcs);
 1655	u8 error;
 1656
 1657	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
 1658		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 1659		      : "cc", "memory");
 1660	if (error)
 1661		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
 1662		       vmcs, phys_addr);
 1663}
 1664
 1665static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 1666{
 1667	vmcs_clear(loaded_vmcs->vmcs);
 1668	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 1669		vmcs_clear(loaded_vmcs->shadow_vmcs);
 1670	loaded_vmcs->cpu = -1;
 1671	loaded_vmcs->launched = 0;
 1672}
 1673
 1674static void vmcs_load(struct vmcs *vmcs)
 1675{
 1676	u64 phys_addr = __pa(vmcs);
 1677	u8 error;
 1678
 1679	if (static_branch_unlikely(&enable_evmcs))
 1680		return evmcs_load(phys_addr);
 1681
 1682	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
 1683			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 1684			: "cc", "memory");
 1685	if (error)
 1686		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
 1687		       vmcs, phys_addr);
 1688}
 1689
 1690#ifdef CONFIG_KEXEC_CORE
 1691/*
 1692 * This bitmap is used to indicate whether the vmclear
 1693 * operation is enabled on all cpus. All disabled by
 1694 * default.
 1695 */
 1696static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
 1697
 1698static inline void crash_enable_local_vmclear(int cpu)
 1699{
 1700	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
 1701}
 1702
 1703static inline void crash_disable_local_vmclear(int cpu)
 1704{
 1705	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
 1706}
 1707
 1708static inline int crash_local_vmclear_enabled(int cpu)
 1709{
 1710	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
 1711}
 1712
 1713static void crash_vmclear_local_loaded_vmcss(void)
 1714{
 1715	int cpu = raw_smp_processor_id();
 1716	struct loaded_vmcs *v;
 1717
 1718	if (!crash_local_vmclear_enabled(cpu))
 1719		return;
 1720
 1721	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
 1722			    loaded_vmcss_on_cpu_link)
 1723		vmcs_clear(v->vmcs);
 1724}
 1725#else
 1726static inline void crash_enable_local_vmclear(int cpu) { }
 1727static inline void crash_disable_local_vmclear(int cpu) { }
 1728#endif /* CONFIG_KEXEC_CORE */
 1729
 1730static void __loaded_vmcs_clear(void *arg)
 1731{
 1732	struct loaded_vmcs *loaded_vmcs = arg;
 1733	int cpu = raw_smp_processor_id();
 1734
 1735	if (loaded_vmcs->cpu != cpu)
 1736		return; /* vcpu migration can race with cpu offline */
 1737	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 1738		per_cpu(current_vmcs, cpu) = NULL;
 1739	crash_disable_local_vmclear(cpu);
 1740	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 1741
 1742	/*
 1743	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
 1744	 * is before setting loaded_vmcs->vcpu to -1 which is done in
 1745	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
 1746	 * then adds the vmcs into percpu list before it is deleted.
 1747	 */
 1748	smp_wmb();
 1749
 1750	loaded_vmcs_init(loaded_vmcs);
 1751	crash_enable_local_vmclear(cpu);
 1752}
 1753
 1754static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 1755{
 1756	int cpu = loaded_vmcs->cpu;
 1757
 1758	if (cpu != -1)
 1759		smp_call_function_single(cpu,
 1760			 __loaded_vmcs_clear, loaded_vmcs, 1);
 1761}
 1762
 1763static inline void vpid_sync_vcpu_single(int vpid)
 1764{
 1765	if (vpid == 0)
 1766		return;
 1767
 1768	if (cpu_has_vmx_invvpid_single())
 1769		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
 1770}
 1771
 1772static inline void vpid_sync_vcpu_global(void)
 1773{
 1774	if (cpu_has_vmx_invvpid_global())
 1775		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 1776}
 1777
 1778static inline void vpid_sync_context(int vpid)
 1779{
 1780	if (cpu_has_vmx_invvpid_single())
 1781		vpid_sync_vcpu_single(vpid);
 1782	else
 1783		vpid_sync_vcpu_global();
 1784}
 1785
 1786static inline void ept_sync_global(void)
 1787{
 1788	__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
 1789}
 1790
 1791static inline void ept_sync_context(u64 eptp)
 1792{
 1793	if (cpu_has_vmx_invept_context())
 1794		__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 1795	else
 1796		ept_sync_global();
 1797}
 1798
 1799static __always_inline void vmcs_check16(unsigned long field)
 1800{
 1801        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
 1802			 "16-bit accessor invalid for 64-bit field");
 1803        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
 1804			 "16-bit accessor invalid for 64-bit high field");
 1805        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
 1806			 "16-bit accessor invalid for 32-bit high field");
 1807        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
 1808			 "16-bit accessor invalid for natural width field");
 1809}
 1810
 1811static __always_inline void vmcs_check32(unsigned long field)
 1812{
 1813        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
 1814			 "32-bit accessor invalid for 16-bit field");
 1815        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
 1816			 "32-bit accessor invalid for natural width field");
 1817}
 1818
 1819static __always_inline void vmcs_check64(unsigned long field)
 1820{
 1821        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
 1822			 "64-bit accessor invalid for 16-bit field");
 1823        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
 1824			 "64-bit accessor invalid for 64-bit high field");
 1825        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
 1826			 "64-bit accessor invalid for 32-bit field");
 1827        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
 1828			 "64-bit accessor invalid for natural width field");
 1829}
 1830
 1831static __always_inline void vmcs_checkl(unsigned long field)
 1832{
 1833        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
 1834			 "Natural width accessor invalid for 16-bit field");
 1835        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
 1836			 "Natural width accessor invalid for 64-bit field");
 1837        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
 1838			 "Natural width accessor invalid for 64-bit high field");
 1839        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
 1840			 "Natural width accessor invalid for 32-bit field");
 1841}
 1842
 1843static __always_inline unsigned long __vmcs_readl(unsigned long field)
 1844{
 1845	unsigned long value;
 1846
 1847	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
 1848		      : "=a"(value) : "d"(field) : "cc");
 1849	return value;
 1850}
 1851
 1852static __always_inline u16 vmcs_read16(unsigned long field)
 1853{
 1854	vmcs_check16(field);
 1855	if (static_branch_unlikely(&enable_evmcs))
 1856		return evmcs_read16(field);
 1857	return __vmcs_readl(field);
 1858}
 1859
 1860static __always_inline u32 vmcs_read32(unsigned long field)
 1861{
 1862	vmcs_check32(field);
 1863	if (static_branch_unlikely(&enable_evmcs))
 1864		return evmcs_read32(field);
 1865	return __vmcs_readl(field);
 1866}
 1867
 1868static __always_inline u64 vmcs_read64(unsigned long field)
 1869{
 1870	vmcs_check64(field);
 1871	if (static_branch_unlikely(&enable_evmcs))
 1872		return evmcs_read64(field);
 1873#ifdef CONFIG_X86_64
 1874	return __vmcs_readl(field);
 1875#else
 1876	return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
 1877#endif
 1878}
 1879
 1880static __always_inline unsigned long vmcs_readl(unsigned long field)
 1881{
 1882	vmcs_checkl(field);
 1883	if (static_branch_unlikely(&enable_evmcs))
 1884		return evmcs_read64(field);
 1885	return __vmcs_readl(field);
 1886}
 1887
 1888static noinline void vmwrite_error(unsigned long field, unsigned long value)
 1889{
 1890	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
 1891	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
 1892	dump_stack();
 1893}
 1894
 1895static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
 1896{
 1897	u8 error;
 1898
 1899	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
 1900		       : "=q"(error) : "a"(value), "d"(field) : "cc");
 1901	if (unlikely(error))
 1902		vmwrite_error(field, value);
 1903}
 1904
 1905static __always_inline void vmcs_write16(unsigned long field, u16 value)
 1906{
 1907	vmcs_check16(field);
 1908	if (static_branch_unlikely(&enable_evmcs))
 1909		return evmcs_write16(field, value);
 1910
 1911	__vmcs_writel(field, value);
 1912}
 1913
 1914static __always_inline void vmcs_write32(unsigned long field, u32 value)
 1915{
 1916	vmcs_check32(field);
 1917	if (static_branch_unlikely(&enable_evmcs))
 1918		return evmcs_write32(field, value);
 1919
 1920	__vmcs_writel(field, value);
 1921}
 1922
 1923static __always_inline void vmcs_write64(unsigned long field, u64 value)
 1924{
 1925	vmcs_check64(field);
 1926	if (static_branch_unlikely(&enable_evmcs))
 1927		return evmcs_write64(field, value);
 1928
 1929	__vmcs_writel(field, value);
 1930#ifndef CONFIG_X86_64
 1931	asm volatile ("");
 1932	__vmcs_writel(field+1, value >> 32);
 1933#endif
 1934}
 1935
 1936static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
 1937{
 1938	vmcs_checkl(field);
 1939	if (static_branch_unlikely(&enable_evmcs))
 1940		return evmcs_write64(field, value);
 1941
 1942	__vmcs_writel(field, value);
 1943}
 1944
 1945static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
 1946{
 1947        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
 1948			 "vmcs_clear_bits does not support 64-bit fields");
 1949	if (static_branch_unlikely(&enable_evmcs))
 1950		return evmcs_write32(field, evmcs_read32(field) & ~mask);
 1951
 1952	__vmcs_writel(field, __vmcs_readl(field) & ~mask);
 1953}
 1954
 1955static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
 1956{
 1957        BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
 1958			 "vmcs_set_bits does not support 64-bit fields");
 1959	if (static_branch_unlikely(&enable_evmcs))
 1960		return evmcs_write32(field, evmcs_read32(field) | mask);
 1961
 1962	__vmcs_writel(field, __vmcs_readl(field) | mask);
 1963}
 1964
 1965static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
 1966{
 1967	vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
 1968}
 1969
 1970static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
 1971{
 1972	vmcs_write32(VM_ENTRY_CONTROLS, val);
 1973	vmx->vm_entry_controls_shadow = val;
 1974}
 1975
 1976static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
 1977{
 1978	if (vmx->vm_entry_controls_shadow != val)
 1979		vm_entry_controls_init(vmx, val);
 1980}
 1981
 1982static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
 1983{
 1984	return vmx->vm_entry_controls_shadow;
 1985}
 1986
 1987
 1988static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
 1989{
 1990	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
 1991}
 1992
 1993static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
 1994{
 1995	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
 1996}
 1997
 1998static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
 1999{
 2000	vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
 2001}
 2002
 2003static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
 2004{
 2005	vmcs_write32(VM_EXIT_CONTROLS, val);
 2006	vmx->vm_exit_controls_shadow = val;
 2007}
 2008
 2009static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
 2010{
 2011	if (vmx->vm_exit_controls_shadow != val)
 2012		vm_exit_controls_init(vmx, val);
 2013}
 2014
 2015static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
 2016{
 2017	return vmx->vm_exit_controls_shadow;
 2018}
 2019
 2020
 2021static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
 2022{
 2023	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
 2024}
 2025
 2026static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
 2027{
 2028	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
 2029}
 2030
 2031static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
 2032{
 2033	vmx->segment_cache.bitmask = 0;
 2034}
 2035
 2036static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
 2037				       unsigned field)
 2038{
 2039	bool ret;
 2040	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
 2041
 2042	if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
 2043		vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
 2044		vmx->segment_cache.bitmask = 0;
 2045	}
 2046	ret = vmx->segment_cache.bitmask & mask;
 2047	vmx->segment_cache.bitmask |= mask;
 2048	return ret;
 2049}
 2050
 2051static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
 2052{
 2053	u16 *p = &vmx->segment_cache.seg[seg].selector;
 2054
 2055	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
 2056		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
 2057	return *p;
 2058}
 2059
 2060static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
 2061{
 2062	ulong *p = &vmx->segment_cache.seg[seg].base;
 2063
 2064	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
 2065		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
 2066	return *p;
 2067}
 2068
 2069static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
 2070{
 2071	u32 *p = &vmx->segment_cache.seg[seg].limit;
 2072
 2073	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
 2074		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
 2075	return *p;
 2076}
 2077
 2078static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
 2079{
 2080	u32 *p = &vmx->segment_cache.seg[seg].ar;
 2081
 2082	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
 2083		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
 2084	return *p;
 2085}
 2086
 2087static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 2088{
 2089	u32 eb;
 2090
 2091	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 2092	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 2093	/*
 2094	 * Guest access to VMware backdoor ports could legitimately
 2095	 * trigger #GP because of TSS I/O permission bitmap.
 2096	 * We intercept those #GP and allow access to them anyway
 2097	 * as VMware does.
 2098	 */
 2099	if (enable_vmware_backdoor)
 2100		eb |= (1u << GP_VECTOR);
 2101	if ((vcpu->guest_debug &
 2102	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 2103	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
 2104		eb |= 1u << BP_VECTOR;
 2105	if (to_vmx(vcpu)->rmode.vm86_active)
 2106		eb = ~0;
 2107	if (enable_ept)
 2108		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 2109
 2110	/* When we are running a nested L2 guest and L1 specified for it a
 2111	 * certain exception bitmap, we must trap the same exceptions and pass
 2112	 * them to L1. When running L2, we will only handle the exceptions
 2113	 * specified above if L1 did not want them.
 2114	 */
 2115	if (is_guest_mode(vcpu))
 2116		eb |= get_vmcs12(vcpu)->exception_bitmap;
 2117
 2118	vmcs_write32(EXCEPTION_BITMAP, eb);
 2119}
 2120
 2121/*
 2122 * Check if MSR is intercepted for currently loaded MSR bitmap.
 2123 */
 2124static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 2125{
 2126	unsigned long *msr_bitmap;
 2127	int f = sizeof(unsigned long);
 2128
 2129	if (!cpu_has_vmx_msr_bitmap())
 2130		return true;
 2131
 2132	msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
 2133
 2134	if (msr <= 0x1fff) {
 2135		return !!test_bit(msr, msr_bitmap + 0x800 / f);
 2136	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 2137		msr &= 0x1fff;
 2138		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
 2139	}
 2140
 2141	return true;
 2142}
 2143
 2144/*
 2145 * Check if MSR is intercepted for L01 MSR bitmap.
 2146 */
 2147static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
 2148{
 2149	unsigned long *msr_bitmap;
 2150	int f = sizeof(unsigned long);
 2151
 2152	if (!cpu_has_vmx_msr_bitmap())
 2153		return true;
 2154
 2155	msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
 2156
 2157	if (msr <= 0x1fff) {
 2158		return !!test_bit(msr, msr_bitmap + 0x800 / f);
 2159	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 2160		msr &= 0x1fff;
 2161		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
 2162	}
 2163
 2164	return true;
 2165}
 2166
 2167static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 2168		unsigned long entry, unsigned long exit)
 2169{
 2170	vm_entry_controls_clearbit(vmx, entry);
 2171	vm_exit_controls_clearbit(vmx, exit);
 2172}
 2173
 2174static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 2175{
 2176	unsigned i;
 2177	struct msr_autoload *m = &vmx->msr_autoload;
 2178
 2179	switch (msr) {
 2180	case MSR_EFER:
 2181		if (cpu_has_load_ia32_efer) {
 2182			clear_atomic_switch_msr_special(vmx,
 2183					VM_ENTRY_LOAD_IA32_EFER,
 2184					VM_EXIT_LOAD_IA32_EFER);
 2185			return;
 2186		}
 2187		break;
 2188	case MSR_CORE_PERF_GLOBAL_CTRL:
 2189		if (cpu_has_load_perf_global_ctrl) {
 2190			clear_atomic_switch_msr_special(vmx,
 2191					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 2192					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 2193			return;
 2194		}
 2195		break;
 2196	}
 2197
 2198	for (i = 0; i < m->nr; ++i)
 2199		if (m->guest[i].index == msr)
 2200			break;
 2201
 2202	if (i == m->nr)
 2203		return;
 2204	--m->nr;
 2205	m->guest[i] = m->guest[m->nr];
 2206	m->host[i] = m->host[m->nr];
 2207	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
 2208	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 2209}
 2210
 2211static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 2212		unsigned long entry, unsigned long exit,
 2213		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
 2214		u64 guest_val, u64 host_val)
 2215{
 2216	vmcs_write64(guest_val_vmcs, guest_val);
 2217	vmcs_write64(host_val_vmcs, host_val);
 2218	vm_entry_controls_setbit(vmx, entry);
 2219	vm_exit_controls_setbit(vmx, exit);
 2220}
 2221
 2222static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 2223				  u64 guest_val, u64 host_val)
 2224{
 2225	unsigned i;
 2226	struct msr_autoload *m = &vmx->msr_autoload;
 2227
 2228	switch (msr) {
 2229	case MSR_EFER:
 2230		if (cpu_has_load_ia32_efer) {
 2231			add_atomic_switch_msr_special(vmx,
 2232					VM_ENTRY_LOAD_IA32_EFER,
 2233					VM_EXIT_LOAD_IA32_EFER,
 2234					GUEST_IA32_EFER,
 2235					HOST_IA32_EFER,
 2236					guest_val, host_val);
 2237			return;
 2238		}
 2239		break;
 2240	case MSR_CORE_PERF_GLOBAL_CTRL:
 2241		if (cpu_has_load_perf_global_ctrl) {
 2242			add_atomic_switch_msr_special(vmx,
 2243					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
 2244					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
 2245					GUEST_IA32_PERF_GLOBAL_CTRL,
 2246					HOST_IA32_PERF_GLOBAL_CTRL,
 2247					guest_val, host_val);
 2248			return;
 2249		}
 2250		break;
 2251	case MSR_IA32_PEBS_ENABLE:
 2252		/* PEBS needs a quiescent period after being disabled (to write
 2253		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
 2254		 * provide that period, so a CPU could write host's record into
 2255		 * guest's memory.
 2256		 */
 2257		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 2258	}
 2259
 2260	for (i = 0; i < m->nr; ++i)
 2261		if (m->guest[i].index == msr)
 2262			break;
 2263
 2264	if (i == NR_AUTOLOAD_MSRS) {
 2265		printk_once(KERN_WARNING "Not enough msr switch entries. "
 2266				"Can't add msr %x\n", msr);
 2267		return;
 2268	} else if (i == m->nr) {
 2269		++m->nr;
 2270		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
 2271		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
 2272	}
 2273
 2274	m->guest[i].index = msr;
 2275	m->guest[i].value = guest_val;
 2276	m->host[i].index = msr;
 2277	m->host[i].value = host_val;
 2278}
 2279
 2280static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 2281{
 2282	u64 guest_efer = vmx->vcpu.arch.efer;
 2283	u64 ignore_bits = 0;
 2284
 2285	if (!enable_ept) {
 2286		/*
 2287		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
 2288		 * host CPUID is more efficient than testing guest CPUID
 2289		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
 2290		 */
 2291		if (boot_cpu_has(X86_FEATURE_SMEP))
 2292			guest_efer |= EFER_NX;
 2293		else if (!(guest_efer & EFER_NX))
 2294			ignore_bits |= EFER_NX;
 2295	}
 2296
 2297	/*
 2298	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
 2299	 */
 2300	ignore_bits |= EFER_SCE;
 2301#ifdef CONFIG_X86_64
 2302	ignore_bits |= EFER_LMA | EFER_LME;
 2303	/* SCE is meaningful only in long mode on Intel */
 2304	if (guest_efer & EFER_LMA)
 2305		ignore_bits &= ~(u64)EFER_SCE;
 2306#endif
 2307
 2308	clear_atomic_switch_msr(vmx, MSR_EFER);
 2309
 2310	/*
 2311	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
 2312	 * On CPUs that support "load IA32_EFER", always switch EFER
 2313	 * atomically, since it's faster than switching it manually.
 2314	 */
 2315	if (cpu_has_load_ia32_efer ||
 2316	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
 2317		if (!(guest_efer & EFER_LMA))
 2318			guest_efer &= ~EFER_LME;
 2319		if (guest_efer != host_efer)
 2320			add_atomic_switch_msr(vmx, MSR_EFER,
 2321					      guest_efer, host_efer);
 2322		return false;
 2323	} else {
 2324		guest_efer &= ~ignore_bits;
 2325		guest_efer |= host_efer & ignore_bits;
 2326
 2327		vmx->guest_msrs[efer_offset].data = guest_efer;
 2328		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 2329
 2330		return true;
 2331	}
 2332}
 2333
 2334#ifdef CONFIG_X86_32
 2335/*
 2336 * On 32-bit kernels, VM exits still load the FS and GS bases from the
 2337 * VMCS rather than the segment table.  KVM uses this helper to figure
 2338 * out the current bases to poke them into the VMCS before entry.
 2339 */
 2340static unsigned long segment_base(u16 selector)
 2341{
 2342	struct desc_struct *table;
 2343	unsigned long v;
 2344
 2345	if (!(selector & ~SEGMENT_RPL_MASK))
 2346		return 0;
 2347
 2348	table = get_current_gdt_ro();
 2349
 2350	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
 2351		u16 ldt_selector = kvm_read_ldt();
 2352
 2353		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
 2354			return 0;
 2355
 2356		table = (struct desc_struct *)segment_base(ldt_selector);
 2357	}
 2358	v = get_desc_base(&table[selector >> 3]);
 2359	return v;
 2360}
 2361#endif
 2362
 2363static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 2364{
 2365	struct vcpu_vmx *vmx = to_vmx(vcpu);
 2366#ifdef CONFIG_X86_64
 2367	int cpu = raw_smp_processor_id();
 2368#endif
 2369	int i;
 2370
 2371	if (vmx->host_state.loaded)
 2372		return;
 2373
 2374	vmx->host_state.loaded = 1;
 2375	/*
 2376	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
 2377	 * allow segment selectors with cpl > 0 or ti == 1.
 2378	 */
 2379	vmx->host_state.ldt_sel = kvm_read_ldt();
 2380	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
 2381
 2382#ifdef CONFIG_X86_64
 2383	save_fsgs_for_kvm();
 2384	vmx->host_state.fs_sel = current->thread.fsindex;
 2385	vmx->host_state.gs_sel = current->thread.gsindex;
 2386#else
 2387	savesegment(fs, vmx->host_state.fs_sel);
 2388	savesegment(gs, vmx->host_state.gs_sel);
 2389#endif
 2390	if (!(vmx->host_state.fs_sel & 7)) {
 2391		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
 2392		vmx->host_state.fs_reload_needed = 0;
 2393	} else {
 2394		vmcs_write16(HOST_FS_SELECTOR, 0);
 2395		vmx->host_state.fs_reload_needed = 1;
 2396	}
 2397	if (!(vmx->host_state.gs_sel & 7))
 2398		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
 2399	else {
 2400		vmcs_write16(HOST_GS_SELECTOR, 0);
 2401		vmx->host_state.gs_ldt_reload_needed = 1;
 2402	}
 2403
 2404#ifdef CONFIG_X86_64
 2405	savesegment(ds, vmx->host_state.ds_sel);
 2406	savesegment(es, vmx->host_state.es_sel);
 2407
 2408	vmcs_writel(HOST_FS_BASE, current->thread.fsbase);
 2409	vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
 2410
 2411	vmx->msr_host_kernel_gs_base = current->thread.gsbase;
 2412	if (is_long_mode(&vmx->vcpu))
 2413		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 2414#else
 2415	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
 2416	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
 2417#endif
 2418	if (boot_cpu_has(X86_FEATURE_MPX))
 2419		rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
 2420	for (i = 0; i < vmx->save_nmsrs; ++i)
 2421		kvm_set_shared_msr(vmx->guest_msrs[i].index,
 2422				   vmx->guest_msrs[i].data,
 2423				   vmx->guest_msrs[i].mask);
 2424}
 2425
 2426static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 2427{
 2428	if (!vmx->host_state.loaded)
 2429		return;
 2430
 2431	++vmx->vcpu.stat.host_state_reload;
 2432	vmx->host_state.loaded = 0;
 2433#ifdef CONFIG_X86_64
 2434	if (is_long_mode(&vmx->vcpu))
 2435		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 2436#endif
 2437	if (vmx->host_state.gs_ldt_reload_needed) {
 2438		kvm_load_ldt(vmx->host_state.ldt_sel);
 2439#ifdef CONFIG_X86_64
 2440		load_gs_index(vmx->host_state.gs_sel);
 2441#else
 2442		loadsegment(gs, vmx->host_state.gs_sel);
 2443#endif
 2444	}
 2445	if (vmx->host_state.fs_reload_needed)
 2446		loadsegment(fs, vmx->host_state.fs_sel);
 2447#ifdef CONFIG_X86_64
 2448	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
 2449		loadsegment(ds, vmx->host_state.ds_sel);
 2450		loadsegment(es, vmx->host_state.es_sel);
 2451	}
 2452#endif
 2453	invalidate_tss_limit();
 2454#ifdef CONFIG_X86_64
 2455	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 2456#endif
 2457	if (vmx->host_state.msr_host_bndcfgs)
 2458		wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
 2459	load_fixmap_gdt(raw_smp_processor_id());
 2460}
 2461
 2462static void vmx_load_host_state(struct vcpu_vmx *vmx)
 2463{
 2464	preempt_disable();
 2465	__vmx_load_host_state(vmx);
 2466	preempt_enable();
 2467}
 2468
 2469static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 2470{
 2471	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 2472	struct pi_desc old, new;
 2473	unsigned int dest;
 2474
 2475	/*
 2476	 * In case of hot-plug or hot-unplug, we may have to undo
 2477	 * vmx_vcpu_pi_put even if there is no assigned device.  And we
 2478	 * always keep PI.NDST up to date for simplicity: it makes the
 2479	 * code easier, and CPU migration is not a fast path.
 2480	 */
 2481	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
 2482		return;
 2483
 2484	/*
 2485	 * First handle the simple case where no cmpxchg is necessary; just
 2486	 * allow posting non-urgent interrupts.
 2487	 *
 2488	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
 2489	 * PI.NDST: pi_post_block will do it for us and the wakeup_handler
 2490	 * expects the VCPU to be on the blocked_vcpu_list that matches
 2491	 * PI.NDST.
 2492	 */
 2493	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
 2494	    vcpu->cpu == cpu) {
 2495		pi_clear_sn(pi_desc);
 2496		return;
 2497	}
 2498
 2499	/* The full case.  */
 2500	do {
 2501		old.control = new.control = pi_desc->control;
 2502
 2503		dest = cpu_physical_id(cpu);
 2504
 2505		if (x2apic_enabled())
 2506			new.ndst = dest;
 2507		else
 2508			new.ndst = (dest << 8) & 0xFF00;
 2509
 2510		new.sn = 0;
 2511	} while (cmpxchg64(&pi_desc->control, old.control,
 2512			   new.control) != old.control);
 2513}
 2514
 2515static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
 2516{
 2517	vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
 2518	vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
 2519}
 2520
 2521/*
 2522 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
 2523 * vcpu mutex is already taken.
 2524 */
 2525static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 2526{
 2527	struct vcpu_vmx *vmx = to_vmx(vcpu);
 2528	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
 2529
 2530	if (!already_loaded) {
 2531		loaded_vmcs_clear(vmx->loaded_vmcs);
 2532		local_irq_disable();
 2533		crash_disable_local_vmclear(cpu);
 2534
 2535		/*
 2536		 * Read loaded_vmcs->cpu should be before fetching
 2537		 * loaded_vmcs->loaded_vmcss_on_cpu_link.
 2538		 * See the comments in __loaded_vmcs_clear().
 2539		 */
 2540		smp_rmb();
 2541
 2542		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
 2543			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 2544		crash_enable_local_vmclear(cpu);
 2545		local_irq_enable();
 2546	}
 2547
 2548	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
 2549		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
 2550		vmcs_load(vmx->loaded_vmcs->vmcs);
 2551		indirect_branch_prediction_barrier();
 2552	}
 2553
 2554	if (!already_loaded) {
 2555		void *gdt = get_current_gdt_ro();
 2556		unsigned long sysenter_esp;
 2557
 2558		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 2559
 2560		/*
 2561		 * Linux uses per-cpu TSS and GDT, so set these when switching
 2562		 * processors.  See 22.2.4.
 2563		 */
 2564		vmcs_writel(HOST_TR_BASE,
 2565			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
 2566		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
 2567
 2568		/*
 2569		 * VM exits change the host TR limit to 0x67 after a VM
 2570		 * exit.  This is okay, since 0x67 covers everything except
 2571		 * the IO bitmap and have have code to handle the IO bitmap
 2572		 * being lost after a VM exit.
 2573		 */
 2574		BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
 2575
 2576		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 2577		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 2578
 2579		vmx->loaded_vmcs->cpu = cpu;
 2580	}
 2581
 2582	/* Setup TSC multiplier */
 2583	if (kvm_has_tsc_control &&
 2584	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
 2585		decache_tsc_multiplier(vmx);
 2586
 2587	vmx_vcpu_pi_load(vcpu, cpu);
 2588	vmx->host_pkru = read_pkru();
 2589	vmx->host_debugctlmsr = get_debugctlmsr();
 2590}
 2591
 2592static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 2593{
 2594	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 2595
 2596	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
 2597		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
 2598		!kvm_vcpu_apicv_active(vcpu))
 2599		return;
 2600
 2601	/* Set SN when the vCPU is preempted */
 2602	if (vcpu->preempted)
 2603		pi_set_sn(pi_desc);
 2604}
 2605
 2606static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 2607{
 2608	vmx_vcpu_pi_put(vcpu);
 2609
 2610	__vmx_load_host_state(to_vmx(vcpu));
 2611}
 2612
 2613static bool emulation_required(struct kvm_vcpu *vcpu)
 2614{
 2615	return emulate_invalid_guest_state && !guest_state_valid(vcpu);
 2616}
 2617
 2618static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 2619
 2620/*
 2621 * Return the cr0 value that a nested guest would read. This is a combination
 2622 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
 2623 * its hypervisor (cr0_read_shadow).
 2624 */
 2625static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
 2626{
 2627	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
 2628		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
 2629}
 2630static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
 2631{
 2632	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
 2633		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
 2634}
 2635
 2636static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 2637{
 2638	unsigned long rflags, save_rflags;
 2639
 2640	if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
 2641		__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
 2642		rflags = vmcs_readl(GUEST_RFLAGS);
 2643		if (to_vmx(vcpu)->rmode.vm86_active) {
 2644			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
 2645			save_rflags = to_vmx(vcpu)->rmode.save_rflags;
 2646			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
 2647		}
 2648		to_vmx(vcpu)->rflags = rflags;
 2649	}
 2650	return to_vmx(vcpu)->rflags;
 2651}
 2652
 2653static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 2654{
 2655	unsigned long old_rflags = vmx_get_rflags(vcpu);
 2656
 2657	__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
 2658	to_vmx(vcpu)->rflags = rflags;
 2659	if (to_vmx(vcpu)->rmode.vm86_active) {
 2660		to_vmx(vcpu)->rmode.save_rflags = rflags;
 2661		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 2662	}
 2663	vmcs_writel(GUEST_RFLAGS, rflags);
 2664
 2665	if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
 2666		to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
 2667}
 2668
 2669static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 2670{
 2671	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 2672	int ret = 0;
 2673
 2674	if (interruptibility & GUEST_INTR_STATE_STI)
 2675		ret |= KVM_X86_SHADOW_INT_STI;
 2676	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
 2677		ret |= KVM_X86_SHADOW_INT_MOV_SS;
 2678
 2679	return ret;
 2680}
 2681
 2682static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 2683{
 2684	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
 2685	u32 interruptibility = interruptibility_old;
 2686
 2687	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
 2688
 2689	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
 2690		interruptibility |= GUEST_INTR_STATE_MOV_SS;
 2691	else if (mask & KVM_X86_SHADOW_INT_STI)
 2692		interruptibility |= GUEST_INTR_STATE_STI;
 2693
 2694	if ((interruptibility != interruptibility_old))
 2695		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 2696}
 2697
 2698static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 2699{
 2700	unsigned long rip;
 2701
 2702	rip = kvm_rip_read(vcpu);
 2703	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 2704	kvm_rip_write(vcpu, rip);
 2705
 2706	/* skipping an emulated instruction also counts */
 2707	vmx_set_interrupt_shadow(vcpu, 0);
 2708}
 2709
 2710static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
 2711					       unsigned long exit_qual)
 2712{
 2713	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 2714	unsigned int nr = vcpu->arch.exception.nr;
 2715	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 2716
 2717	if (vcpu->arch.exception.has_error_code) {
 2718		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
 2719		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 2720	}
 2721
 2722	if (kvm_exception_is_soft(nr))
 2723		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
 2724	else
 2725		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 2726
 2727	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
 2728	    vmx_get_nmi_mask(vcpu))
 2729		intr_info |= INTR_INFO_UNBLOCK_NMI;
 2730
 2731	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
 2732}
 2733
 2734/*
 2735 * KVM wants to inject page-faults which it got to the guest. This function
 2736 * checks whether in a nested guest, we need to inject them to L1 or L2.
 2737 */
 2738static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
 2739{
 2740	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 2741	unsigned int nr = vcpu->arch.exception.nr;
 2742
 2743	if (nr == PF_VECTOR) {
 2744		if (vcpu->arch.exception.nested_apf) {
 2745			*exit_qual = vcpu->arch.apf.nested_apf_token;
 2746			return 1;
 2747		}
 2748		/*
 2749		 * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
 2750		 * The fix is to add the ancillary datum (CR2 or DR6) to structs
 2751		 * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
 2752		 * can be written only when inject_pending_event runs.  This should be
 2753		 * conditional on a new capability---if the capability is disabled,
 2754		 * kvm_multiple_exception would write the ancillary information to
 2755		 * CR2 or DR6, for backwards ABI-compatibility.
 2756		 */
 2757		if (nested_vmx_is_page_fault_vmexit(vmcs12,
 2758						    vcpu->arch.exception.error_code)) {
 2759			*exit_qual = vcpu->arch.cr2;
 2760			return 1;
 2761		}
 2762	} else {
 2763		if (vmcs12->exception_bitmap & (1u << nr)) {
 2764			if (nr == DB_VECTOR)
 2765				*exit_qual = vcpu->arch.dr6;
 2766			else
 2767				*exit_qual = 0;
 2768			return 1;
 2769		}
 2770	}
 2771
 2772	return 0;
 2773}
 2774
 2775static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 2776{
 2777	/*
 2778	 * Ensure that we clear the HLT state in the VMCS.  We don't need to
 2779	 * explicitly skip the instruction because if the HLT state is set,
 2780	 * then the instruction is already executing and RIP has already been
 2781	 * advanced.
 2782	 */
 2783	if (kvm_hlt_in_guest(vcpu->kvm) &&
 2784			vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
 2785		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 2786}
 2787
 2788static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 2789{
 2790	struct vcpu_vmx *vmx = to_vmx(vcpu);
 2791	unsigned nr = vcpu->arch.exception.nr;
 2792	bool has_error_code = vcpu->arch.exception.has_error_code;
 2793	u32 error_code = vcpu->arch.exception.error_code;
 2794	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 2795
 2796	if (has_error_code) {
 2797		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 2798		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 2799	}
 2800
 2801	if (vmx->rmode.vm86_active) {
 2802		int inc_eip = 0;
 2803		if (kvm_exception_is_soft(nr))
 2804			inc_eip = vcpu->arch.event_exit_inst_len;
 2805		if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
 2806			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 2807		return;
 2808	}
 2809
 2810	WARN_ON_ONCE(vmx->emulation_required);
 2811
 2812	if (kvm_exception_is_soft(nr)) {
 2813		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
 2814			     vmx->vcpu.arch.event_exit_inst_len);
 2815		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
 2816	} else
 2817		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 2818
 2819	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 2820
 2821	vmx_clear_hlt(vcpu);
 2822}
 2823
 2824static bool vmx_rdtscp_supported(void)
 2825{
 2826	return cpu_has_vmx_rdtscp();
 2827}
 2828
 2829static bool vmx_invpcid_supported(void)
 2830{
 2831	return cpu_has_vmx_invpcid() && enable_ept;
 2832}
 2833
 2834/*
 2835 * Swap MSR entry in host/guest MSR entry array.
 2836 */
 2837static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 2838{
 2839	struct shared_msr_entry tmp;
 2840
 2841	tmp = vmx->guest_msrs[to];
 2842	vmx->guest_msrs[to] = vmx->guest_msrs[from];
 2843	vmx->guest_msrs[from] = tmp;
 2844}
 2845
 2846/*
 2847 * Set up the vmcs to automatically save and restore system
 2848 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
 2849 * mode, as fiddling with msrs is very expensive.
 2850 */
 2851static void setup_msrs(struct vcpu_vmx *vmx)
 2852{
 2853	int save_nmsrs, index;
 2854
 2855	save_nmsrs = 0;
 2856#ifdef CONFIG_X86_64
 2857	if (is_long_mode(&vmx->vcpu)) {
 2858		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
 2859		if (index >= 0)
 2860			move_msr_up(vmx, index, save_nmsrs++);
 2861		index = __find_msr_index(vmx, MSR_LSTAR);
 2862		if (index >= 0)
 2863			move_msr_up(vmx, index, save_nmsrs++);
 2864		index = __find_msr_index(vmx, MSR_CSTAR);
 2865		if (index >= 0)
 2866			move_msr_up(vmx, index, save_nmsrs++);
 2867		index = __find_msr_index(vmx, MSR_TSC_AUX);
 2868		if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
 2869			move_msr_up(vmx, index, save_nmsrs++);
 2870		/*
 2871		 * MSR_STAR is only needed on long mode guests, and only
 2872		 * if efer.sce is enabled.
 2873		 */
 2874		index = __find_msr_index(vmx, MSR_STAR);
 2875		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
 2876			move_msr_up(vmx, index, save_nmsrs++);
 2877	}
 2878#endif
 2879	index = __find_msr_index(vmx, MSR_EFER);
 2880	if (index >= 0 && update_transition_efer(vmx, index))
 2881		move_msr_up(vmx, index, save_nmsrs++);
 2882
 2883	vmx->save_nmsrs = save_nmsrs;
 2884
 2885	if (cpu_has_vmx_msr_bitmap())
 2886		vmx_update_msr_bitmap(&vmx->vcpu);
 2887}
 2888
 2889static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 2890{
 2891	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 2892
 2893	if (is_guest_mode(vcpu) &&
 2894	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
 2895		return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
 2896
 2897	return vcpu->arch.tsc_offset;
 2898}
 2899
 2900/*
 2901 * writes 'offset' into guest's timestamp counter offset register
 2902 */
 2903static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 2904{
 2905	if (is_guest_mode(vcpu)) {
 2906		/*
 2907		 * We're here if L1 chose not to trap WRMSR to TSC. According
 2908		 * to the spec, this should set L1's TSC; The offset that L1
 2909		 * set for L2 remains unchanged, and still needs to be added
 2910		 * to the newly set TSC to get L2's TSC.
 2911		 */
 2912		struct vmcs12 *vmcs12;
 2913		/* recalculate vmcs02.TSC_OFFSET: */
 2914		vmcs12 = get_vmcs12(vcpu);
 2915		vmcs_write64(TSC_OFFSET, offset +
 2916			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
 2917			 vmcs12->tsc_offset : 0));
 2918	} else {
 2919		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
 2920					   vmcs_read64(TSC_OFFSET), offset);
 2921		vmcs_write64(TSC_OFFSET, offset);
 2922	}
 2923}
 2924
 2925/*
 2926 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
 2927 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
 2928 * all guests if the "nested" module option is off, and can also be disabled
 2929 * for a single guest by disabling its VMX cpuid bit.
 2930 */
 2931static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
 2932{
 2933	return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
 2934}
 2935
 2936/*
 2937 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
 2938 * returned for the various VMX controls MSRs when nested VMX is enabled.
 2939 * The same values should also be used to verify that vmcs12 control fields are
 2940 * valid during nested entry from L1 to L2.
 2941 * Each of these control msrs has a low and high 32-bit half: A low bit is on
 2942 * if the corresponding bit in the (32-bit) control field *must* be on, and a
 2943 * bit in the high half is on if the corresponding bit in the control field
 2944 * may be on. See also vmx_control_verify().
 2945 */
 2946static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
 2947{
 2948	if (!nested) {
 2949		memset(msrs, 0, sizeof(*msrs));
 2950		return;
 2951	}
 2952
 2953	/*
 2954	 * Note that as a general rule, the high half of the MSRs (bits in
 2955	 * the control fields which may be 1) should be initialized by the
 2956	 * intersection of the underlying hardware's MSR (i.e., features which
 2957	 * can be supported) and the list of features we want to expose -
 2958	 * because they are known to be properly supported in our code.
 2959	 * Also, usually, the low half of the MSRs (bits which must be 1) can
 2960	 * be set to 0, meaning that L1 may turn off any of these bits. The
 2961	 * reason is that if one of these bits is necessary, it will appear
 2962	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
 2963	 * fields of vmcs01 and vmcs02, will turn these bits off - and
 2964	 * nested_vmx_exit_reflected() will not pass related exits to L1.
 2965	 * These rules have exceptions below.
 2966	 */
 2967
 2968	/* pin-based controls */
 2969	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
 2970		msrs->pinbased_ctls_low,
 2971		msrs->pinbased_ctls_high);
 2972	msrs->pinbased_ctls_low |=
 2973		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 2974	msrs->pinbased_ctls_high &=
 2975		PIN_BASED_EXT_INTR_MASK |
 2976		PIN_BASED_NMI_EXITING |
 2977		PIN_BASED_VIRTUAL_NMIS |
 2978		(apicv ? PIN_BASED_POSTED_INTR : 0);
 2979	msrs->pinbased_ctls_high |=
 2980		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 2981		PIN_BASED_VMX_PREEMPTION_TIMER;
 2982
 2983	/* exit controls */
 2984	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
 2985		msrs->exit_ctls_low,
 2986		msrs->exit_ctls_high);
 2987	msrs->exit_ctls_low =
 2988		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 2989
 2990	msrs->exit_ctls_high &=
 2991#ifdef CONFIG_X86_64
 2992		VM_EXIT_HOST_ADDR_SPACE_SIZE |
 2993#endif
 2994		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 2995	msrs->exit_ctls_high |=
 2996		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 2997		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
 2998		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 2999
 3000	if (kvm_mpx_supported())
 3001		msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 3002
 3003	/* We support free control of debug control saving. */
 3004	msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
 3005
 3006	/* entry controls */
 3007	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 3008		msrs->entry_ctls_low,
 3009		msrs->entry_ctls_high);
 3010	msrs->entry_ctls_low =
 3011		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 3012	msrs->entry_ctls_high &=
 3013#ifdef CONFIG_X86_64
 3014		VM_ENTRY_IA32E_MODE |
 3015#endif
 3016		VM_ENTRY_LOAD_IA32_PAT;
 3017	msrs->entry_ctls_high |=
 3018		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
 3019	if (kvm_mpx_supported())
 3020		msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 3021
 3022	/* We support free control of debug control loading. */
 3023	msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
 3024
 3025	/* cpu-based controls */
 3026	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
 3027		msrs->procbased_ctls_low,
 3028		msrs->procbased_ctls_high);
 3029	msrs->procbased_ctls_low =
 3030		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 3031	msrs->procbased_ctls_high &=
 3032		CPU_BASED_VIRTUAL_INTR_PENDING |
 3033		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
 3034		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
 3035		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
 3036		CPU_BASED_CR3_STORE_EXITING |
 3037#ifdef CONFIG_X86_64
 3038		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
 3039#endif
 3040		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
 3041		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
 3042		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
 3043		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
 3044		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 3045	/*
 3046	 * We can allow some features even when not supported by the
 3047	 * hardware. For example, L1 can specify an MSR bitmap - and we
 3048	 * can use it to avoid exits to L1 - even when L0 runs L2
 3049	 * without MSR bitmaps.
 3050	 */
 3051	msrs->procbased_ctls_high |=
 3052		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 3053		CPU_BASED_USE_MSR_BITMAPS;
 3054
 3055	/* We support free control of CR3 access interception. */
 3056	msrs->procbased_ctls_low &=
 3057		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
 3058
 3059	/*
 3060	 * secondary cpu-based controls.  Do not include those that
 3061	 * depend on CPUID bits, they are added later by vmx_cpuid_update.
 3062	 */
 3063	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
 3064		msrs->secondary_ctls_low,
 3065		msrs->secondary_ctls_high);
 3066	msrs->secondary_ctls_low = 0;
 3067	msrs->secondary_ctls_high &=
 3068		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 3069		SECONDARY_EXEC_DESC |
 3070		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 3071		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 3072		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 3073		SECONDARY_EXEC_WBINVD_EXITING;
 3074
 3075	if (enable_ept) {
 3076		/* nested EPT: emulate EPT also to L1 */
 3077		msrs->secondary_ctls_high |=
 3078			SECONDARY_EXEC_ENABLE_EPT;
 3079		msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 3080			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
 3081		if (cpu_has_vmx_ept_execute_only())
 3082			msrs->ept_caps |=
 3083				VMX_EPT_EXECUTE_ONLY_BIT;
 3084		msrs->ept_caps &= vmx_capability.ept;
 3085		msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
 3086			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
 3087			VMX_EPT_1GB_PAGE_BIT;
 3088		if (enable_ept_ad_bits) {
 3089			msrs->secondary_ctls_high |=
 3090				SECONDARY_EXEC_ENABLE_PML;
 3091			msrs->ept_caps |= VMX_EPT_AD_BIT;
 3092		}
 3093	}
 3094
 3095	if (cpu_has_vmx_vmfunc()) {
 3096		msrs->secondary_ctls_high |=
 3097			SECONDARY_EXEC_ENABLE_VMFUNC;
 3098		/*
 3099		 * Advertise EPTP switching unconditionally
 3100		 * since we emulate it
 3101		 */
 3102		if (enable_ept)
 3103			msrs->vmfunc_controls =
 3104				VMX_VMFUNC_EPTP_SWITCHING;
 3105	}
 3106
 3107	/*
 3108	 * Old versions of KVM use the single-context version without
 3109	 * checking for support, so declare that it is supported even
 3110	 * though it is treated as global context.  The alternative is
 3111	 * not failing the single-context invvpid, and it is worse.
 3112	 */
 3113	if (enable_vpid) {
 3114		msrs->secondary_ctls_high |=
 3115			SECONDARY_EXEC_ENABLE_VPID;
 3116		msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
 3117			VMX_VPID_EXTENT_SUPPORTED_MASK;
 3118	}
 3119
 3120	if (enable_unrestricted_guest)
 3121		msrs->secondary_ctls_high |=
 3122			SECONDARY_EXEC_UNRESTRICTED_GUEST;
 3123
 3124	/* miscellaneous data */
 3125	rdmsr(MSR_IA32_VMX_MISC,
 3126		msrs->misc_low,
 3127		msrs->misc_high);
 3128	msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
 3129	msrs->misc_low |=
 3130		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
 3131		VMX_MISC_ACTIVITY_HLT;
 3132	msrs->misc_high = 0;
 3133
 3134	/*
 3135	 * This MSR reports some information about VMX support. We
 3136	 * should return information about the VMX we emulate for the
 3137	 * guest, and the VMCS structure we give it - not about the
 3138	 * VMX support of the underlying hardware.
 3139	 */
 3140	msrs->basic =
 3141		VMCS12_REVISION |
 3142		VMX_BASIC_TRUE_CTLS |
 3143		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
 3144		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
 3145
 3146	if (cpu_has_vmx_basic_inout())
 3147		msrs->basic |= VMX_BASIC_INOUT;
 3148
 3149	/*
 3150	 * These MSRs specify bits which the guest must keep fixed on
 3151	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
 3152	 * We picked the standard core2 setting.
 3153	 */
 3154#define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
 3155#define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
 3156	msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
 3157	msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
 3158
 3159	/* These MSRs specify bits which the guest must keep fixed off. */
 3160	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
 3161	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
 3162
 3163	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
 3164	msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
 3165}
 3166
 3167/*
 3168 * if fixed0[i] == 1: val[i] must be 1
 3169 * if fixed1[i] == 0: val[i] must be 0
 3170 */
 3171static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
 3172{
 3173	return ((val & fixed1) | fixed0) == val;
 3174}
 3175
 3176static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
 3177{
 3178	return fixed_bits_valid(control, low, high);
 3179}
 3180
 3181static inline u64 vmx_control_msr(u32 low, u32 high)
 3182{
 3183	return low | ((u64)high << 32);
 3184}
 3185
 3186static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
 3187{
 3188	superset &= mask;
 3189	subset &= mask;
 3190
 3191	return (superset | subset) == superset;
 3192}
 3193
 3194static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
 3195{
 3196	const u64 feature_and_reserved =
 3197		/* feature (except bit 48; see below) */
 3198		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
 3199		/* reserved */
 3200		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
 3201	u64 vmx_basic = vmx->nested.msrs.basic;
 3202
 3203	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
 3204		return -EINVAL;
 3205
 3206	/*
 3207	 * KVM does not emulate a version of VMX that constrains physical
 3208	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
 3209	 */
 3210	if (data & BIT_ULL(48))
 3211		return -EINVAL;
 3212
 3213	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
 3214	    vmx_basic_vmcs_revision_id(data))
 3215		return -EINVAL;
 3216
 3217	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
 3218		return -EINVAL;
 3219
 3220	vmx->nested.msrs.basic = data;
 3221	return 0;
 3222}
 3223
 3224static int
 3225vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
 3226{
 3227	u64 supported;
 3228	u32 *lowp, *highp;
 3229
 3230	switch (msr_index) {
 3231	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 3232		lowp = &vmx->nested.msrs.pinbased_ctls_low;
 3233		highp = &vmx->nested.msrs.pinbased_ctls_high;
 3234		break;
 3235	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
 3236		lowp = &vmx->nested.msrs.procbased_ctls_low;
 3237		highp = &vmx->nested.msrs.procbased_ctls_high;
 3238		break;
 3239	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
 3240		lowp = &vmx->nested.msrs.exit_ctls_low;
 3241		highp = &vmx->nested.msrs.exit_ctls_high;
 3242		break;
 3243	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
 3244		lowp = &vmx->nested.msrs.entry_ctls_low;
 3245		highp = &vmx->nested.msrs.entry_ctls_high;
 3246		break;
 3247	case MSR_IA32_VMX_PROCBASED_CTLS2:
 3248		lowp = &vmx->nested.msrs.secondary_ctls_low;
 3249		highp = &vmx->nested.msrs.secondary_ctls_high;
 3250		break;
 3251	default:
 3252		BUG();
 3253	}
 3254
 3255	supported = vmx_control_msr(*lowp, *highp);
 3256
 3257	/* Check must-be-1 bits are still 1. */
 3258	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
 3259		return -EINVAL;
 3260
 3261	/* Check must-be-0 bits are still 0. */
 3262	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
 3263		return -EINVAL;
 3264
 3265	*lowp = data;
 3266	*highp = data >> 32;
 3267	return 0;
 3268}
 3269
 3270static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
 3271{
 3272	const u64 feature_and_reserved_bits =
 3273		/* feature */
 3274		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
 3275		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
 3276		/* reserved */
 3277		GENMASK_ULL(13, 9) | BIT_ULL(31);
 3278	u64 vmx_misc;
 3279
 3280	vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
 3281				   vmx->nested.msrs.misc_high);
 3282
 3283	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
 3284		return -EINVAL;
 3285
 3286	if ((vmx->nested.msrs.pinbased_ctls_high &
 3287	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
 3288	    vmx_misc_preemption_timer_rate(data) !=
 3289	    vmx_misc_preemption_timer_rate(vmx_misc))
 3290		return -EINVAL;
 3291
 3292	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
 3293		return -EINVAL;
 3294
 3295	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
 3296		return -EINVAL;
 3297
 3298	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
 3299		return -EINVAL;
 3300
 3301	vmx->nested.msrs.misc_low = data;
 3302	vmx->nested.msrs.misc_high = data >> 32;
 3303	return 0;
 3304}
 3305
 3306static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
 3307{
 3308	u64 vmx_ept_vpid_cap;
 3309
 3310	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
 3311					   vmx->nested.msrs.vpid_caps);
 3312
 3313	/* Every bit is either reserved or a feature bit. */
 3314	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
 3315		return -EINVAL;
 3316
 3317	vmx->nested.msrs.ept_caps = data;
 3318	vmx->nested.msrs.vpid_caps = data >> 32;
 3319	return 0;
 3320}
 3321
 3322static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
 3323{
 3324	u64 *msr;
 3325
 3326	switch (msr_index) {
 3327	case MSR_IA32_VMX_CR0_FIXED0:
 3328		msr = &vmx->nested.msrs.cr0_fixed0;
 3329		break;
 3330	case MSR_IA32_VMX_CR4_FIXED0:
 3331		msr = &vmx->nested.msrs.cr4_fixed0;
 3332		break;
 3333	default:
 3334		BUG();
 3335	}
 3336
 3337	/*
 3338	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
 3339	 * must be 1 in the restored value.
 3340	 */
 3341	if (!is_bitwise_subset(data, *msr, -1ULL))
 3342		return -EINVAL;
 3343
 3344	*msr = data;
 3345	return 0;
 3346}
 3347
 3348/*
 3349 * Called when userspace is restoring VMX MSRs.
 3350 *
 3351 * Returns 0 on success, non-0 otherwise.
 3352 */
 3353static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 3354{
 3355	struct vcpu_vmx *vmx = to_vmx(vcpu);
 3356
 3357	switch (msr_index) {
 3358	case MSR_IA32_VMX_BASIC:
 3359		return vmx_restore_vmx_basic(vmx, data);
 3360	case MSR_IA32_VMX_PINBASED_CTLS:
 3361	case MSR_IA32_VMX_PROCBASED_CTLS:
 3362	case MSR_IA32_VMX_EXIT_CTLS:
 3363	case MSR_IA32_VMX_ENTRY_CTLS:
 3364		/*
 3365		 * The "non-true" VMX capability MSRs are generated from the
 3366		 * "true" MSRs, so we do not support restoring them directly.
 3367		 *
 3368		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
 3369		 * should restore the "true" MSRs with the must-be-1 bits
 3370		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
 3371		 * DEFAULT SETTINGS".
 3372		 */
 3373		return -EINVAL;
 3374	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 3375	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
 3376	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
 3377	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
 3378	case MSR_IA32_VMX_PROCBASED_CTLS2:
 3379		return vmx_restore_control_msr(vmx, msr_index, data);
 3380	case MSR_IA32_VMX_MISC:
 3381		return vmx_restore_vmx_misc(vmx, data);
 3382	case MSR_IA32_VMX_CR0_FIXED0:
 3383	case MSR_IA32_VMX_CR4_FIXED0:
 3384		return vmx_restore_fixed0_msr(vmx, msr_index, data);
 3385	case MSR_IA32_VMX_CR0_FIXED1:
 3386	case MSR_IA32_VMX_CR4_FIXED1:
 3387		/*
 3388		 * These MSRs are generated based on the vCPU's CPUID, so we
 3389		 * do not support restoring them directly.
 3390		 */
 3391		return -EINVAL;
 3392	case MSR_IA32_VMX_EPT_VPID_CAP:
 3393		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
 3394	case MSR_IA32_VMX_VMCS_ENUM:
 3395		vmx->nested.msrs.vmcs_enum = data;
 3396		return 0;
 3397	default:
 3398		/*
 3399		 * The rest of the VMX capability MSRs do not support restore.
 3400		 */
 3401		return -EINVAL;
 3402	}
 3403}
 3404
 3405/* Returns 0 on success, non-0 otherwise. */
 3406static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
 3407{
 3408	switch (msr_index) {
 3409	case MSR_IA32_VMX_BASIC:
 3410		*pdata = msrs->basic;
 3411		break;
 3412	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 3413	case MSR_IA32_VMX_PINBASED_CTLS:
 3414		*pdata = vmx_control_msr(
 3415			msrs->pinbased_ctls_low,
 3416			msrs->pinbased_ctls_high);
 3417		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
 3418			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 3419		break;
 3420	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
 3421	case MSR_IA32_VMX_PROCBASED_CTLS:
 3422		*pdata = vmx_control_msr(
 3423			msrs->procbased_ctls_low,
 3424			msrs->procbased_ctls_high);
 3425		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
 3426			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 3427		break;
 3428	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
 3429	case MSR_IA32_VMX_EXIT_CTLS:
 3430		*pdata = vmx_control_msr(
 3431			msrs->exit_ctls_low,
 3432			msrs->exit_ctls_high);
 3433		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
 3434			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 3435		break;
 3436	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
 3437	case MSR_IA32_VMX_ENTRY_CTLS:
 3438		*pdata = vmx_control_msr(
 3439			msrs->entry_ctls_low,
 3440			msrs->entry_ctls_high);
 3441		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
 3442			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 3443		break;
 3444	case MSR_IA32_VMX_MISC:
 3445		*pdata = vmx_control_msr(
 3446			msrs->misc_low,
 3447			msrs->misc_high);
 3448		break;
 3449	case MSR_IA32_VMX_CR0_FIXED0:
 3450		*pdata = msrs->cr0_fixed0;
 3451		break;
 3452	case MSR_IA32_VMX_CR0_FIXED1:
 3453		*pdata = msrs->cr0_fixed1;
 3454		break;
 3455	case MSR_IA32_VMX_CR4_FIXED0:
 3456		*pdata = msrs->cr4_fixed0;
 3457		break;
 3458	case MSR_IA32_VMX_CR4_FIXED1:
 3459		*pdata = msrs->cr4_fixed1;
 3460		break;
 3461	case MSR_IA32_VMX_VMCS_ENUM:
 3462		*pdata = msrs->vmcs_enum;
 3463		break;
 3464	case MSR_IA32_VMX_PROCBASED_CTLS2:
 3465		*pdata = vmx_control_msr(
 3466			msrs->secondary_ctls_low,
 3467			msrs->secondary_ctls_high);
 3468		break;
 3469	case MSR_IA32_VMX_EPT_VPID_CAP:
 3470		*pdata = msrs->ept_caps |
 3471			((u64)msrs->vpid_caps << 32);
 3472		break;
 3473	case MSR_IA32_VMX_VMFUNC:
 3474		*pdata = msrs->vmfunc_controls;
 3475		break;
 3476	default:
 3477		return 1;
 3478	}
 3479
 3480	return 0;
 3481}
 3482
 3483static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
 3484						 uint64_t val)
 3485{
 3486	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
 3487
 3488	return !(val & ~valid_bits);
 3489}
 3490
 3491static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
 3492{
 3493	switch (msr->index) {
 3494	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 3495		if (!nested)
 3496			return 1;
 3497		return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
 3498	default:
 3499		return 1;
 3500	}
 3501
 3502	return 0;
 3503}
 3504
 3505/*
 3506 * Reads an msr value (of 'msr_index') into 'pdata'.
 3507 * Returns 0 on success, non-0 otherwise.
 3508 * Assumes vcpu_load() was already called.
 3509 */
 3510static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 3511{
 3512	struct vcpu_vmx *vmx = to_vmx(vcpu);
 3513	struct shared_msr_entry *msr;
 3514
 3515	switch (msr_info->index) {
 3516#ifdef CONFIG_X86_64
 3517	case MSR_FS_BASE:
 3518		msr_info->data = vmcs_readl(GUEST_FS_BASE);
 3519		break;
 3520	case MSR_GS_BASE:
 3521		msr_info->data = vmcs_readl(GUEST_GS_BASE);
 3522		break;
 3523	case MSR_KERNEL_GS_BASE:
 3524		vmx_load_host_state(vmx);
 3525		msr_info->data = vmx->msr_guest_kernel_gs_base;
 3526		break;
 3527#endif
 3528	case MSR_EFER:
 3529		return kvm_get_msr_common(vcpu, msr_info);
 3530	case MSR_IA32_SPEC_CTRL:
 3531		if (!msr_info->host_initiated &&
 3532		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 3533			return 1;
 3534
 3535		msr_info->data = to_vmx(vcpu)->spec_ctrl;
 3536		break;
 3537	case MSR_IA32_ARCH_CAPABILITIES:
 3538		if (!msr_info->host_initiated &&
 3539		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
 3540			return 1;
 3541		msr_info->data = to_vmx(vcpu)->arch_capabilities;
 3542		break;
 3543	case MSR_IA32_SYSENTER_CS:
 3544		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
 3545		break;
 3546	case MSR_IA32_SYSENTER_EIP:
 3547		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
 3548		break;
 3549	case MSR_IA32_SYSENTER_ESP:
 3550		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
 3551		break;
 3552	case MSR_IA32_BNDCFGS:
 3553		if (!kvm_mpx_supported() ||
 3554		    (!msr_info->host_initiated &&
 3555		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
 3556			return 1;
 3557		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 3558		break;
 3559	case MSR_IA32_MCG_EXT_CTL:
 3560		if (!msr_info->host_initiated &&
 3561		    !(vmx->msr_ia32_feature_control &
 3562		      FEATURE_CONTROL_LMCE))
 3563			return 1;
 3564		msr_info->data = vcpu->arch.mcg_ext_ctl;
 3565		break;
 3566	case MSR_IA32_FEATURE_CONTROL:
 3567		msr_info->data = vmx->msr_ia32_feature_control;
 3568		break;
 3569	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 3570		if (!nested_vmx_allowed(vcpu))
 3571			return 1;
 3572		return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
 3573				       &msr_info->data);
 3574	case MSR_IA32_XSS:
 3575		if (!vmx_xsaves_supported())
 3576			return 1;
 3577		msr_info->data = vcpu->arch.ia32_xss;
 3578		break;
 3579	case MSR_TSC_AUX:
 3580		if (!msr_info->host_initiated &&
 3581		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 3582			return 1;
 3583		/* Otherwise falls through */
 3584	default:
 3585		msr = find_msr_entry(vmx, msr_info->index);
 3586		if (msr) {
 3587			msr_info->data = msr->data;
 3588			break;
 3589		}
 3590		return kvm_get_msr_common(vcpu, msr_info);
 3591	}
 3592
 3593	return 0;
 3594}
 3595
 3596static void vmx_leave_nested(struct kvm_vcpu *vcpu);
 3597
 3598/*
 3599 * Writes msr value into into the appropriate "register".
 3600 * Returns 0 on success, non-0 otherwise.
 3601 * Assumes vcpu_load() was already called.
 3602 */
 3603static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 3604{
 3605	struct vcpu_vmx *vmx = to_vmx(vcpu);
 3606	struct shared_msr_entry *msr;
 3607	int ret = 0;
 3608	u32 msr_index = msr_info->index;
 3609	u64 data = msr_info->data;
 3610
 3611	switch (msr_index) {
 3612	case MSR_EFER:
 3613		ret = kvm_set_msr_common(vcpu, msr_info);
 3614		break;
 3615#ifdef CONFIG_X86_64
 3616	case MSR_FS_BASE:
 3617		vmx_segment_cache_clear(vmx);
 3618		vmcs_writel(GUEST_FS_BASE, data);
 3619		break;
 3620	case MSR_GS_BASE:
 3621		vmx_segment_cache_clear(vmx);
 3622		vmcs_writel(GUEST_GS_BASE, data);
 3623		break;
 3624	case MSR_KERNEL_GS_BASE:
 3625		vmx_load_host_state(vmx);
 3626		vmx->msr_guest_kernel_gs_base = data;
 3627		break;
 3628#endif
 3629	case MSR_IA32_SYSENTER_CS:
 3630		vmcs_write32(GUEST_SYSENTER_CS, data);
 3631		break;
 3632	case MSR_IA32_SYSENTER_EIP:
 3633		vmcs_writel(GUEST_SYSENTER_EIP, data);
 3634		break;
 3635	case MSR_IA32_SYSENTER_ESP:
 3636		vmcs_writel(GUEST_SYSENTER_ESP, data);
 3637		break;
 3638	case MSR_IA32_BNDCFGS:
 3639		if (!kvm_mpx_supported() ||
 3640		    (!msr_info->host_initiated &&
 3641		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
 3642			return 1;
 3643		if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
 3644		    (data & MSR_IA32_BNDCFGS_RSVD))
 3645			return 1;
 3646		vmcs_write64(GUEST_BNDCFGS, data);
 3647		break;
 3648	case MSR_IA32_SPEC_CTRL:
 3649		if (!msr_info->host_initiated &&
 3650		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 3651			return 1;
 3652
 3653		/* The STIBP bit doesn't fault even if it's not advertised */
 3654		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
 3655			return 1;
 3656
 3657		vmx->spec_ctrl = data;
 3658
 3659		if (!data)
 3660			break;
 3661
 3662		/*
 3663		 * For non-nested:
 3664		 * When it's written (to non-zero) for the first time, pass
 3665		 * it through.
 3666		 *
 3667		 * For nested:
 3668		 * The handling of the MSR bitmap for L2 guests is done in
 3669		 * nested_vmx_merge_msr_bitmap. We should not touch the
 3670		 * vmcs02.msr_bitmap here since it gets completely overwritten
 3671		 * in the merging. We update the vmcs01 here for L1 as well
 3672		 * since it will end up touching the MSR anyway now.
 3673		 */
 3674		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
 3675					      MSR_IA32_SPEC_CTRL,
 3676					      MSR_TYPE_RW);
 3677		break;
 3678	case MSR_IA32_PRED_CMD:
 3679		if (!msr_info->host_initiated &&
 3680		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
 3681			return 1;
 3682
 3683		if (data & ~PRED_CMD_IBPB)
 3684			return 1;
 3685
 3686		if (!data)
 3687			break;
 3688
 3689		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
 3690
 3691		/*
 3692		 * For non-nested:
 3693		 * When it's written (to non-zero) for the first time, pass
 3694		 * it through.
 3695		 *
 3696		 * For nested:
 3697		 * The handling of the MSR bitmap for L2 guests is done in
 3698		 * nested_vmx_merge_msr_bitmap. We should not touch the
 3699		 * vmcs02.msr_bitmap here since it gets completely overwritten
 3700		 * in the merging.
 3701		 */
 3702		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
 3703					      MSR_TYPE_W);
 3704		break;
 3705	case MSR_IA32_ARCH_CAPABILITIES:
 3706		if (!msr_info->host_initiated)
 3707			return 1;
 3708		vmx->arch_capabilities = data;
 3709		break;
 3710	case MSR_IA32_CR_PAT:
 3711		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
 3712			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
 3713				return 1;
 3714			vmcs_write64(GUEST_IA32_PAT, data);
 3715			vcpu->arch.pat = data;
 3716			break;
 3717		}
 3718		ret = kvm_set_msr_common(vcpu, msr_info);
 3719		break;
 3720	case MSR_IA32_TSC_ADJUST:
 3721		ret = kvm_set_msr_common(vcpu, msr_info);
 3722		break;
 3723	case MSR_IA32_MCG_EXT_CTL:
 3724		if ((!msr_info->host_initiated &&
 3725		     !(to_vmx(vcpu)->msr_ia32_feature_control &
 3726		       FEATURE_CONTROL_LMCE)) ||
 3727		    (data & ~MCG_EXT_CTL_LMCE_EN))
 3728			return 1;
 3729		vcpu->arch.mcg_ext_ctl = data;
 3730		break;
 3731	case MSR_IA32_FEATURE_CONTROL:
 3732		if (!vmx_feature_control_msr_valid(vcpu, data) ||
 3733		    (to_vmx(vcpu)->msr_ia32_feature_control &
 3734		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
 3735			return 1;
 3736		vmx->msr_ia32_feature_control = data;
 3737		if (msr_info->host_initiated && data == 0)
 3738			vmx_leave_nested(vcpu);
 3739		break;
 3740	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 3741		if (!msr_info->host_initiated)
 3742			return 1; /* they are read-only */
 3743		if (!nested_vmx_allowed(vcpu))
 3744			return 1;
 3745		return vmx_set_vmx_msr(vcpu, msr_index, data);
 3746	case MSR_IA32_XSS:
 3747		if (!vmx_xsaves_supported())
 3748			return 1;
 3749		/*
 3750		 * The only supported bit as of Skylake is bit 8, but
 3751		 * it is not supported on KVM.
 3752		 */
 3753		if (data != 0)
 3754			return 1;
 3755		vcpu->arch.ia32_xss = data;
 3756		if (vcpu->arch.ia32_xss != host_xss)
 3757			add_atomic_switch_msr(vmx, MSR_IA32_XSS,
 3758				vcpu->arch.ia32_xss, host_xss);
 3759		else
 3760			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
 3761		break;
 3762	case MSR_TSC_AUX:
 3763		if (!msr_info->host_initiated &&
 3764		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
 3765			return 1;
 3766		/* Check reserved bit, higher 32 bits should be zero */
 3767		if ((data >> 32) != 0)
 3768			return 1;
 3769		/* Otherwise falls through */
 3770	default:
 3771		msr = find_msr_entry(vmx, msr_index);
 3772		if (msr) {
 3773			u64 old_msr_data = msr->data;
 3774			msr->data = data;
 3775			if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
 3776				preempt_disable();
 3777				ret = kvm_set_shared_msr(msr->index, msr->data,
 3778							 msr->mask);
 3779				preempt_enable();
 3780				if (ret)
 3781					msr->data = old_msr_data;
 3782			}
 3783			break;
 3784		}
 3785		ret = kvm_set_msr_common(vcpu, msr_info);
 3786	}
 3787
 3788	return ret;
 3789}
 3790
 3791static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 3792{
 3793	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
 3794	switch (reg) {
 3795	case VCPU_REGS_RSP:
 3796		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
 3797		break;
 3798	case VCPU_REGS_RIP:
 3799		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
 3800		break;
 3801	case VCPU_EXREG_PDPTR:
 3802		if (enable_ept)
 3803			ept_save_pdptrs(vcpu);
 3804		break;
 3805	default:
 3806		break;
 3807	}
 3808}
 3809
 3810static __init int cpu_has_kvm_support(void)
 3811{
 3812	return cpu_has_vmx();
 3813}
 3814
 3815static __init int vmx_disabled_by_bios(void)
 3816{
 3817	u64 msr;
 3818
 3819	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
 3820	if (msr & FEATURE_CONTROL_LOCKED) {
 3821		/* launched w/ TXT and VMX disabled */
 3822		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
 3823			&& tboot_enabled())
 3824			return 1;
 3825		/* launched w/o TXT and VMX only enabled w/ TXT */
 3826		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
 3827			&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
 3828			&& !tboot_enabled()) {
 3829			printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
 3830				"activate TXT before enabling KVM\n");
 3831			return 1;
 3832		}
 3833		/* launched w/o TXT and VMX disabled */
 3834		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
 3835			&& !tboot_enabled())
 3836			return 1;
 3837	}
 3838
 3839	return 0;
 3840}
 3841
 3842static void kvm_cpu_vmxon(u64 addr)
 3843{
 3844	cr4_set_bits(X86_CR4_VMXE);
 3845	intel_pt_handle_vmx(1);
 3846
 3847	asm volatile (ASM_VMX_VMXON_RAX
 3848			: : "a"(&addr), "m"(addr)
 3849			: "memory", "cc");
 3850}
 3851
 3852static int hardware_enable(void)
 3853{
 3854	int cpu = raw_smp_processor_id();
 3855	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 3856	u64 old, test_bits;
 3857
 3858	if (cr4_read_shadow() & X86_CR4_VMXE)
 3859		return -EBUSY;
 3860
 3861	/*
 3862	 * This can happen if we hot-added a CPU but failed to allocate
 3863	 * VP assist page for it.
 3864	 */
 3865	if (static_branch_unlikely(&enable_evmcs) &&
 3866	    !hv_get_vp_assist_page(cpu))
 3867		return -EFAULT;
 3868
 3869	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
 3870	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
 3871	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 3872
 3873	/*
 3874	 * Now we can enable the vmclear operation in kdump
 3875	 * since the loaded_vmcss_on_cpu list on this cpu
 3876	 * has been initialized.
 3877	 *
 3878	 * Though the cpu is not in VMX operation now, there
 3879	 * is no problem to enable the vmclear operation
 3880	 * for the loaded_vmcss_on_cpu list is empty!
 3881	 */
 3882	crash_enable_local_vmclear(cpu);
 3883
 3884	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 3885
 3886	test_bits = FEATURE_CONTROL_LOCKED;
 3887	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 3888	if (tboot_enabled())
 3889		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
 3890
 3891	if ((old & test_bits) != test_bits) {
 3892		/* enable and lock */
 3893		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 3894	}
 3895	kvm_cpu_vmxon(phys_addr);
 3896	if (enable_ept)
 3897		ept_sync_global();
 3898
 3899	return 0;
 3900}
 3901
 3902static void vmclear_local_loaded_vmcss(void)
 3903{
 3904	int cpu = raw_smp_processor_id();
 3905	struct loaded_vmcs *v, *n;
 3906
 3907	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
 3908				 loaded_vmcss_on_cpu_link)
 3909		__loaded_vmcs_clear(v);
 3910}
 3911
 3912
 3913/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
 3914 * tricks.
 3915 */
 3916static void kvm_cpu_vmxoff(void)
 3917{
 3918	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 3919
 3920	intel_pt_handle_vmx(0);
 3921	cr4_clear_bits(X86_CR4_VMXE);
 3922}
 3923
 3924static void hardware_disable(void)
 3925{
 3926	vmclear_local_loaded_vmcss();
 3927	kvm_cpu_vmxoff();
 3928}
 3929
 3930static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 3931				      u32 msr, u32 *result)
 3932{
 3933	u32 vmx_msr_low, vmx_msr_high;
 3934	u32 ctl = ctl_min | ctl_opt;
 3935
 3936	rdmsr(msr, vmx_msr_low, vmx_msr_high);
 3937
 3938	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
 3939	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
 3940
 3941	/* Ensure minimum (required) set of control bits are supported. */
 3942	if (ctl_min & ~ctl)
 3943		return -EIO;
 3944
 3945	*result = ctl;
 3946	return 0;
 3947}
 3948
 3949static __init bool allow_1_setting(u32 msr, u32 ctl)
 3950{
 3951	u32 vmx_msr_low, vmx_msr_high;
 3952
 3953	rdmsr(msr, vmx_msr_low, vmx_msr_high);
 3954	return vmx_msr_high & ctl;
 3955}
 3956
 3957static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 3958{
 3959	u32 vmx_msr_low, vmx_msr_high;
 3960	u32 min, opt, min2, opt2;
 3961	u32 _pin_based_exec_control = 0;
 3962	u32 _cpu_based_exec_control = 0;
 3963	u32 _cpu_based_2nd_exec_control = 0;
 3964	u32 _vmexit_control = 0;
 3965	u32 _vmentry_control = 0;
 3966
 3967	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
 3968	min = CPU_BASED_HLT_EXITING |
 3969#ifdef CONFIG_X86_64
 3970	      CPU_BASED_CR8_LOAD_EXITING |
 3971	      CPU_BASED_CR8_STORE_EXITING |
 3972#endif
 3973	      CPU_BASED_CR3_LOAD_EXITING |
 3974	      CPU_BASED_CR3_STORE_EXITING |
 3975	      CPU_BASED_UNCOND_IO_EXITING |
 3976	      CPU_BASED_MOV_DR_EXITING |
 3977	      CPU_BASED_USE_TSC_OFFSETING |
 3978	      CPU_BASED_MWAIT_EXITING |
 3979	      CPU_BASED_MONITOR_EXITING |
 3980	      CPU_BASED_INVLPG_EXITING |
 3981	      CPU_BASED_RDPMC_EXITING;
 3982
 3983	opt = CPU_BASED_TPR_SHADOW |
 3984	      CPU_BASED_USE_MSR_BITMAPS |
 3985	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 3986	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
 3987				&_cpu_based_exec_control) < 0)
 3988		return -EIO;
 3989#ifdef CONFIG_X86_64
 3990	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
 3991		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
 3992					   ~CPU_BASED_CR8_STORE_EXITING;
 3993#endif
 3994	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
 3995		min2 = 0;
 3996		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 3997			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 3998			SECONDARY_EXEC_WBINVD_EXITING |
 3999			SECONDARY_EXEC_ENABLE_VPID |
 4000			SECONDARY_EXEC_ENABLE_EPT |
 4001			SECONDARY_EXEC_UNRESTRICTED_GUEST |
 4002			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
 4003			SECONDARY_EXEC_DESC |
 4004			SECONDARY_EXEC_RDTSCP |
 4005			SECONDARY_EXEC_ENABLE_INVPCID |
 4006			SECONDARY_EXEC_APIC_REGISTER_VIRT |
 4007			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 4008			SECONDARY_EXEC_SHADOW_VMCS |
 4009			SECONDARY_EXEC_XSAVES |
 4010			SECONDARY_EXEC_RDSEED_EXITING |
 4011			SECONDARY_EXEC_RDRAND_EXITING |
 4012			SECONDARY_EXEC_ENABLE_PML |
 4013			SECONDARY_EXEC_TSC_SCALING |
 4014			SECONDARY_EXEC_ENABLE_VMFUNC;
 4015		if (adjust_vmx_controls(min2, opt2,
 4016					MSR_IA32_VMX_PROCBASED_CTLS2,
 4017					&_cpu_based_2nd_exec_control) < 0)
 4018			return -EIO;
 4019	}
 4020#ifndef CONFIG_X86_64
 4021	if (!(_cpu_based_2nd_exec_control &
 4022				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
 4023		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 4024#endif
 4025
 4026	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
 4027		_cpu_based_2nd_exec_control &= ~(
 4028				SECONDARY_EXEC_APIC_REGISTER_VIRT |
 4029				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 4030				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 4031
 4032	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
 4033		&vmx_capability.ept, &vmx_capability.vpid);
 4034
 4035	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
 4036		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
 4037		   enabled */
 4038		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
 4039					     CPU_BASED_CR3_STORE_EXITING |
 4040					     CPU_BASED_INVLPG_EXITING);
 4041	} else if (vmx_capability.ept) {
 4042		vmx_capability.ept = 0;
 4043		pr_warn_once("EPT CAP should not exist if not support "
 4044				"1-setting enable EPT VM-execution control\n");
 4045	}
 4046	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
 4047		vmx_capability.vpid) {
 4048		vmx_capability.vpid = 0;
 4049		pr_warn_once("VPID CAP should not exist if not support "
 4050				"1-setting enable VPID VM-execution control\n");
 4051	}
 4052
 4053	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
 4054#ifdef CONFIG_X86_64
 4055	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 4056#endif
 4057	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
 4058		VM_EXIT_CLEAR_BNDCFGS;
 4059	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 4060				&_vmexit_control) < 0)
 4061		return -EIO;
 4062
 4063	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
 4064	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
 4065		 PIN_BASED_VMX_PREEMPTION_TIMER;
 4066	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
 4067				&_pin_based_exec_control) < 0)
 4068		return -EIO;
 4069
 4070	if (cpu_has_broken_vmx_preemption_timer())
 4071		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 4072	if (!(_cpu_based_2nd_exec_control &
 4073		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
 4074		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
 4075
 4076	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
 4077	opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
 4078	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 4079				&_vmentry_control) < 0)
 4080		return -EIO;
 4081
 4082	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
 4083
 4084	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
 4085	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
 4086		return -EIO;
 4087
 4088#ifdef CONFIG_X86_64
 4089	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
 4090	if (vmx_msr_high & (1u<<16))
 4091		return -EIO;
 4092#endif
 4093
 4094	/* Require Write-Back (WB) memory type for VMCS accesses. */
 4095	if (((vmx_msr_high >> 18) & 15) != 6)
 4096		return -EIO;
 4097
 4098	vmcs_conf->size = vmx_msr_high & 0x1fff;
 4099	vmcs_conf->order = get_order(vmcs_conf->size);
 4100	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
 4101
 4102	/* KVM supports Enlightened VMCS v1 only */
 4103	if (static_branch_unlikely(&enable_evmcs))
 4104		vmcs_conf->revision_id = KVM_EVMCS_VERSION;
 4105	else
 4106		vmcs_conf->revision_id = vmx_msr_low;
 4107
 4108	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
 4109	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
 4110	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
 4111	vmcs_conf->vmexit_ctrl         = _vmexit_control;
 4112	vmcs_conf->vmentry_ctrl        = _vmentry_control;
 4113
 4114	if (static_branch_unlikely(&enable_evmcs))
 4115		evmcs_sanitize_exec_ctrls(vmcs_conf);
 4116
 4117	cpu_has_load_ia32_efer =
 4118		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
 4119				VM_ENTRY_LOAD_IA32_EFER)
 4120		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
 4121				   VM_EXIT_LOAD_IA32_EFER);
 4122
 4123	cpu_has_load_perf_global_ctrl =
 4124		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
 4125				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
 4126		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
 4127				   VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
 4128
 4129	/*
 4130	 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
 4131	 * but due to errata below it can't be used. Workaround is to use
 4132	 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
 4133	 *
 4134	 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
 4135	 *
 4136	 * AAK155             (model 26)
 4137	 * AAP115             (model 30)
 4138	 * AAT100             (model 37)
 4139	 * BC86,AAY89,BD102   (model 44)
 4140	 * BA97               (model 46)
 4141	 *
 4142	 */
 4143	if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
 4144		switch (boot_cpu_data.x86_model) {
 4145		case 26:
 4146		case 30:
 4147		case 37:
 4148		case 44:
 4149		case 46:
 4150			cpu_has_load_perf_global_ctrl = false;
 4151			printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
 4152					"does not work properly. Using workaround\n");
 4153			break;
 4154		default:
 4155			break;
 4156		}
 4157	}
 4158
 4159	if (boot_cpu_has(X86_FEATURE_XSAVES))
 4160		rdmsrl(MSR_IA32_XSS, host_xss);
 4161
 4162	return 0;
 4163}
 4164
 4165static struct vmcs *alloc_vmcs_cpu(int cpu)
 4166{
 4167	int node = cpu_to_node(cpu);
 4168	struct page *pages;
 4169	struct vmcs *vmcs;
 4170
 4171	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
 4172	if (!pages)
 4173		return NULL;
 4174	vmcs = page_address(pages);
 4175	memset(vmcs, 0, vmcs_config.size);
 4176	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
 4177	return vmcs;
 4178}
 4179
 4180static void free_vmcs(struct vmcs *vmcs)
 4181{
 4182	free_pages((unsigned long)vmcs, vmcs_config.order);
 4183}
 4184
 4185/*
 4186 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
 4187 */
 4188static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 4189{
 4190	if (!loaded_vmcs->vmcs)
 4191		return;
 4192	loaded_vmcs_clear(loaded_vmcs);
 4193	free_vmcs(loaded_vmcs->vmcs);
 4194	loaded_vmcs->vmcs = NULL;
 4195	if (loaded_vmcs->msr_bitmap)
 4196		free_page((unsigned long)loaded_vmcs->msr_bitmap);
 4197	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
 4198}
 4199
 4200static struct vmcs *alloc_vmcs(void)
 4201{
 4202	return alloc_vmcs_cpu(raw_smp_processor_id());
 4203}
 4204
 4205static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 4206{
 4207	loaded_vmcs->vmcs = alloc_vmcs();
 4208	if (!loaded_vmcs->vmcs)
 4209		return -ENOMEM;
 4210
 4211	loaded_vmcs->shadow_vmcs = NULL;
 4212	loaded_vmcs_init(loaded_vmcs);
 4213
 4214	if (cpu_has_vmx_msr_bitmap()) {
 4215		loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 4216		if (!loaded_vmcs->msr_bitmap)
 4217			goto out_vmcs;
 4218		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
 4219	}
 4220	return 0;
 4221
 4222out_vmcs:
 4223	free_loaded_vmcs(loaded_vmcs);
 4224	return -ENOMEM;
 4225}
 4226
 4227static void free_kvm_area(void)
 4228{
 4229	int cpu;
 4230
 4231	for_each_possible_cpu(cpu) {
 4232		free_vmcs(per_cpu(vmxarea, cpu));
 4233		per_cpu(vmxarea, cpu) = NULL;
 4234	}
 4235}
 4236
 4237enum vmcs_field_width {
 4238	VMCS_FIELD_WIDTH_U16 = 0,
 4239	VMCS_FIELD_WIDTH_U64 = 1,
 4240	VMCS_FIELD_WIDTH_U32 = 2,
 4241	VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
 4242};
 4243
 4244static inline int vmcs_field_width(unsigned long field)
 4245{
 4246	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
 4247		return VMCS_FIELD_WIDTH_U32;
 4248	return (field >> 13) & 0x3 ;
 4249}
 4250
 4251static inline int vmcs_field_readonly(unsigned long field)
 4252{
 4253	return (((field >> 10) & 0x3) == 1);
 4254}
 4255
 4256static void init_vmcs_shadow_fields(void)
 4257{
 4258	int i, j;
 4259
 4260	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
 4261		u16 field = shadow_read_only_fields[i];
 4262		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 4263		    (i + 1 == max_shadow_read_only_fields ||
 4264		     shadow_read_only_fields[i + 1] != field + 1))
 4265			pr_err("Missing field from shadow_read_only_field %x\n",
 4266			       field + 1);
 4267
 4268		clear_bit(field, vmx_vmread_bitmap);
 4269#ifdef CONFIG_X86_64
 4270		if (field & 1)
 4271			continue;
 4272#endif
 4273		if (j < i)
 4274			shadow_read_only_fields[j] = field;
 4275		j++;
 4276	}
 4277	max_shadow_read_only_fields = j;
 4278
 4279	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
 4280		u16 field = shadow_read_write_fields[i];
 4281		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 4282		    (i + 1 == max_shadow_read_write_fields ||
 4283		     shadow_read_write_fields[i + 1] != field + 1))
 4284			pr_err("Missing field from shadow_read_write_field %x\n",
 4285			       field + 1);
 4286
 4287		/*
 4288		 * PML and the preemption timer can be emulated, but the
 4289		 * processor cannot vmwrite to fields that don't exist
 4290		 * on bare metal.
 4291		 */
 4292		switch (field) {
 4293		case GUEST_PML_INDEX:
 4294			if (!cpu_has_vmx_pml())
 4295				continue;
 4296			break;
 4297		case VMX_PREEMPTION_TIMER_VALUE:
 4298			if (!cpu_has_vmx_preemption_timer())
 4299				continue;
 4300			break;
 4301		case GUEST_INTR_STATUS:
 4302			if (!cpu_has_vmx_apicv())
 4303				continue;
 4304			break;
 4305		default:
 4306			break;
 4307		}
 4308
 4309		clear_bit(field, vmx_vmwrite_bitmap);
 4310		clear_bit(field, vmx_vmread_bitmap);
 4311#ifdef CONFIG_X86_64
 4312		if (field & 1)
 4313			continue;
 4314#endif
 4315		if (j < i)
 4316			shadow_read_write_fields[j] = field;
 4317		j++;
 4318	}
 4319	max_shadow_read_write_fields = j;
 4320}
 4321
 4322static __init int alloc_kvm_area(void)
 4323{
 4324	int cpu;
 4325
 4326	for_each_possible_cpu(cpu) {
 4327		struct vmcs *vmcs;
 4328
 4329		vmcs = alloc_vmcs_cpu(cpu);
 4330		if (!vmcs) {
 4331			free_kvm_area();
 4332			return -ENOMEM;
 4333		}
 4334
 4335		per_cpu(vmxarea, cpu) = vmcs;
 4336	}
 4337	return 0;
 4338}
 4339
 4340static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 4341		struct kvm_segment *save)
 4342{
 4343	if (!emulate_invalid_guest_state) {
 4344		/*
 4345		 * CS and SS RPL should be equal during guest entry according
 4346		 * to VMX spec, but in reality it is not always so. Since vcpu
 4347		 * is in the middle of the transition from real mode to
 4348		 * protected mode it is safe to assume that RPL 0 is a good
 4349		 * default value.
 4350		 */
 4351		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
 4352			save->selector &= ~SEGMENT_RPL_MASK;
 4353		save->dpl = save->selector & SEGMENT_RPL_MASK;
 4354		save->s = 1;
 4355	}
 4356	vmx_set_segment(vcpu, save, seg);
 4357}
 4358
 4359static void enter_pmode(struct kvm_vcpu *vcpu)
 4360{
 4361	unsigned long flags;
 4362	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4363
 4364	/*
 4365	 * Update real mode segment cache. It may be not up-to-date if sement
 4366	 * register was written while vcpu was in a guest mode.
 4367	 */
 4368	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
 4369	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
 4370	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
 4371	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
 4372	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
 4373	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
 4374
 4375	vmx->rmode.vm86_active = 0;
 4376
 4377	vmx_segment_cache_clear(vmx);
 4378
 4379	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
 4380
 4381	flags = vmcs_readl(GUEST_RFLAGS);
 4382	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
 4383	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
 4384	vmcs_writel(GUEST_RFLAGS, flags);
 4385
 4386	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
 4387			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
 4388
 4389	update_exception_bitmap(vcpu);
 4390
 4391	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
 4392	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
 4393	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
 4394	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
 4395	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
 4396	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
 4397}
 4398
 4399static void fix_rmode_seg(int seg, struct kvm_segment *save)
 4400{
 4401	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 4402	struct kvm_segment var = *save;
 4403
 4404	var.dpl = 0x3;
 4405	if (seg == VCPU_SREG_CS)
 4406		var.type = 0x3;
 4407
 4408	if (!emulate_invalid_guest_state) {
 4409		var.selector = var.base >> 4;
 4410		var.base = var.base & 0xffff0;
 4411		var.limit = 0xffff;
 4412		var.g = 0;
 4413		var.db = 0;
 4414		var.present = 1;
 4415		var.s = 1;
 4416		var.l = 0;
 4417		var.unusable = 0;
 4418		var.type = 0x3;
 4419		var.avl = 0;
 4420		if (save->base & 0xf)
 4421			printk_once(KERN_WARNING "kvm: segment base is not "
 4422					"paragraph aligned when entering "
 4423					"protected mode (seg=%d)", seg);
 4424	}
 4425
 4426	vmcs_write16(sf->selector, var.selector);
 4427	vmcs_writel(sf->base, var.base);
 4428	vmcs_write32(sf->limit, var.limit);
 4429	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
 4430}
 4431
 4432static void enter_rmode(struct kvm_vcpu *vcpu)
 4433{
 4434	unsigned long flags;
 4435	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4436	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
 4437
 4438	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
 4439	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
 4440	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
 4441	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
 4442	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
 4443	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
 4444	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
 4445
 4446	vmx->rmode.vm86_active = 1;
 4447
 4448	/*
 4449	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
 4450	 * vcpu. Warn the user that an update is overdue.
 4451	 */
 4452	if (!kvm_vmx->tss_addr)
 4453		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
 4454			     "called before entering vcpu\n");
 4455
 4456	vmx_segment_cache_clear(vmx);
 4457
 4458	vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
 4459	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
 4460	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 4461
 4462	flags = vmcs_readl(GUEST_RFLAGS);
 4463	vmx->rmode.save_rflags = flags;
 4464
 4465	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 4466
 4467	vmcs_writel(GUEST_RFLAGS, flags);
 4468	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
 4469	update_exception_bitmap(vcpu);
 4470
 4471	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
 4472	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
 4473	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
 4474	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
 4475	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
 4476	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
 4477
 4478	kvm_mmu_reset_context(vcpu);
 4479}
 4480
 4481static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 4482{
 4483	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4484	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 4485
 4486	if (!msr)
 4487		return;
 4488
 4489	/*
 4490	 * Force kernel_gs_base reloading before EFER changes, as control
 4491	 * of this msr depends on is_long_mode().
 4492	 */
 4493	vmx_load_host_state(to_vmx(vcpu));
 4494	vcpu->arch.efer = efer;
 4495	if (efer & EFER_LMA) {
 4496		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 4497		msr->data = efer;
 4498	} else {
 4499		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 4500
 4501		msr->data = efer & ~EFER_LME;
 4502	}
 4503	setup_msrs(vmx);
 4504}
 4505
 4506#ifdef CONFIG_X86_64
 4507
 4508static void enter_lmode(struct kvm_vcpu *vcpu)
 4509{
 4510	u32 guest_tr_ar;
 4511
 4512	vmx_segment_cache_clear(to_vmx(vcpu));
 4513
 4514	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
 4515	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
 4516		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
 4517				     __func__);
 4518		vmcs_write32(GUEST_TR_AR_BYTES,
 4519			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
 4520			     | VMX_AR_TYPE_BUSY_64_TSS);
 4521	}
 4522	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
 4523}
 4524
 4525static void exit_lmode(struct kvm_vcpu *vcpu)
 4526{
 4527	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
 4528	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
 4529}
 4530
 4531#endif
 4532
 4533static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
 4534				bool invalidate_gpa)
 4535{
 4536	if (enable_ept && (invalidate_gpa || !enable_vpid)) {
 4537		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 4538			return;
 4539		ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
 4540	} else {
 4541		vpid_sync_context(vpid);
 4542	}
 4543}
 4544
 4545static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 4546{
 4547	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
 4548}
 4549
 4550static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 4551{
 4552	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
 4553
 4554	vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
 4555	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
 4556}
 4557
 4558static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
 4559{
 4560	if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
 4561		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 4562	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 4563}
 4564
 4565static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 4566{
 4567	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
 4568
 4569	vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
 4570	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
 4571}
 4572
 4573static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 4574{
 4575	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 4576
 4577	if (!test_bit(VCPU_EXREG_PDPTR,
 4578		      (unsigned long *)&vcpu->arch.regs_dirty))
 4579		return;
 4580
 4581	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
 4582		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
 4583		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
 4584		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
 4585		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
 4586	}
 4587}
 4588
 4589static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 4590{
 4591	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 4592
 4593	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
 4594		mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
 4595		mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
 4596		mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
 4597		mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 4598	}
 4599
 4600	__set_bit(VCPU_EXREG_PDPTR,
 4601		  (unsigned long *)&vcpu->arch.regs_avail);
 4602	__set_bit(VCPU_EXREG_PDPTR,
 4603		  (unsigned long *)&vcpu->arch.regs_dirty);
 4604}
 4605
 4606static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
 4607{
 4608	u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
 4609	u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
 4610	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 4611
 4612	if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
 4613		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
 4614	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
 4615		fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
 4616
 4617	return fixed_bits_valid(val, fixed0, fixed1);
 4618}
 4619
 4620static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
 4621{
 4622	u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
 4623	u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
 4624
 4625	return fixed_bits_valid(val, fixed0, fixed1);
 4626}
 4627
 4628static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
 4629{
 4630	u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
 4631	u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
 4632
 4633	return fixed_bits_valid(val, fixed0, fixed1);
 4634}
 4635
 4636/* No difference in the restrictions on guest and host CR4 in VMX operation. */
 4637#define nested_guest_cr4_valid	nested_cr4_valid
 4638#define nested_host_cr4_valid	nested_cr4_valid
 4639
 4640static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 4641
 4642static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 4643					unsigned long cr0,
 4644					struct kvm_vcpu *vcpu)
 4645{
 4646	if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
 4647		vmx_decache_cr3(vcpu);
 4648	if (!(cr0 & X86_CR0_PG)) {
 4649		/* From paging/starting to nonpaging */
 4650		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
 4651			     vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
 4652			     (CPU_BASED_CR3_LOAD_EXITING |
 4653			      CPU_BASED_CR3_STORE_EXITING));
 4654		vcpu->arch.cr0 = cr0;
 4655		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
 4656	} else if (!is_paging(vcpu)) {
 4657		/* From nonpaging to paging */
 4658		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
 4659			     vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
 4660			     ~(CPU_BASED_CR3_LOAD_EXITING |
 4661			       CPU_BASED_CR3_STORE_EXITING));
 4662		vcpu->arch.cr0 = cr0;
 4663		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
 4664	}
 4665
 4666	if (!(cr0 & X86_CR0_WP))
 4667		*hw_cr0 &= ~X86_CR0_WP;
 4668}
 4669
 4670static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 4671{
 4672	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4673	unsigned long hw_cr0;
 4674
 4675	hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
 4676	if (enable_unrestricted_guest)
 4677		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
 4678	else {
 4679		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
 4680
 4681		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
 4682			enter_pmode(vcpu);
 4683
 4684		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
 4685			enter_rmode(vcpu);
 4686	}
 4687
 4688#ifdef CONFIG_X86_64
 4689	if (vcpu->arch.efer & EFER_LME) {
 4690		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
 4691			enter_lmode(vcpu);
 4692		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
 4693			exit_lmode(vcpu);
 4694	}
 4695#endif
 4696
 4697	if (enable_ept && !enable_unrestricted_guest)
 4698		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 4699
 4700	vmcs_writel(CR0_READ_SHADOW, cr0);
 4701	vmcs_writel(GUEST_CR0, hw_cr0);
 4702	vcpu->arch.cr0 = cr0;
 4703
 4704	/* depends on vcpu->arch.cr0 to be set to a new value */
 4705	vmx->emulation_required = emulation_required(vcpu);
 4706}
 4707
 4708static int get_ept_level(struct kvm_vcpu *vcpu)
 4709{
 4710	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
 4711		return 5;
 4712	return 4;
 4713}
 4714
 4715static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 4716{
 4717	u64 eptp = VMX_EPTP_MT_WB;
 4718
 4719	eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
 4720
 4721	if (enable_ept_ad_bits &&
 4722	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
 4723		eptp |= VMX_EPTP_AD_ENABLE_BIT;
 4724	eptp |= (root_hpa & PAGE_MASK);
 4725
 4726	return eptp;
 4727}
 4728
 4729static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 4730{
 4731	unsigned long guest_cr3;
 4732	u64 eptp;
 4733
 4734	guest_cr3 = cr3;
 4735	if (enable_ept) {
 4736		eptp = construct_eptp(vcpu, cr3);
 4737		vmcs_write64(EPT_POINTER, eptp);
 4738		if (enable_unrestricted_guest || is_paging(vcpu) ||
 4739		    is_guest_mode(vcpu))
 4740			guest_cr3 = kvm_read_cr3(vcpu);
 4741		else
 4742			guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
 4743		ept_load_pdptrs(vcpu);
 4744	}
 4745
 4746	vmx_flush_tlb(vcpu, true);
 4747	vmcs_writel(GUEST_CR3, guest_cr3);
 4748}
 4749
 4750static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 4751{
 4752	/*
 4753	 * Pass through host's Machine Check Enable value to hw_cr4, which
 4754	 * is in force while we are in guest mode.  Do not let guests control
 4755	 * this bit, even if host CR4.MCE == 0.
 4756	 */
 4757	unsigned long hw_cr4;
 4758
 4759	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
 4760	if (enable_unrestricted_guest)
 4761		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
 4762	else if (to_vmx(vcpu)->rmode.vm86_active)
 4763		hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
 4764	else
 4765		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
 4766
 4767	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
 4768		if (cr4 & X86_CR4_UMIP) {
 4769			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 4770				SECONDARY_EXEC_DESC);
 4771			hw_cr4 &= ~X86_CR4_UMIP;
 4772		} else if (!is_guest_mode(vcpu) ||
 4773			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
 4774			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
 4775					SECONDARY_EXEC_DESC);
 4776	}
 4777
 4778	if (cr4 & X86_CR4_VMXE) {
 4779		/*
 4780		 * To use VMXON (and later other VMX instructions), a guest
 4781		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
 4782		 * So basically the check on whether to allow nested VMX
 4783		 * is here.
 4784		 */
 4785		if (!nested_vmx_allowed(vcpu))
 4786			return 1;
 4787	}
 4788
 4789	if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
 4790		return 1;
 4791
 4792	vcpu->arch.cr4 = cr4;
 4793
 4794	if (!enable_unrestricted_guest) {
 4795		if (enable_ept) {
 4796			if (!is_paging(vcpu)) {
 4797				hw_cr4 &= ~X86_CR4_PAE;
 4798				hw_cr4 |= X86_CR4_PSE;
 4799			} else if (!(cr4 & X86_CR4_PAE)) {
 4800				hw_cr4 &= ~X86_CR4_PAE;
 4801			}
 4802		}
 4803
 4804		/*
 4805		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
 4806		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
 4807		 * to be manually disabled when guest switches to non-paging
 4808		 * mode.
 4809		 *
 4810		 * If !enable_unrestricted_guest, the CPU is always running
 4811		 * with CR0.PG=1 and CR4 needs to be modified.
 4812		 * If enable_unrestricted_guest, the CPU automatically
 4813		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
 4814		 */
 4815		if (!is_paging(vcpu))
 4816			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
 4817	}
 4818
 4819	vmcs_writel(CR4_READ_SHADOW, cr4);
 4820	vmcs_writel(GUEST_CR4, hw_cr4);
 4821	return 0;
 4822}
 4823
 4824static void vmx_get_segment(struct kvm_vcpu *vcpu,
 4825			    struct kvm_segment *var, int seg)
 4826{
 4827	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4828	u32 ar;
 4829
 4830	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
 4831		*var = vmx->rmode.segs[seg];
 4832		if (seg == VCPU_SREG_TR
 4833		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
 4834			return;
 4835		var->base = vmx_read_guest_seg_base(vmx, seg);
 4836		var->selector = vmx_read_guest_seg_selector(vmx, seg);
 4837		return;
 4838	}
 4839	var->base = vmx_read_guest_seg_base(vmx, seg);
 4840	var->limit = vmx_read_guest_seg_limit(vmx, seg);
 4841	var->selector = vmx_read_guest_seg_selector(vmx, seg);
 4842	ar = vmx_read_guest_seg_ar(vmx, seg);
 4843	var->unusable = (ar >> 16) & 1;
 4844	var->type = ar & 15;
 4845	var->s = (ar >> 4) & 1;
 4846	var->dpl = (ar >> 5) & 3;
 4847	/*
 4848	 * Some userspaces do not preserve unusable property. Since usable
 4849	 * segment has to be present according to VMX spec we can use present
 4850	 * property to amend userspace bug by making unusable segment always
 4851	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
 4852	 * segment as unusable.
 4853	 */
 4854	var->present = !var->unusable;
 4855	var->avl = (ar >> 12) & 1;
 4856	var->l = (ar >> 13) & 1;
 4857	var->db = (ar >> 14) & 1;
 4858	var->g = (ar >> 15) & 1;
 4859}
 4860
 4861static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
 4862{
 4863	struct kvm_segment s;
 4864
 4865	if (to_vmx(vcpu)->rmode.vm86_active) {
 4866		vmx_get_segment(vcpu, &s, seg);
 4867		return s.base;
 4868	}
 4869	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
 4870}
 4871
 4872static int vmx_get_cpl(struct kvm_vcpu *vcpu)
 4873{
 4874	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4875
 4876	if (unlikely(vmx->rmode.vm86_active))
 4877		return 0;
 4878	else {
 4879		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
 4880		return VMX_AR_DPL(ar);
 4881	}
 4882}
 4883
 4884static u32 vmx_segment_access_rights(struct kvm_segment *var)
 4885{
 4886	u32 ar;
 4887
 4888	if (var->unusable || !var->present)
 4889		ar = 1 << 16;
 4890	else {
 4891		ar = var->type & 15;
 4892		ar |= (var->s & 1) << 4;
 4893		ar |= (var->dpl & 3) << 5;
 4894		ar |= (var->present & 1) << 7;
 4895		ar |= (var->avl & 1) << 12;
 4896		ar |= (var->l & 1) << 13;
 4897		ar |= (var->db & 1) << 14;
 4898		ar |= (var->g & 1) << 15;
 4899	}
 4900
 4901	return ar;
 4902}
 4903
 4904static void vmx_set_segment(struct kvm_vcpu *vcpu,
 4905			    struct kvm_segment *var, int seg)
 4906{
 4907	struct vcpu_vmx *vmx = to_vmx(vcpu);
 4908	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 4909
 4910	vmx_segment_cache_clear(vmx);
 4911
 4912	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
 4913		vmx->rmode.segs[seg] = *var;
 4914		if (seg == VCPU_SREG_TR)
 4915			vmcs_write16(sf->selector, var->selector);
 4916		else if (var->s)
 4917			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
 4918		goto out;
 4919	}
 4920
 4921	vmcs_writel(sf->base, var->base);
 4922	vmcs_write32(sf->limit, var->limit);
 4923	vmcs_write16(sf->selector, var->selector);
 4924
 4925	/*
 4926	 *   Fix the "Accessed" bit in AR field of segment registers for older
 4927	 * qemu binaries.
 4928	 *   IA32 arch specifies that at the time of processor reset the
 4929	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
 4930	 * is setting it to 0 in the userland code. This causes invalid guest
 4931	 * state vmexit when "unrestricted guest" mode is turned on.
 4932	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
 4933	 * tree. Newer qemu binaries with that qemu fix would not need this
 4934	 * kvm hack.
 4935	 */
 4936	if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
 4937		var->type |= 0x1; /* Accessed */
 4938
 4939	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
 4940
 4941out:
 4942	vmx->emulation_required = emulation_required(vcpu);
 4943}
 4944
 4945static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 4946{
 4947	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
 4948
 4949	*db = (ar >> 14) & 1;
 4950	*l = (ar >> 13) & 1;
 4951}
 4952
 4953static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 4954{
 4955	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
 4956	dt->address = vmcs_readl(GUEST_IDTR_BASE);
 4957}
 4958
 4959static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 4960{
 4961	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
 4962	vmcs_writel(GUEST_IDTR_BASE, dt->address);
 4963}
 4964
 4965static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 4966{
 4967	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
 4968	dt->address = vmcs_readl(GUEST_GDTR_BASE);
 4969}
 4970
 4971static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 4972{
 4973	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
 4974	vmcs_writel(GUEST_GDTR_BASE, dt->address);
 4975}
 4976
 4977static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
 4978{
 4979	struct kvm_segment var;
 4980	u32 ar;
 4981
 4982	vmx_get_segment(vcpu, &var, seg);
 4983	var.dpl = 0x3;
 4984	if (seg == VCPU_SREG_CS)
 4985		var.type = 0x3;
 4986	ar = vmx_segment_access_rights(&var);
 4987
 4988	if (var.base != (var.selector << 4))
 4989		return false;
 4990	if (var.limit != 0xffff)
 4991		return false;
 4992	if (ar != 0xf3)
 4993		return false;
 4994
 4995	return true;
 4996}
 4997
 4998static bool code_segment_valid(struct kvm_vcpu *vcpu)
 4999{
 5000	struct kvm_segment cs;
 5001	unsigned int cs_rpl;
 5002
 5003	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
 5004	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
 5005
 5006	if (cs.unusable)
 5007		return false;
 5008	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
 5009		return false;
 5010	if (!cs.s)
 5011		return false;
 5012	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
 5013		if (cs.dpl > cs_rpl)
 5014			return false;
 5015	} else {
 5016		if (cs.dpl != cs_rpl)
 5017			return false;
 5018	}
 5019	if (!cs.present)
 5020		return false;
 5021
 5022	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
 5023	return true;
 5024}
 5025
 5026static bool stack_segment_valid(struct kvm_vcpu *vcpu)
 5027{
 5028	struct kvm_segment ss;
 5029	unsigned int ss_rpl;
 5030
 5031	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 5032	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
 5033
 5034	if (ss.unusable)
 5035		return true;
 5036	if (ss.type != 3 && ss.type != 7)
 5037		return false;
 5038	if (!ss.s)
 5039		return false;
 5040	if (ss.dpl != ss_rpl) /* DPL != RPL */
 5041		return false;
 5042	if (!ss.present)
 5043		return false;
 5044
 5045	return true;
 5046}
 5047
 5048static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
 5049{
 5050	struct kvm_segment var;
 5051	unsigned int rpl;
 5052
 5053	vmx_get_segment(vcpu, &var, seg);
 5054	rpl = var.selector & SEGMENT_RPL_MASK;
 5055
 5056	if (var.unusable)
 5057		return true;
 5058	if (!var.s)
 5059		return false;
 5060	if (!var.present)
 5061		return false;
 5062	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
 5063		if (var.dpl < rpl) /* DPL < RPL */
 5064			return false;
 5065	}
 5066
 5067	/* TODO: Add other members to kvm_segment_field to allow checking for other access
 5068	 * rights flags
 5069	 */
 5070	return true;
 5071}
 5072
 5073static bool tr_valid(struct kvm_vcpu *vcpu)
 5074{
 5075	struct kvm_segment tr;
 5076
 5077	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
 5078
 5079	if (tr.unusable)
 5080		return false;
 5081	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
 5082		return false;
 5083	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
 5084		return false;
 5085	if (!tr.present)
 5086		return false;
 5087
 5088	return true;
 5089}
 5090
 5091static bool ldtr_valid(struct kvm_vcpu *vcpu)
 5092{
 5093	struct kvm_segment ldtr;
 5094
 5095	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
 5096
 5097	if (ldtr.unusable)
 5098		return true;
 5099	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
 5100		return false;
 5101	if (ldtr.type != 2)
 5102		return false;
 5103	if (!ldtr.present)
 5104		return false;
 5105
 5106	return true;
 5107}
 5108
 5109static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
 5110{
 5111	struct kvm_segment cs, ss;
 5112
 5113	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
 5114	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 5115
 5116	return ((cs.selector & SEGMENT_RPL_MASK) ==
 5117		 (ss.selector & SEGMENT_RPL_MASK));
 5118}
 5119
 5120/*
 5121 * Check if guest state is valid. Returns true if valid, false if
 5122 * not.
 5123 * We assume that registers are always usable
 5124 */
 5125static bool guest_state_valid(struct kvm_vcpu *vcpu)
 5126{
 5127	if (enable_unrestricted_guest)
 5128		return true;
 5129
 5130	/* real mode guest state checks */
 5131	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
 5132		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
 5133			return false;
 5134		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
 5135			return false;
 5136		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
 5137			return false;
 5138		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
 5139			return false;
 5140		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
 5141			return false;
 5142		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
 5143			return false;
 5144	} else {
 5145	/* protected mode guest state checks */
 5146		if (!cs_ss_rpl_check(vcpu))
 5147			return false;
 5148		if (!code_segment_valid(vcpu))
 5149			return false;
 5150		if (!stack_segment_valid(vcpu))
 5151			return false;
 5152		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
 5153			return false;
 5154		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
 5155			return false;
 5156		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
 5157			return false;
 5158		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
 5159			return false;
 5160		if (!tr_valid(vcpu))
 5161			return false;
 5162		if (!ldtr_valid(vcpu))
 5163			return false;
 5164	}
 5165	/* TODO:
 5166	 * - Add checks on RIP
 5167	 * - Add checks on RFLAGS
 5168	 */
 5169
 5170	return true;
 5171}
 5172
 5173static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
 5174{
 5175	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
 5176}
 5177
 5178static int init_rmode_tss(struct kvm *kvm)
 5179{
 5180	gfn_t fn;
 5181	u16 data = 0;
 5182	int idx, r;
 5183
 5184	idx = srcu_read_lock(&kvm->srcu);
 5185	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
 5186	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
 5187	if (r < 0)
 5188		goto out;
 5189	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
 5190	r = kvm_write_guest_page(kvm, fn++, &data,
 5191			TSS_IOPB_BASE_OFFSET, sizeof(u16));
 5192	if (r < 0)
 5193		goto out;
 5194	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
 5195	if (r < 0)
 5196		goto out;
 5197	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
 5198	if (r < 0)
 5199		goto out;
 5200	data = ~0;
 5201	r = kvm_write_guest_page(kvm, fn, &data,
 5202				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
 5203				 sizeof(u8));
 5204out:
 5205	srcu_read_unlock(&kvm->srcu, idx);
 5206	return r;
 5207}
 5208
 5209static int init_rmode_identity_map(struct kvm *kvm)
 5210{
 5211	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
 5212	int i, idx, r = 0;
 5213	kvm_pfn_t identity_map_pfn;
 5214	u32 tmp;
 5215
 5216	/* Protect kvm_vmx->ept_identity_pagetable_done. */
 5217	mutex_lock(&kvm->slots_lock);
 5218
 5219	if (likely(kvm_vmx->ept_identity_pagetable_done))
 5220		goto out2;
 5221
 5222	if (!kvm_vmx->ept_identity_map_addr)
 5223		kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
 5224	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
 5225
 5226	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
 5227				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
 5228	if (r < 0)
 5229		goto out2;
 5230
 5231	idx = srcu_read_lock(&kvm->srcu);
 5232	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
 5233	if (r < 0)
 5234		goto out;
 5235	/* Set up identity-mapping pagetable for EPT in real mode */
 5236	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
 5237		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
 5238			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
 5239		r = kvm_write_guest_page(kvm, identity_map_pfn,
 5240				&tmp, i * sizeof(tmp), sizeof(tmp));
 5241		if (r < 0)
 5242			goto out;
 5243	}
 5244	kvm_vmx->ept_identity_pagetable_done = true;
 5245
 5246out:
 5247	srcu_read_unlock(&kvm->srcu, idx);
 5248
 5249out2:
 5250	mutex_unlock(&kvm->slots_lock);
 5251	return r;
 5252}
 5253
 5254static void seg_setup(int seg)
 5255{
 5256	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 5257	unsigned int ar;
 5258
 5259	vmcs_write16(sf->selector, 0);
 5260	vmcs_writel(sf->base, 0);
 5261	vmcs_write32(sf->limit, 0xffff);
 5262	ar = 0x93;
 5263	if (seg == VCPU_SREG_CS)
 5264		ar |= 0x08; /* code segment */
 5265
 5266	vmcs_write32(sf->ar_bytes, ar);
 5267}
 5268
 5269static int alloc_apic_access_page(struct kvm *kvm)
 5270{
 5271	struct page *page;
 5272	int r = 0;
 5273
 5274	mutex_lock(&kvm->slots_lock);
 5275	if (kvm->arch.apic_access_page_done)
 5276		goto out;
 5277	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
 5278				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
 5279	if (r)
 5280		goto out;
 5281
 5282	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
 5283	if (is_error_page(page)) {
 5284		r = -EFAULT;
 5285		goto out;
 5286	}
 5287
 5288	/*
 5289	 * Do not pin the page in memory, so that memory hot-unplug
 5290	 * is able to migrate it.
 5291	 */
 5292	put_page(page);
 5293	kvm->arch.apic_access_page_done = true;
 5294out:
 5295	mutex_unlock(&kvm->slots_lock);
 5296	return r;
 5297}
 5298
 5299static int allocate_vpid(void)
 5300{
 5301	int vpid;
 5302
 5303	if (!enable_vpid)
 5304		return 0;
 5305	spin_lock(&vmx_vpid_lock);
 5306	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
 5307	if (vpid < VMX_NR_VPIDS)
 5308		__set_bit(vpid, vmx_vpid_bitmap);
 5309	else
 5310		vpid = 0;
 5311	spin_unlock(&vmx_vpid_lock);
 5312	return vpid;
 5313}
 5314
 5315static void free_vpid(int vpid)
 5316{
 5317	if (!enable_vpid || vpid == 0)
 5318		return;
 5319	spin_lock(&vmx_vpid_lock);
 5320	__clear_bit(vpid, vmx_vpid_bitmap);
 5321	spin_unlock(&vmx_vpid_lock);
 5322}
 5323
 5324static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
 5325							  u32 msr, int type)
 5326{
 5327	int f = sizeof(unsigned long);
 5328
 5329	if (!cpu_has_vmx_msr_bitmap())
 5330		return;
 5331
 5332	/*
 5333	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 5334	 * have the write-low and read-high bitmap offsets the wrong way round.
 5335	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 5336	 */
 5337	if (msr <= 0x1fff) {
 5338		if (type & MSR_TYPE_R)
 5339			/* read-low */
 5340			__clear_bit(msr, msr_bitmap + 0x000 / f);
 5341
 5342		if (type & MSR_TYPE_W)
 5343			/* write-low */
 5344			__clear_bit(msr, msr_bitmap + 0x800 / f);
 5345
 5346	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 5347		msr &= 0x1fff;
 5348		if (type & MSR_TYPE_R)
 5349			/* read-high */
 5350			__clear_bit(msr, msr_bitmap + 0x400 / f);
 5351
 5352		if (type & MSR_TYPE_W)
 5353			/* write-high */
 5354			__clear_bit(msr, msr_bitmap + 0xc00 / f);
 5355
 5356	}
 5357}
 5358
 5359static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
 5360							 u32 msr, int type)
 5361{
 5362	int f = sizeof(unsigned long);
 5363
 5364	if (!cpu_has_vmx_msr_bitmap())
 5365		return;
 5366
 5367	/*
 5368	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 5369	 * have the write-low and read-high bitmap offsets the wrong way round.
 5370	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 5371	 */
 5372	if (msr <= 0x1fff) {
 5373		if (type & MSR_TYPE_R)
 5374			/* read-low */
 5375			__set_bit(msr, msr_bitmap + 0x000 / f);
 5376
 5377		if (type & MSR_TYPE_W)
 5378			/* write-low */
 5379			__set_bit(msr, msr_bitmap + 0x800 / f);
 5380
 5381	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 5382		msr &= 0x1fff;
 5383		if (type & MSR_TYPE_R)
 5384			/* read-high */
 5385			__set_bit(msr, msr_bitmap + 0x400 / f);
 5386
 5387		if (type & MSR_TYPE_W)
 5388			/* write-high */
 5389			__set_bit(msr, msr_bitmap + 0xc00 / f);
 5390
 5391	}
 5392}
 5393
 5394static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
 5395			     			      u32 msr, int type, bool value)
 5396{
 5397	if (value)
 5398		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
 5399	else
 5400		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
 5401}
 5402
 5403/*
 5404 * If a msr is allowed by L0, we should check whether it is allowed by L1.
 5405 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
 5406 */
 5407static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 5408					       unsigned long *msr_bitmap_nested,
 5409					       u32 msr, int type)
 5410{
 5411	int f = sizeof(unsigned long);
 5412
 5413	/*
 5414	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 5415	 * have the write-low and read-high bitmap offsets the wrong way round.
 5416	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 5417	 */
 5418	if (msr <= 0x1fff) {
 5419		if (type & MSR_TYPE_R &&
 5420		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
 5421			/* read-low */
 5422			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
 5423
 5424		if (type & MSR_TYPE_W &&
 5425		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
 5426			/* write-low */
 5427			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
 5428
 5429	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 5430		msr &= 0x1fff;
 5431		if (type & MSR_TYPE_R &&
 5432		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
 5433			/* read-high */
 5434			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
 5435
 5436		if (type & MSR_TYPE_W &&
 5437		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
 5438			/* write-high */
 5439			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
 5440
 5441	}
 5442}
 5443
 5444static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
 5445{
 5446	u8 mode = 0;
 5447
 5448	if (cpu_has_secondary_exec_ctrls() &&
 5449	    (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
 5450	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 5451		mode |= MSR_BITMAP_MODE_X2APIC;
 5452		if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
 5453			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
 5454	}
 5455
 5456	if (is_long_mode(vcpu))
 5457		mode |= MSR_BITMAP_MODE_LM;
 5458
 5459	return mode;
 5460}
 5461
 5462#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
 5463
 5464static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
 5465					 u8 mode)
 5466{
 5467	int msr;
 5468
 5469	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
 5470		unsigned word = msr / BITS_PER_LONG;
 5471		msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
 5472		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
 5473	}
 5474
 5475	if (mode & MSR_BITMAP_MODE_X2APIC) {
 5476		/*
 5477		 * TPR reads and writes can be virtualized even if virtual interrupt
 5478		 * delivery is not in use.
 5479		 */
 5480		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
 5481		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
 5482			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
 5483			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
 5484			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
 5485		}
 5486	}
 5487}
 5488
 5489static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
 5490{
 5491	struct vcpu_vmx *vmx = to_vmx(vcpu);
 5492	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
 5493	u8 mode = vmx_msr_bitmap_mode(vcpu);
 5494	u8 changed = mode ^ vmx->msr_bitmap_mode;
 5495
 5496	if (!changed)
 5497		return;
 5498
 5499	vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
 5500				  !(mode & MSR_BITMAP_MODE_LM));
 5501
 5502	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
 5503		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
 5504
 5505	vmx->msr_bitmap_mode = mode;
 5506}
 5507
 5508static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 5509{
 5510	return enable_apicv;
 5511}
 5512
 5513static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
 5514{
 5515	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 5516	gfn_t gfn;
 5517
 5518	/*
 5519	 * Don't need to mark the APIC access page dirty; it is never
 5520	 * written to by the CPU during APIC virtualization.
 5521	 */
 5522
 5523	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
 5524		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
 5525		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 5526	}
 5527
 5528	if (nested_cpu_has_posted_intr(vmcs12)) {
 5529		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
 5530		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 5531	}
 5532}
 5533
 5534
 5535static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 5536{
 5537	struct vcpu_vmx *vmx = to_vmx(vcpu);
 5538	int max_irr;
 5539	void *vapic_page;
 5540	u16 status;
 5541
 5542	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
 5543		return;
 5544
 5545	vmx->nested.pi_pending = false;
 5546	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
 5547		return;
 5548
 5549	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
 5550	if (max_irr != 256) {
 5551		vapic_page = kmap(vmx->nested.virtual_apic_page);
 5552		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
 5553			vapic_page, &max_irr);
 5554		kunmap(vmx->nested.virtual_apic_page);
 5555
 5556		status = vmcs_read16(GUEST_INTR_STATUS);
 5557		if ((u8)max_irr > ((u8)status & 0xff)) {
 5558			status &= ~0xff;
 5559			status |= (u8)max_irr;
 5560			vmcs_write16(GUEST_INTR_STATUS, status);
 5561		}
 5562	}
 5563
 5564	nested_mark_vmcs12_pages_dirty(vcpu);
 5565}
 5566
 5567static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
 5568						     bool nested)
 5569{
 5570#ifdef CONFIG_SMP
 5571	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
 5572
 5573	if (vcpu->mode == IN_GUEST_MODE) {
 5574		/*
 5575		 * The vector of interrupt to be delivered to vcpu had
 5576		 * been set in PIR before this function.
 5577		 *
 5578		 * Following cases will be reached in this block, and
 5579		 * we always send a notification event in all cases as
 5580		 * explained below.
 5581		 *
 5582		 * Case 1: vcpu keeps in non-root mode. Sending a
 5583		 * notification event posts the interrupt to vcpu.
 5584		 *
 5585		 * Case 2: vcpu exits to root mode and is still
 5586		 * runnable. PIR will be synced to vIRR before the
 5587		 * next vcpu entry. Sending a notification event in
 5588		 * this case has no effect, as vcpu is not in root
 5589		 * mode.
 5590		 *
 5591		 * Case 3: vcpu exits to root mode and is blocked.
 5592		 * vcpu_block() has already synced PIR to vIRR and
 5593		 * never blocks vcpu if vIRR is not cleared. Therefore,
 5594		 * a blocked vcpu here does not wait for any requested
 5595		 * interrupts in PIR, and sending a notification event
 5596		 * which has no effect is safe here.
 5597		 */
 5598
 5599		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
 5600		return true;
 5601	}
 5602#endif
 5603	return false;
 5604}
 5605
 5606static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 5607						int vector)
 5608{
 5609	struct vcpu_vmx *vmx = to_vmx(vcpu);
 5610
 5611	if (is_guest_mode(vcpu) &&
 5612	    vector == vmx->nested.posted_intr_nv) {
 5613		/*
 5614		 * If a posted intr is not recognized by hardware,
 5615		 * we will accomplish it in the next vmentry.
 5616		 */
 5617		vmx->nested.pi_pending = true;
 5618		kvm_make_request(KVM_REQ_EVENT, vcpu);
 5619		/* the PIR and ON have been set by L1. */
 5620		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
 5621			kvm_vcpu_kick(vcpu);
 5622		return 0;
 5623	}
 5624	return -1;
 5625}
 5626/*
 5627 * Send interrupt to vcpu via posted interrupt way.
 5628 * 1. If target vcpu is running(non-root mode), send posted interrupt
 5629 * notification to vcpu and hardware will sync PIR to vIRR atomically.
 5630 * 2. If target vcpu isn't running(root mode), kick it to pick up the
 5631 * interrupt from PIR in next vmentry.
 5632 */
 5633static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 5634{
 5635	struct vcpu_vmx *vmx = to_vmx(vcpu);
 5636	int r;
 5637
 5638	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
 5639	if (!r)
 5640		return;
 5641
 5642	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
 5643		return;
 5644
 5645	/* If a previous notification has sent the IPI, nothing to do.  */
 5646	if (pi_test_and_set_on(&vmx->pi_desc))
 5647		return;
 5648
 5649	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
 5650		kvm_vcpu_kick(vcpu);
 5651}
 5652
 5653/*
 5654 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
 5655 * will not change in the lifetime of the guest.
 5656 * Note that host-state that does change is set elsewhere. E.g., host-state
 5657 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
 5658 */
 5659static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 5660{
 5661	u32 low32, high32;
 5662	unsigned long tmpl;
 5663	struct desc_ptr dt;
 5664	unsigned long cr0, cr3, cr4;
 5665
 5666	cr0 = read_cr0();
 5667	WARN_ON(cr0 & X86_CR0_TS);
 5668	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
 5669
 5670	/*
 5671	 * Save the most likely value for this task's CR3 in the VMCS.
 5672	 * We can't use __get_current_cr3_fast() because we're not atomic.
 5673	 */
 5674	cr3 = __read_cr3();
 5675	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
 5676	vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 5677
 5678	/* Save the most likely value for this task's CR4 in the VMCS. */
 5679	cr4 = cr4_read_shadow();
 5680	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
 5681	vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 5682
 5683	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 5684#ifdef CONFIG_X86_64
 5685	/*
 5686	 * Load null selectors, so we can avoid reloading them in
 5687	 * __vmx_load_host_state(), in case userspace uses the null selectors
 5688	 * too (the expected case).
 5689	 */
 5690	vmcs_write16(HOST_DS_SELECTOR, 0);
 5691	vmcs_write16(HOST_ES_SELECTOR, 0);
 5692#else
 5693	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 5694	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 5695#endif
 5696	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 5697	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 5698
 5699	store_idt(&dt);
 5700	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 5701	vmx->host_idt_base = dt.address;
 5702
 5703	vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
 5704
 5705	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
 5706	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
 5707	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
 5708	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
 5709
 5710	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
 5711		rdmsr(MSR_IA32_CR_PAT, low32, high32);
 5712		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
 5713	}
 5714}
 5715
 5716static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
 5717{
 5718	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
 5719	if (enable_ept)
 5720		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
 5721	if (is_guest_mode(&vmx->vcpu))
 5722		vmx->vcpu.arch.cr4_guest_owned_bits &=
 5723			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
 5724	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 5725}
 5726
 5727static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 5728{
 5729	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 5730
 5731	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
 5732		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 5733
 5734	if (!enable_vnmi)
 5735		pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
 5736
 5737	/* Enable the preemption timer dynamically */
 5738	pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
 5739	return pin_based_exec_ctrl;
 5740}
 5741
 5742static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 5743{
 5744	struct vcpu_vmx *vmx = to_vmx(vcpu);
 5745
 5746	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
 5747	if (cpu_has_secondary_exec_ctrls()) {
 5748		if (kvm_vcpu_apicv_active(vcpu))
 5749			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 5750				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
 5751				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 5752		else
 5753			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
 5754					SECONDARY_EXEC_APIC_REGISTER_VIRT |
 5755					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 5756	}
 5757
 5758	if (cpu_has_vmx_msr_bitmap())
 5759		vmx_update_msr_bitmap(vcpu);
 5760}
 5761
 5762static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 5763{
 5764	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
 5765
 5766	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
 5767		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
 5768
 5769	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
 5770		exec_control &= ~CPU_BASED_TPR_SHADOW;
 5771#ifdef CONFIG_X86_64
 5772		exec_control |= CPU_BASED_CR8_STORE_EXITING |
 5773				CPU_BASED_CR8_LOAD_EXITING;
 5774#endif
 5775	}
 5776	if (!enable_ept)
 5777		exec_control |= CPU_BASED_CR3_STORE_EXITING |
 5778				CPU_BASED_CR3_LOAD_EXITING  |
 5779				CPU_BASED_INVLPG_EXITING;
 5780	if (kvm_mwait_in_guest(vmx->vcpu.kvm))
 5781		exec_control &= ~(CPU_BASED_MWAIT_EXITING |
 5782				CPU_BASED_MONITOR_EXITING);
 5783	if (kvm_hlt_in_guest(vmx->vcpu.kvm))
 5784		exec_control &= ~CPU_BASED_HLT_EXITING;
 5785	return exec_control;
 5786}
 5787
 5788static bool vmx_rdrand_supported(void)
 5789{
 5790	return vmcs_config.cpu_based_2nd_exec_ctrl &
 5791		SECONDARY_EXEC_RDRAND_EXITING;
 5792}
 5793
 5794static bool vmx_rdseed_supported(void)
 5795{
 5796	return vmcs_config.cpu_based_2nd_exec_ctrl &
 5797		SECONDARY_EXEC_RDSEED_EXITING;
 5798}
 5799
 5800static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 5801{
 5802	struct kvm_vcpu *vcpu = &vmx->vcpu;
 5803
 5804	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
 5805
 5806	if (!cpu_need_virtualize_apic_accesses(vcpu))
 5807		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 5808	if (vmx->vpid == 0)
 5809		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 5810	if (!enable_ept) {
 5811		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 5812		enable_unrestricted_guest = 0;
 5813		/* Enable INVPCID for non-ept guests may cause performance regression. */
 5814		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
 5815	}
 5816	if (!enable_unrestricted_guest)
 5817		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 5818	if (kvm_pause_in_guest(vmx->vcpu.kvm))
 5819		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 5820	if (!kvm_vcpu_apicv_active(vcpu))
 5821		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 5822				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 5823	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 5824
 5825	/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
 5826	 * in vmx_set_cr4.  */
 5827	exec_control &= ~SECONDARY_EXEC_DESC;
 5828
 5829	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
 5830	   (handle_vmptrld).
 5831	   We can NOT enable shadow_vmcs here because we don't have yet
 5832	   a current VMCS12
 5833	*/
 5834	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 5835
 5836	if (!enable_pml)
 5837		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 5838
 5839	if (vmx_xsaves_supported()) {
 5840		/* Exposing XSAVES only when XSAVE is exposed */
 5841		bool xsaves_enabled =
 5842			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
 5843			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
 5844
 5845		if (!xsaves_enabled)
 5846			exec_control &= ~SECONDARY_EXEC_XSAVES;
 5847
 5848		if (nested) {
 5849			if (xsaves_enabled)
 5850				vmx->nested.msrs.secondary_ctls_high |=
 5851					SECONDARY_EXEC_XSAVES;
 5852			else
 5853				vmx->nested.msrs.secondary_ctls_high &=
 5854					~SECONDARY_EXEC_XSAVES;
 5855		}
 5856	}
 5857
 5858	if (vmx_rdtscp_supported()) {
 5859		bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
 5860		if (!rdtscp_enabled)
 5861			exec_control &= ~SECONDARY_EXEC_RDTSCP;
 5862
 5863		if (nested) {
 5864			if (rdtscp_enabled)
 5865				vmx->nested.msrs.secondary_ctls_high |=
 5866					SECONDARY_EXEC_RDTSCP;
 5867			else
 5868				vmx->nested.msrs.secondary_ctls_high &=
 5869					~SECONDARY_EXEC_RDTSCP;
 5870		}
 5871	}
 5872
 5873	if (vmx_invpcid_supported()) {
 5874		/* Exposing INVPCID only when PCID is exposed */
 5875		bool invpcid_enabled =
 5876			guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
 5877			guest_cpuid_has(vcpu, X86_FEATURE_PCID);
 5878
 5879		if (!invpcid_enabled) {
 5880			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
 5881			guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
 5882		}
 5883
 5884		if (nested) {
 5885			if (invpcid_enabled)
 5886				vmx->nested.msrs.secondary_ctls_high |=
 5887					SECONDARY_EXEC_ENABLE_INVPCID;
 5888			else
 5889				vmx->nested.msrs.secondary_ctls_high &=
 5890					~SECONDARY_EXEC_ENABLE_INVPCID;
 5891		}
 5892	}
 5893
 5894	if (vmx_rdrand_supported()) {
 5895		bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
 5896		if (rdrand_enabled)
 5897			exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
 5898
 5899		if (nested) {
 5900			if (rdrand_enabled)
 5901				vmx->nested.msrs.secondary_ctls_high |=
 5902					SECONDARY_EXEC_RDRAND_EXITING;
 5903			else
 5904				vmx->nested.msrs.secondary_ctls_high &=
 5905					~SECONDARY_EXEC_RDRAND_EXITING;
 5906		}
 5907	}
 5908
 5909	if (vmx_rdseed_supported()) {
 5910		bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
 5911		if (rdseed_enabled)
 5912			exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
 5913
 5914		if (nested) {
 5915			if (rdseed_enabled)
 5916				vmx->nested.msrs.secondary_ctls_high |=
 5917					SECONDARY_EXEC_RDSEED_EXITING;
 5918			else
 5919				vmx->nested.msrs.secondary_ctls_high &=
 5920					~SECONDARY_EXEC_RDSEED_EXITING;
 5921		}
 5922	}
 5923
 5924	vmx->secondary_exec_control = exec_control;
 5925}
 5926
 5927static void ept_set_mmio_spte_mask(void)
 5928{
 5929	/*
 5930	 * EPT Misconfigurations can be generated if the value of bits 2:0
 5931	 * of an EPT paging-structure entry is 110b (write/execute).
 5932	 */
 5933	kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
 5934				   VMX_EPT_MISCONFIG_WX_VALUE);
 5935}
 5936
 5937#define VMX_XSS_EXIT_BITMAP 0
 5938/*
 5939 * Sets up the vmcs for emulated real mode.
 5940 */
 5941static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 5942{
 5943#ifdef CONFIG_X86_64
 5944	unsigned long a;
 5945#endif
 5946	int i;
 5947
 5948	if (enable_shadow_vmcs) {
 5949		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
 5950		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
 5951	}
 5952	if (cpu_has_vmx_msr_bitmap())
 5953		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
 5954
 5955	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 5956
 5957	/* Control */
 5958	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
 5959	vmx->hv_deadline_tsc = -1;
 5960
 5961	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 5962
 5963	if (cpu_has_secondary_exec_ctrls()) {
 5964		vmx_compute_secondary_exec_control(vmx);
 5965		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 5966			     vmx->secondary_exec_control);
 5967	}
 5968
 5969	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
 5970		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 5971		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 5972		vmcs_write64(EOI_EXIT_BITMAP2, 0);
 5973		vmcs_write64(EOI_EXIT_BITMAP3, 0);
 5974
 5975		vmcs_write16(GUEST_INTR_STATUS, 0);
 5976
 5977		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 5978		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
 5979	}
 5980
 5981	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
 5982		vmcs_write32(PLE_GAP, ple_gap);
 5983		vmx->ple_window = ple_window;
 5984		vmx->ple_window_dirty = true;
 5985	}
 5986
 5987	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
 5988	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 5989	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 5990
 5991	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
 5992	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
 5993	vmx_set_constant_host_state(vmx);
 5994#ifdef CONFIG_X86_64
 5995	rdmsrl(MSR_FS_BASE, a);
 5996	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
 5997	rdmsrl(MSR_GS_BASE, a);
 5998	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
 5999#else
 6000	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
 6001	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 6002#endif
 6003
 6004	if (cpu_has_vmx_vmfunc())
 6005		vmcs_write64(VM_FUNCTION_CONTROL, 0);
 6006
 6007	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 6008	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
 6009	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 6010	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
 6011	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 6012
 6013	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
 6014		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 6015
 6016	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
 6017		u32 index = vmx_msr_index[i];
 6018		u32 data_low, data_high;
 6019		int j = vmx->nmsrs;
 6020
 6021		if (rdmsr_safe(index, &data_low, &data_high) < 0)
 6022			continue;
 6023		if (wrmsr_safe(index, data_low, data_high) < 0)
 6024			continue;
 6025		vmx->guest_msrs[j].index = i;
 6026		vmx->guest_msrs[j].data = 0;
 6027		vmx->guest_msrs[j].mask = -1ull;
 6028		++vmx->nmsrs;
 6029	}
 6030
 6031	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
 6032		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
 6033
 6034	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 6035
 6036	/* 22.2.1, 20.8.1 */
 6037	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
 6038
 6039	vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
 6040	vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
 6041
 6042	set_cr4_guest_host_mask(vmx);
 6043
 6044	if (vmx_xsaves_supported())
 6045		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
 6046
 6047	if (enable_pml) {
 6048		ASSERT(vmx->pml_pg);
 6049		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 6050		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 6051	}
 6052}
 6053
 6054static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 6055{
 6056	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6057	struct msr_data apic_base_msr;
 6058	u64 cr0;
 6059
 6060	vmx->rmode.vm86_active = 0;
 6061	vmx->spec_ctrl = 0;
 6062
 6063	vcpu->arch.microcode_version = 0x100000000ULL;
 6064	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 6065	kvm_set_cr8(vcpu, 0);
 6066
 6067	if (!init_event) {
 6068		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
 6069				     MSR_IA32_APICBASE_ENABLE;
 6070		if (kvm_vcpu_is_reset_bsp(vcpu))
 6071			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
 6072		apic_base_msr.host_initiated = true;
 6073		kvm_set_apic_base(vcpu, &apic_base_msr);
 6074	}
 6075
 6076	vmx_segment_cache_clear(vmx);
 6077
 6078	seg_setup(VCPU_SREG_CS);
 6079	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
 6080	vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
 6081
 6082	seg_setup(VCPU_SREG_DS);
 6083	seg_setup(VCPU_SREG_ES);
 6084	seg_setup(VCPU_SREG_FS);
 6085	seg_setup(VCPU_SREG_GS);
 6086	seg_setup(VCPU_SREG_SS);
 6087
 6088	vmcs_write16(GUEST_TR_SELECTOR, 0);
 6089	vmcs_writel(GUEST_TR_BASE, 0);
 6090	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
 6091	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 6092
 6093	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
 6094	vmcs_writel(GUEST_LDTR_BASE, 0);
 6095	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
 6096	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
 6097
 6098	if (!init_event) {
 6099		vmcs_write32(GUEST_SYSENTER_CS, 0);
 6100		vmcs_writel(GUEST_SYSENTER_ESP, 0);
 6101		vmcs_writel(GUEST_SYSENTER_EIP, 0);
 6102		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 6103	}
 6104
 6105	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
 6106	kvm_rip_write(vcpu, 0xfff0);
 6107
 6108	vmcs_writel(GUEST_GDTR_BASE, 0);
 6109	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
 6110
 6111	vmcs_writel(GUEST_IDTR_BASE, 0);
 6112	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
 6113
 6114	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 6115	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 6116	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 6117	if (kvm_mpx_supported())
 6118		vmcs_write64(GUEST_BNDCFGS, 0);
 6119
 6120	setup_msrs(vmx);
 6121
 6122	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 6123
 6124	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 6125		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
 6126		if (cpu_need_tpr_shadow(vcpu))
 6127			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 6128				     __pa(vcpu->arch.apic->regs));
 6129		vmcs_write32(TPR_THRESHOLD, 0);
 6130	}
 6131
 6132	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 6133
 6134	if (vmx->vpid != 0)
 6135		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 6136
 6137	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
 6138	vmx->vcpu.arch.cr0 = cr0;
 6139	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 6140	vmx_set_cr4(vcpu, 0);
 6141	vmx_set_efer(vcpu, 0);
 6142
 6143	update_exception_bitmap(vcpu);
 6144
 6145	vpid_sync_context(vmx->vpid);
 6146	if (init_event)
 6147		vmx_clear_hlt(vcpu);
 6148}
 6149
 6150/*
 6151 * In nested virtualization, check if L1 asked to exit on external interrupts.
 6152 * For most existing hypervisors, this will always return true.
 6153 */
 6154static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
 6155{
 6156	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
 6157		PIN_BASED_EXT_INTR_MASK;
 6158}
 6159
 6160/*
 6161 * In nested virtualization, check if L1 has set
 6162 * VM_EXIT_ACK_INTR_ON_EXIT
 6163 */
 6164static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
 6165{
 6166	return get_vmcs12(vcpu)->vm_exit_controls &
 6167		VM_EXIT_ACK_INTR_ON_EXIT;
 6168}
 6169
 6170static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 6171{
 6172	return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
 6173}
 6174
 6175static void enable_irq_window(struct kvm_vcpu *vcpu)
 6176{
 6177	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
 6178		      CPU_BASED_VIRTUAL_INTR_PENDING);
 6179}
 6180
 6181static void enable_nmi_window(struct kvm_vcpu *vcpu)
 6182{
 6183	if (!enable_vnmi ||
 6184	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
 6185		enable_irq_window(vcpu);
 6186		return;
 6187	}
 6188
 6189	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
 6190		      CPU_BASED_VIRTUAL_NMI_PENDING);
 6191}
 6192
 6193static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 6194{
 6195	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6196	uint32_t intr;
 6197	int irq = vcpu->arch.interrupt.nr;
 6198
 6199	trace_kvm_inj_virq(irq);
 6200
 6201	++vcpu->stat.irq_injections;
 6202	if (vmx->rmode.vm86_active) {
 6203		int inc_eip = 0;
 6204		if (vcpu->arch.interrupt.soft)
 6205			inc_eip = vcpu->arch.event_exit_inst_len;
 6206		if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
 6207			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 6208		return;
 6209	}
 6210	intr = irq | INTR_INFO_VALID_MASK;
 6211	if (vcpu->arch.interrupt.soft) {
 6212		intr |= INTR_TYPE_SOFT_INTR;
 6213		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
 6214			     vmx->vcpu.arch.event_exit_inst_len);
 6215	} else
 6216		intr |= INTR_TYPE_EXT_INTR;
 6217	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 6218
 6219	vmx_clear_hlt(vcpu);
 6220}
 6221
 6222static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 6223{
 6224	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6225
 6226	if (!enable_vnmi) {
 6227		/*
 6228		 * Tracking the NMI-blocked state in software is built upon
 6229		 * finding the next open IRQ window. This, in turn, depends on
 6230		 * well-behaving guests: They have to keep IRQs disabled at
 6231		 * least as long as the NMI handler runs. Otherwise we may
 6232		 * cause NMI nesting, maybe breaking the guest. But as this is
 6233		 * highly unlikely, we can live with the residual risk.
 6234		 */
 6235		vmx->loaded_vmcs->soft_vnmi_blocked = 1;
 6236		vmx->loaded_vmcs->vnmi_blocked_time = 0;
 6237	}
 6238
 6239	++vcpu->stat.nmi_injections;
 6240	vmx->loaded_vmcs->nmi_known_unmasked = false;
 6241
 6242	if (vmx->rmode.vm86_active) {
 6243		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
 6244			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 6245		return;
 6246	}
 6247
 6248	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 6249			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 6250
 6251	vmx_clear_hlt(vcpu);
 6252}
 6253
 6254static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 6255{
 6256	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6257	bool masked;
 6258
 6259	if (!enable_vnmi)
 6260		return vmx->loaded_vmcs->soft_vnmi_blocked;
 6261	if (vmx->loaded_vmcs->nmi_known_unmasked)
 6262		return false;
 6263	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
 6264	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
 6265	return masked;
 6266}
 6267
 6268static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 6269{
 6270	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6271
 6272	if (!enable_vnmi) {
 6273		if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
 6274			vmx->loaded_vmcs->soft_vnmi_blocked = masked;
 6275			vmx->loaded_vmcs->vnmi_blocked_time = 0;
 6276		}
 6277	} else {
 6278		vmx->loaded_vmcs->nmi_known_unmasked = !masked;
 6279		if (masked)
 6280			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 6281				      GUEST_INTR_STATE_NMI);
 6282		else
 6283			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 6284					GUEST_INTR_STATE_NMI);
 6285	}
 6286}
 6287
 6288static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 6289{
 6290	if (to_vmx(vcpu)->nested.nested_run_pending)
 6291		return 0;
 6292
 6293	if (!enable_vnmi &&
 6294	    to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
 6295		return 0;
 6296
 6297	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 6298		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
 6299		   | GUEST_INTR_STATE_NMI));
 6300}
 6301
 6302static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 6303{
 6304	return (!to_vmx(vcpu)->nested.nested_run_pending &&
 6305		vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 6306		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 6307			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 6308}
 6309
 6310static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 6311{
 6312	int ret;
 6313
 6314	if (enable_unrestricted_guest)
 6315		return 0;
 6316
 6317	ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
 6318				    PAGE_SIZE * 3);
 6319	if (ret)
 6320		return ret;
 6321	to_kvm_vmx(kvm)->tss_addr = addr;
 6322	return init_rmode_tss(kvm);
 6323}
 6324
 6325static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
 6326{
 6327	to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
 6328	return 0;
 6329}
 6330
 6331static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
 6332{
 6333	switch (vec) {
 6334	case BP_VECTOR:
 6335		/*
 6336		 * Update instruction length as we may reinject the exception
 6337		 * from user space while in guest debugging mode.
 6338		 */
 6339		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
 6340			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 6341		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
 6342			return false;
 6343		/* fall through */
 6344	case DB_VECTOR:
 6345		if (vcpu->guest_debug &
 6346			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
 6347			return false;
 6348		/* fall through */
 6349	case DE_VECTOR:
 6350	case OF_VECTOR:
 6351	case BR_VECTOR:
 6352	case UD_VECTOR:
 6353	case DF_VECTOR:
 6354	case SS_VECTOR:
 6355	case GP_VECTOR:
 6356	case MF_VECTOR:
 6357		return true;
 6358	break;
 6359	}
 6360	return false;
 6361}
 6362
 6363static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 6364				  int vec, u32 err_code)
 6365{
 6366	/*
 6367	 * Instruction with address size override prefix opcode 0x67
 6368	 * Cause the #SS fault with 0 error code in VM86 mode.
 6369	 */
 6370	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
 6371		if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
 6372			if (vcpu->arch.halt_request) {
 6373				vcpu->arch.halt_request = 0;
 6374				return kvm_vcpu_halt(vcpu);
 6375			}
 6376			return 1;
 6377		}
 6378		return 0;
 6379	}
 6380
 6381	/*
 6382	 * Forward all other exceptions that are valid in real mode.
 6383	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
 6384	 *        the required debugging infrastructure rework.
 6385	 */
 6386	kvm_queue_exception(vcpu, vec);
 6387	return 1;
 6388}
 6389
 6390/*
 6391 * Trigger machine check on the host. We assume all the MSRs are already set up
 6392 * by the CPU and that we still run on the same CPU as the MCE occurred on.
 6393 * We pass a fake environment to the machine check handler because we want
 6394 * the guest to be always treated like user space, no matter what context
 6395 * it used internally.
 6396 */
 6397static void kvm_machine_check(void)
 6398{
 6399#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
 6400	struct pt_regs regs = {
 6401		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
 6402		.flags = X86_EFLAGS_IF,
 6403	};
 6404
 6405	do_machine_check(&regs, 0);
 6406#endif
 6407}
 6408
 6409static int handle_machine_check(struct kvm_vcpu *vcpu)
 6410{
 6411	/* already handled by vcpu_run */
 6412	return 1;
 6413}
 6414
 6415static int handle_exception(struct kvm_vcpu *vcpu)
 6416{
 6417	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6418	struct kvm_run *kvm_run = vcpu->run;
 6419	u32 intr_info, ex_no, error_code;
 6420	unsigned long cr2, rip, dr6;
 6421	u32 vect_info;
 6422	enum emulation_result er;
 6423
 6424	vect_info = vmx->idt_vectoring_info;
 6425	intr_info = vmx->exit_intr_info;
 6426
 6427	if (is_machine_check(intr_info))
 6428		return handle_machine_check(vcpu);
 6429
 6430	if (is_nmi(intr_info))
 6431		return 1;  /* already handled by vmx_vcpu_run() */
 6432
 6433	if (is_invalid_opcode(intr_info))
 6434		return handle_ud(vcpu);
 6435
 6436	error_code = 0;
 6437	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
 6438		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 6439
 6440	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
 6441		WARN_ON_ONCE(!enable_vmware_backdoor);
 6442		er = emulate_instruction(vcpu,
 6443			EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
 6444		if (er == EMULATE_USER_EXIT)
 6445			return 0;
 6446		else if (er != EMULATE_DONE)
 6447			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 6448		return 1;
 6449	}
 6450
 6451	/*
 6452	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
 6453	 * MMIO, it is better to report an internal error.
 6454	 * See the comments in vmx_handle_exit.
 6455	 */
 6456	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
 6457	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
 6458		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 6459		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
 6460		vcpu->run->internal.ndata = 3;
 6461		vcpu->run->internal.data[0] = vect_info;
 6462		vcpu->run->internal.data[1] = intr_info;
 6463		vcpu->run->internal.data[2] = error_code;
 6464		return 0;
 6465	}
 6466
 6467	if (is_page_fault(intr_info)) {
 6468		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 6469		/* EPT won't cause page fault directly */
 6470		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
 6471		return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
 6472	}
 6473
 6474	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
 6475
 6476	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
 6477		return handle_rmode_exception(vcpu, ex_no, error_code);
 6478
 6479	switch (ex_no) {
 6480	case AC_VECTOR:
 6481		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
 6482		return 1;
 6483	case DB_VECTOR:
 6484		dr6 = vmcs_readl(EXIT_QUALIFICATION);
 6485		if (!(vcpu->guest_debug &
 6486		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
 6487			vcpu->arch.dr6 &= ~15;
 6488			vcpu->arch.dr6 |= dr6 | DR6_RTM;
 6489			if (is_icebp(intr_info))
 6490				skip_emulated_instruction(vcpu);
 6491
 6492			kvm_queue_exception(vcpu, DB_VECTOR);
 6493			return 1;
 6494		}
 6495		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
 6496		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
 6497		/* fall through */
 6498	case BP_VECTOR:
 6499		/*
 6500		 * Update instruction length as we may reinject #BP from
 6501		 * user space while in guest debugging mode. Reading it for
 6502		 * #DB as well causes no harm, it is not used in that case.
 6503		 */
 6504		vmx->vcpu.arch.event_exit_inst_len =
 6505			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 6506		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 6507		rip = kvm_rip_read(vcpu);
 6508		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
 6509		kvm_run->debug.arch.exception = ex_no;
 6510		break;
 6511	default:
 6512		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
 6513		kvm_run->ex.exception = ex_no;
 6514		kvm_run->ex.error_code = error_code;
 6515		break;
 6516	}
 6517	return 0;
 6518}
 6519
 6520static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 6521{
 6522	++vcpu->stat.irq_exits;
 6523	return 1;
 6524}
 6525
 6526static int handle_triple_fault(struct kvm_vcpu *vcpu)
 6527{
 6528	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 6529	vcpu->mmio_needed = 0;
 6530	return 0;
 6531}
 6532
 6533static int handle_io(struct kvm_vcpu *vcpu)
 6534{
 6535	unsigned long exit_qualification;
 6536	int size, in, string;
 6537	unsigned port;
 6538
 6539	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6540	string = (exit_qualification & 16) != 0;
 6541
 6542	++vcpu->stat.io_exits;
 6543
 6544	if (string)
 6545		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 6546
 6547	port = exit_qualification >> 16;
 6548	size = (exit_qualification & 7) + 1;
 6549	in = (exit_qualification & 8) != 0;
 6550
 6551	return kvm_fast_pio(vcpu, size, port, in);
 6552}
 6553
 6554static void
 6555vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 6556{
 6557	/*
 6558	 * Patch in the VMCALL instruction:
 6559	 */
 6560	hypercall[0] = 0x0f;
 6561	hypercall[1] = 0x01;
 6562	hypercall[2] = 0xc1;
 6563}
 6564
 6565/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 6566static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 6567{
 6568	if (is_guest_mode(vcpu)) {
 6569		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 6570		unsigned long orig_val = val;
 6571
 6572		/*
 6573		 * We get here when L2 changed cr0 in a way that did not change
 6574		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
 6575		 * but did change L0 shadowed bits. So we first calculate the
 6576		 * effective cr0 value that L1 would like to write into the
 6577		 * hardware. It consists of the L2-owned bits from the new
 6578		 * value combined with the L1-owned bits from L1's guest_cr0.
 6579		 */
 6580		val = (val & ~vmcs12->cr0_guest_host_mask) |
 6581			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
 6582
 6583		if (!nested_guest_cr0_valid(vcpu, val))
 6584			return 1;
 6585
 6586		if (kvm_set_cr0(vcpu, val))
 6587			return 1;
 6588		vmcs_writel(CR0_READ_SHADOW, orig_val);
 6589		return 0;
 6590	} else {
 6591		if (to_vmx(vcpu)->nested.vmxon &&
 6592		    !nested_host_cr0_valid(vcpu, val))
 6593			return 1;
 6594
 6595		return kvm_set_cr0(vcpu, val);
 6596	}
 6597}
 6598
 6599static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 6600{
 6601	if (is_guest_mode(vcpu)) {
 6602		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 6603		unsigned long orig_val = val;
 6604
 6605		/* analogously to handle_set_cr0 */
 6606		val = (val & ~vmcs12->cr4_guest_host_mask) |
 6607			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
 6608		if (kvm_set_cr4(vcpu, val))
 6609			return 1;
 6610		vmcs_writel(CR4_READ_SHADOW, orig_val);
 6611		return 0;
 6612	} else
 6613		return kvm_set_cr4(vcpu, val);
 6614}
 6615
 6616static int handle_desc(struct kvm_vcpu *vcpu)
 6617{
 6618	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
 6619	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 6620}
 6621
 6622static int handle_cr(struct kvm_vcpu *vcpu)
 6623{
 6624	unsigned long exit_qualification, val;
 6625	int cr;
 6626	int reg;
 6627	int err;
 6628	int ret;
 6629
 6630	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6631	cr = exit_qualification & 15;
 6632	reg = (exit_qualification >> 8) & 15;
 6633	switch ((exit_qualification >> 4) & 3) {
 6634	case 0: /* mov to cr */
 6635		val = kvm_register_readl(vcpu, reg);
 6636		trace_kvm_cr_write(cr, val);
 6637		switch (cr) {
 6638		case 0:
 6639			err = handle_set_cr0(vcpu, val);
 6640			return kvm_complete_insn_gp(vcpu, err);
 6641		case 3:
 6642			WARN_ON_ONCE(enable_unrestricted_guest);
 6643			err = kvm_set_cr3(vcpu, val);
 6644			return kvm_complete_insn_gp(vcpu, err);
 6645		case 4:
 6646			err = handle_set_cr4(vcpu, val);
 6647			return kvm_complete_insn_gp(vcpu, err);
 6648		case 8: {
 6649				u8 cr8_prev = kvm_get_cr8(vcpu);
 6650				u8 cr8 = (u8)val;
 6651				err = kvm_set_cr8(vcpu, cr8);
 6652				ret = kvm_complete_insn_gp(vcpu, err);
 6653				if (lapic_in_kernel(vcpu))
 6654					return ret;
 6655				if (cr8_prev <= cr8)
 6656					return ret;
 6657				/*
 6658				 * TODO: we might be squashing a
 6659				 * KVM_GUESTDBG_SINGLESTEP-triggered
 6660				 * KVM_EXIT_DEBUG here.
 6661				 */
 6662				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
 6663				return 0;
 6664			}
 6665		}
 6666		break;
 6667	case 2: /* clts */
 6668		WARN_ONCE(1, "Guest should always own CR0.TS");
 6669		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
 6670		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
 6671		return kvm_skip_emulated_instruction(vcpu);
 6672	case 1: /*mov from cr*/
 6673		switch (cr) {
 6674		case 3:
 6675			WARN_ON_ONCE(enable_unrestricted_guest);
 6676			val = kvm_read_cr3(vcpu);
 6677			kvm_register_write(vcpu, reg, val);
 6678			trace_kvm_cr_read(cr, val);
 6679			return kvm_skip_emulated_instruction(vcpu);
 6680		case 8:
 6681			val = kvm_get_cr8(vcpu);
 6682			kvm_register_write(vcpu, reg, val);
 6683			trace_kvm_cr_read(cr, val);
 6684			return kvm_skip_emulated_instruction(vcpu);
 6685		}
 6686		break;
 6687	case 3: /* lmsw */
 6688		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
 6689		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
 6690		kvm_lmsw(vcpu, val);
 6691
 6692		return kvm_skip_emulated_instruction(vcpu);
 6693	default:
 6694		break;
 6695	}
 6696	vcpu->run->exit_reason = 0;
 6697	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
 6698	       (int)(exit_qualification >> 4) & 3, cr);
 6699	return 0;
 6700}
 6701
 6702static int handle_dr(struct kvm_vcpu *vcpu)
 6703{
 6704	unsigned long exit_qualification;
 6705	int dr, dr7, reg;
 6706
 6707	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6708	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
 6709
 6710	/* First, if DR does not exist, trigger UD */
 6711	if (!kvm_require_dr(vcpu, dr))
 6712		return 1;
 6713
 6714	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
 6715	if (!kvm_require_cpl(vcpu, 0))
 6716		return 1;
 6717	dr7 = vmcs_readl(GUEST_DR7);
 6718	if (dr7 & DR7_GD) {
 6719		/*
 6720		 * As the vm-exit takes precedence over the debug trap, we
 6721		 * need to emulate the latter, either for the host or the
 6722		 * guest debugging itself.
 6723		 */
 6724		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
 6725			vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
 6726			vcpu->run->debug.arch.dr7 = dr7;
 6727			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
 6728			vcpu->run->debug.arch.exception = DB_VECTOR;
 6729			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
 6730			return 0;
 6731		} else {
 6732			vcpu->arch.dr6 &= ~15;
 6733			vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
 6734			kvm_queue_exception(vcpu, DB_VECTOR);
 6735			return 1;
 6736		}
 6737	}
 6738
 6739	if (vcpu->guest_debug == 0) {
 6740		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
 6741				CPU_BASED_MOV_DR_EXITING);
 6742
 6743		/*
 6744		 * No more DR vmexits; force a reload of the debug registers
 6745		 * and reenter on this instruction.  The next vmexit will
 6746		 * retrieve the full state of the debug registers.
 6747		 */
 6748		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
 6749		return 1;
 6750	}
 6751
 6752	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
 6753	if (exit_qualification & TYPE_MOV_FROM_DR) {
 6754		unsigned long val;
 6755
 6756		if (kvm_get_dr(vcpu, dr, &val))
 6757			return 1;
 6758		kvm_register_write(vcpu, reg, val);
 6759	} else
 6760		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
 6761			return 1;
 6762
 6763	return kvm_skip_emulated_instruction(vcpu);
 6764}
 6765
 6766static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
 6767{
 6768	return vcpu->arch.dr6;
 6769}
 6770
 6771static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 6772{
 6773}
 6774
 6775static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 6776{
 6777	get_debugreg(vcpu->arch.db[0], 0);
 6778	get_debugreg(vcpu->arch.db[1], 1);
 6779	get_debugreg(vcpu->arch.db[2], 2);
 6780	get_debugreg(vcpu->arch.db[3], 3);
 6781	get_debugreg(vcpu->arch.dr6, 6);
 6782	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 6783
 6784	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
 6785	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
 6786}
 6787
 6788static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
 6789{
 6790	vmcs_writel(GUEST_DR7, val);
 6791}
 6792
 6793static int handle_cpuid(struct kvm_vcpu *vcpu)
 6794{
 6795	return kvm_emulate_cpuid(vcpu);
 6796}
 6797
 6798static int handle_rdmsr(struct kvm_vcpu *vcpu)
 6799{
 6800	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
 6801	struct msr_data msr_info;
 6802
 6803	msr_info.index = ecx;
 6804	msr_info.host_initiated = false;
 6805	if (vmx_get_msr(vcpu, &msr_info)) {
 6806		trace_kvm_msr_read_ex(ecx);
 6807		kvm_inject_gp(vcpu, 0);
 6808		return 1;
 6809	}
 6810
 6811	trace_kvm_msr_read(ecx, msr_info.data);
 6812
 6813	/* FIXME: handling of bits 32:63 of rax, rdx */
 6814	vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
 6815	vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
 6816	return kvm_skip_emulated_instruction(vcpu);
 6817}
 6818
 6819static int handle_wrmsr(struct kvm_vcpu *vcpu)
 6820{
 6821	struct msr_data msr;
 6822	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
 6823	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
 6824		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 6825
 6826	msr.data = data;
 6827	msr.index = ecx;
 6828	msr.host_initiated = false;
 6829	if (kvm_set_msr(vcpu, &msr) != 0) {
 6830		trace_kvm_msr_write_ex(ecx, data);
 6831		kvm_inject_gp(vcpu, 0);
 6832		return 1;
 6833	}
 6834
 6835	trace_kvm_msr_write(ecx, data);
 6836	return kvm_skip_emulated_instruction(vcpu);
 6837}
 6838
 6839static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 6840{
 6841	kvm_apic_update_ppr(vcpu);
 6842	return 1;
 6843}
 6844
 6845static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 6846{
 6847	vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
 6848			CPU_BASED_VIRTUAL_INTR_PENDING);
 6849
 6850	kvm_make_request(KVM_REQ_EVENT, vcpu);
 6851
 6852	++vcpu->stat.irq_window_exits;
 6853	return 1;
 6854}
 6855
 6856static int handle_halt(struct kvm_vcpu *vcpu)
 6857{
 6858	return kvm_emulate_halt(vcpu);
 6859}
 6860
 6861static int handle_vmcall(struct kvm_vcpu *vcpu)
 6862{
 6863	return kvm_emulate_hypercall(vcpu);
 6864}
 6865
 6866static int handle_invd(struct kvm_vcpu *vcpu)
 6867{
 6868	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 6869}
 6870
 6871static int handle_invlpg(struct kvm_vcpu *vcpu)
 6872{
 6873	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6874
 6875	kvm_mmu_invlpg(vcpu, exit_qualification);
 6876	return kvm_skip_emulated_instruction(vcpu);
 6877}
 6878
 6879static int handle_rdpmc(struct kvm_vcpu *vcpu)
 6880{
 6881	int err;
 6882
 6883	err = kvm_rdpmc(vcpu);
 6884	return kvm_complete_insn_gp(vcpu, err);
 6885}
 6886
 6887static int handle_wbinvd(struct kvm_vcpu *vcpu)
 6888{
 6889	return kvm_emulate_wbinvd(vcpu);
 6890}
 6891
 6892static int handle_xsetbv(struct kvm_vcpu *vcpu)
 6893{
 6894	u64 new_bv = kvm_read_edx_eax(vcpu);
 6895	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
 6896
 6897	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
 6898		return kvm_skip_emulated_instruction(vcpu);
 6899	return 1;
 6900}
 6901
 6902static int handle_xsaves(struct kvm_vcpu *vcpu)
 6903{
 6904	kvm_skip_emulated_instruction(vcpu);
 6905	WARN(1, "this should never happen\n");
 6906	return 1;
 6907}
 6908
 6909static int handle_xrstors(struct kvm_vcpu *vcpu)
 6910{
 6911	kvm_skip_emulated_instruction(vcpu);
 6912	WARN(1, "this should never happen\n");
 6913	return 1;
 6914}
 6915
 6916static int handle_apic_access(struct kvm_vcpu *vcpu)
 6917{
 6918	if (likely(fasteoi)) {
 6919		unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6920		int access_type, offset;
 6921
 6922		access_type = exit_qualification & APIC_ACCESS_TYPE;
 6923		offset = exit_qualification & APIC_ACCESS_OFFSET;
 6924		/*
 6925		 * Sane guest uses MOV to write EOI, with written value
 6926		 * not cared. So make a short-circuit here by avoiding
 6927		 * heavy instruction emulation.
 6928		 */
 6929		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
 6930		    (offset == APIC_EOI)) {
 6931			kvm_lapic_set_eoi(vcpu);
 6932			return kvm_skip_emulated_instruction(vcpu);
 6933		}
 6934	}
 6935	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 6936}
 6937
 6938static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
 6939{
 6940	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6941	int vector = exit_qualification & 0xff;
 6942
 6943	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
 6944	kvm_apic_set_eoi_accelerated(vcpu, vector);
 6945	return 1;
 6946}
 6947
 6948static int handle_apic_write(struct kvm_vcpu *vcpu)
 6949{
 6950	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6951	u32 offset = exit_qualification & 0xfff;
 6952
 6953	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
 6954	kvm_apic_write_nodecode(vcpu, offset);
 6955	return 1;
 6956}
 6957
 6958static int handle_task_switch(struct kvm_vcpu *vcpu)
 6959{
 6960	struct vcpu_vmx *vmx = to_vmx(vcpu);
 6961	unsigned long exit_qualification;
 6962	bool has_error_code = false;
 6963	u32 error_code = 0;
 6964	u16 tss_selector;
 6965	int reason, type, idt_v, idt_index;
 6966
 6967	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
 6968	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
 6969	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 6970
 6971	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 6972
 6973	reason = (u32)exit_qualification >> 30;
 6974	if (reason == TASK_SWITCH_GATE && idt_v) {
 6975		switch (type) {
 6976		case INTR_TYPE_NMI_INTR:
 6977			vcpu->arch.nmi_injected = false;
 6978			vmx_set_nmi_mask(vcpu, true);
 6979			break;
 6980		case INTR_TYPE_EXT_INTR:
 6981		case INTR_TYPE_SOFT_INTR:
 6982			kvm_clear_interrupt_queue(vcpu);
 6983			break;
 6984		case INTR_TYPE_HARD_EXCEPTION:
 6985			if (vmx->idt_vectoring_info &
 6986			    VECTORING_INFO_DELIVER_CODE_MASK) {
 6987				has_error_code = true;
 6988				error_code =
 6989					vmcs_read32(IDT_VECTORING_ERROR_CODE);
 6990			}
 6991			/* fall through */
 6992		case INTR_TYPE_SOFT_EXCEPTION:
 6993			kvm_clear_exception_queue(vcpu);
 6994			break;
 6995		default:
 6996			break;
 6997		}
 6998	}
 6999	tss_selector = exit_qualification;
 7000
 7001	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
 7002		       type != INTR_TYPE_EXT_INTR &&
 7003		       type != INTR_TYPE_NMI_INTR))
 7004		skip_emulated_instruction(vcpu);
 7005
 7006	if (kvm_task_switch(vcpu, tss_selector,
 7007			    type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
 7008			    has_error_code, error_code) == EMULATE_FAIL) {
 7009		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 7010		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 7011		vcpu->run->internal.ndata = 0;
 7012		return 0;
 7013	}
 7014
 7015	/*
 7016	 * TODO: What about debug traps on tss switch?
 7017	 *       Are we supposed to inject them and update dr6?
 7018	 */
 7019
 7020	return 1;
 7021}
 7022
 7023static int handle_ept_violation(struct kvm_vcpu *vcpu)
 7024{
 7025	unsigned long exit_qualification;
 7026	gpa_t gpa;
 7027	u64 error_code;
 7028
 7029	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 7030
 7031	/*
 7032	 * EPT violation happened while executing iret from NMI,
 7033	 * "blocked by NMI" bit has to be set before next VM entry.
 7034	 * There are errata that may cause this bit to not be set:
 7035	 * AAK134, BY25.
 7036	 */
 7037	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
 7038			enable_vnmi &&
 7039			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 7040		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
 7041
 7042	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 7043	trace_kvm_page_fault(gpa, exit_qualification);
 7044
 7045	/* Is it a read fault? */
 7046	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
 7047		     ? PFERR_USER_MASK : 0;
 7048	/* Is it a write fault? */
 7049	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
 7050		      ? PFERR_WRITE_MASK : 0;
 7051	/* Is it a fetch fault? */
 7052	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
 7053		      ? PFERR_FETCH_MASK : 0;
 7054	/* ept page table entry is present? */
 7055	error_code |= (exit_qualification &
 7056		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
 7057			EPT_VIOLATION_EXECUTABLE))
 7058		      ? PFERR_PRESENT_MASK : 0;
 7059
 7060	error_code |= (exit_qualification & 0x100) != 0 ?
 7061	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
 7062
 7063	vcpu->arch.exit_qualification = exit_qualification;
 7064	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 7065}
 7066
 7067static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 7068{
 7069	gpa_t gpa;
 7070
 7071	/*
 7072	 * A nested guest cannot optimize MMIO vmexits, because we have an
 7073	 * nGPA here instead of the required GPA.
 7074	 */
 7075	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 7076	if (!is_guest_mode(vcpu) &&
 7077	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 7078		trace_kvm_fast_mmio(gpa);
 7079		/*
 7080		 * Doing kvm_skip_emulated_instruction() depends on undefined
 7081		 * behavior: Intel's manual doesn't mandate
 7082		 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
 7083		 * occurs and while on real hardware it was observed to be set,
 7084		 * other hypervisors (namely Hyper-V) don't set it, we end up
 7085		 * advancing IP with some random value. Disable fast mmio when
 7086		 * running nested and keep it for real hardware in hope that
 7087		 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
 7088		 */
 7089		if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
 7090			return kvm_skip_emulated_instruction(vcpu);
 7091		else
 7092			return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
 7093						       NULL, 0) == EMULATE_DONE;
 7094	}
 7095
 7096	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
 7097}
 7098
 7099static int handle_nmi_window(struct kvm_vcpu *vcpu)
 7100{
 7101	WARN_ON_ONCE(!enable_vnmi);
 7102	vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
 7103			CPU_BASED_VIRTUAL_NMI_PENDING);
 7104	++vcpu->stat.nmi_window_exits;
 7105	kvm_make_request(KVM_REQ_EVENT, vcpu);
 7106
 7107	return 1;
 7108}
 7109
 7110static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 7111{
 7112	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7113	enum emulation_result err = EMULATE_DONE;
 7114	int ret = 1;
 7115	u32 cpu_exec_ctrl;
 7116	bool intr_window_requested;
 7117	unsigned count = 130;
 7118
 7119	/*
 7120	 * We should never reach the point where we are emulating L2
 7121	 * due to invalid guest state as that means we incorrectly
 7122	 * allowed a nested VMEntry with an invalid vmcs12.
 7123	 */
 7124	WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
 7125
 7126	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 7127	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
 7128
 7129	while (vmx->emulation_required && count-- != 0) {
 7130		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
 7131			return handle_interrupt_window(&vmx->vcpu);
 7132
 7133		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
 7134			return 1;
 7135
 7136		err = emulate_instruction(vcpu, 0);
 7137
 7138		if (err == EMULATE_USER_EXIT) {
 7139			++vcpu->stat.mmio_exits;
 7140			ret = 0;
 7141			goto out;
 7142		}
 7143
 7144		if (err != EMULATE_DONE)
 7145			goto emulation_error;
 7146
 7147		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
 7148		    vcpu->arch.exception.pending)
 7149			goto emulation_error;
 7150
 7151		if (vcpu->arch.halt_request) {
 7152			vcpu->arch.halt_request = 0;
 7153			ret = kvm_vcpu_halt(vcpu);
 7154			goto out;
 7155		}
 7156
 7157		if (signal_pending(current))
 7158			goto out;
 7159		if (need_resched())
 7160			schedule();
 7161	}
 7162
 7163out:
 7164	return ret;
 7165
 7166emulation_error:
 7167	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 7168	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 7169	vcpu->run->internal.ndata = 0;
 7170	return 0;
 7171}
 7172
 7173static void grow_ple_window(struct kvm_vcpu *vcpu)
 7174{
 7175	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7176	int old = vmx->ple_window;
 7177
 7178	vmx->ple_window = __grow_ple_window(old, ple_window,
 7179					    ple_window_grow,
 7180					    ple_window_max);
 7181
 7182	if (vmx->ple_window != old)
 7183		vmx->ple_window_dirty = true;
 7184
 7185	trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
 7186}
 7187
 7188static void shrink_ple_window(struct kvm_vcpu *vcpu)
 7189{
 7190	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7191	int old = vmx->ple_window;
 7192
 7193	vmx->ple_window = __shrink_ple_window(old, ple_window,
 7194					      ple_window_shrink,
 7195					      ple_window);
 7196
 7197	if (vmx->ple_window != old)
 7198		vmx->ple_window_dirty = true;
 7199
 7200	trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
 7201}
 7202
 7203/*
 7204 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
 7205 */
 7206static void wakeup_handler(void)
 7207{
 7208	struct kvm_vcpu *vcpu;
 7209	int cpu = smp_processor_id();
 7210
 7211	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 7212	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
 7213			blocked_vcpu_list) {
 7214		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 7215
 7216		if (pi_test_on(pi_desc) == 1)
 7217			kvm_vcpu_kick(vcpu);
 7218	}
 7219	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 7220}
 7221
 7222static void vmx_enable_tdp(void)
 7223{
 7224	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
 7225		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
 7226		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
 7227		0ull, VMX_EPT_EXECUTABLE_MASK,
 7228		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
 7229		VMX_EPT_RWX_MASK, 0ull);
 7230
 7231	ept_set_mmio_spte_mask();
 7232	kvm_enable_tdp();
 7233}
 7234
 7235static __init int hardware_setup(void)
 7236{
 7237	int r = -ENOMEM, i;
 7238
 7239	rdmsrl_safe(MSR_EFER, &host_efer);
 7240
 7241	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
 7242		kvm_define_shared_msr(i, vmx_msr_index[i]);
 7243
 7244	for (i = 0; i < VMX_BITMAP_NR; i++) {
 7245		vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
 7246		if (!vmx_bitmap[i])
 7247			goto out;
 7248	}
 7249
 7250	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 7251	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 7252
 7253	if (setup_vmcs_config(&vmcs_config) < 0) {
 7254		r = -EIO;
 7255		goto out;
 7256	}
 7257
 7258	if (boot_cpu_has(X86_FEATURE_NX))
 7259		kvm_enable_efer_bits(EFER_NX);
 7260
 7261	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
 7262		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 7263		enable_vpid = 0;
 7264
 7265	if (!cpu_has_vmx_ept() ||
 7266	    !cpu_has_vmx_ept_4levels() ||
 7267	    !cpu_has_vmx_ept_mt_wb() ||
 7268	    !cpu_has_vmx_invept_global())
 7269		enable_ept = 0;
 7270
 7271	if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
 7272		enable_ept_ad_bits = 0;
 7273
 7274	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
 7275		enable_unrestricted_guest = 0;
 7276
 7277	if (!cpu_has_vmx_flexpriority())
 7278		flexpriority_enabled = 0;
 7279
 7280	if (!cpu_has_virtual_nmis())
 7281		enable_vnmi = 0;
 7282
 7283	/*
 7284	 * set_apic_access_page_addr() is used to reload apic access
 7285	 * page upon invalidation.  No need to do anything if not
 7286	 * using the APIC_ACCESS_ADDR VMCS field.
 7287	 */
 7288	if (!flexpriority_enabled)
 7289		kvm_x86_ops->set_apic_access_page_addr = NULL;
 7290
 7291	if (!cpu_has_vmx_tpr_shadow())
 7292		kvm_x86_ops->update_cr8_intercept = NULL;
 7293
 7294	if (enable_ept && !cpu_has_vmx_ept_2m_page())
 7295		kvm_disable_largepages();
 7296
 7297	if (!cpu_has_vmx_ple()) {
 7298		ple_gap = 0;
 7299		ple_window = 0;
 7300		ple_window_grow = 0;
 7301		ple_window_max = 0;
 7302		ple_window_shrink = 0;
 7303	}
 7304
 7305	if (!cpu_has_vmx_apicv()) {
 7306		enable_apicv = 0;
 7307		kvm_x86_ops->sync_pir_to_irr = NULL;
 7308	}
 7309
 7310	if (cpu_has_vmx_tsc_scaling()) {
 7311		kvm_has_tsc_control = true;
 7312		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
 7313		kvm_tsc_scaling_ratio_frac_bits = 48;
 7314	}
 7315
 7316	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 7317
 7318	if (enable_ept)
 7319		vmx_enable_tdp();
 7320	else
 7321		kvm_disable_tdp();
 7322
 7323	/*
 7324	 * Only enable PML when hardware supports PML feature, and both EPT
 7325	 * and EPT A/D bit features are enabled -- PML depends on them to work.
 7326	 */
 7327	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
 7328		enable_pml = 0;
 7329
 7330	if (!enable_pml) {
 7331		kvm_x86_ops->slot_enable_log_dirty = NULL;
 7332		kvm_x86_ops->slot_disable_log_dirty = NULL;
 7333		kvm_x86_ops->flush_log_dirty = NULL;
 7334		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 7335	}
 7336
 7337	if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
 7338		u64 vmx_msr;
 7339
 7340		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
 7341		cpu_preemption_timer_multi =
 7342			 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
 7343	} else {
 7344		kvm_x86_ops->set_hv_timer = NULL;
 7345		kvm_x86_ops->cancel_hv_timer = NULL;
 7346	}
 7347
 7348	if (!cpu_has_vmx_shadow_vmcs())
 7349		enable_shadow_vmcs = 0;
 7350	if (enable_shadow_vmcs)
 7351		init_vmcs_shadow_fields();
 7352
 7353	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 7354	nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
 7355
 7356	kvm_mce_cap_supported |= MCG_LMCE_P;
 7357
 7358	return alloc_kvm_area();
 7359
 7360out:
 7361	for (i = 0; i < VMX_BITMAP_NR; i++)
 7362		free_page((unsigned long)vmx_bitmap[i]);
 7363
 7364    return r;
 7365}
 7366
 7367static __exit void hardware_unsetup(void)
 7368{
 7369	int i;
 7370
 7371	for (i = 0; i < VMX_BITMAP_NR; i++)
 7372		free_page((unsigned long)vmx_bitmap[i]);
 7373
 7374	free_kvm_area();
 7375}
 7376
 7377/*
 7378 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
 7379 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
 7380 */
 7381static int handle_pause(struct kvm_vcpu *vcpu)
 7382{
 7383	if (!kvm_pause_in_guest(vcpu->kvm))
 7384		grow_ple_window(vcpu);
 7385
 7386	/*
 7387	 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
 7388	 * VM-execution control is ignored if CPL > 0. OTOH, KVM
 7389	 * never set PAUSE_EXITING and just set PLE if supported,
 7390	 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
 7391	 */
 7392	kvm_vcpu_on_spin(vcpu, true);
 7393	return kvm_skip_emulated_instruction(vcpu);
 7394}
 7395
 7396static int handle_nop(struct kvm_vcpu *vcpu)
 7397{
 7398	return kvm_skip_emulated_instruction(vcpu);
 7399}
 7400
 7401static int handle_mwait(struct kvm_vcpu *vcpu)
 7402{
 7403	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
 7404	return handle_nop(vcpu);
 7405}
 7406
 7407static int handle_invalid_op(struct kvm_vcpu *vcpu)
 7408{
 7409	kvm_queue_exception(vcpu, UD_VECTOR);
 7410	return 1;
 7411}
 7412
 7413static int handle_monitor_trap(struct kvm_vcpu *vcpu)
 7414{
 7415	return 1;
 7416}
 7417
 7418static int handle_monitor(struct kvm_vcpu *vcpu)
 7419{
 7420	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
 7421	return handle_nop(vcpu);
 7422}
 7423
 7424/*
 7425 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 7426 * set the success or error code of an emulated VMX instruction, as specified
 7427 * by Vol 2B, VMX Instruction Reference, "Conventions".
 7428 */
 7429static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
 7430{
 7431	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
 7432			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 7433			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
 7434}
 7435
 7436static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 7437{
 7438	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 7439			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
 7440			    X86_EFLAGS_SF | X86_EFLAGS_OF))
 7441			| X86_EFLAGS_CF);
 7442}
 7443
 7444static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
 7445					u32 vm_instruction_error)
 7446{
 7447	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
 7448		/*
 7449		 * failValid writes the error number to the current VMCS, which
 7450		 * can't be done there isn't a current VMCS.
 7451		 */
 7452		nested_vmx_failInvalid(vcpu);
 7453		return;
 7454	}
 7455	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
 7456			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
 7457			    X86_EFLAGS_SF | X86_EFLAGS_OF))
 7458			| X86_EFLAGS_ZF);
 7459	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
 7460	/*
 7461	 * We don't need to force a shadow sync because
 7462	 * VM_INSTRUCTION_ERROR is not shadowed
 7463	 */
 7464}
 7465
 7466static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 7467{
 7468	/* TODO: not to reset guest simply here. */
 7469	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 7470	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 7471}
 7472
 7473static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 7474{
 7475	struct vcpu_vmx *vmx =
 7476		container_of(timer, struct vcpu_vmx, nested.preemption_timer);
 7477
 7478	vmx->nested.preemption_timer_expired = true;
 7479	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
 7480	kvm_vcpu_kick(&vmx->vcpu);
 7481
 7482	return HRTIMER_NORESTART;
 7483}
 7484
 7485/*
 7486 * Decode the memory-address operand of a vmx instruction, as recorded on an
 7487 * exit caused by such an instruction (run by a guest hypervisor).
 7488 * On success, returns 0. When the operand is invalid, returns 1 and throws
 7489 * #UD or #GP.
 7490 */
 7491static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 7492				 unsigned long exit_qualification,
 7493				 u32 vmx_instruction_info, bool wr, gva_t *ret)
 7494{
 7495	gva_t off;
 7496	bool exn;
 7497	struct kvm_segment s;
 7498
 7499	/*
 7500	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
 7501	 * Execution", on an exit, vmx_instruction_info holds most of the
 7502	 * addressing components of the operand. Only the displacement part
 7503	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
 7504	 * For how an actual address is calculated from all these components,
 7505	 * refer to Vol. 1, "Operand Addressing".
 7506	 */
 7507	int  scaling = vmx_instruction_info & 3;
 7508	int  addr_size = (vmx_instruction_info >> 7) & 7;
 7509	bool is_reg = vmx_instruction_info & (1u << 10);
 7510	int  seg_reg = (vmx_instruction_info >> 15) & 7;
 7511	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
 7512	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
 7513	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
 7514	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
 7515
 7516	if (is_reg) {
 7517		kvm_queue_exception(vcpu, UD_VECTOR);
 7518		return 1;
 7519	}
 7520
 7521	/* Addr = segment_base + offset */
 7522	/* offset = base + [index * scale] + displacement */
 7523	off = exit_qualification; /* holds the displacement */
 7524	if (base_is_valid)
 7525		off += kvm_register_read(vcpu, base_reg);
 7526	if (index_is_valid)
 7527		off += kvm_register_read(vcpu, index_reg)<<scaling;
 7528	vmx_get_segment(vcpu, &s, seg_reg);
 7529	*ret = s.base + off;
 7530
 7531	if (addr_size == 1) /* 32 bit */
 7532		*ret &= 0xffffffff;
 7533
 7534	/* Checks for #GP/#SS exceptions. */
 7535	exn = false;
 7536	if (is_long_mode(vcpu)) {
 7537		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
 7538		 * non-canonical form. This is the only check on the memory
 7539		 * destination for long mode!
 7540		 */
 7541		exn = is_noncanonical_address(*ret, vcpu);
 7542	} else if (is_protmode(vcpu)) {
 7543		/* Protected mode: apply checks for segment validity in the
 7544		 * following order:
 7545		 * - segment type check (#GP(0) may be thrown)
 7546		 * - usability check (#GP(0)/#SS(0))
 7547		 * - limit check (#GP(0)/#SS(0))
 7548		 */
 7549		if (wr)
 7550			/* #GP(0) if the destination operand is located in a
 7551			 * read-only data segment or any code segment.
 7552			 */
 7553			exn = ((s.type & 0xa) == 0 || (s.type & 8));
 7554		else
 7555			/* #GP(0) if the source operand is located in an
 7556			 * execute-only code segment
 7557			 */
 7558			exn = ((s.type & 0xa) == 8);
 7559		if (exn) {
 7560			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 7561			return 1;
 7562		}
 7563		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
 7564		 */
 7565		exn = (s.unusable != 0);
 7566		/* Protected mode: #GP(0)/#SS(0) if the memory
 7567		 * operand is outside the segment limit.
 7568		 */
 7569		exn = exn || (off + sizeof(u64) > s.limit);
 7570	}
 7571	if (exn) {
 7572		kvm_queue_exception_e(vcpu,
 7573				      seg_reg == VCPU_SREG_SS ?
 7574						SS_VECTOR : GP_VECTOR,
 7575				      0);
 7576		return 1;
 7577	}
 7578
 7579	return 0;
 7580}
 7581
 7582static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 7583{
 7584	gva_t gva;
 7585	struct x86_exception e;
 7586
 7587	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 7588			vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
 7589		return 1;
 7590
 7591	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
 7592				sizeof(*vmpointer), &e)) {
 7593		kvm_inject_page_fault(vcpu, &e);
 7594		return 1;
 7595	}
 7596
 7597	return 0;
 7598}
 7599
 7600static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 7601{
 7602	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7603	struct vmcs *shadow_vmcs;
 7604	int r;
 7605
 7606	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
 7607	if (r < 0)
 7608		goto out_vmcs02;
 7609
 7610	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
 7611	if (!vmx->nested.cached_vmcs12)
 7612		goto out_cached_vmcs12;
 7613
 7614	if (enable_shadow_vmcs) {
 7615		shadow_vmcs = alloc_vmcs();
 7616		if (!shadow_vmcs)
 7617			goto out_shadow_vmcs;
 7618		/* mark vmcs as shadow */
 7619		shadow_vmcs->revision_id |= (1u << 31);
 7620		/* init shadow vmcs */
 7621		vmcs_clear(shadow_vmcs);
 7622		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
 7623	}
 7624
 7625	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
 7626		     HRTIMER_MODE_REL_PINNED);
 7627	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
 7628
 7629	vmx->nested.vmxon = true;
 7630	return 0;
 7631
 7632out_shadow_vmcs:
 7633	kfree(vmx->nested.cached_vmcs12);
 7634
 7635out_cached_vmcs12:
 7636	free_loaded_vmcs(&vmx->nested.vmcs02);
 7637
 7638out_vmcs02:
 7639	return -ENOMEM;
 7640}
 7641
 7642/*
 7643 * Emulate the VMXON instruction.
 7644 * Currently, we just remember that VMX is active, and do not save or even
 7645 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
 7646 * do not currently need to store anything in that guest-allocated memory
 7647 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
 7648 * argument is different from the VMXON pointer (which the spec says they do).
 7649 */
 7650static int handle_vmon(struct kvm_vcpu *vcpu)
 7651{
 7652	int ret;
 7653	gpa_t vmptr;
 7654	struct page *page;
 7655	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7656	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
 7657		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 7658
 7659	/*
 7660	 * The Intel VMX Instruction Reference lists a bunch of bits that are
 7661	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
 7662	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
 7663	 * Otherwise, we should fail with #UD.  But most faulting conditions
 7664	 * have already been checked by hardware, prior to the VM-exit for
 7665	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
 7666	 * that bit set to 1 in non-root mode.
 7667	 */
 7668	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
 7669		kvm_queue_exception(vcpu, UD_VECTOR);
 7670		return 1;
 7671	}
 7672
 7673	if (vmx->nested.vmxon) {
 7674		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 7675		return kvm_skip_emulated_instruction(vcpu);
 7676	}
 7677
 7678	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
 7679			!= VMXON_NEEDED_FEATURES) {
 7680		kvm_inject_gp(vcpu, 0);
 7681		return 1;
 7682	}
 7683
 7684	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 7685		return 1;
 7686
 7687	/*
 7688	 * SDM 3: 24.11.5
 7689	 * The first 4 bytes of VMXON region contain the supported
 7690	 * VMCS revision identifier
 7691	 *
 7692	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
 7693	 * which replaces physical address width with 32
 7694	 */
 7695	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
 7696		nested_vmx_failInvalid(vcpu);
 7697		return kvm_skip_emulated_instruction(vcpu);
 7698	}
 7699
 7700	page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
 7701	if (is_error_page(page)) {
 7702		nested_vmx_failInvalid(vcpu);
 7703		return kvm_skip_emulated_instruction(vcpu);
 7704	}
 7705	if (*(u32 *)kmap(page) != VMCS12_REVISION) {
 7706		kunmap(page);
 7707		kvm_release_page_clean(page);
 7708		nested_vmx_failInvalid(vcpu);
 7709		return kvm_skip_emulated_instruction(vcpu);
 7710	}
 7711	kunmap(page);
 7712	kvm_release_page_clean(page);
 7713
 7714	vmx->nested.vmxon_ptr = vmptr;
 7715	ret = enter_vmx_operation(vcpu);
 7716	if (ret)
 7717		return ret;
 7718
 7719	nested_vmx_succeed(vcpu);
 7720	return kvm_skip_emulated_instruction(vcpu);
 7721}
 7722
 7723/*
 7724 * Intel's VMX Instruction Reference specifies a common set of prerequisites
 7725 * for running VMX instructions (except VMXON, whose prerequisites are
 7726 * slightly different). It also specifies what exception to inject otherwise.
 7727 * Note that many of these exceptions have priority over VM exits, so they
 7728 * don't have to be checked again here.
 7729 */
 7730static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 7731{
 7732	if (!to_vmx(vcpu)->nested.vmxon) {
 7733		kvm_queue_exception(vcpu, UD_VECTOR);
 7734		return 0;
 7735	}
 7736	return 1;
 7737}
 7738
 7739static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
 7740{
 7741	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
 7742	vmcs_write64(VMCS_LINK_POINTER, -1ull);
 7743}
 7744
 7745static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 7746{
 7747	if (vmx->nested.current_vmptr == -1ull)
 7748		return;
 7749
 7750	if (enable_shadow_vmcs) {
 7751		/* copy to memory all shadowed fields in case
 7752		   they were modified */
 7753		copy_shadow_to_vmcs12(vmx);
 7754		vmx->nested.sync_shadow_vmcs = false;
 7755		vmx_disable_shadow_vmcs(vmx);
 7756	}
 7757	vmx->nested.posted_intr_nv = -1;
 7758
 7759	/* Flush VMCS12 to guest memory */
 7760	kvm_vcpu_write_guest_page(&vmx->vcpu,
 7761				  vmx->nested.current_vmptr >> PAGE_SHIFT,
 7762				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
 7763
 7764	vmx->nested.current_vmptr = -1ull;
 7765}
 7766
 7767/*
 7768 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
 7769 * just stops using VMX.
 7770 */
 7771static void free_nested(struct vcpu_vmx *vmx)
 7772{
 7773	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
 7774		return;
 7775
 7776	vmx->nested.vmxon = false;
 7777	vmx->nested.smm.vmxon = false;
 7778	free_vpid(vmx->nested.vpid02);
 7779	vmx->nested.posted_intr_nv = -1;
 7780	vmx->nested.current_vmptr = -1ull;
 7781	if (enable_shadow_vmcs) {
 7782		vmx_disable_shadow_vmcs(vmx);
 7783		vmcs_clear(vmx->vmcs01.shadow_vmcs);
 7784		free_vmcs(vmx->vmcs01.shadow_vmcs);
 7785		vmx->vmcs01.shadow_vmcs = NULL;
 7786	}
 7787	kfree(vmx->nested.cached_vmcs12);
 7788	/* Unpin physical memory we referred to in the vmcs02 */
 7789	if (vmx->nested.apic_access_page) {
 7790		kvm_release_page_dirty(vmx->nested.apic_access_page);
 7791		vmx->nested.apic_access_page = NULL;
 7792	}
 7793	if (vmx->nested.virtual_apic_page) {
 7794		kvm_release_page_dirty(vmx->nested.virtual_apic_page);
 7795		vmx->nested.virtual_apic_page = NULL;
 7796	}
 7797	if (vmx->nested.pi_desc_page) {
 7798		kunmap(vmx->nested.pi_desc_page);
 7799		kvm_release_page_dirty(vmx->nested.pi_desc_page);
 7800		vmx->nested.pi_desc_page = NULL;
 7801		vmx->nested.pi_desc = NULL;
 7802	}
 7803
 7804	free_loaded_vmcs(&vmx->nested.vmcs02);
 7805}
 7806
 7807/* Emulate the VMXOFF instruction */
 7808static int handle_vmoff(struct kvm_vcpu *vcpu)
 7809{
 7810	if (!nested_vmx_check_permission(vcpu))
 7811		return 1;
 7812	free_nested(to_vmx(vcpu));
 7813	nested_vmx_succeed(vcpu);
 7814	return kvm_skip_emulated_instruction(vcpu);
 7815}
 7816
 7817/* Emulate the VMCLEAR instruction */
 7818static int handle_vmclear(struct kvm_vcpu *vcpu)
 7819{
 7820	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7821	u32 zero = 0;
 7822	gpa_t vmptr;
 7823
 7824	if (!nested_vmx_check_permission(vcpu))
 7825		return 1;
 7826
 7827	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 7828		return 1;
 7829
 7830	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
 7831		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
 7832		return kvm_skip_emulated_instruction(vcpu);
 7833	}
 7834
 7835	if (vmptr == vmx->nested.vmxon_ptr) {
 7836		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
 7837		return kvm_skip_emulated_instruction(vcpu);
 7838	}
 7839
 7840	if (vmptr == vmx->nested.current_vmptr)
 7841		nested_release_vmcs12(vmx);
 7842
 7843	kvm_vcpu_write_guest(vcpu,
 7844			vmptr + offsetof(struct vmcs12, launch_state),
 7845			&zero, sizeof(zero));
 7846
 7847	nested_vmx_succeed(vcpu);
 7848	return kvm_skip_emulated_instruction(vcpu);
 7849}
 7850
 7851static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
 7852
 7853/* Emulate the VMLAUNCH instruction */
 7854static int handle_vmlaunch(struct kvm_vcpu *vcpu)
 7855{
 7856	return nested_vmx_run(vcpu, true);
 7857}
 7858
 7859/* Emulate the VMRESUME instruction */
 7860static int handle_vmresume(struct kvm_vcpu *vcpu)
 7861{
 7862
 7863	return nested_vmx_run(vcpu, false);
 7864}
 7865
 7866/*
 7867 * Read a vmcs12 field. Since these can have varying lengths and we return
 7868 * one type, we chose the biggest type (u64) and zero-extend the return value
 7869 * to that size. Note that the caller, handle_vmread, might need to use only
 7870 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
 7871 * 64-bit fields are to be returned).
 7872 */
 7873static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 7874				  unsigned long field, u64 *ret)
 7875{
 7876	short offset = vmcs_field_to_offset(field);
 7877	char *p;
 7878
 7879	if (offset < 0)
 7880		return offset;
 7881
 7882	p = ((char *)(get_vmcs12(vcpu))) + offset;
 7883
 7884	switch (vmcs_field_width(field)) {
 7885	case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
 7886		*ret = *((natural_width *)p);
 7887		return 0;
 7888	case VMCS_FIELD_WIDTH_U16:
 7889		*ret = *((u16 *)p);
 7890		return 0;
 7891	case VMCS_FIELD_WIDTH_U32:
 7892		*ret = *((u32 *)p);
 7893		return 0;
 7894	case VMCS_FIELD_WIDTH_U64:
 7895		*ret = *((u64 *)p);
 7896		return 0;
 7897	default:
 7898		WARN_ON(1);
 7899		return -ENOENT;
 7900	}
 7901}
 7902
 7903
 7904static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
 7905				   unsigned long field, u64 field_value){
 7906	short offset = vmcs_field_to_offset(field);
 7907	char *p = ((char *) get_vmcs12(vcpu)) + offset;
 7908	if (offset < 0)
 7909		return offset;
 7910
 7911	switch (vmcs_field_width(field)) {
 7912	case VMCS_FIELD_WIDTH_U16:
 7913		*(u16 *)p = field_value;
 7914		return 0;
 7915	case VMCS_FIELD_WIDTH_U32:
 7916		*(u32 *)p = field_value;
 7917		return 0;
 7918	case VMCS_FIELD_WIDTH_U64:
 7919		*(u64 *)p = field_value;
 7920		return 0;
 7921	case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
 7922		*(natural_width *)p = field_value;
 7923		return 0;
 7924	default:
 7925		WARN_ON(1);
 7926		return -ENOENT;
 7927	}
 7928
 7929}
 7930
 7931static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 7932{
 7933	int i;
 7934	unsigned long field;
 7935	u64 field_value;
 7936	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
 7937	const u16 *fields = shadow_read_write_fields;
 7938	const int num_fields = max_shadow_read_write_fields;
 7939
 7940	preempt_disable();
 7941
 7942	vmcs_load(shadow_vmcs);
 7943
 7944	for (i = 0; i < num_fields; i++) {
 7945		field = fields[i];
 7946		field_value = __vmcs_readl(field);
 7947		vmcs12_write_any(&vmx->vcpu, field, field_value);
 7948	}
 7949
 7950	vmcs_clear(shadow_vmcs);
 7951	vmcs_load(vmx->loaded_vmcs->vmcs);
 7952
 7953	preempt_enable();
 7954}
 7955
 7956static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 7957{
 7958	const u16 *fields[] = {
 7959		shadow_read_write_fields,
 7960		shadow_read_only_fields
 7961	};
 7962	const int max_fields[] = {
 7963		max_shadow_read_write_fields,
 7964		max_shadow_read_only_fields
 7965	};
 7966	int i, q;
 7967	unsigned long field;
 7968	u64 field_value = 0;
 7969	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
 7970
 7971	vmcs_load(shadow_vmcs);
 7972
 7973	for (q = 0; q < ARRAY_SIZE(fields); q++) {
 7974		for (i = 0; i < max_fields[q]; i++) {
 7975			field = fields[q][i];
 7976			vmcs12_read_any(&vmx->vcpu, field, &field_value);
 7977			__vmcs_writel(field, field_value);
 7978		}
 7979	}
 7980
 7981	vmcs_clear(shadow_vmcs);
 7982	vmcs_load(vmx->loaded_vmcs->vmcs);
 7983}
 7984
 7985/*
 7986 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
 7987 * used before) all generate the same failure when it is missing.
 7988 */
 7989static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
 7990{
 7991	struct vcpu_vmx *vmx = to_vmx(vcpu);
 7992	if (vmx->nested.current_vmptr == -1ull) {
 7993		nested_vmx_failInvalid(vcpu);
 7994		return 0;
 7995	}
 7996	return 1;
 7997}
 7998
 7999static int handle_vmread(struct kvm_vcpu *vcpu)
 8000{
 8001	unsigned long field;
 8002	u64 field_value;
 8003	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 8004	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 8005	gva_t gva = 0;
 8006
 8007	if (!nested_vmx_check_permission(vcpu))
 8008		return 1;
 8009
 8010	if (!nested_vmx_check_vmcs12(vcpu))
 8011		return kvm_skip_emulated_instruction(vcpu);
 8012
 8013	/* Decode instruction info and find the field to read */
 8014	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
 8015	/* Read the field, zero-extended to a u64 field_value */
 8016	if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
 8017		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 8018		return kvm_skip_emulated_instruction(vcpu);
 8019	}
 8020	/*
 8021	 * Now copy part of this value to register or memory, as requested.
 8022	 * Note that the number of bits actually copied is 32 or 64 depending
 8023	 * on the guest's mode (32 or 64 bit), not on the given field's length.
 8024	 */
 8025	if (vmx_instruction_info & (1u << 10)) {
 8026		kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
 8027			field_value);
 8028	} else {
 8029		if (get_vmx_mem_address(vcpu, exit_qualification,
 8030				vmx_instruction_info, true, &gva))
 8031			return 1;
 8032		/* _system ok, as hardware has verified cpl=0 */
 8033		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
 8034			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
 8035	}
 8036
 8037	nested_vmx_succeed(vcpu);
 8038	return kvm_skip_emulated_instruction(vcpu);
 8039}
 8040
 8041
 8042static int handle_vmwrite(struct kvm_vcpu *vcpu)
 8043{
 8044	unsigned long field;
 8045	gva_t gva;
 8046	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8047	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 8048	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 8049
 8050	/* The value to write might be 32 or 64 bits, depending on L1's long
 8051	 * mode, and eventually we need to write that into a field of several
 8052	 * possible lengths. The code below first zero-extends the value to 64
 8053	 * bit (field_value), and then copies only the appropriate number of
 8054	 * bits into the vmcs12 field.
 8055	 */
 8056	u64 field_value = 0;
 8057	struct x86_exception e;
 8058
 8059	if (!nested_vmx_check_permission(vcpu))
 8060		return 1;
 8061
 8062	if (!nested_vmx_check_vmcs12(vcpu))
 8063		return kvm_skip_emulated_instruction(vcpu);
 8064
 8065	if (vmx_instruction_info & (1u << 10))
 8066		field_value = kvm_register_readl(vcpu,
 8067			(((vmx_instruction_info) >> 3) & 0xf));
 8068	else {
 8069		if (get_vmx_mem_address(vcpu, exit_qualification,
 8070				vmx_instruction_info, false, &gva))
 8071			return 1;
 8072		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
 8073			   &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
 8074			kvm_inject_page_fault(vcpu, &e);
 8075			return 1;
 8076		}
 8077	}
 8078
 8079
 8080	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
 8081	if (vmcs_field_readonly(field)) {
 8082		nested_vmx_failValid(vcpu,
 8083			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
 8084		return kvm_skip_emulated_instruction(vcpu);
 8085	}
 8086
 8087	if (vmcs12_write_any(vcpu, field, field_value) < 0) {
 8088		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 8089		return kvm_skip_emulated_instruction(vcpu);
 8090	}
 8091
 8092	switch (field) {
 8093#define SHADOW_FIELD_RW(x) case x:
 8094#include "vmx_shadow_fields.h"
 8095		/*
 8096		 * The fields that can be updated by L1 without a vmexit are
 8097		 * always updated in the vmcs02, the others go down the slow
 8098		 * path of prepare_vmcs02.
 8099		 */
 8100		break;
 8101	default:
 8102		vmx->nested.dirty_vmcs12 = true;
 8103		break;
 8104	}
 8105
 8106	nested_vmx_succeed(vcpu);
 8107	return kvm_skip_emulated_instruction(vcpu);
 8108}
 8109
 8110static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
 8111{
 8112	vmx->nested.current_vmptr = vmptr;
 8113	if (enable_shadow_vmcs) {
 8114		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
 8115			      SECONDARY_EXEC_SHADOW_VMCS);
 8116		vmcs_write64(VMCS_LINK_POINTER,
 8117			     __pa(vmx->vmcs01.shadow_vmcs));
 8118		vmx->nested.sync_shadow_vmcs = true;
 8119	}
 8120	vmx->nested.dirty_vmcs12 = true;
 8121}
 8122
 8123/* Emulate the VMPTRLD instruction */
 8124static int handle_vmptrld(struct kvm_vcpu *vcpu)
 8125{
 8126	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8127	gpa_t vmptr;
 8128
 8129	if (!nested_vmx_check_permission(vcpu))
 8130		return 1;
 8131
 8132	if (nested_vmx_get_vmptr(vcpu, &vmptr))
 8133		return 1;
 8134
 8135	if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
 8136		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
 8137		return kvm_skip_emulated_instruction(vcpu);
 8138	}
 8139
 8140	if (vmptr == vmx->nested.vmxon_ptr) {
 8141		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
 8142		return kvm_skip_emulated_instruction(vcpu);
 8143	}
 8144
 8145	if (vmx->nested.current_vmptr != vmptr) {
 8146		struct vmcs12 *new_vmcs12;
 8147		struct page *page;
 8148		page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
 8149		if (is_error_page(page)) {
 8150			nested_vmx_failInvalid(vcpu);
 8151			return kvm_skip_emulated_instruction(vcpu);
 8152		}
 8153		new_vmcs12 = kmap(page);
 8154		if (new_vmcs12->revision_id != VMCS12_REVISION) {
 8155			kunmap(page);
 8156			kvm_release_page_clean(page);
 8157			nested_vmx_failValid(vcpu,
 8158				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
 8159			return kvm_skip_emulated_instruction(vcpu);
 8160		}
 8161
 8162		nested_release_vmcs12(vmx);
 8163		/*
 8164		 * Load VMCS12 from guest memory since it is not already
 8165		 * cached.
 8166		 */
 8167		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
 8168		kunmap(page);
 8169		kvm_release_page_clean(page);
 8170
 8171		set_current_vmptr(vmx, vmptr);
 8172	}
 8173
 8174	nested_vmx_succeed(vcpu);
 8175	return kvm_skip_emulated_instruction(vcpu);
 8176}
 8177
 8178/* Emulate the VMPTRST instruction */
 8179static int handle_vmptrst(struct kvm_vcpu *vcpu)
 8180{
 8181	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 8182	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 8183	gva_t vmcs_gva;
 8184	struct x86_exception e;
 8185
 8186	if (!nested_vmx_check_permission(vcpu))
 8187		return 1;
 8188
 8189	if (get_vmx_mem_address(vcpu, exit_qualification,
 8190			vmx_instruction_info, true, &vmcs_gva))
 8191		return 1;
 8192	/* ok to use *_system, as hardware has verified cpl=0 */
 8193	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
 8194				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
 8195				 sizeof(u64), &e)) {
 8196		kvm_inject_page_fault(vcpu, &e);
 8197		return 1;
 8198	}
 8199	nested_vmx_succeed(vcpu);
 8200	return kvm_skip_emulated_instruction(vcpu);
 8201}
 8202
 8203/* Emulate the INVEPT instruction */
 8204static int handle_invept(struct kvm_vcpu *vcpu)
 8205{
 8206	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8207	u32 vmx_instruction_info, types;
 8208	unsigned long type;
 8209	gva_t gva;
 8210	struct x86_exception e;
 8211	struct {
 8212		u64 eptp, gpa;
 8213	} operand;
 8214
 8215	if (!(vmx->nested.msrs.secondary_ctls_high &
 8216	      SECONDARY_EXEC_ENABLE_EPT) ||
 8217	    !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
 8218		kvm_queue_exception(vcpu, UD_VECTOR);
 8219		return 1;
 8220	}
 8221
 8222	if (!nested_vmx_check_permission(vcpu))
 8223		return 1;
 8224
 8225	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 8226	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 8227
 8228	types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 8229
 8230	if (type >= 32 || !(types & (1 << type))) {
 8231		nested_vmx_failValid(vcpu,
 8232				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 8233		return kvm_skip_emulated_instruction(vcpu);
 8234	}
 8235
 8236	/* According to the Intel VMX instruction reference, the memory
 8237	 * operand is read even if it isn't needed (e.g., for type==global)
 8238	 */
 8239	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 8240			vmx_instruction_info, false, &gva))
 8241		return 1;
 8242	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
 8243				sizeof(operand), &e)) {
 8244		kvm_inject_page_fault(vcpu, &e);
 8245		return 1;
 8246	}
 8247
 8248	switch (type) {
 8249	case VMX_EPT_EXTENT_GLOBAL:
 8250	/*
 8251	 * TODO: track mappings and invalidate
 8252	 * single context requests appropriately
 8253	 */
 8254	case VMX_EPT_EXTENT_CONTEXT:
 8255		kvm_mmu_sync_roots(vcpu);
 8256		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 8257		nested_vmx_succeed(vcpu);
 8258		break;
 8259	default:
 8260		BUG_ON(1);
 8261		break;
 8262	}
 8263
 8264	return kvm_skip_emulated_instruction(vcpu);
 8265}
 8266
 8267static int handle_invvpid(struct kvm_vcpu *vcpu)
 8268{
 8269	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8270	u32 vmx_instruction_info;
 8271	unsigned long type, types;
 8272	gva_t gva;
 8273	struct x86_exception e;
 8274	struct {
 8275		u64 vpid;
 8276		u64 gla;
 8277	} operand;
 8278
 8279	if (!(vmx->nested.msrs.secondary_ctls_high &
 8280	      SECONDARY_EXEC_ENABLE_VPID) ||
 8281			!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
 8282		kvm_queue_exception(vcpu, UD_VECTOR);
 8283		return 1;
 8284	}
 8285
 8286	if (!nested_vmx_check_permission(vcpu))
 8287		return 1;
 8288
 8289	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 8290	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 8291
 8292	types = (vmx->nested.msrs.vpid_caps &
 8293			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
 8294
 8295	if (type >= 32 || !(types & (1 << type))) {
 8296		nested_vmx_failValid(vcpu,
 8297			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 8298		return kvm_skip_emulated_instruction(vcpu);
 8299	}
 8300
 8301	/* according to the intel vmx instruction reference, the memory
 8302	 * operand is read even if it isn't needed (e.g., for type==global)
 8303	 */
 8304	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
 8305			vmx_instruction_info, false, &gva))
 8306		return 1;
 8307	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
 8308				sizeof(operand), &e)) {
 8309		kvm_inject_page_fault(vcpu, &e);
 8310		return 1;
 8311	}
 8312	if (operand.vpid >> 16) {
 8313		nested_vmx_failValid(vcpu,
 8314			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 8315		return kvm_skip_emulated_instruction(vcpu);
 8316	}
 8317
 8318	switch (type) {
 8319	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
 8320		if (is_noncanonical_address(operand.gla, vcpu)) {
 8321			nested_vmx_failValid(vcpu,
 8322				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 8323			return kvm_skip_emulated_instruction(vcpu);
 8324		}
 8325		/* fall through */
 8326	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
 8327	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
 8328		if (!operand.vpid) {
 8329			nested_vmx_failValid(vcpu,
 8330				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 8331			return kvm_skip_emulated_instruction(vcpu);
 8332		}
 8333		break;
 8334	case VMX_VPID_EXTENT_ALL_CONTEXT:
 8335		break;
 8336	default:
 8337		WARN_ON_ONCE(1);
 8338		return kvm_skip_emulated_instruction(vcpu);
 8339	}
 8340
 8341	__vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
 8342	nested_vmx_succeed(vcpu);
 8343
 8344	return kvm_skip_emulated_instruction(vcpu);
 8345}
 8346
 8347static int handle_pml_full(struct kvm_vcpu *vcpu)
 8348{
 8349	unsigned long exit_qualification;
 8350
 8351	trace_kvm_pml_full(vcpu->vcpu_id);
 8352
 8353	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 8354
 8355	/*
 8356	 * PML buffer FULL happened while executing iret from NMI,
 8357	 * "blocked by NMI" bit has to be set before next VM entry.
 8358	 */
 8359	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
 8360			enable_vnmi &&
 8361			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 8362		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 8363				GUEST_INTR_STATE_NMI);
 8364
 8365	/*
 8366	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
 8367	 * here.., and there's no userspace involvement needed for PML.
 8368	 */
 8369	return 1;
 8370}
 8371
 8372static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 8373{
 8374	kvm_lapic_expired_hv_timer(vcpu);
 8375	return 1;
 8376}
 8377
 8378static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
 8379{
 8380	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8381	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 8382
 8383	/* Check for memory type validity */
 8384	switch (address & VMX_EPTP_MT_MASK) {
 8385	case VMX_EPTP_MT_UC:
 8386		if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
 8387			return false;
 8388		break;
 8389	case VMX_EPTP_MT_WB:
 8390		if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
 8391			return false;
 8392		break;
 8393	default:
 8394		return false;
 8395	}
 8396
 8397	/* only 4 levels page-walk length are valid */
 8398	if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
 8399		return false;
 8400
 8401	/* Reserved bits should not be set */
 8402	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
 8403		return false;
 8404
 8405	/* AD, if set, should be supported */
 8406	if (address & VMX_EPTP_AD_ENABLE_BIT) {
 8407		if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
 8408			return false;
 8409	}
 8410
 8411	return true;
 8412}
 8413
 8414static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
 8415				     struct vmcs12 *vmcs12)
 8416{
 8417	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
 8418	u64 address;
 8419	bool accessed_dirty;
 8420	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
 8421
 8422	if (!nested_cpu_has_eptp_switching(vmcs12) ||
 8423	    !nested_cpu_has_ept(vmcs12))
 8424		return 1;
 8425
 8426	if (index >= VMFUNC_EPTP_ENTRIES)
 8427		return 1;
 8428
 8429
 8430	if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
 8431				     &address, index * 8, 8))
 8432		return 1;
 8433
 8434	accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
 8435
 8436	/*
 8437	 * If the (L2) guest does a vmfunc to the currently
 8438	 * active ept pointer, we don't have to do anything else
 8439	 */
 8440	if (vmcs12->ept_pointer != address) {
 8441		if (!valid_ept_address(vcpu, address))
 8442			return 1;
 8443
 8444		kvm_mmu_unload(vcpu);
 8445		mmu->ept_ad = accessed_dirty;
 8446		mmu->base_role.ad_disabled = !accessed_dirty;
 8447		vmcs12->ept_pointer = address;
 8448		/*
 8449		 * TODO: Check what's the correct approach in case
 8450		 * mmu reload fails. Currently, we just let the next
 8451		 * reload potentially fail
 8452		 */
 8453		kvm_mmu_reload(vcpu);
 8454	}
 8455
 8456	return 0;
 8457}
 8458
 8459static int handle_vmfunc(struct kvm_vcpu *vcpu)
 8460{
 8461	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8462	struct vmcs12 *vmcs12;
 8463	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
 8464
 8465	/*
 8466	 * VMFUNC is only supported for nested guests, but we always enable the
 8467	 * secondary control for simplicity; for non-nested mode, fake that we
 8468	 * didn't by injecting #UD.
 8469	 */
 8470	if (!is_guest_mode(vcpu)) {
 8471		kvm_queue_exception(vcpu, UD_VECTOR);
 8472		return 1;
 8473	}
 8474
 8475	vmcs12 = get_vmcs12(vcpu);
 8476	if ((vmcs12->vm_function_control & (1 << function)) == 0)
 8477		goto fail;
 8478
 8479	switch (function) {
 8480	case 0:
 8481		if (nested_vmx_eptp_switching(vcpu, vmcs12))
 8482			goto fail;
 8483		break;
 8484	default:
 8485		goto fail;
 8486	}
 8487	return kvm_skip_emulated_instruction(vcpu);
 8488
 8489fail:
 8490	nested_vmx_vmexit(vcpu, vmx->exit_reason,
 8491			  vmcs_read32(VM_EXIT_INTR_INFO),
 8492			  vmcs_readl(EXIT_QUALIFICATION));
 8493	return 1;
 8494}
 8495
 8496/*
 8497 * The exit handlers return 1 if the exit was handled fully and guest execution
 8498 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
 8499 * to be done to userspace and return 0.
 8500 */
 8501static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 8502	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
 8503	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
 8504	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
 8505	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
 8506	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
 8507	[EXIT_REASON_CR_ACCESS]               = handle_cr,
 8508	[EXIT_REASON_DR_ACCESS]               = handle_dr,
 8509	[EXIT_REASON_CPUID]                   = handle_cpuid,
 8510	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
 8511	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
 8512	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
 8513	[EXIT_REASON_HLT]                     = handle_halt,
 8514	[EXIT_REASON_INVD]		      = handle_invd,
 8515	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 8516	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
 8517	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 8518	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 8519	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
 8520	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
 8521	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
 8522	[EXIT_REASON_VMREAD]                  = handle_vmread,
 8523	[EXIT_REASON_VMRESUME]                = handle_vmresume,
 8524	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
 8525	[EXIT_REASON_VMOFF]                   = handle_vmoff,
 8526	[EXIT_REASON_VMON]                    = handle_vmon,
 8527	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 8528	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 8529	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
 8530	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
 8531	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 8532	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 8533	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 8534	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 8535	[EXIT_REASON_GDTR_IDTR]		      = handle_desc,
 8536	[EXIT_REASON_LDTR_TR]		      = handle_desc,
 8537	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 8538	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
 8539	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 8540	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
 8541	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
 8542	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
 8543	[EXIT_REASON_INVEPT]                  = handle_invept,
 8544	[EXIT_REASON_INVVPID]                 = handle_invvpid,
 8545	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
 8546	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
 8547	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 8548	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 8549	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 8550	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
 8551	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 8552};
 8553
 8554static const int kvm_vmx_max_exit_handlers =
 8555	ARRAY_SIZE(kvm_vmx_exit_handlers);
 8556
 8557static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 8558				       struct vmcs12 *vmcs12)
 8559{
 8560	unsigned long exit_qualification;
 8561	gpa_t bitmap, last_bitmap;
 8562	unsigned int port;
 8563	int size;
 8564	u8 b;
 8565
 8566	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
 8567		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
 8568
 8569	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 8570
 8571	port = exit_qualification >> 16;
 8572	size = (exit_qualification & 7) + 1;
 8573
 8574	last_bitmap = (gpa_t)-1;
 8575	b = -1;
 8576
 8577	while (size > 0) {
 8578		if (port < 0x8000)
 8579			bitmap = vmcs12->io_bitmap_a;
 8580		else if (port < 0x10000)
 8581			bitmap = vmcs12->io_bitmap_b;
 8582		else
 8583			return true;
 8584		bitmap += (port & 0x7fff) / 8;
 8585
 8586		if (last_bitmap != bitmap)
 8587			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
 8588				return true;
 8589		if (b & (1 << (port & 7)))
 8590			return true;
 8591
 8592		port++;
 8593		size--;
 8594		last_bitmap = bitmap;
 8595	}
 8596
 8597	return false;
 8598}
 8599
 8600/*
 8601 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
 8602 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
 8603 * disinterest in the current event (read or write a specific MSR) by using an
 8604 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
 8605 */
 8606static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 8607	struct vmcs12 *vmcs12, u32 exit_reason)
 8608{
 8609	u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
 8610	gpa_t bitmap;
 8611
 8612	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
 8613		return true;
 8614
 8615	/*
 8616	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
 8617	 * for the four combinations of read/write and low/high MSR numbers.
 8618	 * First we need to figure out which of the four to use:
 8619	 */
 8620	bitmap = vmcs12->msr_bitmap;
 8621	if (exit_reason == EXIT_REASON_MSR_WRITE)
 8622		bitmap += 2048;
 8623	if (msr_index >= 0xc0000000) {
 8624		msr_index -= 0xc0000000;
 8625		bitmap += 1024;
 8626	}
 8627
 8628	/* Then read the msr_index'th bit from this bitmap: */
 8629	if (msr_index < 1024*8) {
 8630		unsigned char b;
 8631		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
 8632			return true;
 8633		return 1 & (b >> (msr_index & 7));
 8634	} else
 8635		return true; /* let L1 handle the wrong parameter */
 8636}
 8637
 8638/*
 8639 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
 8640 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
 8641 * intercept (via guest_host_mask etc.) the current event.
 8642 */
 8643static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 8644	struct vmcs12 *vmcs12)
 8645{
 8646	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 8647	int cr = exit_qualification & 15;
 8648	int reg;
 8649	unsigned long val;
 8650
 8651	switch ((exit_qualification >> 4) & 3) {
 8652	case 0: /* mov to cr */
 8653		reg = (exit_qualification >> 8) & 15;
 8654		val = kvm_register_readl(vcpu, reg);
 8655		switch (cr) {
 8656		case 0:
 8657			if (vmcs12->cr0_guest_host_mask &
 8658			    (val ^ vmcs12->cr0_read_shadow))
 8659				return true;
 8660			break;
 8661		case 3:
 8662			if ((vmcs12->cr3_target_count >= 1 &&
 8663					vmcs12->cr3_target_value0 == val) ||
 8664				(vmcs12->cr3_target_count >= 2 &&
 8665					vmcs12->cr3_target_value1 == val) ||
 8666				(vmcs12->cr3_target_count >= 3 &&
 8667					vmcs12->cr3_target_value2 == val) ||
 8668				(vmcs12->cr3_target_count >= 4 &&
 8669					vmcs12->cr3_target_value3 == val))
 8670				return false;
 8671			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
 8672				return true;
 8673			break;
 8674		case 4:
 8675			if (vmcs12->cr4_guest_host_mask &
 8676			    (vmcs12->cr4_read_shadow ^ val))
 8677				return true;
 8678			break;
 8679		case 8:
 8680			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
 8681				return true;
 8682			break;
 8683		}
 8684		break;
 8685	case 2: /* clts */
 8686		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
 8687		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
 8688			return true;
 8689		break;
 8690	case 1: /* mov from cr */
 8691		switch (cr) {
 8692		case 3:
 8693			if (vmcs12->cpu_based_vm_exec_control &
 8694			    CPU_BASED_CR3_STORE_EXITING)
 8695				return true;
 8696			break;
 8697		case 8:
 8698			if (vmcs12->cpu_based_vm_exec_control &
 8699			    CPU_BASED_CR8_STORE_EXITING)
 8700				return true;
 8701			break;
 8702		}
 8703		break;
 8704	case 3: /* lmsw */
 8705		/*
 8706		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
 8707		 * cr0. Other attempted changes are ignored, with no exit.
 8708		 */
 8709		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
 8710		if (vmcs12->cr0_guest_host_mask & 0xe &
 8711		    (val ^ vmcs12->cr0_read_shadow))
 8712			return true;
 8713		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
 8714		    !(vmcs12->cr0_read_shadow & 0x1) &&
 8715		    (val & 0x1))
 8716			return true;
 8717		break;
 8718	}
 8719	return false;
 8720}
 8721
 8722/*
 8723 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
 8724 * should handle it ourselves in L0 (and then continue L2). Only call this
 8725 * when in is_guest_mode (L2).
 8726 */
 8727static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 8728{
 8729	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 8730	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8731	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 8732
 8733	if (vmx->nested.nested_run_pending)
 8734		return false;
 8735
 8736	if (unlikely(vmx->fail)) {
 8737		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
 8738				    vmcs_read32(VM_INSTRUCTION_ERROR));
 8739		return true;
 8740	}
 8741
 8742	/*
 8743	 * The host physical addresses of some pages of guest memory
 8744	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
 8745	 * Page). The CPU may write to these pages via their host
 8746	 * physical address while L2 is running, bypassing any
 8747	 * address-translation-based dirty tracking (e.g. EPT write
 8748	 * protection).
 8749	 *
 8750	 * Mark them dirty on every exit from L2 to prevent them from
 8751	 * getting out of sync with dirty tracking.
 8752	 */
 8753	nested_mark_vmcs12_pages_dirty(vcpu);
 8754
 8755	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
 8756				vmcs_readl(EXIT_QUALIFICATION),
 8757				vmx->idt_vectoring_info,
 8758				intr_info,
 8759				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 8760				KVM_ISA_VMX);
 8761
 8762	switch (exit_reason) {
 8763	case EXIT_REASON_EXCEPTION_NMI:
 8764		if (is_nmi(intr_info))
 8765			return false;
 8766		else if (is_page_fault(intr_info))
 8767			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
 8768		else if (is_no_device(intr_info) &&
 8769			 !(vmcs12->guest_cr0 & X86_CR0_TS))
 8770			return false;
 8771		else if (is_debug(intr_info) &&
 8772			 vcpu->guest_debug &
 8773			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
 8774			return false;
 8775		else if (is_breakpoint(intr_info) &&
 8776			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
 8777			return false;
 8778		return vmcs12->exception_bitmap &
 8779				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
 8780	case EXIT_REASON_EXTERNAL_INTERRUPT:
 8781		return false;
 8782	case EXIT_REASON_TRIPLE_FAULT:
 8783		return true;
 8784	case EXIT_REASON_PENDING_INTERRUPT:
 8785		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
 8786	case EXIT_REASON_NMI_WINDOW:
 8787		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
 8788	case EXIT_REASON_TASK_SWITCH:
 8789		return true;
 8790	case EXIT_REASON_CPUID:
 8791		return true;
 8792	case EXIT_REASON_HLT:
 8793		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
 8794	case EXIT_REASON_INVD:
 8795		return true;
 8796	case EXIT_REASON_INVLPG:
 8797		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
 8798	case EXIT_REASON_RDPMC:
 8799		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
 8800	case EXIT_REASON_RDRAND:
 8801		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
 8802	case EXIT_REASON_RDSEED:
 8803		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
 8804	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
 8805		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
 8806	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
 8807	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
 8808	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
 8809	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
 8810	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
 8811	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
 8812		/*
 8813		 * VMX instructions trap unconditionally. This allows L1 to
 8814		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
 8815		 */
 8816		return true;
 8817	case EXIT_REASON_CR_ACCESS:
 8818		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
 8819	case EXIT_REASON_DR_ACCESS:
 8820		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
 8821	case EXIT_REASON_IO_INSTRUCTION:
 8822		return nested_vmx_exit_handled_io(vcpu, vmcs12);
 8823	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
 8824		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
 8825	case EXIT_REASON_MSR_READ:
 8826	case EXIT_REASON_MSR_WRITE:
 8827		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
 8828	case EXIT_REASON_INVALID_STATE:
 8829		return true;
 8830	case EXIT_REASON_MWAIT_INSTRUCTION:
 8831		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
 8832	case EXIT_REASON_MONITOR_TRAP_FLAG:
 8833		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
 8834	case EXIT_REASON_MONITOR_INSTRUCTION:
 8835		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
 8836	case EXIT_REASON_PAUSE_INSTRUCTION:
 8837		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
 8838			nested_cpu_has2(vmcs12,
 8839				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
 8840	case EXIT_REASON_MCE_DURING_VMENTRY:
 8841		return false;
 8842	case EXIT_REASON_TPR_BELOW_THRESHOLD:
 8843		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
 8844	case EXIT_REASON_APIC_ACCESS:
 8845		return nested_cpu_has2(vmcs12,
 8846			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
 8847	case EXIT_REASON_APIC_WRITE:
 8848	case EXIT_REASON_EOI_INDUCED:
 8849		/* apic_write and eoi_induced should exit unconditionally. */
 8850		return true;
 8851	case EXIT_REASON_EPT_VIOLATION:
 8852		/*
 8853		 * L0 always deals with the EPT violation. If nested EPT is
 8854		 * used, and the nested mmu code discovers that the address is
 8855		 * missing in the guest EPT table (EPT12), the EPT violation
 8856		 * will be injected with nested_ept_inject_page_fault()
 8857		 */
 8858		return false;
 8859	case EXIT_REASON_EPT_MISCONFIG:
 8860		/*
 8861		 * L2 never uses directly L1's EPT, but rather L0's own EPT
 8862		 * table (shadow on EPT) or a merged EPT table that L0 built
 8863		 * (EPT on EPT). So any problems with the structure of the
 8864		 * table is L0's fault.
 8865		 */
 8866		return false;
 8867	case EXIT_REASON_INVPCID:
 8868		return
 8869			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
 8870			nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
 8871	case EXIT_REASON_WBINVD:
 8872		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
 8873	case EXIT_REASON_XSETBV:
 8874		return true;
 8875	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
 8876		/*
 8877		 * This should never happen, since it is not possible to
 8878		 * set XSS to a non-zero value---neither in L1 nor in L2.
 8879		 * If if it were, XSS would have to be checked against
 8880		 * the XSS exit bitmap in vmcs12.
 8881		 */
 8882		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 8883	case EXIT_REASON_PREEMPTION_TIMER:
 8884		return false;
 8885	case EXIT_REASON_PML_FULL:
 8886		/* We emulate PML support to L1. */
 8887		return false;
 8888	case EXIT_REASON_VMFUNC:
 8889		/* VM functions are emulated through L2->L0 vmexits. */
 8890		return false;
 8891	default:
 8892		return true;
 8893	}
 8894}
 8895
 8896static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
 8897{
 8898	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 8899
 8900	/*
 8901	 * At this point, the exit interruption info in exit_intr_info
 8902	 * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
 8903	 * we need to query the in-kernel LAPIC.
 8904	 */
 8905	WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
 8906	if ((exit_intr_info &
 8907	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
 8908	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
 8909		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 8910		vmcs12->vm_exit_intr_error_code =
 8911			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 8912	}
 8913
 8914	nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
 8915			  vmcs_readl(EXIT_QUALIFICATION));
 8916	return 1;
 8917}
 8918
 8919static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 8920{
 8921	*info1 = vmcs_readl(EXIT_QUALIFICATION);
 8922	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 8923}
 8924
 8925static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 8926{
 8927	if (vmx->pml_pg) {
 8928		__free_page(vmx->pml_pg);
 8929		vmx->pml_pg = NULL;
 8930	}
 8931}
 8932
 8933static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
 8934{
 8935	struct vcpu_vmx *vmx = to_vmx(vcpu);
 8936	u64 *pml_buf;
 8937	u16 pml_idx;
 8938
 8939	pml_idx = vmcs_read16(GUEST_PML_INDEX);
 8940
 8941	/* Do nothing if PML buffer is empty */
 8942	if (pml_idx == (PML_ENTITY_NUM - 1))
 8943		return;
 8944
 8945	/* PML index always points to next available PML buffer entity */
 8946	if (pml_idx >= PML_ENTITY_NUM)
 8947		pml_idx = 0;
 8948	else
 8949		pml_idx++;
 8950
 8951	pml_buf = page_address(vmx->pml_pg);
 8952	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
 8953		u64 gpa;
 8954
 8955		gpa = pml_buf[pml_idx];
 8956		WARN_ON(gpa & (PAGE_SIZE - 1));
 8957		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
 8958	}
 8959
 8960	/* reset PML index */
 8961	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 8962}
 8963
 8964/*
 8965 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
 8966 * Called before reporting dirty_bitmap to userspace.
 8967 */
 8968static void kvm_flush_pml_buffers(struct kvm *kvm)
 8969{
 8970	int i;
 8971	struct kvm_vcpu *vcpu;
 8972	/*
 8973	 * We only need to kick vcpu out of guest mode here, as PML buffer
 8974	 * is flushed at beginning of all VMEXITs, and it's obvious that only
 8975	 * vcpus running in guest are possible to have unflushed GPAs in PML
 8976	 * buffer.
 8977	 */
 8978	kvm_for_each_vcpu(i, vcpu, kvm)
 8979		kvm_vcpu_kick(vcpu);
 8980}
 8981
 8982static void vmx_dump_sel(char *name, uint32_t sel)
 8983{
 8984	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
 8985	       name, vmcs_read16(sel),
 8986	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
 8987	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
 8988	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
 8989}
 8990
 8991static void vmx_dump_dtsel(char *name, uint32_t limit)
 8992{
 8993	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
 8994	       name, vmcs_read32(limit),
 8995	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
 8996}
 8997
 8998static void dump_vmcs(void)
 8999{
 9000	u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
 9001	u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
 9002	u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 9003	u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
 9004	u32 secondary_exec_control = 0;
 9005	unsigned long cr4 = vmcs_readl(GUEST_CR4);
 9006	u64 efer = vmcs_read64(GUEST_IA32_EFER);
 9007	int i, n;
 9008
 9009	if (cpu_has_secondary_exec_ctrls())
 9010		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 9011
 9012	pr_err("*** Guest State ***\n");
 9013	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
 9014	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
 9015	       vmcs_readl(CR0_GUEST_HOST_MASK));
 9016	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
 9017	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
 9018	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
 9019	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
 9020	    (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
 9021	{
 9022		pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
 9023		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
 9024		pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
 9025		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
 9026	}
 9027	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
 9028	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
 9029	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
 9030	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
 9031	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
 9032	       vmcs_readl(GUEST_SYSENTER_ESP),
 9033	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
 9034	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
 9035	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
 9036	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
 9037	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
 9038	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
 9039	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
 9040	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
 9041	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
 9042	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
 9043	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
 9044	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
 9045	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
 9046		pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
 9047		       efer, vmcs_read64(GUEST_IA32_PAT));
 9048	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
 9049	       vmcs_read64(GUEST_IA32_DEBUGCTL),
 9050	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
 9051	if (cpu_has_load_perf_global_ctrl &&
 9052	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
 9053		pr_err("PerfGlobCtl = 0x%016llx\n",
 9054		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
 9055	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
 9056		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
 9057	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
 9058	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
 9059	       vmcs_read32(GUEST_ACTIVITY_STATE));
 9060	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
 9061		pr_err("InterruptStatus = %04x\n",
 9062		       vmcs_read16(GUEST_INTR_STATUS));
 9063
 9064	pr_err("*** Host State ***\n");
 9065	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
 9066	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
 9067	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
 9068	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
 9069	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
 9070	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
 9071	       vmcs_read16(HOST_TR_SELECTOR));
 9072	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
 9073	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
 9074	       vmcs_readl(HOST_TR_BASE));
 9075	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
 9076	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
 9077	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
 9078	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
 9079	       vmcs_readl(HOST_CR4));
 9080	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
 9081	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
 9082	       vmcs_read32(HOST_IA32_SYSENTER_CS),
 9083	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
 9084	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
 9085		pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
 9086		       vmcs_read64(HOST_IA32_EFER),
 9087		       vmcs_read64(HOST_IA32_PAT));
 9088	if (cpu_has_load_perf_global_ctrl &&
 9089	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
 9090		pr_err("PerfGlobCtl = 0x%016llx\n",
 9091		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
 9092
 9093	pr_err("*** Control State ***\n");
 9094	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
 9095	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
 9096	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
 9097	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
 9098	       vmcs_read32(EXCEPTION_BITMAP),
 9099	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
 9100	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
 9101	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
 9102	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 9103	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
 9104	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
 9105	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
 9106	       vmcs_read32(VM_EXIT_INTR_INFO),
 9107	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
 9108	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
 9109	pr_err("        reason=%08x qualification=%016lx\n",
 9110	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
 9111	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
 9112	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
 9113	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
 9114	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
 9115	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
 9116		pr_err("TSC Multiplier = 0x%016llx\n",
 9117		       vmcs_read64(TSC_MULTIPLIER));
 9118	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
 9119		pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
 9120	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
 9121		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
 9122	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
 9123		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
 9124	n = vmcs_read32(CR3_TARGET_COUNT);
 9125	for (i = 0; i + 1 < n; i += 4)
 9126		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
 9127		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
 9128		       i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
 9129	if (i < n)
 9130		pr_err("CR3 target%u=%016lx\n",
 9131		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
 9132	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
 9133		pr_err("PLE Gap=%08x Window=%08x\n",
 9134		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
 9135	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
 9136		pr_err("Virtual processor ID = 0x%04x\n",
 9137		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
 9138}
 9139
 9140/*
 9141 * The guest has exited.  See if we can fix it or if we need userspace
 9142 * assistance.
 9143 */
 9144static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 9145{
 9146	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9147	u32 exit_reason = vmx->exit_reason;
 9148	u32 vectoring_info = vmx->idt_vectoring_info;
 9149
 9150	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
 9151
 9152	/*
 9153	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
 9154	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
 9155	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
 9156	 * mode as if vcpus is in root mode, the PML buffer must has been
 9157	 * flushed already.
 9158	 */
 9159	if (enable_pml)
 9160		vmx_flush_pml_buffer(vcpu);
 9161
 9162	/* If guest state is invalid, start emulating */
 9163	if (vmx->emulation_required)
 9164		return handle_invalid_guest_state(vcpu);
 9165
 9166	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
 9167		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 9168
 9169	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 9170		dump_vmcs();
 9171		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 9172		vcpu->run->fail_entry.hardware_entry_failure_reason
 9173			= exit_reason;
 9174		return 0;
 9175	}
 9176
 9177	if (unlikely(vmx->fail)) {
 9178		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 9179		vcpu->run->fail_entry.hardware_entry_failure_reason
 9180			= vmcs_read32(VM_INSTRUCTION_ERROR);
 9181		return 0;
 9182	}
 9183
 9184	/*
 9185	 * Note:
 9186	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
 9187	 * delivery event since it indicates guest is accessing MMIO.
 9188	 * The vm-exit can be triggered again after return to guest that
 9189	 * will cause infinite loop.
 9190	 */
 9191	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 9192			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
 9193			exit_reason != EXIT_REASON_EPT_VIOLATION &&
 9194			exit_reason != EXIT_REASON_PML_FULL &&
 9195			exit_reason != EXIT_REASON_TASK_SWITCH)) {
 9196		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 9197		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
 9198		vcpu->run->internal.ndata = 3;
 9199		vcpu->run->internal.data[0] = vectoring_info;
 9200		vcpu->run->internal.data[1] = exit_reason;
 9201		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
 9202		if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
 9203			vcpu->run->internal.ndata++;
 9204			vcpu->run->internal.data[3] =
 9205				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 9206		}
 9207		return 0;
 9208	}
 9209
 9210	if (unlikely(!enable_vnmi &&
 9211		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
 9212		if (vmx_interrupt_allowed(vcpu)) {
 9213			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
 9214		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
 9215			   vcpu->arch.nmi_pending) {
 9216			/*
 9217			 * This CPU don't support us in finding the end of an
 9218			 * NMI-blocked window if the guest runs with IRQs
 9219			 * disabled. So we pull the trigger after 1 s of
 9220			 * futile waiting, but inform the user about this.
 9221			 */
 9222			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
 9223			       "state on VCPU %d after 1 s timeout\n",
 9224			       __func__, vcpu->vcpu_id);
 9225			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
 9226		}
 9227	}
 9228
 9229	if (exit_reason < kvm_vmx_max_exit_handlers
 9230	    && kvm_vmx_exit_handlers[exit_reason])
 9231		return kvm_vmx_exit_handlers[exit_reason](vcpu);
 9232	else {
 9233		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
 9234				exit_reason);
 9235		kvm_queue_exception(vcpu, UD_VECTOR);
 9236		return 1;
 9237	}
 9238}
 9239
 9240static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 9241{
 9242	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 9243
 9244	if (is_guest_mode(vcpu) &&
 9245		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
 9246		return;
 9247
 9248	if (irr == -1 || tpr < irr) {
 9249		vmcs_write32(TPR_THRESHOLD, 0);
 9250		return;
 9251	}
 9252
 9253	vmcs_write32(TPR_THRESHOLD, irr);
 9254}
 9255
 9256static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 9257{
 9258	u32 sec_exec_control;
 9259
 9260	/* Postpone execution until vmcs01 is the current VMCS. */
 9261	if (is_guest_mode(vcpu)) {
 9262		to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
 9263		return;
 9264	}
 9265
 9266	if (!cpu_has_vmx_virtualize_x2apic_mode())
 9267		return;
 9268
 9269	if (!cpu_need_tpr_shadow(vcpu))
 9270		return;
 9271
 9272	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 9273
 9274	if (set) {
 9275		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 9276		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 9277	} else {
 9278		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 9279		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 9280		vmx_flush_tlb(vcpu, true);
 9281	}
 9282	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 9283
 9284	vmx_update_msr_bitmap(vcpu);
 9285}
 9286
 9287static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
 9288{
 9289	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9290
 9291	/*
 9292	 * Currently we do not handle the nested case where L2 has an
 9293	 * APIC access page of its own; that page is still pinned.
 9294	 * Hence, we skip the case where the VCPU is in guest mode _and_
 9295	 * L1 prepared an APIC access page for L2.
 9296	 *
 9297	 * For the case where L1 and L2 share the same APIC access page
 9298	 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
 9299	 * in the vmcs12), this function will only update either the vmcs01
 9300	 * or the vmcs02.  If the former, the vmcs02 will be updated by
 9301	 * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
 9302	 * the next L2->L1 exit.
 9303	 */
 9304	if (!is_guest_mode(vcpu) ||
 9305	    !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
 9306			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
 9307		vmcs_write64(APIC_ACCESS_ADDR, hpa);
 9308		vmx_flush_tlb(vcpu, true);
 9309	}
 9310}
 9311
 9312static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
 9313{
 9314	u16 status;
 9315	u8 old;
 9316
 9317	if (max_isr == -1)
 9318		max_isr = 0;
 9319
 9320	status = vmcs_read16(GUEST_INTR_STATUS);
 9321	old = status >> 8;
 9322	if (max_isr != old) {
 9323		status &= 0xff;
 9324		status |= max_isr << 8;
 9325		vmcs_write16(GUEST_INTR_STATUS, status);
 9326	}
 9327}
 9328
 9329static void vmx_set_rvi(int vector)
 9330{
 9331	u16 status;
 9332	u8 old;
 9333
 9334	if (vector == -1)
 9335		vector = 0;
 9336
 9337	status = vmcs_read16(GUEST_INTR_STATUS);
 9338	old = (u8)status & 0xff;
 9339	if ((u8)vector != old) {
 9340		status &= ~0xff;
 9341		status |= (u8)vector;
 9342		vmcs_write16(GUEST_INTR_STATUS, status);
 9343	}
 9344}
 9345
 9346static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 9347{
 9348	/*
 9349	 * When running L2, updating RVI is only relevant when
 9350	 * vmcs12 virtual-interrupt-delivery enabled.
 9351	 * However, it can be enabled only when L1 also
 9352	 * intercepts external-interrupts and in that case
 9353	 * we should not update vmcs02 RVI but instead intercept
 9354	 * interrupt. Therefore, do nothing when running L2.
 9355	 */
 9356	if (!is_guest_mode(vcpu))
 9357		vmx_set_rvi(max_irr);
 9358}
 9359
 9360static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 9361{
 9362	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9363	int max_irr;
 9364	bool max_irr_updated;
 9365
 9366	WARN_ON(!vcpu->arch.apicv_active);
 9367	if (pi_test_on(&vmx->pi_desc)) {
 9368		pi_clear_on(&vmx->pi_desc);
 9369		/*
 9370		 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
 9371		 * But on x86 this is just a compiler barrier anyway.
 9372		 */
 9373		smp_mb__after_atomic();
 9374		max_irr_updated =
 9375			kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
 9376
 9377		/*
 9378		 * If we are running L2 and L1 has a new pending interrupt
 9379		 * which can be injected, we should re-evaluate
 9380		 * what should be done with this new L1 interrupt.
 9381		 * If L1 intercepts external-interrupts, we should
 9382		 * exit from L2 to L1. Otherwise, interrupt should be
 9383		 * delivered directly to L2.
 9384		 */
 9385		if (is_guest_mode(vcpu) && max_irr_updated) {
 9386			if (nested_exit_on_intr(vcpu))
 9387				kvm_vcpu_exiting_guest_mode(vcpu);
 9388			else
 9389				kvm_make_request(KVM_REQ_EVENT, vcpu);
 9390		}
 9391	} else {
 9392		max_irr = kvm_lapic_find_highest_irr(vcpu);
 9393	}
 9394	vmx_hwapic_irr_update(vcpu, max_irr);
 9395	return max_irr;
 9396}
 9397
 9398static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 9399{
 9400	if (!kvm_vcpu_apicv_active(vcpu))
 9401		return;
 9402
 9403	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
 9404	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
 9405	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
 9406	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 9407}
 9408
 9409static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 9410{
 9411	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9412
 9413	pi_clear_on(&vmx->pi_desc);
 9414	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
 9415}
 9416
 9417static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 9418{
 9419	u32 exit_intr_info = 0;
 9420	u16 basic_exit_reason = (u16)vmx->exit_reason;
 9421
 9422	if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
 9423	      || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI))
 9424		return;
 9425
 9426	if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
 9427		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 9428	vmx->exit_intr_info = exit_intr_info;
 9429
 9430	/* if exit due to PF check for async PF */
 9431	if (is_page_fault(exit_intr_info))
 9432		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
 9433
 9434	/* Handle machine checks before interrupts are enabled */
 9435	if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY ||
 9436	    is_machine_check(exit_intr_info))
 9437		kvm_machine_check();
 9438
 9439	/* We need to handle NMIs before interrupts are enabled */
 9440	if (is_nmi(exit_intr_info)) {
 9441		kvm_before_interrupt(&vmx->vcpu);
 9442		asm("int $2");
 9443		kvm_after_interrupt(&vmx->vcpu);
 9444	}
 9445}
 9446
 9447static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 9448{
 9449	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 9450
 9451	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
 9452			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
 9453		unsigned int vector;
 9454		unsigned long entry;
 9455		gate_desc *desc;
 9456		struct vcpu_vmx *vmx = to_vmx(vcpu);
 9457#ifdef CONFIG_X86_64
 9458		unsigned long tmp;
 9459#endif
 9460
 9461		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
 9462		desc = (gate_desc *)vmx->host_idt_base + vector;
 9463		entry = gate_offset(desc);
 9464		asm volatile(
 9465#ifdef CONFIG_X86_64
 9466			"mov %%" _ASM_SP ", %[sp]\n\t"
 9467			"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
 9468			"push $%c[ss]\n\t"
 9469			"push %[sp]\n\t"
 9470#endif
 9471			"pushf\n\t"
 9472			__ASM_SIZE(push) " $%c[cs]\n\t"
 9473			CALL_NOSPEC
 9474			:
 9475#ifdef CONFIG_X86_64
 9476			[sp]"=&r"(tmp),
 9477#endif
 9478			ASM_CALL_CONSTRAINT
 9479			:
 9480			THUNK_TARGET(entry),
 9481			[ss]"i"(__KERNEL_DS),
 9482			[cs]"i"(__KERNEL_CS)
 9483			);
 9484	}
 9485}
 9486STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
 9487
 9488static bool vmx_has_emulated_msr(int index)
 9489{
 9490	switch (index) {
 9491	case MSR_IA32_SMBASE:
 9492		/*
 9493		 * We cannot do SMM unless we can run the guest in big
 9494		 * real mode.
 9495		 */
 9496		return enable_unrestricted_guest || emulate_invalid_guest_state;
 9497	case MSR_AMD64_VIRT_SPEC_CTRL:
 9498		/* This is AMD only.  */
 9499		return false;
 9500	default:
 9501		return true;
 9502	}
 9503}
 9504
 9505static bool vmx_mpx_supported(void)
 9506{
 9507	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
 9508		(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
 9509}
 9510
 9511static bool vmx_xsaves_supported(void)
 9512{
 9513	return vmcs_config.cpu_based_2nd_exec_ctrl &
 9514		SECONDARY_EXEC_XSAVES;
 9515}
 9516
 9517static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 9518{
 9519	u32 exit_intr_info;
 9520	bool unblock_nmi;
 9521	u8 vector;
 9522	bool idtv_info_valid;
 9523
 9524	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 9525
 9526	if (enable_vnmi) {
 9527		if (vmx->loaded_vmcs->nmi_known_unmasked)
 9528			return;
 9529		/*
 9530		 * Can't use vmx->exit_intr_info since we're not sure what
 9531		 * the exit reason is.
 9532		 */
 9533		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 9534		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 9535		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
 9536		/*
 9537		 * SDM 3: 27.7.1.2 (September 2008)
 9538		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
 9539		 * a guest IRET fault.
 9540		 * SDM 3: 23.2.2 (September 2008)
 9541		 * Bit 12 is undefined in any of the following cases:
 9542		 *  If the VM exit sets the valid bit in the IDT-vectoring
 9543		 *   information field.
 9544		 *  If the VM exit is due to a double fault.
 9545		 */
 9546		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
 9547		    vector != DF_VECTOR && !idtv_info_valid)
 9548			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 9549				      GUEST_INTR_STATE_NMI);
 9550		else
 9551			vmx->loaded_vmcs->nmi_known_unmasked =
 9552				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
 9553				  & GUEST_INTR_STATE_NMI);
 9554	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
 9555		vmx->loaded_vmcs->vnmi_blocked_time +=
 9556			ktime_to_ns(ktime_sub(ktime_get(),
 9557					      vmx->loaded_vmcs->entry_time));
 9558}
 9559
 9560static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
 9561				      u32 idt_vectoring_info,
 9562				      int instr_len_field,
 9563				      int error_code_field)
 9564{
 9565	u8 vector;
 9566	int type;
 9567	bool idtv_info_valid;
 9568
 9569	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 9570
 9571	vcpu->arch.nmi_injected = false;
 9572	kvm_clear_exception_queue(vcpu);
 9573	kvm_clear_interrupt_queue(vcpu);
 9574
 9575	if (!idtv_info_valid)
 9576		return;
 9577
 9578	kvm_make_request(KVM_REQ_EVENT, vcpu);
 9579
 9580	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 9581	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 9582
 9583	switch (type) {
 9584	case INTR_TYPE_NMI_INTR:
 9585		vcpu->arch.nmi_injected = true;
 9586		/*
 9587		 * SDM 3: 27.7.1.2 (September 2008)
 9588		 * Clear bit "block by NMI" before VM entry if a NMI
 9589		 * delivery faulted.
 9590		 */
 9591		vmx_set_nmi_mask(vcpu, false);
 9592		break;
 9593	case INTR_TYPE_SOFT_EXCEPTION:
 9594		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
 9595		/* fall through */
 9596	case INTR_TYPE_HARD_EXCEPTION:
 9597		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 9598			u32 err = vmcs_read32(error_code_field);
 9599			kvm_requeue_exception_e(vcpu, vector, err);
 9600		} else
 9601			kvm_requeue_exception(vcpu, vector);
 9602		break;
 9603	case INTR_TYPE_SOFT_INTR:
 9604		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
 9605		/* fall through */
 9606	case INTR_TYPE_EXT_INTR:
 9607		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
 9608		break;
 9609	default:
 9610		break;
 9611	}
 9612}
 9613
 9614static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 9615{
 9616	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
 9617				  VM_EXIT_INSTRUCTION_LEN,
 9618				  IDT_VECTORING_ERROR_CODE);
 9619}
 9620
 9621static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 9622{
 9623	__vmx_complete_interrupts(vcpu,
 9624				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 9625				  VM_ENTRY_INSTRUCTION_LEN,
 9626				  VM_ENTRY_EXCEPTION_ERROR_CODE);
 9627
 9628	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 9629}
 9630
 9631static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 9632{
 9633	int i, nr_msrs;
 9634	struct perf_guest_switch_msr *msrs;
 9635
 9636	msrs = perf_guest_get_msrs(&nr_msrs);
 9637
 9638	if (!msrs)
 9639		return;
 9640
 9641	for (i = 0; i < nr_msrs; i++)
 9642		if (msrs[i].host == msrs[i].guest)
 9643			clear_atomic_switch_msr(vmx, msrs[i].msr);
 9644		else
 9645			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
 9646					msrs[i].host);
 9647}
 9648
 9649static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 9650{
 9651	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9652	u64 tscl;
 9653	u32 delta_tsc;
 9654
 9655	if (vmx->hv_deadline_tsc == -1)
 9656		return;
 9657
 9658	tscl = rdtsc();
 9659	if (vmx->hv_deadline_tsc > tscl)
 9660		/* sure to be 32 bit only because checked on set_hv_timer */
 9661		delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
 9662			cpu_preemption_timer_multi);
 9663	else
 9664		delta_tsc = 0;
 9665
 9666	vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
 9667}
 9668
 9669static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 9670{
 9671	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9672	unsigned long cr3, cr4, evmcs_rsp;
 9673
 9674	/* Record the guest's net vcpu time for enforced NMI injections. */
 9675	if (unlikely(!enable_vnmi &&
 9676		     vmx->loaded_vmcs->soft_vnmi_blocked))
 9677		vmx->loaded_vmcs->entry_time = ktime_get();
 9678
 9679	/* Don't enter VMX if guest state is invalid, let the exit handler
 9680	   start emulation until we arrive back to a valid state */
 9681	if (vmx->emulation_required)
 9682		return;
 9683
 9684	if (vmx->ple_window_dirty) {
 9685		vmx->ple_window_dirty = false;
 9686		vmcs_write32(PLE_WINDOW, vmx->ple_window);
 9687	}
 9688
 9689	if (vmx->nested.sync_shadow_vmcs) {
 9690		copy_vmcs12_to_shadow(vmx);
 9691		vmx->nested.sync_shadow_vmcs = false;
 9692	}
 9693
 9694	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
 9695		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
 9696	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
 9697		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 9698
 9699	cr3 = __get_current_cr3_fast();
 9700	if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
 9701		vmcs_writel(HOST_CR3, cr3);
 9702		vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 9703	}
 9704
 9705	cr4 = cr4_read_shadow();
 9706	if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
 9707		vmcs_writel(HOST_CR4, cr4);
 9708		vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 9709	}
 9710
 9711	/* When single-stepping over STI and MOV SS, we must clear the
 9712	 * corresponding interruptibility bits in the guest state. Otherwise
 9713	 * vmentry fails as it then expects bit 14 (BS) in pending debug
 9714	 * exceptions being set, but that's not correct for the guest debugging
 9715	 * case. */
 9716	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 9717		vmx_set_interrupt_shadow(vcpu, 0);
 9718
 9719	if (static_cpu_has(X86_FEATURE_PKU) &&
 9720	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
 9721	    vcpu->arch.pkru != vmx->host_pkru)
 9722		__write_pkru(vcpu->arch.pkru);
 9723
 9724	atomic_switch_perf_msrs(vmx);
 9725
 9726	vmx_arm_hv_timer(vcpu);
 9727
 9728	/*
 9729	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
 9730	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
 9731	 * is no need to worry about the conditional branch over the wrmsr
 9732	 * being speculatively taken.
 9733	 */
 9734	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
 9735
 9736	vmx->__launched = vmx->loaded_vmcs->launched;
 9737
 9738	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
 9739		(unsigned long)&current_evmcs->host_rsp : 0;
 9740
 9741	asm(
 9742		/* Store host registers */
 9743		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
 9744		"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
 9745		"push %%" _ASM_CX " \n\t"
 9746		"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
 9747		"je 1f \n\t"
 9748		"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
 9749		/* Avoid VMWRITE when Enlightened VMCS is in use */
 9750		"test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
 9751		"jz 2f \n\t"
 9752		"mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
 9753		"jmp 1f \n\t"
 9754		"2: \n\t"
 9755		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
 9756		"1: \n\t"
 9757		/* Reload cr2 if changed */
 9758		"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
 9759		"mov %%cr2, %%" _ASM_DX " \n\t"
 9760		"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
 9761		"je 3f \n\t"
 9762		"mov %%" _ASM_AX", %%cr2 \n\t"
 9763		"3: \n\t"
 9764		/* Check if vmlaunch of vmresume is needed */
 9765		"cmpl $0, %c[launched](%0) \n\t"
 9766		/* Load guest registers.  Don't clobber flags. */
 9767		"mov %c[rax](%0), %%" _ASM_AX " \n\t"
 9768		"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
 9769		"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
 9770		"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
 9771		"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
 9772		"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
 9773#ifdef CONFIG_X86_64
 9774		"mov %c[r8](%0),  %%r8  \n\t"
 9775		"mov %c[r9](%0),  %%r9  \n\t"
 9776		"mov %c[r10](%0), %%r10 \n\t"
 9777		"mov %c[r11](%0), %%r11 \n\t"
 9778		"mov %c[r12](%0), %%r12 \n\t"
 9779		"mov %c[r13](%0), %%r13 \n\t"
 9780		"mov %c[r14](%0), %%r14 \n\t"
 9781		"mov %c[r15](%0), %%r15 \n\t"
 9782#endif
 9783		"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
 9784
 9785		/* Enter guest mode */
 9786		"jne 1f \n\t"
 9787		__ex(ASM_VMX_VMLAUNCH) "\n\t"
 9788		"jmp 2f \n\t"
 9789		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
 9790		"2: "
 9791		/* Save guest registers, load host registers, keep flags */
 9792		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
 9793		"pop %0 \n\t"
 9794		"setbe %c[fail](%0)\n\t"
 9795		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
 9796		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
 9797		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
 9798		"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
 9799		"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
 9800		"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
 9801		"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
 9802#ifdef CONFIG_X86_64
 9803		"mov %%r8,  %c[r8](%0) \n\t"
 9804		"mov %%r9,  %c[r9](%0) \n\t"
 9805		"mov %%r10, %c[r10](%0) \n\t"
 9806		"mov %%r11, %c[r11](%0) \n\t"
 9807		"mov %%r12, %c[r12](%0) \n\t"
 9808		"mov %%r13, %c[r13](%0) \n\t"
 9809		"mov %%r14, %c[r14](%0) \n\t"
 9810		"mov %%r15, %c[r15](%0) \n\t"
 9811		"xor %%r8d,  %%r8d \n\t"
 9812		"xor %%r9d,  %%r9d \n\t"
 9813		"xor %%r10d, %%r10d \n\t"
 9814		"xor %%r11d, %%r11d \n\t"
 9815		"xor %%r12d, %%r12d \n\t"
 9816		"xor %%r13d, %%r13d \n\t"
 9817		"xor %%r14d, %%r14d \n\t"
 9818		"xor %%r15d, %%r15d \n\t"
 9819#endif
 9820		"mov %%cr2, %%" _ASM_AX "   \n\t"
 9821		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 9822
 9823		"xor %%eax, %%eax \n\t"
 9824		"xor %%ebx, %%ebx \n\t"
 9825		"xor %%esi, %%esi \n\t"
 9826		"xor %%edi, %%edi \n\t"
 9827		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
 9828		".pushsection .rodata \n\t"
 9829		".global vmx_return \n\t"
 9830		"vmx_return: " _ASM_PTR " 2b \n\t"
 9831		".popsection"
 9832	      : : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
 9833		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
 9834		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
 9835		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 9836		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
 9837		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
 9838		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
 9839		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
 9840		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
 9841		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
 9842		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
 9843#ifdef CONFIG_X86_64
 9844		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
 9845		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
 9846		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
 9847		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
 9848		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
 9849		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
 9850		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
 9851		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
 9852#endif
 9853		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
 9854		[wordsize]"i"(sizeof(ulong))
 9855	      : "cc", "memory"
 9856#ifdef CONFIG_X86_64
 9857		, "rax", "rbx", "rdi"
 9858		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 9859#else
 9860		, "eax", "ebx", "edi"
 9861#endif
 9862	      );
 9863
 9864	/*
 9865	 * We do not use IBRS in the kernel. If this vCPU has used the
 9866	 * SPEC_CTRL MSR it may have left it on; save the value and
 9867	 * turn it off. This is much more efficient than blindly adding
 9868	 * it to the atomic save/restore list. Especially as the former
 9869	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
 9870	 *
 9871	 * For non-nested case:
 9872	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
 9873	 * save it.
 9874	 *
 9875	 * For nested case:
 9876	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 9877	 * save it.
 9878	 */
 9879	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
 9880		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 9881
 9882	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
 9883
 9884	/* Eliminate branch target predictions from guest mode */
 9885	vmexit_fill_RSB();
 9886
 9887	/* All fields are clean at this point */
 9888	if (static_branch_unlikely(&enable_evmcs))
 9889		current_evmcs->hv_clean_fields |=
 9890			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
 9891
 9892	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 9893	if (vmx->host_debugctlmsr)
 9894		update_debugctlmsr(vmx->host_debugctlmsr);
 9895
 9896#ifndef CONFIG_X86_64
 9897	/*
 9898	 * The sysexit path does not restore ds/es, so we must set them to
 9899	 * a reasonable value ourselves.
 9900	 *
 9901	 * We can't defer this to vmx_load_host_state() since that function
 9902	 * may be executed in interrupt context, which saves and restore segments
 9903	 * around it, nullifying its effect.
 9904	 */
 9905	loadsegment(ds, __USER_DS);
 9906	loadsegment(es, __USER_DS);
 9907#endif
 9908
 9909	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
 9910				  | (1 << VCPU_EXREG_RFLAGS)
 9911				  | (1 << VCPU_EXREG_PDPTR)
 9912				  | (1 << VCPU_EXREG_SEGMENTS)
 9913				  | (1 << VCPU_EXREG_CR3));
 9914	vcpu->arch.regs_dirty = 0;
 9915
 9916	/*
 9917	 * eager fpu is enabled if PKEY is supported and CR4 is switched
 9918	 * back on host, so it is safe to read guest PKRU from current
 9919	 * XSAVE.
 9920	 */
 9921	if (static_cpu_has(X86_FEATURE_PKU) &&
 9922	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
 9923		vcpu->arch.pkru = __read_pkru();
 9924		if (vcpu->arch.pkru != vmx->host_pkru)
 9925			__write_pkru(vmx->host_pkru);
 9926	}
 9927
 9928	vmx->nested.nested_run_pending = 0;
 9929	vmx->idt_vectoring_info = 0;
 9930
 9931	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
 9932	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
 9933		return;
 9934
 9935	vmx->loaded_vmcs->launched = 1;
 9936	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 9937
 9938	vmx_complete_atomic_exit(vmx);
 9939	vmx_recover_nmi_blocking(vmx);
 9940	vmx_complete_interrupts(vmx);
 9941}
 9942STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
 9943
 9944static struct kvm *vmx_vm_alloc(void)
 9945{
 9946	struct kvm_vmx *kvm_vmx = kzalloc(sizeof(struct kvm_vmx), GFP_KERNEL);
 9947	return &kvm_vmx->kvm;
 9948}
 9949
 9950static void vmx_vm_free(struct kvm *kvm)
 9951{
 9952	kfree(to_kvm_vmx(kvm));
 9953}
 9954
 9955static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 9956{
 9957	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9958	int cpu;
 9959
 9960	if (vmx->loaded_vmcs == vmcs)
 9961		return;
 9962
 9963	cpu = get_cpu();
 9964	vmx->loaded_vmcs = vmcs;
 9965	vmx_vcpu_put(vcpu);
 9966	vmx_vcpu_load(vcpu, cpu);
 9967	put_cpu();
 9968}
 9969
 9970/*
 9971 * Ensure that the current vmcs of the logical processor is the
 9972 * vmcs01 of the vcpu before calling free_nested().
 9973 */
 9974static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 9975{
 9976       struct vcpu_vmx *vmx = to_vmx(vcpu);
 9977
 9978       vcpu_load(vcpu);
 9979       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 9980       free_nested(vmx);
 9981       vcpu_put(vcpu);
 9982}
 9983
 9984static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 9985{
 9986	struct vcpu_vmx *vmx = to_vmx(vcpu);
 9987
 9988	if (enable_pml)
 9989		vmx_destroy_pml_buffer(vmx);
 9990	free_vpid(vmx->vpid);
 9991	leave_guest_mode(vcpu);
 9992	vmx_free_vcpu_nested(vcpu);
 9993	free_loaded_vmcs(vmx->loaded_vmcs);
 9994	kfree(vmx->guest_msrs);
 9995	kvm_vcpu_uninit(vcpu);
 9996	kmem_cache_free(kvm_vcpu_cache, vmx);
 9997}
 9998
 9999static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
10000{
10001	int err;
10002	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
10003	unsigned long *msr_bitmap;
10004	int cpu;
10005
10006	if (!vmx)
10007		return ERR_PTR(-ENOMEM);
10008
10009	vmx->vpid = allocate_vpid();
10010
10011	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
10012	if (err)
10013		goto free_vcpu;
10014
10015	err = -ENOMEM;
10016
10017	/*
10018	 * If PML is turned on, failure on enabling PML just results in failure
10019	 * of creating the vcpu, therefore we can simplify PML logic (by
10020	 * avoiding dealing with cases, such as enabling PML partially on vcpus
10021	 * for the guest, etc.
10022	 */
10023	if (enable_pml) {
10024		vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
10025		if (!vmx->pml_pg)
10026			goto uninit_vcpu;
10027	}
10028
10029	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
10030	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
10031		     > PAGE_SIZE);
10032
10033	if (!vmx->guest_msrs)
10034		goto free_pml;
10035
10036	err = alloc_loaded_vmcs(&vmx->vmcs01);
10037	if (err < 0)
10038		goto free_msrs;
10039
10040	msr_bitmap = vmx->vmcs01.msr_bitmap;
10041	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
10042	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
10043	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
10044	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
10045	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
10046	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
10047	vmx->msr_bitmap_mode = 0;
10048
10049	vmx->loaded_vmcs = &vmx->vmcs01;
10050	cpu = get_cpu();
10051	vmx_vcpu_load(&vmx->vcpu, cpu);
10052	vmx->vcpu.cpu = cpu;
10053	vmx_vcpu_setup(vmx);
10054	vmx_vcpu_put(&vmx->vcpu);
10055	put_cpu();
10056	if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
10057		err = alloc_apic_access_page(kvm);
10058		if (err)
10059			goto free_vmcs;
10060	}
10061
10062	if (enable_ept && !enable_unrestricted_guest) {
10063		err = init_rmode_identity_map(kvm);
10064		if (err)
10065			goto free_vmcs;
10066	}
10067
10068	if (nested) {
10069		nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
10070					   kvm_vcpu_apicv_active(&vmx->vcpu));
10071		vmx->nested.vpid02 = allocate_vpid();
10072	}
10073
10074	vmx->nested.posted_intr_nv = -1;
10075	vmx->nested.current_vmptr = -1ull;
10076
10077	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
10078
10079	/*
10080	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
10081	 * or POSTED_INTR_WAKEUP_VECTOR.
10082	 */
10083	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
10084	vmx->pi_desc.sn = 1;
10085
10086	return &vmx->vcpu;
10087
10088free_vmcs:
10089	free_vpid(vmx->nested.vpid02);
10090	free_loaded_vmcs(vmx->loaded_vmcs);
10091free_msrs:
10092	kfree(vmx->guest_msrs);
10093free_pml:
10094	vmx_destroy_pml_buffer(vmx);
10095uninit_vcpu:
10096	kvm_vcpu_uninit(&vmx->vcpu);
10097free_vcpu:
10098	free_vpid(vmx->vpid);
10099	kmem_cache_free(kvm_vcpu_cache, vmx);
10100	return ERR_PTR(err);
10101}
10102
10103static int vmx_vm_init(struct kvm *kvm)
10104{
10105	if (!ple_gap)
10106		kvm->arch.pause_in_guest = true;
10107	return 0;
10108}
10109
10110static void __init vmx_check_processor_compat(void *rtn)
10111{
10112	struct vmcs_config vmcs_conf;
10113
10114	*(int *)rtn = 0;
10115	if (setup_vmcs_config(&vmcs_conf) < 0)
10116		*(int *)rtn = -EIO;
10117	nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
10118	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
10119		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
10120				smp_processor_id());
10121		*(int *)rtn = -EIO;
10122	}
10123}
10124
10125static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
10126{
10127	u8 cache;
10128	u64 ipat = 0;
10129
10130	/* For VT-d and EPT combination
10131	 * 1. MMIO: always map as UC
10132	 * 2. EPT with VT-d:
10133	 *   a. VT-d without snooping control feature: can't guarantee the
10134	 *	result, try to trust guest.
10135	 *   b. VT-d with snooping control feature: snooping control feature of
10136	 *	VT-d engine can guarantee the cache correctness. Just set it
10137	 *	to WB to keep consistent with host. So the same as item 3.
10138	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
10139	 *    consistent with host MTRR
10140	 */
10141	if (is_mmio) {
10142		cache = MTRR_TYPE_UNCACHABLE;
10143		goto exit;
10144	}
10145
10146	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
10147		ipat = VMX_EPT_IPAT_BIT;
10148		cache = MTRR_TYPE_WRBACK;
10149		goto exit;
10150	}
10151
10152	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
10153		ipat = VMX_EPT_IPAT_BIT;
10154		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
10155			cache = MTRR_TYPE_WRBACK;
10156		else
10157			cache = MTRR_TYPE_UNCACHABLE;
10158		goto exit;
10159	}
10160
10161	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
10162
10163exit:
10164	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
10165}
10166
10167static int vmx_get_lpage_level(void)
10168{
10169	if (enable_ept && !cpu_has_vmx_ept_1g_page())
10170		return PT_DIRECTORY_LEVEL;
10171	else
10172		/* For shadow and EPT supported 1GB page */
10173		return PT_PDPE_LEVEL;
10174}
10175
10176static void vmcs_set_secondary_exec_control(u32 new_ctl)
10177{
10178	/*
10179	 * These bits in the secondary execution controls field
10180	 * are dynamic, the others are mostly based on the hypervisor
10181	 * architecture and the guest's CPUID.  Do not touch the
10182	 * dynamic bits.
10183	 */
10184	u32 mask =
10185		SECONDARY_EXEC_SHADOW_VMCS |
10186		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
10187		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
10188		SECONDARY_EXEC_DESC;
10189
10190	u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
10191
10192	vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
10193		     (new_ctl & ~mask) | (cur_ctl & mask));
10194}
10195
10196/*
10197 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
10198 * (indicating "allowed-1") if they are supported in the guest's CPUID.
10199 */
10200static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
10201{
10202	struct vcpu_vmx *vmx = to_vmx(vcpu);
10203	struct kvm_cpuid_entry2 *entry;
10204
10205	vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
10206	vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
10207
10208#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
10209	if (entry && (entry->_reg & (_cpuid_mask)))			\
10210		vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);	\
10211} while (0)
10212
10213	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
10214	cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
10215	cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
10216	cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
10217	cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
10218	cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
10219	cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
10220	cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
10221	cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
10222	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
10223	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
10224	cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
10225	cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
10226	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
10227	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
10228
10229	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
10230	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
10231	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
10232	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
10233	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
10234	cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
10235
10236#undef cr4_fixed1_update
10237}
10238
10239static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
10240{
10241	struct vcpu_vmx *vmx = to_vmx(vcpu);
10242
10243	if (cpu_has_secondary_exec_ctrls()) {
10244		vmx_compute_secondary_exec_control(vmx);
10245		vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
10246	}
10247
10248	if (nested_vmx_allowed(vcpu))
10249		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
10250			FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
10251	else
10252		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
10253			~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
10254
10255	if (nested_vmx_allowed(vcpu))
10256		nested_vmx_cr_fixed1_bits_update(vcpu);
10257}
10258
10259static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
10260{
10261	if (func == 1 && nested)
10262		entry->ecx |= bit(X86_FEATURE_VMX);
10263}
10264
10265static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
10266		struct x86_exception *fault)
10267{
10268	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10269	struct vcpu_vmx *vmx = to_vmx(vcpu);
10270	u32 exit_reason;
10271	unsigned long exit_qualification = vcpu->arch.exit_qualification;
10272
10273	if (vmx->nested.pml_full) {
10274		exit_reason = EXIT_REASON_PML_FULL;
10275		vmx->nested.pml_full = false;
10276		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
10277	} else if (fault->error_code & PFERR_RSVD_MASK)
10278		exit_reason = EXIT_REASON_EPT_MISCONFIG;
10279	else
10280		exit_reason = EXIT_REASON_EPT_VIOLATION;
10281
10282	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
10283	vmcs12->guest_physical_address = fault->address;
10284}
10285
10286static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
10287{
10288	return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
10289}
10290
10291/* Callbacks for nested_ept_init_mmu_context: */
10292
10293static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
10294{
10295	/* return the page table to be shadowed - in our case, EPT12 */
10296	return get_vmcs12(vcpu)->ept_pointer;
10297}
10298
10299static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
10300{
10301	WARN_ON(mmu_is_nested(vcpu));
10302	if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
10303		return 1;
10304
10305	kvm_mmu_unload(vcpu);
10306	kvm_init_shadow_ept_mmu(vcpu,
10307			to_vmx(vcpu)->nested.msrs.ept_caps &
10308			VMX_EPT_EXECUTE_ONLY_BIT,
10309			nested_ept_ad_enabled(vcpu));
10310	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
10311	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
10312	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
10313
10314	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
10315	return 0;
10316}
10317
10318static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
10319{
10320	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
10321}
10322
10323static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
10324					    u16 error_code)
10325{
10326	bool inequality, bit;
10327
10328	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
10329	inequality =
10330		(error_code & vmcs12->page_fault_error_code_mask) !=
10331		 vmcs12->page_fault_error_code_match;
10332	return inequality ^ bit;
10333}
10334
10335static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
10336		struct x86_exception *fault)
10337{
10338	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10339
10340	WARN_ON(!is_guest_mode(vcpu));
10341
10342	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
10343		!to_vmx(vcpu)->nested.nested_run_pending) {
10344		vmcs12->vm_exit_intr_error_code = fault->error_code;
10345		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
10346				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
10347				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
10348				  fault->address);
10349	} else {
10350		kvm_inject_page_fault(vcpu, fault);
10351	}
10352}
10353
10354static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
10355						 struct vmcs12 *vmcs12);
10356
10357static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
10358					struct vmcs12 *vmcs12)
10359{
10360	struct vcpu_vmx *vmx = to_vmx(vcpu);
10361	struct page *page;
10362	u64 hpa;
10363
10364	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
10365		/*
10366		 * Translate L1 physical address to host physical
10367		 * address for vmcs02. Keep the page pinned, so this
10368		 * physical address remains valid. We keep a reference
10369		 * to it so we can release it later.
10370		 */
10371		if (vmx->nested.apic_access_page) { /* shouldn't happen */
10372			kvm_release_page_dirty(vmx->nested.apic_access_page);
10373			vmx->nested.apic_access_page = NULL;
10374		}
10375		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
10376		/*
10377		 * If translation failed, no matter: This feature asks
10378		 * to exit when accessing the given address, and if it
10379		 * can never be accessed, this feature won't do
10380		 * anything anyway.
10381		 */
10382		if (!is_error_page(page)) {
10383			vmx->nested.apic_access_page = page;
10384			hpa = page_to_phys(vmx->nested.apic_access_page);
10385			vmcs_write64(APIC_ACCESS_ADDR, hpa);
10386		} else {
10387			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
10388					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
10389		}
10390	} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
10391		   cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
10392		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
10393			      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
10394		kvm_vcpu_reload_apic_access_page(vcpu);
10395	}
10396
10397	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
10398		if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
10399			kvm_release_page_dirty(vmx->nested.virtual_apic_page);
10400			vmx->nested.virtual_apic_page = NULL;
10401		}
10402		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
10403
10404		/*
10405		 * If translation failed, VM entry will fail because
10406		 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
10407		 * Failing the vm entry is _not_ what the processor
10408		 * does but it's basically the only possibility we
10409		 * have.  We could still enter the guest if CR8 load
10410		 * exits are enabled, CR8 store exits are enabled, and
10411		 * virtualize APIC access is disabled; in this case
10412		 * the processor would never use the TPR shadow and we
10413		 * could simply clear the bit from the execution
10414		 * control.  But such a configuration is useless, so
10415		 * let's keep the code simple.
10416		 */
10417		if (!is_error_page(page)) {
10418			vmx->nested.virtual_apic_page = page;
10419			hpa = page_to_phys(vmx->nested.virtual_apic_page);
10420			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
10421		}
10422	}
10423
10424	if (nested_cpu_has_posted_intr(vmcs12)) {
10425		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
10426			kunmap(vmx->nested.pi_desc_page);
10427			kvm_release_page_dirty(vmx->nested.pi_desc_page);
10428			vmx->nested.pi_desc_page = NULL;
10429		}
10430		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
10431		if (is_error_page(page))
10432			return;
10433		vmx->nested.pi_desc_page = page;
10434		vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
10435		vmx->nested.pi_desc =
10436			(struct pi_desc *)((void *)vmx->nested.pi_desc +
10437			(unsigned long)(vmcs12->posted_intr_desc_addr &
10438			(PAGE_SIZE - 1)));
10439		vmcs_write64(POSTED_INTR_DESC_ADDR,
10440			page_to_phys(vmx->nested.pi_desc_page) +
10441			(unsigned long)(vmcs12->posted_intr_desc_addr &
10442			(PAGE_SIZE - 1)));
10443	}
10444	if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
10445		vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
10446			      CPU_BASED_USE_MSR_BITMAPS);
10447	else
10448		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
10449				CPU_BASED_USE_MSR_BITMAPS);
10450}
10451
10452static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
10453{
10454	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
10455	struct vcpu_vmx *vmx = to_vmx(vcpu);
10456
10457	if (vcpu->arch.virtual_tsc_khz == 0)
10458		return;
10459
10460	/* Make sure short timeouts reliably trigger an immediate vmexit.
10461	 * hrtimer_start does not guarantee this. */
10462	if (preemption_timeout <= 1) {
10463		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
10464		return;
10465	}
10466
10467	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
10468	preemption_timeout *= 1000000;
10469	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
10470	hrtimer_start(&vmx->nested.preemption_timer,
10471		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
10472}
10473
10474static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
10475					       struct vmcs12 *vmcs12)
10476{
10477	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
10478		return 0;
10479
10480	if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
10481	    !page_address_valid(vcpu, vmcs12->io_bitmap_b))
10482		return -EINVAL;
10483
10484	return 0;
10485}
10486
10487static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
10488						struct vmcs12 *vmcs12)
10489{
10490	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
10491		return 0;
10492
10493	if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
10494		return -EINVAL;
10495
10496	return 0;
10497}
10498
10499static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
10500						struct vmcs12 *vmcs12)
10501{
10502	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10503		return 0;
10504
10505	if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
10506		return -EINVAL;
10507
10508	return 0;
10509}
10510
10511/*
10512 * Merge L0's and L1's MSR bitmap, return false to indicate that
10513 * we do not use the hardware.
10514 */
10515static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
10516						 struct vmcs12 *vmcs12)
10517{
10518	int msr;
10519	struct page *page;
10520	unsigned long *msr_bitmap_l1;
10521	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
10522	/*
10523	 * pred_cmd & spec_ctrl are trying to verify two things:
10524	 *
10525	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
10526	 *    ensures that we do not accidentally generate an L02 MSR bitmap
10527	 *    from the L12 MSR bitmap that is too permissive.
10528	 * 2. That L1 or L2s have actually used the MSR. This avoids
10529	 *    unnecessarily merging of the bitmap if the MSR is unused. This
10530	 *    works properly because we only update the L01 MSR bitmap lazily.
10531	 *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
10532	 *    updated to reflect this when L1 (or its L2s) actually write to
10533	 *    the MSR.
10534	 */
10535	bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
10536	bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
10537
10538	/* Nothing to do if the MSR bitmap is not in use.  */
10539	if (!cpu_has_vmx_msr_bitmap() ||
10540	    !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
10541		return false;
10542
10543	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10544	    !pred_cmd && !spec_ctrl)
10545		return false;
10546
10547	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
10548	if (is_error_page(page))
10549		return false;
10550
10551	msr_bitmap_l1 = (unsigned long *)kmap(page);
10552	if (nested_cpu_has_apic_reg_virt(vmcs12)) {
10553		/*
10554		 * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
10555		 * just lets the processor take the value from the virtual-APIC page;
10556		 * take those 256 bits directly from the L1 bitmap.
10557		 */
10558		for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
10559			unsigned word = msr / BITS_PER_LONG;
10560			msr_bitmap_l0[word] = msr_bitmap_l1[word];
10561			msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
10562		}
10563	} else {
10564		for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
10565			unsigned word = msr / BITS_PER_LONG;
10566			msr_bitmap_l0[word] = ~0;
10567			msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
10568		}
10569	}
10570
10571	nested_vmx_disable_intercept_for_msr(
10572		msr_bitmap_l1, msr_bitmap_l0,
10573		X2APIC_MSR(APIC_TASKPRI),
10574		MSR_TYPE_W);
10575
10576	if (nested_cpu_has_vid(vmcs12)) {
10577		nested_vmx_disable_intercept_for_msr(
10578			msr_bitmap_l1, msr_bitmap_l0,
10579			X2APIC_MSR(APIC_EOI),
10580			MSR_TYPE_W);
10581		nested_vmx_disable_intercept_for_msr(
10582			msr_bitmap_l1, msr_bitmap_l0,
10583			X2APIC_MSR(APIC_SELF_IPI),
10584			MSR_TYPE_W);
10585	}
10586
10587	if (spec_ctrl)
10588		nested_vmx_disable_intercept_for_msr(
10589					msr_bitmap_l1, msr_bitmap_l0,
10590					MSR_IA32_SPEC_CTRL,
10591					MSR_TYPE_R | MSR_TYPE_W);
10592
10593	if (pred_cmd)
10594		nested_vmx_disable_intercept_for_msr(
10595					msr_bitmap_l1, msr_bitmap_l0,
10596					MSR_IA32_PRED_CMD,
10597					MSR_TYPE_W);
10598
10599	kunmap(page);
10600	kvm_release_page_clean(page);
10601
10602	return true;
10603}
10604
10605static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
10606					  struct vmcs12 *vmcs12)
10607{
10608	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
10609	    !page_address_valid(vcpu, vmcs12->apic_access_addr))
10610		return -EINVAL;
10611	else
10612		return 0;
10613}
10614
10615static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
10616					   struct vmcs12 *vmcs12)
10617{
10618	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10619	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
10620	    !nested_cpu_has_vid(vmcs12) &&
10621	    !nested_cpu_has_posted_intr(vmcs12))
10622		return 0;
10623
10624	/*
10625	 * If virtualize x2apic mode is enabled,
10626	 * virtualize apic access must be disabled.
10627	 */
10628	if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
10629	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
10630		return -EINVAL;
10631
10632	/*
10633	 * If virtual interrupt delivery is enabled,
10634	 * we must exit on external interrupts.
10635	 */
10636	if (nested_cpu_has_vid(vmcs12) &&
10637	   !nested_exit_on_intr(vcpu))
10638		return -EINVAL;
10639
10640	/*
10641	 * bits 15:8 should be zero in posted_intr_nv,
10642	 * the descriptor address has been already checked
10643	 * in nested_get_vmcs12_pages.
10644	 */
10645	if (nested_cpu_has_posted_intr(vmcs12) &&
10646	   (!nested_cpu_has_vid(vmcs12) ||
10647	    !nested_exit_intr_ack_set(vcpu) ||
10648	    vmcs12->posted_intr_nv & 0xff00))
10649		return -EINVAL;
10650
10651	/* tpr shadow is needed by all apicv features. */
10652	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
10653		return -EINVAL;
10654
10655	return 0;
10656}
10657
10658static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
10659				       unsigned long count_field,
10660				       unsigned long addr_field)
10661{
10662	int maxphyaddr;
10663	u64 count, addr;
10664
10665	if (vmcs12_read_any(vcpu, count_field, &count) ||
10666	    vmcs12_read_any(vcpu, addr_field, &addr)) {
10667		WARN_ON(1);
10668		return -EINVAL;
10669	}
10670	if (count == 0)
10671		return 0;
10672	maxphyaddr = cpuid_maxphyaddr(vcpu);
10673	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
10674	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
10675		pr_debug_ratelimited(
10676			"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
10677			addr_field, maxphyaddr, count, addr);
10678		return -EINVAL;
10679	}
10680	return 0;
10681}
10682
10683static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
10684						struct vmcs12 *vmcs12)
10685{
10686	if (vmcs12->vm_exit_msr_load_count == 0 &&
10687	    vmcs12->vm_exit_msr_store_count == 0 &&
10688	    vmcs12->vm_entry_msr_load_count == 0)
10689		return 0; /* Fast path */
10690	if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
10691					VM_EXIT_MSR_LOAD_ADDR) ||
10692	    nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
10693					VM_EXIT_MSR_STORE_ADDR) ||
10694	    nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
10695					VM_ENTRY_MSR_LOAD_ADDR))
10696		return -EINVAL;
10697	return 0;
10698}
10699
10700static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
10701					 struct vmcs12 *vmcs12)
10702{
10703	u64 address = vmcs12->pml_address;
10704	int maxphyaddr = cpuid_maxphyaddr(vcpu);
10705
10706	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
10707		if (!nested_cpu_has_ept(vmcs12) ||
10708		    !IS_ALIGNED(address, 4096)  ||
10709		    address >> maxphyaddr)
10710			return -EINVAL;
10711	}
10712
10713	return 0;
10714}
10715
10716static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
10717				       struct vmx_msr_entry *e)
10718{
10719	/* x2APIC MSR accesses are not allowed */
10720	if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
10721		return -EINVAL;
10722	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
10723	    e->index == MSR_IA32_UCODE_REV)
10724		return -EINVAL;
10725	if (e->reserved != 0)
10726		return -EINVAL;
10727	return 0;
10728}
10729
10730static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
10731				     struct vmx_msr_entry *e)
10732{
10733	if (e->index == MSR_FS_BASE ||
10734	    e->index == MSR_GS_BASE ||
10735	    e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
10736	    nested_vmx_msr_check_common(vcpu, e))
10737		return -EINVAL;
10738	return 0;
10739}
10740
10741static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
10742				      struct vmx_msr_entry *e)
10743{
10744	if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
10745	    nested_vmx_msr_check_common(vcpu, e))
10746		return -EINVAL;
10747	return 0;
10748}
10749
10750/*
10751 * Load guest's/host's msr at nested entry/exit.
10752 * return 0 for success, entry index for failure.
10753 */
10754static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
10755{
10756	u32 i;
10757	struct vmx_msr_entry e;
10758	struct msr_data msr;
10759
10760	msr.host_initiated = false;
10761	for (i = 0; i < count; i++) {
10762		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
10763					&e, sizeof(e))) {
10764			pr_debug_ratelimited(
10765				"%s cannot read MSR entry (%u, 0x%08llx)\n",
10766				__func__, i, gpa + i * sizeof(e));
10767			goto fail;
10768		}
10769		if (nested_vmx_load_msr_check(vcpu, &e)) {
10770			pr_debug_ratelimited(
10771				"%s check failed (%u, 0x%x, 0x%x)\n",
10772				__func__, i, e.index, e.reserved);
10773			goto fail;
10774		}
10775		msr.index = e.index;
10776		msr.data = e.value;
10777		if (kvm_set_msr(vcpu, &msr)) {
10778			pr_debug_ratelimited(
10779				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
10780				__func__, i, e.index, e.value);
10781			goto fail;
10782		}
10783	}
10784	return 0;
10785fail:
10786	return i + 1;
10787}
10788
10789static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
10790{
10791	u32 i;
10792	struct vmx_msr_entry e;
10793
10794	for (i = 0; i < count; i++) {
10795		struct msr_data msr_info;
10796		if (kvm_vcpu_read_guest(vcpu,
10797					gpa + i * sizeof(e),
10798					&e, 2 * sizeof(u32))) {
10799			pr_debug_ratelimited(
10800				"%s cannot read MSR entry (%u, 0x%08llx)\n",
10801				__func__, i, gpa + i * sizeof(e));
10802			return -EINVAL;
10803		}
10804		if (nested_vmx_store_msr_check(vcpu, &e)) {
10805			pr_debug_ratelimited(
10806				"%s check failed (%u, 0x%x, 0x%x)\n",
10807				__func__, i, e.index, e.reserved);
10808			return -EINVAL;
10809		}
10810		msr_info.host_initiated = false;
10811		msr_info.index = e.index;
10812		if (kvm_get_msr(vcpu, &msr_info)) {
10813			pr_debug_ratelimited(
10814				"%s cannot read MSR (%u, 0x%x)\n",
10815				__func__, i, e.index);
10816			return -EINVAL;
10817		}
10818		if (kvm_vcpu_write_guest(vcpu,
10819					 gpa + i * sizeof(e) +
10820					     offsetof(struct vmx_msr_entry, value),
10821					 &msr_info.data, sizeof(msr_info.data))) {
10822			pr_debug_ratelimited(
10823				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
10824				__func__, i, e.index, msr_info.data);
10825			return -EINVAL;
10826		}
10827	}
10828	return 0;
10829}
10830
10831static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
10832{
10833	unsigned long invalid_mask;
10834
10835	invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
10836	return (val & invalid_mask) == 0;
10837}
10838
10839/*
10840 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
10841 * emulating VM entry into a guest with EPT enabled.
10842 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
10843 * is assigned to entry_failure_code on failure.
10844 */
10845static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
10846			       u32 *entry_failure_code)
10847{
10848	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
10849		if (!nested_cr3_valid(vcpu, cr3)) {
10850			*entry_failure_code = ENTRY_FAIL_DEFAULT;
10851			return 1;
10852		}
10853
10854		/*
10855		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
10856		 * must not be dereferenced.
10857		 */
10858		if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
10859		    !nested_ept) {
10860			if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
10861				*entry_failure_code = ENTRY_FAIL_PDPTE;
10862				return 1;
10863			}
10864		}
10865
10866		vcpu->arch.cr3 = cr3;
10867		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
10868	}
10869
10870	kvm_mmu_reset_context(vcpu);
10871	return 0;
10872}
10873
10874static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
10875			       bool from_vmentry)
10876{
10877	struct vcpu_vmx *vmx = to_vmx(vcpu);
10878
10879	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
10880	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
10881	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
10882	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
10883	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
10884	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
10885	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
10886	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
10887	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
10888	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
10889	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
10890	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
10891	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
10892	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
10893	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
10894	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
10895	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
10896	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
10897	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
10898	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
10899	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
10900	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
10901	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
10902	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
10903	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
10904	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
10905	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
10906	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
10907	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
10908	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
10909	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
10910
10911	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
10912	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
10913		vmcs12->guest_pending_dbg_exceptions);
10914	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
10915	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
10916
10917	if (nested_cpu_has_xsaves(vmcs12))
10918		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
10919	vmcs_write64(VMCS_LINK_POINTER, -1ull);
10920
10921	if (cpu_has_vmx_posted_intr())
10922		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
10923
10924	/*
10925	 * Whether page-faults are trapped is determined by a combination of
10926	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
10927	 * If enable_ept, L0 doesn't care about page faults and we should
10928	 * set all of these to L1's desires. However, if !enable_ept, L0 does
10929	 * care about (at least some) page faults, and because it is not easy
10930	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
10931	 * to exit on each and every L2 page fault. This is done by setting
10932	 * MASK=MATCH=0 and (see below) EB.PF=1.
10933	 * Note that below we don't need special code to set EB.PF beyond the
10934	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
10935	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
10936	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
10937	 */
10938	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
10939		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
10940	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
10941		enable_ept ? vmcs12->page_fault_error_code_match : 0);
10942
10943	/* All VMFUNCs are currently emulated through L0 vmexits.  */
10944	if (cpu_has_vmx_vmfunc())
10945		vmcs_write64(VM_FUNCTION_CONTROL, 0);
10946
10947	if (cpu_has_vmx_apicv()) {
10948		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
10949		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
10950		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
10951		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
10952	}
10953
10954	/*
10955	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
10956	 * Some constant fields are set here by vmx_set_constant_host_state().
10957	 * Other fields are different per CPU, and will be set later when
10958	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
10959	 */
10960	vmx_set_constant_host_state(vmx);
10961
10962	/*
10963	 * Set the MSR load/store lists to match L0's settings.
10964	 */
10965	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
10966	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
10967	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
10968	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
10969	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
10970
10971	set_cr4_guest_host_mask(vmx);
10972
10973	if (vmx_mpx_supported())
10974		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
10975
10976	if (enable_vpid) {
10977		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
10978			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
10979		else
10980			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
10981	}
10982
10983	/*
10984	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
10985	 */
10986	if (enable_ept) {
10987		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
10988		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
10989		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
10990		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
10991	}
10992
10993	if (cpu_has_vmx_msr_bitmap())
10994		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
10995}
10996
10997/*
10998 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
10999 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
11000 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
11001 * guest in a way that will both be appropriate to L1's requests, and our
11002 * needs. In addition to modifying the active vmcs (which is vmcs02), this
11003 * function also has additional necessary side-effects, like setting various
11004 * vcpu->arch fields.
11005 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
11006 * is assigned to entry_failure_code on failure.
11007 */
11008static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
11009			  bool from_vmentry, u32 *entry_failure_code)
11010{
11011	struct vcpu_vmx *vmx = to_vmx(vcpu);
11012	u32 exec_control, vmcs12_exec_ctrl;
11013
11014	if (vmx->nested.dirty_vmcs12) {
11015		prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
11016		vmx->nested.dirty_vmcs12 = false;
11017	}
11018
11019	/*
11020	 * First, the fields that are shadowed.  This must be kept in sync
11021	 * with vmx_shadow_fields.h.
11022	 */
11023
11024	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
11025	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
11026	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
11027	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
11028	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
11029
11030	/*
11031	 * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
11032	 * HOST_FS_BASE, HOST_GS_BASE.
11033	 */
11034
11035	if (from_vmentry &&
11036	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
11037		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
11038		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
11039	} else {
11040		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
11041		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
11042	}
11043	if (from_vmentry) {
11044		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
11045			     vmcs12->vm_entry_intr_info_field);
11046		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
11047			     vmcs12->vm_entry_exception_error_code);
11048		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
11049			     vmcs12->vm_entry_instruction_len);
11050		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
11051			     vmcs12->guest_interruptibility_info);
11052		vmx->loaded_vmcs->nmi_known_unmasked =
11053			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
11054	} else {
11055		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
11056	}
11057	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
11058
11059	exec_control = vmcs12->pin_based_vm_exec_control;
11060
11061	/* Preemption timer setting is only taken from vmcs01.  */
11062	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
11063	exec_control |= vmcs_config.pin_based_exec_ctrl;
11064	if (vmx->hv_deadline_tsc == -1)
11065		exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
11066
11067	/* Posted interrupts setting is only taken from vmcs12.  */
11068	if (nested_cpu_has_posted_intr(vmcs12)) {
11069		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
11070		vmx->nested.pi_pending = false;
11071	} else {
11072		exec_control &= ~PIN_BASED_POSTED_INTR;
11073	}
11074
11075	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
11076
11077	vmx->nested.preemption_timer_expired = false;
11078	if (nested_cpu_has_preemption_timer(vmcs12))
11079		vmx_start_preemption_timer(vcpu);
11080
11081	if (cpu_has_secondary_exec_ctrls()) {
11082		exec_control = vmx->secondary_exec_control;
11083
11084		/* Take the following fields only from vmcs12 */
11085		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
11086				  SECONDARY_EXEC_ENABLE_INVPCID |
11087				  SECONDARY_EXEC_RDTSCP |
11088				  SECONDARY_EXEC_XSAVES |
11089				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
11090				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
11091				  SECONDARY_EXEC_ENABLE_VMFUNC);
11092		if (nested_cpu_has(vmcs12,
11093				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
11094			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
11095				~SECONDARY_EXEC_ENABLE_PML;
11096			exec_control |= vmcs12_exec_ctrl;
11097		}
11098
11099		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
11100			vmcs_write16(GUEST_INTR_STATUS,
11101				vmcs12->guest_intr_status);
11102
11103		/*
11104		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
11105		 * nested_get_vmcs12_pages will either fix it up or
11106		 * remove the VM execution control.
11107		 */
11108		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
11109			vmcs_write64(APIC_ACCESS_ADDR, -1ull);
11110
11111		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
11112	}
11113
11114	/*
11115	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
11116	 * entry, but only if the current (host) sp changed from the value
11117	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
11118	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
11119	 * here we just force the write to happen on entry.
11120	 */
11121	vmx->host_rsp = 0;
11122
11123	exec_control = vmx_exec_control(vmx); /* L0's desires */
11124	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
11125	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
11126	exec_control &= ~CPU_BASED_TPR_SHADOW;
11127	exec_control |= vmcs12->cpu_based_vm_exec_control;
11128
11129	/*
11130	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
11131	 * nested_get_vmcs12_pages can't fix it up, the illegal value
11132	 * will result in a VM entry failure.
11133	 */
11134	if (exec_control & CPU_BASED_TPR_SHADOW) {
11135		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
11136		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
11137	} else {
11138#ifdef CONFIG_X86_64
11139		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
11140				CPU_BASED_CR8_STORE_EXITING;
11141#endif
11142	}
11143
11144	/*
11145	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
11146	 * for I/O port accesses.
11147	 */
11148	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
11149	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
11150
11151	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
11152
11153	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
11154	 * bitwise-or of what L1 wants to trap for L2, and what we want to
11155	 * trap. Note that CR0.TS also needs updating - we do this later.
11156	 */
11157	update_exception_bitmap(vcpu);
11158	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
11159	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
11160
11161	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
11162	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
11163	 * bits are further modified by vmx_set_efer() below.
11164	 */
11165	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
11166
11167	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
11168	 * emulated by vmx_set_efer(), below.
11169	 */
11170	vm_entry_controls_init(vmx, 
11171		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
11172			~VM_ENTRY_IA32E_MODE) |
11173		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
11174
11175	if (from_vmentry &&
11176	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
11177		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
11178		vcpu->arch.pat = vmcs12->guest_ia32_pat;
11179	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
11180		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
11181	}
11182
11183	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
11184
11185	if (kvm_has_tsc_control)
11186		decache_tsc_multiplier(vmx);
11187
11188	if (enable_vpid) {
11189		/*
11190		 * There is no direct mapping between vpid02 and vpid12, the
11191		 * vpid02 is per-vCPU for L0 and reused while the value of
11192		 * vpid12 is changed w/ one invvpid during nested vmentry.
11193		 * The vpid12 is allocated by L1 for L2, so it will not
11194		 * influence global bitmap(for vpid01 and vpid02 allocation)
11195		 * even if spawn a lot of nested vCPUs.
11196		 */
11197		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
11198			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
11199				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
11200				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
11201			}
11202		} else {
11203			vmx_flush_tlb(vcpu, true);
11204		}
11205	}
11206
11207	if (enable_pml) {
11208		/*
11209		 * Conceptually we want to copy the PML address and index from
11210		 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
11211		 * since we always flush the log on each vmexit, this happens
11212		 * to be equivalent to simply resetting the fields in vmcs02.
11213		 */
11214		ASSERT(vmx->pml_pg);
11215		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
11216		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
11217	}
11218
11219	if (nested_cpu_has_ept(vmcs12)) {
11220		if (nested_ept_init_mmu_context(vcpu)) {
11221			*entry_failure_code = ENTRY_FAIL_DEFAULT;
11222			return 1;
11223		}
11224	} else if (nested_cpu_has2(vmcs12,
11225				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
11226		vmx_flush_tlb(vcpu, true);
11227	}
11228
11229	/*
11230	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
11231	 * bits which we consider mandatory enabled.
11232	 * The CR0_READ_SHADOW is what L2 should have expected to read given
11233	 * the specifications by L1; It's not enough to take
11234	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
11235	 * have more bits than L1 expected.
11236	 */
11237	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
11238	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
11239
11240	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
11241	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
11242
11243	if (from_vmentry &&
11244	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
11245		vcpu->arch.efer = vmcs12->guest_ia32_efer;
11246	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
11247		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
11248	else
11249		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
11250	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
11251	vmx_set_efer(vcpu, vcpu->arch.efer);
11252
11253	/*
11254	 * Guest state is invalid and unrestricted guest is disabled,
11255	 * which means L1 attempted VMEntry to L2 with invalid state.
11256	 * Fail the VMEntry.
11257	 */
11258	if (vmx->emulation_required) {
11259		*entry_failure_code = ENTRY_FAIL_DEFAULT;
11260		return 1;
11261	}
11262
11263	/* Shadow page tables on either EPT or shadow page tables. */
11264	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
11265				entry_failure_code))
11266		return 1;
11267
11268	if (!enable_ept)
11269		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
11270
11271	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
11272	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
11273	return 0;
11274}
11275
11276static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
11277{
11278	if (!nested_cpu_has_nmi_exiting(vmcs12) &&
11279	    nested_cpu_has_virtual_nmis(vmcs12))
11280		return -EINVAL;
11281
11282	if (!nested_cpu_has_virtual_nmis(vmcs12) &&
11283	    nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
11284		return -EINVAL;
11285
11286	return 0;
11287}
11288
11289static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
11290{
11291	struct vcpu_vmx *vmx = to_vmx(vcpu);
11292
11293	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
11294	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
11295		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11296
11297	if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
11298		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11299
11300	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
11301		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11302
11303	if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
11304		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11305
11306	if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
11307		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11308
11309	if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
11310		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11311
11312	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
11313		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11314
11315	if (nested_vmx_check_pml_controls(vcpu, vmcs12))
11316		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11317
11318	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
11319				vmx->nested.msrs.procbased_ctls_low,
11320				vmx->nested.msrs.procbased_ctls_high) ||
11321	    (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
11322	     !vmx_control_verify(vmcs12->secondary_vm_exec_control,
11323				 vmx->nested.msrs.secondary_ctls_low,
11324				 vmx->nested.msrs.secondary_ctls_high)) ||
11325	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
11326				vmx->nested.msrs.pinbased_ctls_low,
11327				vmx->nested.msrs.pinbased_ctls_high) ||
11328	    !vmx_control_verify(vmcs12->vm_exit_controls,
11329				vmx->nested.msrs.exit_ctls_low,
11330				vmx->nested.msrs.exit_ctls_high) ||
11331	    !vmx_control_verify(vmcs12->vm_entry_controls,
11332				vmx->nested.msrs.entry_ctls_low,
11333				vmx->nested.msrs.entry_ctls_high))
11334		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11335
11336	if (nested_vmx_check_nmi_controls(vmcs12))
11337		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11338
11339	if (nested_cpu_has_vmfunc(vmcs12)) {
11340		if (vmcs12->vm_function_control &
11341		    ~vmx->nested.msrs.vmfunc_controls)
11342			return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11343
11344		if (nested_cpu_has_eptp_switching(vmcs12)) {
11345			if (!nested_cpu_has_ept(vmcs12) ||
11346			    !page_address_valid(vcpu, vmcs12->eptp_list_address))
11347				return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11348		}
11349	}
11350
11351	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
11352		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
11353
11354	if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
11355	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
11356	    !nested_cr3_valid(vcpu, vmcs12->host_cr3))
11357		return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
11358
11359	return 0;
11360}
11361
11362static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
11363				  u32 *exit_qual)
11364{
11365	bool ia32e;
11366
11367	*exit_qual = ENTRY_FAIL_DEFAULT;
11368
11369	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
11370	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
11371		return 1;
11372
11373	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
11374	    vmcs12->vmcs_link_pointer != -1ull) {
11375		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
11376		return 1;
11377	}
11378
11379	/*
11380	 * If the load IA32_EFER VM-entry control is 1, the following checks
11381	 * are performed on the field for the IA32_EFER MSR:
11382	 * - Bits reserved in the IA32_EFER MSR must be 0.
11383	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
11384	 *   the IA-32e mode guest VM-exit control. It must also be identical
11385	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
11386	 *   CR0.PG) is 1.
11387	 */
11388	if (to_vmx(vcpu)->nested.nested_run_pending &&
11389	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
11390		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
11391		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
11392		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
11393		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
11394		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
11395			return 1;
11396	}
11397
11398	/*
11399	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
11400	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
11401	 * the values of the LMA and LME bits in the field must each be that of
11402	 * the host address-space size VM-exit control.
11403	 */
11404	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
11405		ia32e = (vmcs12->vm_exit_controls &
11406			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
11407		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
11408		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
11409		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
11410			return 1;
11411	}
11412
11413	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
11414		(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
11415		(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
11416			return 1;
11417
11418	return 0;
11419}
11420
11421static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
11422{
11423	struct vcpu_vmx *vmx = to_vmx(vcpu);
11424	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
11425	u32 msr_entry_idx;
11426	u32 exit_qual;
11427	int r;
11428
11429	enter_guest_mode(vcpu);
11430
11431	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
11432		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
11433
11434	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
11435	vmx_segment_cache_clear(vmx);
11436
11437	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
11438		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
11439
11440	r = EXIT_REASON_INVALID_STATE;
11441	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
11442		goto fail;
11443
11444	nested_get_vmcs12_pages(vcpu, vmcs12);
11445
11446	r = EXIT_REASON_MSR_LOAD_FAIL;
11447	msr_entry_idx = nested_vmx_load_msr(vcpu,
11448					    vmcs12->vm_entry_msr_load_addr,
11449					    vmcs12->vm_entry_msr_load_count);
11450	if (msr_entry_idx)
11451		goto fail;
11452
11453	/*
11454	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
11455	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
11456	 * returned as far as L1 is concerned. It will only return (and set
11457	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
11458	 */
11459	return 0;
11460
11461fail:
11462	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
11463		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
11464	leave_guest_mode(vcpu);
11465	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
11466	nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
11467	return 1;
11468}
11469
11470/*
11471 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
11472 * for running an L2 nested guest.
11473 */
11474static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
11475{
11476	struct vmcs12 *vmcs12;
11477	struct vcpu_vmx *vmx = to_vmx(vcpu);
11478	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
11479	u32 exit_qual;
11480	int ret;
11481
11482	if (!nested_vmx_check_permission(vcpu))
11483		return 1;
11484
11485	if (!nested_vmx_check_vmcs12(vcpu))
11486		goto out;
11487
11488	vmcs12 = get_vmcs12(vcpu);
11489
11490	if (enable_shadow_vmcs)
11491		copy_shadow_to_vmcs12(vmx);
11492
11493	/*
11494	 * The nested entry process starts with enforcing various prerequisites
11495	 * on vmcs12 as required by the Intel SDM, and act appropriately when
11496	 * they fail: As the SDM explains, some conditions should cause the
11497	 * instruction to fail, while others will cause the instruction to seem
11498	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
11499	 * To speed up the normal (success) code path, we should avoid checking
11500	 * for misconfigurations which will anyway be caught by the processor
11501	 * when using the merged vmcs02.
11502	 */
11503	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
11504		nested_vmx_failValid(vcpu,
11505				     VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
11506		goto out;
11507	}
11508
11509	if (vmcs12->launch_state == launch) {
11510		nested_vmx_failValid(vcpu,
11511			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
11512			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
11513		goto out;
11514	}
11515
11516	ret = check_vmentry_prereqs(vcpu, vmcs12);
11517	if (ret) {
11518		nested_vmx_failValid(vcpu, ret);
11519		goto out;
11520	}
11521
11522	/*
11523	 * After this point, the trap flag no longer triggers a singlestep trap
11524	 * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
11525	 * This is not 100% correct; for performance reasons, we delegate most
11526	 * of the checks on host state to the processor.  If those fail,
11527	 * the singlestep trap is missed.
11528	 */
11529	skip_emulated_instruction(vcpu);
11530
11531	ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
11532	if (ret) {
11533		nested_vmx_entry_failure(vcpu, vmcs12,
11534					 EXIT_REASON_INVALID_STATE, exit_qual);
11535		return 1;
11536	}
11537
11538	/*
11539	 * We're finally done with prerequisite checking, and can start with
11540	 * the nested entry.
11541	 */
11542
11543	ret = enter_vmx_non_root_mode(vcpu, true);
11544	if (ret)
11545		return ret;
11546
11547	/*
11548	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
11549	 * by event injection, halt vcpu.
11550	 */
11551	if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
11552	    !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
11553		return kvm_vcpu_halt(vcpu);
11554
11555	vmx->nested.nested_run_pending = 1;
11556
11557	return 1;
11558
11559out:
11560	return kvm_skip_emulated_instruction(vcpu);
11561}
11562
11563/*
11564 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
11565 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
11566 * This function returns the new value we should put in vmcs12.guest_cr0.
11567 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
11568 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
11569 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
11570 *     didn't trap the bit, because if L1 did, so would L0).
11571 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
11572 *     been modified by L2, and L1 knows it. So just leave the old value of
11573 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
11574 *     isn't relevant, because if L0 traps this bit it can set it to anything.
11575 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
11576 *     changed these bits, and therefore they need to be updated, but L0
11577 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
11578 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
11579 */
11580static inline unsigned long
11581vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
11582{
11583	return
11584	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
11585	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
11586	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
11587			vcpu->arch.cr0_guest_owned_bits));
11588}
11589
11590static inline unsigned long
11591vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
11592{
11593	return
11594	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
11595	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
11596	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
11597			vcpu->arch.cr4_guest_owned_bits));
11598}
11599
11600static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
11601				       struct vmcs12 *vmcs12)
11602{
11603	u32 idt_vectoring;
11604	unsigned int nr;
11605
11606	if (vcpu->arch.exception.injected) {
11607		nr = vcpu->arch.exception.nr;
11608		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
11609
11610		if (kvm_exception_is_soft(nr)) {
11611			vmcs12->vm_exit_instruction_len =
11612				vcpu->arch.event_exit_inst_len;
11613			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
11614		} else
11615			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
11616
11617		if (vcpu->arch.exception.has_error_code) {
11618			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
11619			vmcs12->idt_vectoring_error_code =
11620				vcpu->arch.exception.error_code;
11621		}
11622
11623		vmcs12->idt_vectoring_info_field = idt_vectoring;
11624	} else if (vcpu->arch.nmi_injected) {
11625		vmcs12->idt_vectoring_info_field =
11626			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
11627	} else if (vcpu->arch.interrupt.injected) {
11628		nr = vcpu->arch.interrupt.nr;
11629		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
11630
11631		if (vcpu->arch.interrupt.soft) {
11632			idt_vectoring |= INTR_TYPE_SOFT_INTR;
11633			vmcs12->vm_entry_instruction_len =
11634				vcpu->arch.event_exit_inst_len;
11635		} else
11636			idt_vectoring |= INTR_TYPE_EXT_INTR;
11637
11638		vmcs12->idt_vectoring_info_field = idt_vectoring;
11639	}
11640}
11641
11642static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
11643{
11644	struct vcpu_vmx *vmx = to_vmx(vcpu);
11645	unsigned long exit_qual;
11646	bool block_nested_events =
11647	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
11648
11649	if (vcpu->arch.exception.pending &&
11650		nested_vmx_check_exception(vcpu, &exit_qual)) {
11651		if (block_nested_events)
11652			return -EBUSY;
11653		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
11654		return 0;
11655	}
11656
11657	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
11658	    vmx->nested.preemption_timer_expired) {
11659		if (block_nested_events)
11660			return -EBUSY;
11661		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
11662		return 0;
11663	}
11664
11665	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
11666		if (block_nested_events)
11667			return -EBUSY;
11668		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
11669				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
11670				  INTR_INFO_VALID_MASK, 0);
11671		/*
11672		 * The NMI-triggered VM exit counts as injection:
11673		 * clear this one and block further NMIs.
11674		 */
11675		vcpu->arch.nmi_pending = 0;
11676		vmx_set_nmi_mask(vcpu, true);
11677		return 0;
11678	}
11679
11680	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
11681	    nested_exit_on_intr(vcpu)) {
11682		if (block_nested_events)
11683			return -EBUSY;
11684		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
11685		return 0;
11686	}
11687
11688	vmx_complete_nested_posted_interrupt(vcpu);
11689	return 0;
11690}
11691
11692static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
11693{
11694	ktime_t remaining =
11695		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
11696	u64 value;
11697
11698	if (ktime_to_ns(remaining) <= 0)
11699		return 0;
11700
11701	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
11702	do_div(value, 1000000);
11703	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
11704}
11705
11706/*
11707 * Update the guest state fields of vmcs12 to reflect changes that
11708 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
11709 * VM-entry controls is also updated, since this is really a guest
11710 * state bit.)
11711 */
11712static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
11713{
11714	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
11715	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
11716
11717	vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
11718	vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
11719	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
11720
11721	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
11722	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
11723	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
11724	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
11725	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
11726	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
11727	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
11728	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
11729	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
11730	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
11731	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
11732	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
11733	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
11734	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
11735	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
11736	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
11737	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
11738	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
11739	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
11740	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
11741	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
11742	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
11743	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
11744	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
11745	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
11746	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
11747	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
11748	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
11749	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
11750	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
11751	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
11752	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
11753	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
11754	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
11755	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
11756	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
11757
11758	vmcs12->guest_interruptibility_info =
11759		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
11760	vmcs12->guest_pending_dbg_exceptions =
11761		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
11762	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
11763		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
11764	else
11765		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
11766
11767	if (nested_cpu_has_preemption_timer(vmcs12)) {
11768		if (vmcs12->vm_exit_controls &
11769		    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
11770			vmcs12->vmx_preemption_timer_value =
11771				vmx_get_preemption_timer_value(vcpu);
11772		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
11773	}
11774
11775	/*
11776	 * In some cases (usually, nested EPT), L2 is allowed to change its
11777	 * own CR3 without exiting. If it has changed it, we must keep it.
11778	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
11779	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
11780	 *
11781	 * Additionally, restore L2's PDPTR to vmcs12.
11782	 */
11783	if (enable_ept) {
11784		vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
11785		vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
11786		vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
11787		vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
11788		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
11789	}
11790
11791	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
11792
11793	if (nested_cpu_has_vid(vmcs12))
11794		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
11795
11796	vmcs12->vm_entry_controls =
11797		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
11798		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
11799
11800	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
11801		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
11802		vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
11803	}
11804
11805	/* TODO: These cannot have changed unless we have MSR bitmaps and
11806	 * the relevant bit asks not to trap the change */
11807	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
11808		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
11809	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
11810		vmcs12->guest_ia32_efer = vcpu->arch.efer;
11811	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
11812	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
11813	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
11814	if (kvm_mpx_supported())
11815		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
11816}
11817
11818/*
11819 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
11820 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
11821 * and this function updates it to reflect the changes to the guest state while
11822 * L2 was running (and perhaps made some exits which were handled directly by L0
11823 * without going back to L1), and to reflect the exit reason.
11824 * Note that we do not have to copy here all VMCS fields, just those that
11825 * could have changed by the L2 guest or the exit - i.e., the guest-state and
11826 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
11827 * which already writes to vmcs12 directly.
11828 */
11829static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
11830			   u32 exit_reason, u32 exit_intr_info,
11831			   unsigned long exit_qualification)
11832{
11833	/* update guest state fields: */
11834	sync_vmcs12(vcpu, vmcs12);
11835
11836	/* update exit information fields: */
11837
11838	vmcs12->vm_exit_reason = exit_reason;
11839	vmcs12->exit_qualification = exit_qualification;
11840	vmcs12->vm_exit_intr_info = exit_intr_info;
11841
11842	vmcs12->idt_vectoring_info_field = 0;
11843	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
11844	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
11845
11846	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
11847		vmcs12->launch_state = 1;
11848
11849		/* vm_entry_intr_info_field is cleared on exit. Emulate this
11850		 * instead of reading the real value. */
11851		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
11852
11853		/*
11854		 * Transfer the event that L0 or L1 may wanted to inject into
11855		 * L2 to IDT_VECTORING_INFO_FIELD.
11856		 */
11857		vmcs12_save_pending_event(vcpu, vmcs12);
11858	}
11859
11860	/*
11861	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
11862	 * preserved above and would only end up incorrectly in L1.
11863	 */
11864	vcpu->arch.nmi_injected = false;
11865	kvm_clear_exception_queue(vcpu);
11866	kvm_clear_interrupt_queue(vcpu);
11867}
11868
11869static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
11870			struct vmcs12 *vmcs12)
11871{
11872	u32 entry_failure_code;
11873
11874	nested_ept_uninit_mmu_context(vcpu);
11875
11876	/*
11877	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
11878	 * couldn't have changed.
11879	 */
11880	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
11881		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
11882
11883	if (!enable_ept)
11884		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
11885}
11886
11887/*
11888 * A part of what we need to when the nested L2 guest exits and we want to
11889 * run its L1 parent, is to reset L1's guest state to the host state specified
11890 * in vmcs12.
11891 * This function is to be called not only on normal nested exit, but also on
11892 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
11893 * Failures During or After Loading Guest State").
11894 * This function should be called when the active VMCS is L1's (vmcs01).
11895 */
11896static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
11897				   struct vmcs12 *vmcs12)
11898{
11899	struct kvm_segment seg;
11900
11901	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
11902		vcpu->arch.efer = vmcs12->host_ia32_efer;
11903	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
11904		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
11905	else
11906		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
11907	vmx_set_efer(vcpu, vcpu->arch.efer);
11908
11909	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
11910	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
11911	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
11912	/*
11913	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
11914	 * actually changed, because vmx_set_cr0 refers to efer set above.
11915	 *
11916	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
11917	 * (KVM doesn't change it);
11918	 */
11919	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
11920	vmx_set_cr0(vcpu, vmcs12->host_cr0);
11921
11922	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
11923	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
11924	vmx_set_cr4(vcpu, vmcs12->host_cr4);
11925
11926	load_vmcs12_mmu_host_state(vcpu, vmcs12);
11927
11928	if (enable_vpid) {
11929		/*
11930		 * Trivially support vpid by letting L2s share their parent
11931		 * L1's vpid. TODO: move to a more elaborate solution, giving
11932		 * each L2 its own vpid and exposing the vpid feature to L1.
11933		 */
11934		vmx_flush_tlb(vcpu, true);
11935	}
11936
11937	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
11938	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
11939	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
11940	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
11941	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
11942	vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
11943	vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
11944
11945	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
11946	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
11947		vmcs_write64(GUEST_BNDCFGS, 0);
11948
11949	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
11950		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
11951		vcpu->arch.pat = vmcs12->host_ia32_pat;
11952	}
11953	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
11954		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
11955			vmcs12->host_ia32_perf_global_ctrl);
11956
11957	/* Set L1 segment info according to Intel SDM
11958	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
11959	seg = (struct kvm_segment) {
11960		.base = 0,
11961		.limit = 0xFFFFFFFF,
11962		.selector = vmcs12->host_cs_selector,
11963		.type = 11,
11964		.present = 1,
11965		.s = 1,
11966		.g = 1
11967	};
11968	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
11969		seg.l = 1;
11970	else
11971		seg.db = 1;
11972	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
11973	seg = (struct kvm_segment) {
11974		.base = 0,
11975		.limit = 0xFFFFFFFF,
11976		.type = 3,
11977		.present = 1,
11978		.s = 1,
11979		.db = 1,
11980		.g = 1
11981	};
11982	seg.selector = vmcs12->host_ds_selector;
11983	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
11984	seg.selector = vmcs12->host_es_selector;
11985	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
11986	seg.selector = vmcs12->host_ss_selector;
11987	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
11988	seg.selector = vmcs12->host_fs_selector;
11989	seg.base = vmcs12->host_fs_base;
11990	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
11991	seg.selector = vmcs12->host_gs_selector;
11992	seg.base = vmcs12->host_gs_base;
11993	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
11994	seg = (struct kvm_segment) {
11995		.base = vmcs12->host_tr_base,
11996		.limit = 0x67,
11997		.selector = vmcs12->host_tr_selector,
11998		.type = 11,
11999		.present = 1
12000	};
12001	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
12002
12003	kvm_set_dr(vcpu, 7, 0x400);
12004	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
12005
12006	if (cpu_has_vmx_msr_bitmap())
12007		vmx_update_msr_bitmap(vcpu);
12008
12009	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
12010				vmcs12->vm_exit_msr_load_count))
12011		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
12012}
12013
12014/*
12015 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
12016 * and modify vmcs12 to make it see what it would expect to see there if
12017 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
12018 */
12019static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
12020			      u32 exit_intr_info,
12021			      unsigned long exit_qualification)
12022{
12023	struct vcpu_vmx *vmx = to_vmx(vcpu);
12024	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
12025
12026	/* trying to cancel vmlaunch/vmresume is a bug */
12027	WARN_ON_ONCE(vmx->nested.nested_run_pending);
12028
12029	/*
12030	 * The only expected VM-instruction error is "VM entry with
12031	 * invalid control field(s)." Anything else indicates a
12032	 * problem with L0.
12033	 */
12034	WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
12035				   VMXERR_ENTRY_INVALID_CONTROL_FIELD));
12036
12037	leave_guest_mode(vcpu);
12038
12039	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
12040		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
12041
12042	if (likely(!vmx->fail)) {
12043		if (exit_reason == -1)
12044			sync_vmcs12(vcpu, vmcs12);
12045		else
12046			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
12047				       exit_qualification);
12048
12049		if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
12050					 vmcs12->vm_exit_msr_store_count))
12051			nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
12052	}
12053
12054	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
12055	vm_entry_controls_reset_shadow(vmx);
12056	vm_exit_controls_reset_shadow(vmx);
12057	vmx_segment_cache_clear(vmx);
12058
12059	/* Update any VMCS fields that might have changed while L2 ran */
12060	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
12061	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
12062	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
12063	if (vmx->hv_deadline_tsc == -1)
12064		vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
12065				PIN_BASED_VMX_PREEMPTION_TIMER);
12066	else
12067		vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
12068			      PIN_BASED_VMX_PREEMPTION_TIMER);
12069	if (kvm_has_tsc_control)
12070		decache_tsc_multiplier(vmx);
12071
12072	if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
12073		vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
12074		vmx_set_virtual_x2apic_mode(vcpu,
12075				vcpu->arch.apic_base & X2APIC_ENABLE);
12076	} else if (!nested_cpu_has_ept(vmcs12) &&
12077		   nested_cpu_has2(vmcs12,
12078				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
12079		vmx_flush_tlb(vcpu, true);
12080	}
12081
12082	/* This is needed for same reason as it was needed in prepare_vmcs02 */
12083	vmx->host_rsp = 0;
12084
12085	/* Unpin physical memory we referred to in vmcs02 */
12086	if (vmx->nested.apic_access_page) {
12087		kvm_release_page_dirty(vmx->nested.apic_access_page);
12088		vmx->nested.apic_access_page = NULL;
12089	}
12090	if (vmx->nested.virtual_apic_page) {
12091		kvm_release_page_dirty(vmx->nested.virtual_apic_page);
12092		vmx->nested.virtual_apic_page = NULL;
12093	}
12094	if (vmx->nested.pi_desc_page) {
12095		kunmap(vmx->nested.pi_desc_page);
12096		kvm_release_page_dirty(vmx->nested.pi_desc_page);
12097		vmx->nested.pi_desc_page = NULL;
12098		vmx->nested.pi_desc = NULL;
12099	}
12100
12101	/*
12102	 * We are now running in L2, mmu_notifier will force to reload the
12103	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
12104	 */
12105	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
12106
12107	if (enable_shadow_vmcs && exit_reason != -1)
12108		vmx->nested.sync_shadow_vmcs = true;
12109
12110	/* in case we halted in L2 */
12111	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
12112
12113	if (likely(!vmx->fail)) {
12114		/*
12115		 * TODO: SDM says that with acknowledge interrupt on
12116		 * exit, bit 31 of the VM-exit interrupt information
12117		 * (valid interrupt) is always set to 1 on
12118		 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
12119		 * need kvm_cpu_has_interrupt().  See the commit
12120		 * message for details.
12121		 */
12122		if (nested_exit_intr_ack_set(vcpu) &&
12123		    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
12124		    kvm_cpu_has_interrupt(vcpu)) {
12125			int irq = kvm_cpu_get_interrupt(vcpu);
12126			WARN_ON(irq < 0);
12127			vmcs12->vm_exit_intr_info = irq |
12128				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
12129		}
12130
12131		if (exit_reason != -1)
12132			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
12133						       vmcs12->exit_qualification,
12134						       vmcs12->idt_vectoring_info_field,
12135						       vmcs12->vm_exit_intr_info,
12136						       vmcs12->vm_exit_intr_error_code,
12137						       KVM_ISA_VMX);
12138
12139		load_vmcs12_host_state(vcpu, vmcs12);
12140
12141		return;
12142	}
12143	
12144	/*
12145	 * After an early L2 VM-entry failure, we're now back
12146	 * in L1 which thinks it just finished a VMLAUNCH or
12147	 * VMRESUME instruction, so we need to set the failure
12148	 * flag and the VM-instruction error field of the VMCS
12149	 * accordingly.
12150	 */
12151	nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
12152
12153	load_vmcs12_mmu_host_state(vcpu, vmcs12);
12154
12155	/*
12156	 * The emulated instruction was already skipped in
12157	 * nested_vmx_run, but the updated RIP was never
12158	 * written back to the vmcs01.
12159	 */
12160	skip_emulated_instruction(vcpu);
12161	vmx->fail = 0;
12162}
12163
12164/*
12165 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
12166 */
12167static void vmx_leave_nested(struct kvm_vcpu *vcpu)
12168{
12169	if (is_guest_mode(vcpu)) {
12170		to_vmx(vcpu)->nested.nested_run_pending = 0;
12171		nested_vmx_vmexit(vcpu, -1, 0, 0);
12172	}
12173	free_nested(to_vmx(vcpu));
12174}
12175
12176/*
12177 * L1's failure to enter L2 is a subset of a normal exit, as explained in
12178 * 23.7 "VM-entry failures during or after loading guest state" (this also
12179 * lists the acceptable exit-reason and exit-qualification parameters).
12180 * It should only be called before L2 actually succeeded to run, and when
12181 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
12182 */
12183static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
12184			struct vmcs12 *vmcs12,
12185			u32 reason, unsigned long qualification)
12186{
12187	load_vmcs12_host_state(vcpu, vmcs12);
12188	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
12189	vmcs12->exit_qualification = qualification;
12190	nested_vmx_succeed(vcpu);
12191	if (enable_shadow_vmcs)
12192		to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
12193}
12194
12195static int vmx_check_intercept(struct kvm_vcpu *vcpu,
12196			       struct x86_instruction_info *info,
12197			       enum x86_intercept_stage stage)
12198{
12199	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
12200	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
12201
12202	/*
12203	 * RDPID causes #UD if disabled through secondary execution controls.
12204	 * Because it is marked as EmulateOnUD, we need to intercept it here.
12205	 */
12206	if (info->intercept == x86_intercept_rdtscp &&
12207	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
12208		ctxt->exception.vector = UD_VECTOR;
12209		ctxt->exception.error_code_valid = false;
12210		return X86EMUL_PROPAGATE_FAULT;
12211	}
12212
12213	/* TODO: check more intercepts... */
12214	return X86EMUL_CONTINUE;
12215}
12216
12217#ifdef CONFIG_X86_64
12218/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
12219static inline int u64_shl_div_u64(u64 a, unsigned int shift,
12220				  u64 divisor, u64 *result)
12221{
12222	u64 low = a << shift, high = a >> (64 - shift);
12223
12224	/* To avoid the overflow on divq */
12225	if (high >= divisor)
12226		return 1;
12227
12228	/* Low hold the result, high hold rem which is discarded */
12229	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
12230	    "rm" (divisor), "0" (low), "1" (high));
12231	*result = low;
12232
12233	return 0;
12234}
12235
12236static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
12237{
12238	struct vcpu_vmx *vmx;
12239	u64 tscl, guest_tscl, delta_tsc;
12240
12241	if (kvm_mwait_in_guest(vcpu->kvm))
12242		return -EOPNOTSUPP;
12243
12244	vmx = to_vmx(vcpu);
12245	tscl = rdtsc();
12246	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
12247	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
12248
12249	/* Convert to host delta tsc if tsc scaling is enabled */
12250	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
12251			u64_shl_div_u64(delta_tsc,
12252				kvm_tsc_scaling_ratio_frac_bits,
12253				vcpu->arch.tsc_scaling_ratio,
12254				&delta_tsc))
12255		return -ERANGE;
12256
12257	/*
12258	 * If the delta tsc can't fit in the 32 bit after the multi shift,
12259	 * we can't use the preemption timer.
12260	 * It's possible that it fits on later vmentries, but checking
12261	 * on every vmentry is costly so we just use an hrtimer.
12262	 */
12263	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
12264		return -ERANGE;
12265
12266	vmx->hv_deadline_tsc = tscl + delta_tsc;
12267	vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
12268			PIN_BASED_VMX_PREEMPTION_TIMER);
12269
12270	return delta_tsc == 0;
12271}
12272
12273static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
12274{
12275	struct vcpu_vmx *vmx = to_vmx(vcpu);
12276	vmx->hv_deadline_tsc = -1;
12277	vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
12278			PIN_BASED_VMX_PREEMPTION_TIMER);
12279}
12280#endif
12281
12282static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
12283{
12284	if (!kvm_pause_in_guest(vcpu->kvm))
12285		shrink_ple_window(vcpu);
12286}
12287
12288static void vmx_slot_enable_log_dirty(struct kvm *kvm,
12289				     struct kvm_memory_slot *slot)
12290{
12291	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
12292	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
12293}
12294
12295static void vmx_slot_disable_log_dirty(struct kvm *kvm,
12296				       struct kvm_memory_slot *slot)
12297{
12298	kvm_mmu_slot_set_dirty(kvm, slot);
12299}
12300
12301static void vmx_flush_log_dirty(struct kvm *kvm)
12302{
12303	kvm_flush_pml_buffers(kvm);
12304}
12305
12306static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
12307{
12308	struct vmcs12 *vmcs12;
12309	struct vcpu_vmx *vmx = to_vmx(vcpu);
12310	gpa_t gpa;
12311	struct page *page = NULL;
12312	u64 *pml_address;
12313
12314	if (is_guest_mode(vcpu)) {
12315		WARN_ON_ONCE(vmx->nested.pml_full);
12316
12317		/*
12318		 * Check if PML is enabled for the nested guest.
12319		 * Whether eptp bit 6 is set is already checked
12320		 * as part of A/D emulation.
12321		 */
12322		vmcs12 = get_vmcs12(vcpu);
12323		if (!nested_cpu_has_pml(vmcs12))
12324			return 0;
12325
12326		if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
12327			vmx->nested.pml_full = true;
12328			return 1;
12329		}
12330
12331		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
12332
12333		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
12334		if (is_error_page(page))
12335			return 0;
12336
12337		pml_address = kmap(page);
12338		pml_address[vmcs12->guest_pml_index--] = gpa;
12339		kunmap(page);
12340		kvm_release_page_clean(page);
12341	}
12342
12343	return 0;
12344}
12345
12346static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
12347					   struct kvm_memory_slot *memslot,
12348					   gfn_t offset, unsigned long mask)
12349{
12350	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
12351}
12352
12353static void __pi_post_block(struct kvm_vcpu *vcpu)
12354{
12355	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
12356	struct pi_desc old, new;
12357	unsigned int dest;
12358
12359	do {
12360		old.control = new.control = pi_desc->control;
12361		WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
12362		     "Wakeup handler not enabled while the VCPU is blocked\n");
12363
12364		dest = cpu_physical_id(vcpu->cpu);
12365
12366		if (x2apic_enabled())
12367			new.ndst = dest;
12368		else
12369			new.ndst = (dest << 8) & 0xFF00;
12370
12371		/* set 'NV' to 'notification vector' */
12372		new.nv = POSTED_INTR_VECTOR;
12373	} while (cmpxchg64(&pi_desc->control, old.control,
12374			   new.control) != old.control);
12375
12376	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
12377		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
12378		list_del(&vcpu->blocked_vcpu_list);
12379		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
12380		vcpu->pre_pcpu = -1;
12381	}
12382}
12383
12384/*
12385 * This routine does the following things for vCPU which is going
12386 * to be blocked if VT-d PI is enabled.
12387 * - Store the vCPU to the wakeup list, so when interrupts happen
12388 *   we can find the right vCPU to wake up.
12389 * - Change the Posted-interrupt descriptor as below:
12390 *      'NDST' <-- vcpu->pre_pcpu
12391 *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
12392 * - If 'ON' is set during this process, which means at least one
12393 *   interrupt is posted for this vCPU, we cannot block it, in
12394 *   this case, return 1, otherwise, return 0.
12395 *
12396 */
12397static int pi_pre_block(struct kvm_vcpu *vcpu)
12398{
12399	unsigned int dest;
12400	struct pi_desc old, new;
12401	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
12402
12403	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
12404		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
12405		!kvm_vcpu_apicv_active(vcpu))
12406		return 0;
12407
12408	WARN_ON(irqs_disabled());
12409	local_irq_disable();
12410	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
12411		vcpu->pre_pcpu = vcpu->cpu;
12412		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
12413		list_add_tail(&vcpu->blocked_vcpu_list,
12414			      &per_cpu(blocked_vcpu_on_cpu,
12415				       vcpu->pre_pcpu));
12416		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
12417	}
12418
12419	do {
12420		old.control = new.control = pi_desc->control;
12421
12422		WARN((pi_desc->sn == 1),
12423		     "Warning: SN field of posted-interrupts "
12424		     "is set before blocking\n");
12425
12426		/*
12427		 * Since vCPU can be preempted during this process,
12428		 * vcpu->cpu could be different with pre_pcpu, we
12429		 * need to set pre_pcpu as the destination of wakeup
12430		 * notification event, then we can find the right vCPU
12431		 * to wakeup in wakeup handler if interrupts happen
12432		 * when the vCPU is in blocked state.
12433		 */
12434		dest = cpu_physical_id(vcpu->pre_pcpu);
12435
12436		if (x2apic_enabled())
12437			new.ndst = dest;
12438		else
12439			new.ndst = (dest << 8) & 0xFF00;
12440
12441		/* set 'NV' to 'wakeup vector' */
12442		new.nv = POSTED_INTR_WAKEUP_VECTOR;
12443	} while (cmpxchg64(&pi_desc->control, old.control,
12444			   new.control) != old.control);
12445
12446	/* We should not block the vCPU if an interrupt is posted for it.  */
12447	if (pi_test_on(pi_desc) == 1)
12448		__pi_post_block(vcpu);
12449
12450	local_irq_enable();
12451	return (vcpu->pre_pcpu == -1);
12452}
12453
12454static int vmx_pre_block(struct kvm_vcpu *vcpu)
12455{
12456	if (pi_pre_block(vcpu))
12457		return 1;
12458
12459	if (kvm_lapic_hv_timer_in_use(vcpu))
12460		kvm_lapic_switch_to_sw_timer(vcpu);
12461
12462	return 0;
12463}
12464
12465static void pi_post_block(struct kvm_vcpu *vcpu)
12466{
12467	if (vcpu->pre_pcpu == -1)
12468		return;
12469
12470	WARN_ON(irqs_disabled());
12471	local_irq_disable();
12472	__pi_post_block(vcpu);
12473	local_irq_enable();
12474}
12475
12476static void vmx_post_block(struct kvm_vcpu *vcpu)
12477{
12478	if (kvm_x86_ops->set_hv_timer)
12479		kvm_lapic_switch_to_hv_timer(vcpu);
12480
12481	pi_post_block(vcpu);
12482}
12483
12484/*
12485 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
12486 *
12487 * @kvm: kvm
12488 * @host_irq: host irq of the interrupt
12489 * @guest_irq: gsi of the interrupt
12490 * @set: set or unset PI
12491 * returns 0 on success, < 0 on failure
12492 */
12493static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
12494			      uint32_t guest_irq, bool set)
12495{
12496	struct kvm_kernel_irq_routing_entry *e;
12497	struct kvm_irq_routing_table *irq_rt;
12498	struct kvm_lapic_irq irq;
12499	struct kvm_vcpu *vcpu;
12500	struct vcpu_data vcpu_info;
12501	int idx, ret = 0;
12502
12503	if (!kvm_arch_has_assigned_device(kvm) ||
12504		!irq_remapping_cap(IRQ_POSTING_CAP) ||
12505		!kvm_vcpu_apicv_active(kvm->vcpus[0]))
12506		return 0;
12507
12508	idx = srcu_read_lock(&kvm->irq_srcu);
12509	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
12510	if (guest_irq >= irq_rt->nr_rt_entries ||
12511	    hlist_empty(&irq_rt->map[guest_irq])) {
12512		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
12513			     guest_irq, irq_rt->nr_rt_entries);
12514		goto out;
12515	}
12516
12517	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
12518		if (e->type != KVM_IRQ_ROUTING_MSI)
12519			continue;
12520		/*
12521		 * VT-d PI cannot support posting multicast/broadcast
12522		 * interrupts to a vCPU, we still use interrupt remapping
12523		 * for these kind of interrupts.
12524		 *
12525		 * For lowest-priority interrupts, we only support
12526		 * those with single CPU as the destination, e.g. user
12527		 * configures the interrupts via /proc/irq or uses
12528		 * irqbalance to make the interrupts single-CPU.
12529		 *
12530		 * We will support full lowest-priority interrupt later.
12531		 */
12532
12533		kvm_set_msi_irq(kvm, e, &irq);
12534		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
12535			/*
12536			 * Make sure the IRTE is in remapped mode if
12537			 * we don't handle it in posted mode.
12538			 */
12539			ret = irq_set_vcpu_affinity(host_irq, NULL);
12540			if (ret < 0) {
12541				printk(KERN_INFO
12542				   "failed to back to remapped mode, irq: %u\n",
12543				   host_irq);
12544				goto out;
12545			}
12546
12547			continue;
12548		}
12549
12550		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
12551		vcpu_info.vector = irq.vector;
12552
12553		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
12554				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
12555
12556		if (set)
12557			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
12558		else
12559			ret = irq_set_vcpu_affinity(host_irq, NULL);
12560
12561		if (ret < 0) {
12562			printk(KERN_INFO "%s: failed to update PI IRTE\n",
12563					__func__);
12564			goto out;
12565		}
12566	}
12567
12568	ret = 0;
12569out:
12570	srcu_read_unlock(&kvm->irq_srcu, idx);
12571	return ret;
12572}
12573
12574static void vmx_setup_mce(struct kvm_vcpu *vcpu)
12575{
12576	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
12577		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
12578			FEATURE_CONTROL_LMCE;
12579	else
12580		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
12581			~FEATURE_CONTROL_LMCE;
12582}
12583
12584static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
12585{
12586	/* we need a nested vmexit to enter SMM, postpone if run is pending */
12587	if (to_vmx(vcpu)->nested.nested_run_pending)
12588		return 0;
12589	return 1;
12590}
12591
12592static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
12593{
12594	struct vcpu_vmx *vmx = to_vmx(vcpu);
12595
12596	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
12597	if (vmx->nested.smm.guest_mode)
12598		nested_vmx_vmexit(vcpu, -1, 0, 0);
12599
12600	vmx->nested.smm.vmxon = vmx->nested.vmxon;
12601	vmx->nested.vmxon = false;
12602	vmx_clear_hlt(vcpu);
12603	return 0;
12604}
12605
12606static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
12607{
12608	struct vcpu_vmx *vmx = to_vmx(vcpu);
12609	int ret;
12610
12611	if (vmx->nested.smm.vmxon) {
12612		vmx->nested.vmxon = true;
12613		vmx->nested.smm.vmxon = false;
12614	}
12615
12616	if (vmx->nested.smm.guest_mode) {
12617		vcpu->arch.hflags &= ~HF_SMM_MASK;
12618		ret = enter_vmx_non_root_mode(vcpu, false);
12619		vcpu->arch.hflags |= HF_SMM_MASK;
12620		if (ret)
12621			return ret;
12622
12623		vmx->nested.smm.guest_mode = false;
12624	}
12625	return 0;
12626}
12627
12628static int enable_smi_window(struct kvm_vcpu *vcpu)
12629{
12630	return 0;
12631}
12632
12633static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
12634	.cpu_has_kvm_support = cpu_has_kvm_support,
12635	.disabled_by_bios = vmx_disabled_by_bios,
12636	.hardware_setup = hardware_setup,
12637	.hardware_unsetup = hardware_unsetup,
12638	.check_processor_compatibility = vmx_check_processor_compat,
12639	.hardware_enable = hardware_enable,
12640	.hardware_disable = hardware_disable,
12641	.cpu_has_accelerated_tpr = report_flexpriority,
12642	.has_emulated_msr = vmx_has_emulated_msr,
12643
12644	.vm_init = vmx_vm_init,
12645	.vm_alloc = vmx_vm_alloc,
12646	.vm_free = vmx_vm_free,
12647
12648	.vcpu_create = vmx_create_vcpu,
12649	.vcpu_free = vmx_free_vcpu,
12650	.vcpu_reset = vmx_vcpu_reset,
12651
12652	.prepare_guest_switch = vmx_save_host_state,
12653	.vcpu_load = vmx_vcpu_load,
12654	.vcpu_put = vmx_vcpu_put,
12655
12656	.update_bp_intercept = update_exception_bitmap,
12657	.get_msr_feature = vmx_get_msr_feature,
12658	.get_msr = vmx_get_msr,
12659	.set_msr = vmx_set_msr,
12660	.get_segment_base = vmx_get_segment_base,
12661	.get_segment = vmx_get_segment,
12662	.set_segment = vmx_set_segment,
12663	.get_cpl = vmx_get_cpl,
12664	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
12665	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
12666	.decache_cr3 = vmx_decache_cr3,
12667	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
12668	.set_cr0 = vmx_set_cr0,
12669	.set_cr3 = vmx_set_cr3,
12670	.set_cr4 = vmx_set_cr4,
12671	.set_efer = vmx_set_efer,
12672	.get_idt = vmx_get_idt,
12673	.set_idt = vmx_set_idt,
12674	.get_gdt = vmx_get_gdt,
12675	.set_gdt = vmx_set_gdt,
12676	.get_dr6 = vmx_get_dr6,
12677	.set_dr6 = vmx_set_dr6,
12678	.set_dr7 = vmx_set_dr7,
12679	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
12680	.cache_reg = vmx_cache_reg,
12681	.get_rflags = vmx_get_rflags,
12682	.set_rflags = vmx_set_rflags,
12683
12684	.tlb_flush = vmx_flush_tlb,
12685
12686	.run = vmx_vcpu_run,
12687	.handle_exit = vmx_handle_exit,
12688	.skip_emulated_instruction = skip_emulated_instruction,
12689	.set_interrupt_shadow = vmx_set_interrupt_shadow,
12690	.get_interrupt_shadow = vmx_get_interrupt_shadow,
12691	.patch_hypercall = vmx_patch_hypercall,
12692	.set_irq = vmx_inject_irq,
12693	.set_nmi = vmx_inject_nmi,
12694	.queue_exception = vmx_queue_exception,
12695	.cancel_injection = vmx_cancel_injection,
12696	.interrupt_allowed = vmx_interrupt_allowed,
12697	.nmi_allowed = vmx_nmi_allowed,
12698	.get_nmi_mask = vmx_get_nmi_mask,
12699	.set_nmi_mask = vmx_set_nmi_mask,
12700	.enable_nmi_window = enable_nmi_window,
12701	.enable_irq_window = enable_irq_window,
12702	.update_cr8_intercept = update_cr8_intercept,
12703	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
12704	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
12705	.get_enable_apicv = vmx_get_enable_apicv,
12706	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
12707	.load_eoi_exitmap = vmx_load_eoi_exitmap,
12708	.apicv_post_state_restore = vmx_apicv_post_state_restore,
12709	.hwapic_irr_update = vmx_hwapic_irr_update,
12710	.hwapic_isr_update = vmx_hwapic_isr_update,
12711	.sync_pir_to_irr = vmx_sync_pir_to_irr,
12712	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
12713
12714	.set_tss_addr = vmx_set_tss_addr,
12715	.set_identity_map_addr = vmx_set_identity_map_addr,
12716	.get_tdp_level = get_ept_level,
12717	.get_mt_mask = vmx_get_mt_mask,
12718
12719	.get_exit_info = vmx_get_exit_info,
12720
12721	.get_lpage_level = vmx_get_lpage_level,
12722
12723	.cpuid_update = vmx_cpuid_update,
12724
12725	.rdtscp_supported = vmx_rdtscp_supported,
12726	.invpcid_supported = vmx_invpcid_supported,
12727
12728	.set_supported_cpuid = vmx_set_supported_cpuid,
12729
12730	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
12731
12732	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
12733	.write_tsc_offset = vmx_write_tsc_offset,
12734
12735	.set_tdp_cr3 = vmx_set_cr3,
12736
12737	.check_intercept = vmx_check_intercept,
12738	.handle_external_intr = vmx_handle_external_intr,
12739	.mpx_supported = vmx_mpx_supported,
12740	.xsaves_supported = vmx_xsaves_supported,
12741	.umip_emulated = vmx_umip_emulated,
12742
12743	.check_nested_events = vmx_check_nested_events,
12744
12745	.sched_in = vmx_sched_in,
12746
12747	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
12748	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
12749	.flush_log_dirty = vmx_flush_log_dirty,
12750	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
12751	.write_log_dirty = vmx_write_pml_buffer,
12752
12753	.pre_block = vmx_pre_block,
12754	.post_block = vmx_post_block,
12755
12756	.pmu_ops = &intel_pmu_ops,
12757
12758	.update_pi_irte = vmx_update_pi_irte,
12759
12760#ifdef CONFIG_X86_64
12761	.set_hv_timer = vmx_set_hv_timer,
12762	.cancel_hv_timer = vmx_cancel_hv_timer,
12763#endif
12764
12765	.setup_mce = vmx_setup_mce,
12766
12767	.smi_allowed = vmx_smi_allowed,
12768	.pre_enter_smm = vmx_pre_enter_smm,
12769	.pre_leave_smm = vmx_pre_leave_smm,
12770	.enable_smi_window = enable_smi_window,
12771};
12772
12773static int __init vmx_init(void)
12774{
12775	int r;
12776
12777#if IS_ENABLED(CONFIG_HYPERV)
12778	/*
12779	 * Enlightened VMCS usage should be recommended and the host needs
12780	 * to support eVMCS v1 or above. We can also disable eVMCS support
12781	 * with module parameter.
12782	 */
12783	if (enlightened_vmcs &&
12784	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
12785	    (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
12786	    KVM_EVMCS_VERSION) {
12787		int cpu;
12788
12789		/* Check that we have assist pages on all online CPUs */
12790		for_each_online_cpu(cpu) {
12791			if (!hv_get_vp_assist_page(cpu)) {
12792				enlightened_vmcs = false;
12793				break;
12794			}
12795		}
12796
12797		if (enlightened_vmcs) {
12798			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
12799			static_branch_enable(&enable_evmcs);
12800		}
12801	} else {
12802		enlightened_vmcs = false;
12803	}
12804#endif
12805
12806	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
12807                     __alignof__(struct vcpu_vmx), THIS_MODULE);
12808	if (r)
12809		return r;
12810
12811#ifdef CONFIG_KEXEC_CORE
12812	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
12813			   crash_vmclear_local_loaded_vmcss);
12814#endif
12815
12816	return 0;
12817}
12818
12819static void __exit vmx_exit(void)
12820{
12821#ifdef CONFIG_KEXEC_CORE
12822	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
12823	synchronize_rcu();
12824#endif
12825
12826	kvm_exit();
12827
12828#if IS_ENABLED(CONFIG_HYPERV)
12829	if (static_branch_unlikely(&enable_evmcs)) {
12830		int cpu;
12831		struct hv_vp_assist_page *vp_ap;
12832		/*
12833		 * Reset everything to support using non-enlightened VMCS
12834		 * access later (e.g. when we reload the module with
12835		 * enlightened_vmcs=0)
12836		 */
12837		for_each_online_cpu(cpu) {
12838			vp_ap =	hv_get_vp_assist_page(cpu);
12839
12840			if (!vp_ap)
12841				continue;
12842
12843			vp_ap->current_nested_vmcs = 0;
12844			vp_ap->enlighten_vmentry = 0;
12845		}
12846
12847		static_branch_disable(&enable_evmcs);
12848	}
12849#endif
12850}
12851
12852module_init(vmx_init)
12853module_exit(vmx_exit)