Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1#define pr_fmt(fmt) "SVM: " fmt
   2
   3#include <linux/kvm_host.h>
   4
   5#include "irq.h"
   6#include "mmu.h"
   7#include "kvm_cache_regs.h"
   8#include "x86.h"
   9#include "smm.h"
  10#include "cpuid.h"
  11#include "pmu.h"
  12
  13#include <linux/module.h>
  14#include <linux/mod_devicetable.h>
  15#include <linux/kernel.h>
  16#include <linux/vmalloc.h>
  17#include <linux/highmem.h>
  18#include <linux/amd-iommu.h>
  19#include <linux/sched.h>
  20#include <linux/trace_events.h>
  21#include <linux/slab.h>
  22#include <linux/hashtable.h>
  23#include <linux/objtool.h>
  24#include <linux/psp-sev.h>
  25#include <linux/file.h>
  26#include <linux/pagemap.h>
  27#include <linux/swap.h>
  28#include <linux/rwsem.h>
  29#include <linux/cc_platform.h>
  30
  31#include <asm/apic.h>
  32#include <asm/perf_event.h>
  33#include <asm/tlbflush.h>
  34#include <asm/desc.h>
  35#include <asm/debugreg.h>
  36#include <asm/kvm_para.h>
  37#include <asm/irq_remapping.h>
  38#include <asm/spec-ctrl.h>
  39#include <asm/cpu_device_id.h>
  40#include <asm/traps.h>
  41#include <asm/fpu/api.h>
  42
  43#include <asm/virtext.h>
  44#include "trace.h"
  45
  46#include "svm.h"
  47#include "svm_ops.h"
  48
  49#include "kvm_onhyperv.h"
  50#include "svm_onhyperv.h"
  51
  52MODULE_AUTHOR("Qumranet");
  53MODULE_LICENSE("GPL");
  54
  55#ifdef MODULE
  56static const struct x86_cpu_id svm_cpu_id[] = {
  57	X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  58	{}
  59};
  60MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  61#endif
  62
  63#define SEG_TYPE_LDT 2
  64#define SEG_TYPE_BUSY_TSS16 3
  65
  66static bool erratum_383_found __read_mostly;
  67
  68u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  69
  70/*
  71 * Set osvw_len to higher value when updated Revision Guides
  72 * are published and we know what the new status bits are
  73 */
  74static uint64_t osvw_len = 4, osvw_status;
  75
  76static DEFINE_PER_CPU(u64, current_tsc_ratio);
  77
  78#define X2APIC_MSR(x)	(APIC_BASE_MSR + (x >> 4))
  79
  80static const struct svm_direct_access_msrs {
  81	u32 index;   /* Index of the MSR */
  82	bool always; /* True if intercept is initially cleared */
  83} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  84	{ .index = MSR_STAR,				.always = true  },
  85	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
  86	{ .index = MSR_IA32_SYSENTER_EIP,		.always = false },
  87	{ .index = MSR_IA32_SYSENTER_ESP,		.always = false },
  88#ifdef CONFIG_X86_64
  89	{ .index = MSR_GS_BASE,				.always = true  },
  90	{ .index = MSR_FS_BASE,				.always = true  },
  91	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
  92	{ .index = MSR_LSTAR,				.always = true  },
  93	{ .index = MSR_CSTAR,				.always = true  },
  94	{ .index = MSR_SYSCALL_MASK,			.always = true  },
  95#endif
  96	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
  97	{ .index = MSR_IA32_PRED_CMD,			.always = false },
  98	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
  99	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
 100	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
 101	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
 102	{ .index = MSR_EFER,				.always = false },
 103	{ .index = MSR_IA32_CR_PAT,			.always = false },
 104	{ .index = MSR_AMD64_SEV_ES_GHCB,		.always = true  },
 105	{ .index = MSR_TSC_AUX,				.always = false },
 106	{ .index = X2APIC_MSR(APIC_ID),			.always = false },
 107	{ .index = X2APIC_MSR(APIC_LVR),		.always = false },
 108	{ .index = X2APIC_MSR(APIC_TASKPRI),		.always = false },
 109	{ .index = X2APIC_MSR(APIC_ARBPRI),		.always = false },
 110	{ .index = X2APIC_MSR(APIC_PROCPRI),		.always = false },
 111	{ .index = X2APIC_MSR(APIC_EOI),		.always = false },
 112	{ .index = X2APIC_MSR(APIC_RRR),		.always = false },
 113	{ .index = X2APIC_MSR(APIC_LDR),		.always = false },
 114	{ .index = X2APIC_MSR(APIC_DFR),		.always = false },
 115	{ .index = X2APIC_MSR(APIC_SPIV),		.always = false },
 116	{ .index = X2APIC_MSR(APIC_ISR),		.always = false },
 117	{ .index = X2APIC_MSR(APIC_TMR),		.always = false },
 118	{ .index = X2APIC_MSR(APIC_IRR),		.always = false },
 119	{ .index = X2APIC_MSR(APIC_ESR),		.always = false },
 120	{ .index = X2APIC_MSR(APIC_ICR),		.always = false },
 121	{ .index = X2APIC_MSR(APIC_ICR2),		.always = false },
 122
 123	/*
 124	 * Note:
 125	 * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 126	 * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 127	 * the AVIC hardware would generate GP fault. Therefore, always
 128	 * intercept the MSR 0x832, and do not setup direct_access_msr.
 129	 */
 130	{ .index = X2APIC_MSR(APIC_LVTTHMR),		.always = false },
 131	{ .index = X2APIC_MSR(APIC_LVTPC),		.always = false },
 132	{ .index = X2APIC_MSR(APIC_LVT0),		.always = false },
 133	{ .index = X2APIC_MSR(APIC_LVT1),		.always = false },
 134	{ .index = X2APIC_MSR(APIC_LVTERR),		.always = false },
 135	{ .index = X2APIC_MSR(APIC_TMICT),		.always = false },
 136	{ .index = X2APIC_MSR(APIC_TMCCT),		.always = false },
 137	{ .index = X2APIC_MSR(APIC_TDCR),		.always = false },
 138	{ .index = MSR_INVALID,				.always = false },
 139};
 140
 141/*
 142 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 143 * pause_filter_count: On processors that support Pause filtering(indicated
 144 *	by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 145 *	count value. On VMRUN this value is loaded into an internal counter.
 146 *	Each time a pause instruction is executed, this counter is decremented
 147 *	until it reaches zero at which time a #VMEXIT is generated if pause
 148 *	intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 149 *	Intercept Filtering for more details.
 150 *	This also indicate if ple logic enabled.
 151 *
 152 * pause_filter_thresh: In addition, some processor families support advanced
 153 *	pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 154 *	the amount of time a guest is allowed to execute in a pause loop.
 155 *	In this mode, a 16-bit pause filter threshold field is added in the
 156 *	VMCB. The threshold value is a cycle count that is used to reset the
 157 *	pause counter. As with simple pause filtering, VMRUN loads the pause
 158 *	count value from VMCB into an internal counter. Then, on each pause
 159 *	instruction the hardware checks the elapsed number of cycles since
 160 *	the most recent pause instruction against the pause filter threshold.
 161 *	If the elapsed cycle count is greater than the pause filter threshold,
 162 *	then the internal pause count is reloaded from the VMCB and execution
 163 *	continues. If the elapsed cycle count is less than the pause filter
 164 *	threshold, then the internal pause count is decremented. If the count
 165 *	value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 166 *	triggered. If advanced pause filtering is supported and pause filter
 167 *	threshold field is set to zero, the filter will operate in the simpler,
 168 *	count only mode.
 169 */
 170
 171static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 172module_param(pause_filter_thresh, ushort, 0444);
 173
 174static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 175module_param(pause_filter_count, ushort, 0444);
 176
 177/* Default doubles per-vcpu window every exit. */
 178static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 179module_param(pause_filter_count_grow, ushort, 0444);
 180
 181/* Default resets per-vcpu window every exit to pause_filter_count. */
 182static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 183module_param(pause_filter_count_shrink, ushort, 0444);
 184
 185/* Default is to compute the maximum so we can never overflow. */
 186static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 187module_param(pause_filter_count_max, ushort, 0444);
 188
 189/*
 190 * Use nested page tables by default.  Note, NPT may get forced off by
 191 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 192 */
 193bool npt_enabled = true;
 194module_param_named(npt, npt_enabled, bool, 0444);
 195
 196/* allow nested virtualization in KVM/SVM */
 197static int nested = true;
 198module_param(nested, int, S_IRUGO);
 199
 200/* enable/disable Next RIP Save */
 201static int nrips = true;
 202module_param(nrips, int, 0444);
 203
 204/* enable/disable Virtual VMLOAD VMSAVE */
 205static int vls = true;
 206module_param(vls, int, 0444);
 207
 208/* enable/disable Virtual GIF */
 209int vgif = true;
 210module_param(vgif, int, 0444);
 211
 212/* enable/disable LBR virtualization */
 213static int lbrv = true;
 214module_param(lbrv, int, 0444);
 215
 216static int tsc_scaling = true;
 217module_param(tsc_scaling, int, 0444);
 218
 219/*
 220 * enable / disable AVIC.  Because the defaults differ for APICv
 221 * support between VMX and SVM we cannot use module_param_named.
 222 */
 223static bool avic;
 224module_param(avic, bool, 0444);
 225
 226bool __read_mostly dump_invalid_vmcb;
 227module_param(dump_invalid_vmcb, bool, 0644);
 228
 229
 230bool intercept_smi = true;
 231module_param(intercept_smi, bool, 0444);
 232
 233
 234static bool svm_gp_erratum_intercept = true;
 235
 236static u8 rsm_ins_bytes[] = "\x0f\xaa";
 237
 238static unsigned long iopm_base;
 239
 240struct kvm_ldttss_desc {
 241	u16 limit0;
 242	u16 base0;
 243	unsigned base1:8, type:5, dpl:2, p:1;
 244	unsigned limit1:4, zero0:3, g:1, base2:8;
 245	u32 base3;
 246	u32 zero1;
 247} __attribute__((packed));
 248
 249DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 250
 251/*
 252 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 253 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 254 *
 255 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 256 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 257 */
 258static int tsc_aux_uret_slot __read_mostly = -1;
 259
 260static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 261
 262#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 263#define MSRS_RANGE_SIZE 2048
 264#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 265
 266u32 svm_msrpm_offset(u32 msr)
 267{
 268	u32 offset;
 269	int i;
 270
 271	for (i = 0; i < NUM_MSR_MAPS; i++) {
 272		if (msr < msrpm_ranges[i] ||
 273		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 274			continue;
 275
 276		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 277		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 278
 279		/* Now we have the u8 offset - but need the u32 offset */
 280		return offset / 4;
 281	}
 282
 283	/* MSR not in any range */
 284	return MSR_INVALID;
 285}
 286
 287static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 288
 289static int get_npt_level(void)
 290{
 291#ifdef CONFIG_X86_64
 292	return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 293#else
 294	return PT32E_ROOT_LEVEL;
 295#endif
 296}
 297
 298int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 299{
 300	struct vcpu_svm *svm = to_svm(vcpu);
 301	u64 old_efer = vcpu->arch.efer;
 302	vcpu->arch.efer = efer;
 303
 304	if (!npt_enabled) {
 305		/* Shadow paging assumes NX to be available.  */
 306		efer |= EFER_NX;
 307
 308		if (!(efer & EFER_LMA))
 309			efer &= ~EFER_LME;
 310	}
 311
 312	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 313		if (!(efer & EFER_SVME)) {
 314			svm_leave_nested(vcpu);
 315			svm_set_gif(svm, true);
 316			/* #GP intercept is still needed for vmware backdoor */
 317			if (!enable_vmware_backdoor)
 318				clr_exception_intercept(svm, GP_VECTOR);
 319
 320			/*
 321			 * Free the nested guest state, unless we are in SMM.
 322			 * In this case we will return to the nested guest
 323			 * as soon as we leave SMM.
 324			 */
 325			if (!is_smm(vcpu))
 326				svm_free_nested(svm);
 327
 328		} else {
 329			int ret = svm_allocate_nested(svm);
 330
 331			if (ret) {
 332				vcpu->arch.efer = old_efer;
 333				return ret;
 334			}
 335
 336			/*
 337			 * Never intercept #GP for SEV guests, KVM can't
 338			 * decrypt guest memory to workaround the erratum.
 339			 */
 340			if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 341				set_exception_intercept(svm, GP_VECTOR);
 342		}
 343	}
 344
 345	svm->vmcb->save.efer = efer | EFER_SVME;
 346	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 347	return 0;
 348}
 349
 350static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 351{
 352	struct vcpu_svm *svm = to_svm(vcpu);
 353	u32 ret = 0;
 354
 355	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 356		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 357	return ret;
 358}
 359
 360static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 361{
 362	struct vcpu_svm *svm = to_svm(vcpu);
 363
 364	if (mask == 0)
 365		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 366	else
 367		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 368
 369}
 370
 371static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 372					   bool commit_side_effects)
 373{
 374	struct vcpu_svm *svm = to_svm(vcpu);
 375	unsigned long old_rflags;
 376
 377	/*
 378	 * SEV-ES does not expose the next RIP. The RIP update is controlled by
 379	 * the type of exit and the #VC handler in the guest.
 380	 */
 381	if (sev_es_guest(vcpu->kvm))
 382		goto done;
 383
 384	if (nrips && svm->vmcb->control.next_rip != 0) {
 385		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 386		svm->next_rip = svm->vmcb->control.next_rip;
 387	}
 388
 389	if (!svm->next_rip) {
 390		if (unlikely(!commit_side_effects))
 391			old_rflags = svm->vmcb->save.rflags;
 392
 393		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 394			return 0;
 395
 396		if (unlikely(!commit_side_effects))
 397			svm->vmcb->save.rflags = old_rflags;
 398	} else {
 399		kvm_rip_write(vcpu, svm->next_rip);
 400	}
 401
 402done:
 403	if (likely(commit_side_effects))
 404		svm_set_interrupt_shadow(vcpu, 0);
 405
 406	return 1;
 407}
 408
 409static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 410{
 411	return __svm_skip_emulated_instruction(vcpu, true);
 412}
 413
 414static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 415{
 416	unsigned long rip, old_rip = kvm_rip_read(vcpu);
 417	struct vcpu_svm *svm = to_svm(vcpu);
 418
 419	/*
 420	 * Due to architectural shortcomings, the CPU doesn't always provide
 421	 * NextRIP, e.g. if KVM intercepted an exception that occurred while
 422	 * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 423	 * the instruction even if NextRIP is supported to acquire the next
 424	 * RIP so that it can be shoved into the NextRIP field, otherwise
 425	 * hardware will fail to advance guest RIP during event injection.
 426	 * Drop the exception/interrupt if emulation fails and effectively
 427	 * retry the instruction, it's the least awful option.  If NRIPS is
 428	 * in use, the skip must not commit any side effects such as clearing
 429	 * the interrupt shadow or RFLAGS.RF.
 430	 */
 431	if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 432		return -EIO;
 433
 434	rip = kvm_rip_read(vcpu);
 435
 436	/*
 437	 * Save the injection information, even when using next_rip, as the
 438	 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 439	 * doesn't complete due to a VM-Exit occurring while the CPU is
 440	 * vectoring the event.   Decoding the instruction isn't guaranteed to
 441	 * work as there may be no backing instruction, e.g. if the event is
 442	 * being injected by L1 for L2, or if the guest is patching INT3 into
 443	 * a different instruction.
 444	 */
 445	svm->soft_int_injected = true;
 446	svm->soft_int_csbase = svm->vmcb->save.cs.base;
 447	svm->soft_int_old_rip = old_rip;
 448	svm->soft_int_next_rip = rip;
 449
 450	if (nrips)
 451		kvm_rip_write(vcpu, old_rip);
 452
 453	if (static_cpu_has(X86_FEATURE_NRIPS))
 454		svm->vmcb->control.next_rip = rip;
 455
 456	return 0;
 457}
 458
 459static void svm_inject_exception(struct kvm_vcpu *vcpu)
 460{
 461	struct kvm_queued_exception *ex = &vcpu->arch.exception;
 462	struct vcpu_svm *svm = to_svm(vcpu);
 463
 464	kvm_deliver_exception_payload(vcpu, ex);
 465
 466	if (kvm_exception_is_soft(ex->vector) &&
 467	    svm_update_soft_interrupt_rip(vcpu))
 468		return;
 469
 470	svm->vmcb->control.event_inj = ex->vector
 471		| SVM_EVTINJ_VALID
 472		| (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 473		| SVM_EVTINJ_TYPE_EXEPT;
 474	svm->vmcb->control.event_inj_err = ex->error_code;
 475}
 476
 477static void svm_init_erratum_383(void)
 478{
 479	u32 low, high;
 480	int err;
 481	u64 val;
 482
 483	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 484		return;
 485
 486	/* Use _safe variants to not break nested virtualization */
 487	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 488	if (err)
 489		return;
 490
 491	val |= (1ULL << 47);
 492
 493	low  = lower_32_bits(val);
 494	high = upper_32_bits(val);
 495
 496	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 497
 498	erratum_383_found = true;
 499}
 500
 501static void svm_init_osvw(struct kvm_vcpu *vcpu)
 502{
 503	/*
 504	 * Guests should see errata 400 and 415 as fixed (assuming that
 505	 * HLT and IO instructions are intercepted).
 506	 */
 507	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 508	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 509
 510	/*
 511	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
 512	 * all osvw.status bits inside that length, including bit 0 (which is
 513	 * reserved for erratum 298), are valid. However, if host processor's
 514	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
 515	 * be conservative here and therefore we tell the guest that erratum 298
 516	 * is present (because we really don't know).
 517	 */
 518	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 519		vcpu->arch.osvw.status |= 1;
 520}
 521
 522static int has_svm(void)
 523{
 524	const char *msg;
 525
 526	if (!cpu_has_svm(&msg)) {
 527		printk(KERN_INFO "has_svm: %s\n", msg);
 528		return 0;
 529	}
 530
 531	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 532		pr_info("KVM is unsupported when running as an SEV guest\n");
 533		return 0;
 534	}
 535
 536	return 1;
 537}
 538
 539void __svm_write_tsc_multiplier(u64 multiplier)
 540{
 541	preempt_disable();
 542
 543	if (multiplier == __this_cpu_read(current_tsc_ratio))
 544		goto out;
 545
 546	wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 547	__this_cpu_write(current_tsc_ratio, multiplier);
 548out:
 549	preempt_enable();
 550}
 551
 552static void svm_hardware_disable(void)
 553{
 554	/* Make sure we clean up behind us */
 555	if (tsc_scaling)
 556		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 557
 558	cpu_svm_disable();
 559
 560	amd_pmu_disable_virt();
 561}
 562
 563static int svm_hardware_enable(void)
 564{
 565
 566	struct svm_cpu_data *sd;
 567	uint64_t efer;
 568	struct desc_struct *gdt;
 569	int me = raw_smp_processor_id();
 570
 571	rdmsrl(MSR_EFER, efer);
 572	if (efer & EFER_SVME)
 573		return -EBUSY;
 574
 575	if (!has_svm()) {
 576		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 577		return -EINVAL;
 578	}
 579	sd = per_cpu_ptr(&svm_data, me);
 580	sd->asid_generation = 1;
 581	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 582	sd->next_asid = sd->max_asid + 1;
 583	sd->min_asid = max_sev_asid + 1;
 584
 585	gdt = get_current_gdt_rw();
 586	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 587
 588	wrmsrl(MSR_EFER, efer | EFER_SVME);
 589
 590	wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 591
 592	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 593		/*
 594		 * Set the default value, even if we don't use TSC scaling
 595		 * to avoid having stale value in the msr
 596		 */
 597		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 598	}
 599
 600
 601	/*
 602	 * Get OSVW bits.
 603	 *
 604	 * Note that it is possible to have a system with mixed processor
 605	 * revisions and therefore different OSVW bits. If bits are not the same
 606	 * on different processors then choose the worst case (i.e. if erratum
 607	 * is present on one processor and not on another then assume that the
 608	 * erratum is present everywhere).
 609	 */
 610	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 611		uint64_t len, status = 0;
 612		int err;
 613
 614		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 615		if (!err)
 616			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 617						      &err);
 618
 619		if (err)
 620			osvw_status = osvw_len = 0;
 621		else {
 622			if (len < osvw_len)
 623				osvw_len = len;
 624			osvw_status |= status;
 625			osvw_status &= (1ULL << osvw_len) - 1;
 626		}
 627	} else
 628		osvw_status = osvw_len = 0;
 629
 630	svm_init_erratum_383();
 631
 632	amd_pmu_enable_virt();
 633
 634	return 0;
 635}
 636
 637static void svm_cpu_uninit(int cpu)
 638{
 639	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 640
 641	if (!sd->save_area)
 642		return;
 643
 644	kfree(sd->sev_vmcbs);
 645	__free_page(sd->save_area);
 646	sd->save_area_pa = 0;
 647	sd->save_area = NULL;
 648}
 649
 650static int svm_cpu_init(int cpu)
 651{
 652	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 653	int ret = -ENOMEM;
 654
 655	memset(sd, 0, sizeof(struct svm_cpu_data));
 656	sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 657	if (!sd->save_area)
 658		return ret;
 659
 660	ret = sev_cpu_init(sd);
 661	if (ret)
 662		goto free_save_area;
 663
 664	sd->save_area_pa = __sme_page_pa(sd->save_area);
 665	return 0;
 666
 667free_save_area:
 668	__free_page(sd->save_area);
 669	sd->save_area = NULL;
 670	return ret;
 671
 672}
 673
 674static int direct_access_msr_slot(u32 msr)
 675{
 676	u32 i;
 677
 678	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 679		if (direct_access_msrs[i].index == msr)
 680			return i;
 681
 682	return -ENOENT;
 683}
 684
 685static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 686				     int write)
 687{
 688	struct vcpu_svm *svm = to_svm(vcpu);
 689	int slot = direct_access_msr_slot(msr);
 690
 691	if (slot == -ENOENT)
 692		return;
 693
 694	/* Set the shadow bitmaps to the desired intercept states */
 695	if (read)
 696		set_bit(slot, svm->shadow_msr_intercept.read);
 697	else
 698		clear_bit(slot, svm->shadow_msr_intercept.read);
 699
 700	if (write)
 701		set_bit(slot, svm->shadow_msr_intercept.write);
 702	else
 703		clear_bit(slot, svm->shadow_msr_intercept.write);
 704}
 705
 706static bool valid_msr_intercept(u32 index)
 707{
 708	return direct_access_msr_slot(index) != -ENOENT;
 709}
 710
 711static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 712{
 713	u8 bit_write;
 714	unsigned long tmp;
 715	u32 offset;
 716	u32 *msrpm;
 717
 718	/*
 719	 * For non-nested case:
 720	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
 721	 * save it.
 722	 *
 723	 * For nested case:
 724	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 725	 * save it.
 726	 */
 727	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 728				      to_svm(vcpu)->msrpm;
 729
 730	offset    = svm_msrpm_offset(msr);
 731	bit_write = 2 * (msr & 0x0f) + 1;
 732	tmp       = msrpm[offset];
 733
 734	BUG_ON(offset == MSR_INVALID);
 735
 736	return !!test_bit(bit_write,  &tmp);
 737}
 738
 739static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 740					u32 msr, int read, int write)
 741{
 742	struct vcpu_svm *svm = to_svm(vcpu);
 743	u8 bit_read, bit_write;
 744	unsigned long tmp;
 745	u32 offset;
 746
 747	/*
 748	 * If this warning triggers extend the direct_access_msrs list at the
 749	 * beginning of the file
 750	 */
 751	WARN_ON(!valid_msr_intercept(msr));
 752
 753	/* Enforce non allowed MSRs to trap */
 754	if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 755		read = 0;
 756
 757	if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 758		write = 0;
 759
 760	offset    = svm_msrpm_offset(msr);
 761	bit_read  = 2 * (msr & 0x0f);
 762	bit_write = 2 * (msr & 0x0f) + 1;
 763	tmp       = msrpm[offset];
 764
 765	BUG_ON(offset == MSR_INVALID);
 766
 767	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 768	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 769
 770	msrpm[offset] = tmp;
 771
 772	svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 773	svm->nested.force_msr_bitmap_recalc = true;
 774}
 775
 776void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 777			  int read, int write)
 778{
 779	set_shadow_msr_intercept(vcpu, msr, read, write);
 780	set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 781}
 782
 783u32 *svm_vcpu_alloc_msrpm(void)
 784{
 785	unsigned int order = get_order(MSRPM_SIZE);
 786	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 787	u32 *msrpm;
 788
 789	if (!pages)
 790		return NULL;
 791
 792	msrpm = page_address(pages);
 793	memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 794
 795	return msrpm;
 796}
 797
 798void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 799{
 800	int i;
 801
 802	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 803		if (!direct_access_msrs[i].always)
 804			continue;
 805		set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 806	}
 807}
 808
 809void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 810{
 811	int i;
 812
 813	if (intercept == svm->x2avic_msrs_intercepted)
 814		return;
 815
 816	if (avic_mode != AVIC_MODE_X2 ||
 817	    !apic_x2apic_mode(svm->vcpu.arch.apic))
 818		return;
 819
 820	for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 821		int index = direct_access_msrs[i].index;
 822
 823		if ((index < APIC_BASE_MSR) ||
 824		    (index > APIC_BASE_MSR + 0xff))
 825			continue;
 826		set_msr_interception(&svm->vcpu, svm->msrpm, index,
 827				     !intercept, !intercept);
 828	}
 829
 830	svm->x2avic_msrs_intercepted = intercept;
 831}
 832
 833void svm_vcpu_free_msrpm(u32 *msrpm)
 834{
 835	__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 836}
 837
 838static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 839{
 840	struct vcpu_svm *svm = to_svm(vcpu);
 841	u32 i;
 842
 843	/*
 844	 * Set intercept permissions for all direct access MSRs again. They
 845	 * will automatically get filtered through the MSR filter, so we are
 846	 * back in sync after this.
 847	 */
 848	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 849		u32 msr = direct_access_msrs[i].index;
 850		u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 851		u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 852
 853		set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 854	}
 855}
 856
 857static void add_msr_offset(u32 offset)
 858{
 859	int i;
 860
 861	for (i = 0; i < MSRPM_OFFSETS; ++i) {
 862
 863		/* Offset already in list? */
 864		if (msrpm_offsets[i] == offset)
 865			return;
 866
 867		/* Slot used by another offset? */
 868		if (msrpm_offsets[i] != MSR_INVALID)
 869			continue;
 870
 871		/* Add offset to list */
 872		msrpm_offsets[i] = offset;
 873
 874		return;
 875	}
 876
 877	/*
 878	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
 879	 * increase MSRPM_OFFSETS in this case.
 880	 */
 881	BUG();
 882}
 883
 884static void init_msrpm_offsets(void)
 885{
 886	int i;
 887
 888	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 889
 890	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 891		u32 offset;
 892
 893		offset = svm_msrpm_offset(direct_access_msrs[i].index);
 894		BUG_ON(offset == MSR_INVALID);
 895
 896		add_msr_offset(offset);
 897	}
 898}
 899
 900void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 901{
 902	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
 903	to_vmcb->save.br_from		= from_vmcb->save.br_from;
 904	to_vmcb->save.br_to		= from_vmcb->save.br_to;
 905	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
 906	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
 907
 908	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 909}
 910
 911static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 912{
 913	struct vcpu_svm *svm = to_svm(vcpu);
 914
 915	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 916	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 917	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 918	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 919	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 920
 921	/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 922	if (is_guest_mode(vcpu))
 923		svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
 924}
 925
 926static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 927{
 928	struct vcpu_svm *svm = to_svm(vcpu);
 929
 930	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 931	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 932	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 933	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 934	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 935
 936	/*
 937	 * Move the LBR msrs back to the vmcb01 to avoid copying them
 938	 * on nested guest entries.
 939	 */
 940	if (is_guest_mode(vcpu))
 941		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 942}
 943
 944static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
 945{
 946	/*
 947	 * If the LBR virtualization is disabled, the LBR msrs are always
 948	 * kept in the vmcb01 to avoid copying them on nested guest entries.
 949	 *
 950	 * If nested, and the LBR virtualization is enabled/disabled, the msrs
 951	 * are moved between the vmcb01 and vmcb02 as needed.
 952	 */
 953	struct vmcb *vmcb =
 954		(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
 955			svm->vmcb : svm->vmcb01.ptr;
 956
 957	switch (index) {
 958	case MSR_IA32_DEBUGCTLMSR:
 959		return vmcb->save.dbgctl;
 960	case MSR_IA32_LASTBRANCHFROMIP:
 961		return vmcb->save.br_from;
 962	case MSR_IA32_LASTBRANCHTOIP:
 963		return vmcb->save.br_to;
 964	case MSR_IA32_LASTINTFROMIP:
 965		return vmcb->save.last_excp_from;
 966	case MSR_IA32_LASTINTTOIP:
 967		return vmcb->save.last_excp_to;
 968	default:
 969		KVM_BUG(false, svm->vcpu.kvm,
 970			"%s: Unknown MSR 0x%x", __func__, index);
 971		return 0;
 972	}
 973}
 974
 975void svm_update_lbrv(struct kvm_vcpu *vcpu)
 976{
 977	struct vcpu_svm *svm = to_svm(vcpu);
 978
 979	bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
 980					   DEBUGCTLMSR_LBR;
 981
 982	bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
 983				      LBR_CTL_ENABLE_MASK);
 984
 985	if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
 986		if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
 987			enable_lbrv = true;
 988
 989	if (enable_lbrv == current_enable_lbrv)
 990		return;
 991
 992	if (enable_lbrv)
 993		svm_enable_lbrv(vcpu);
 994	else
 995		svm_disable_lbrv(vcpu);
 996}
 997
 998void disable_nmi_singlestep(struct vcpu_svm *svm)
 999{
1000	svm->nmi_singlestep = false;
1001
1002	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1003		/* Clear our flags if they were not set by the guest */
1004		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1005			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1006		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1007			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1008	}
1009}
1010
1011static void grow_ple_window(struct kvm_vcpu *vcpu)
1012{
1013	struct vcpu_svm *svm = to_svm(vcpu);
1014	struct vmcb_control_area *control = &svm->vmcb->control;
1015	int old = control->pause_filter_count;
1016
1017	if (kvm_pause_in_guest(vcpu->kvm))
1018		return;
1019
1020	control->pause_filter_count = __grow_ple_window(old,
1021							pause_filter_count,
1022							pause_filter_count_grow,
1023							pause_filter_count_max);
1024
1025	if (control->pause_filter_count != old) {
1026		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1027		trace_kvm_ple_window_update(vcpu->vcpu_id,
1028					    control->pause_filter_count, old);
1029	}
1030}
1031
1032static void shrink_ple_window(struct kvm_vcpu *vcpu)
1033{
1034	struct vcpu_svm *svm = to_svm(vcpu);
1035	struct vmcb_control_area *control = &svm->vmcb->control;
1036	int old = control->pause_filter_count;
1037
1038	if (kvm_pause_in_guest(vcpu->kvm))
1039		return;
1040
1041	control->pause_filter_count =
1042				__shrink_ple_window(old,
1043						    pause_filter_count,
1044						    pause_filter_count_shrink,
1045						    pause_filter_count);
1046	if (control->pause_filter_count != old) {
1047		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1048		trace_kvm_ple_window_update(vcpu->vcpu_id,
1049					    control->pause_filter_count, old);
1050	}
1051}
1052
1053static void svm_hardware_unsetup(void)
1054{
1055	int cpu;
1056
1057	sev_hardware_unsetup();
1058
1059	for_each_possible_cpu(cpu)
1060		svm_cpu_uninit(cpu);
1061
1062	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1063	get_order(IOPM_SIZE));
1064	iopm_base = 0;
1065}
1066
1067static void init_seg(struct vmcb_seg *seg)
1068{
1069	seg->selector = 0;
1070	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1071		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1072	seg->limit = 0xffff;
1073	seg->base = 0;
1074}
1075
1076static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1077{
1078	seg->selector = 0;
1079	seg->attrib = SVM_SELECTOR_P_MASK | type;
1080	seg->limit = 0xffff;
1081	seg->base = 0;
1082}
1083
1084static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1085{
1086	struct vcpu_svm *svm = to_svm(vcpu);
1087
1088	return svm->nested.ctl.tsc_offset;
1089}
1090
1091static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1092{
1093	struct vcpu_svm *svm = to_svm(vcpu);
1094
1095	return svm->tsc_ratio_msr;
1096}
1097
1098static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1099{
1100	struct vcpu_svm *svm = to_svm(vcpu);
1101
1102	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1103	svm->vmcb->control.tsc_offset = offset;
1104	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1105}
1106
1107static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1108{
1109	__svm_write_tsc_multiplier(multiplier);
1110}
1111
1112
1113/* Evaluate instruction intercepts that depend on guest CPUID features. */
1114static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1115					      struct vcpu_svm *svm)
1116{
1117	/*
1118	 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1119	 * roots, or if INVPCID is disabled in the guest to inject #UD.
1120	 */
1121	if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1122		if (!npt_enabled ||
1123		    !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1124			svm_set_intercept(svm, INTERCEPT_INVPCID);
1125		else
1126			svm_clr_intercept(svm, INTERCEPT_INVPCID);
1127	}
1128
1129	if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1130		if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1131			svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1132		else
1133			svm_set_intercept(svm, INTERCEPT_RDTSCP);
1134	}
1135}
1136
1137static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1138{
1139	struct vcpu_svm *svm = to_svm(vcpu);
1140
1141	if (guest_cpuid_is_intel(vcpu)) {
1142		/*
1143		 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1144		 * accesses because the processor only stores 32 bits.
1145		 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1146		 */
1147		svm_set_intercept(svm, INTERCEPT_VMLOAD);
1148		svm_set_intercept(svm, INTERCEPT_VMSAVE);
1149		svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1150
1151		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1152		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1153
1154		svm->v_vmload_vmsave_enabled = false;
1155	} else {
1156		/*
1157		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1158		 * in VMCB and clear intercepts to avoid #VMEXIT.
1159		 */
1160		if (vls) {
1161			svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1162			svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1163			svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1164		}
1165		/* No need to intercept these MSRs */
1166		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1167		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1168	}
1169}
1170
1171static void init_vmcb(struct kvm_vcpu *vcpu)
1172{
1173	struct vcpu_svm *svm = to_svm(vcpu);
1174	struct vmcb *vmcb = svm->vmcb01.ptr;
1175	struct vmcb_control_area *control = &vmcb->control;
1176	struct vmcb_save_area *save = &vmcb->save;
1177
1178	svm_set_intercept(svm, INTERCEPT_CR0_READ);
1179	svm_set_intercept(svm, INTERCEPT_CR3_READ);
1180	svm_set_intercept(svm, INTERCEPT_CR4_READ);
1181	svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1182	svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1183	svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1184	if (!kvm_vcpu_apicv_active(vcpu))
1185		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1186
1187	set_dr_intercepts(svm);
1188
1189	set_exception_intercept(svm, PF_VECTOR);
1190	set_exception_intercept(svm, UD_VECTOR);
1191	set_exception_intercept(svm, MC_VECTOR);
1192	set_exception_intercept(svm, AC_VECTOR);
1193	set_exception_intercept(svm, DB_VECTOR);
1194	/*
1195	 * Guest access to VMware backdoor ports could legitimately
1196	 * trigger #GP because of TSS I/O permission bitmap.
1197	 * We intercept those #GP and allow access to them anyway
1198	 * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
1199	 * decrypt guest memory to decode the faulting instruction.
1200	 */
1201	if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
1202		set_exception_intercept(svm, GP_VECTOR);
1203
1204	svm_set_intercept(svm, INTERCEPT_INTR);
1205	svm_set_intercept(svm, INTERCEPT_NMI);
1206
1207	if (intercept_smi)
1208		svm_set_intercept(svm, INTERCEPT_SMI);
1209
1210	svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1211	svm_set_intercept(svm, INTERCEPT_RDPMC);
1212	svm_set_intercept(svm, INTERCEPT_CPUID);
1213	svm_set_intercept(svm, INTERCEPT_INVD);
1214	svm_set_intercept(svm, INTERCEPT_INVLPG);
1215	svm_set_intercept(svm, INTERCEPT_INVLPGA);
1216	svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1217	svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1218	svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1219	svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1220	svm_set_intercept(svm, INTERCEPT_VMRUN);
1221	svm_set_intercept(svm, INTERCEPT_VMMCALL);
1222	svm_set_intercept(svm, INTERCEPT_VMLOAD);
1223	svm_set_intercept(svm, INTERCEPT_VMSAVE);
1224	svm_set_intercept(svm, INTERCEPT_STGI);
1225	svm_set_intercept(svm, INTERCEPT_CLGI);
1226	svm_set_intercept(svm, INTERCEPT_SKINIT);
1227	svm_set_intercept(svm, INTERCEPT_WBINVD);
1228	svm_set_intercept(svm, INTERCEPT_XSETBV);
1229	svm_set_intercept(svm, INTERCEPT_RDPRU);
1230	svm_set_intercept(svm, INTERCEPT_RSM);
1231
1232	if (!kvm_mwait_in_guest(vcpu->kvm)) {
1233		svm_set_intercept(svm, INTERCEPT_MONITOR);
1234		svm_set_intercept(svm, INTERCEPT_MWAIT);
1235	}
1236
1237	if (!kvm_hlt_in_guest(vcpu->kvm))
1238		svm_set_intercept(svm, INTERCEPT_HLT);
1239
1240	control->iopm_base_pa = __sme_set(iopm_base);
1241	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1242	control->int_ctl = V_INTR_MASKING_MASK;
1243
1244	init_seg(&save->es);
1245	init_seg(&save->ss);
1246	init_seg(&save->ds);
1247	init_seg(&save->fs);
1248	init_seg(&save->gs);
1249
1250	save->cs.selector = 0xf000;
1251	save->cs.base = 0xffff0000;
1252	/* Executable/Readable Code Segment */
1253	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1254		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1255	save->cs.limit = 0xffff;
1256
1257	save->gdtr.base = 0;
1258	save->gdtr.limit = 0xffff;
1259	save->idtr.base = 0;
1260	save->idtr.limit = 0xffff;
1261
1262	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1263	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1264
1265	if (npt_enabled) {
1266		/* Setup VMCB for Nested Paging */
1267		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1268		svm_clr_intercept(svm, INTERCEPT_INVLPG);
1269		clr_exception_intercept(svm, PF_VECTOR);
1270		svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1271		svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1272		save->g_pat = vcpu->arch.pat;
1273		save->cr3 = 0;
1274	}
1275	svm->current_vmcb->asid_generation = 0;
1276	svm->asid = 0;
1277
1278	svm->nested.vmcb12_gpa = INVALID_GPA;
1279	svm->nested.last_vmcb12_gpa = INVALID_GPA;
1280
1281	if (!kvm_pause_in_guest(vcpu->kvm)) {
1282		control->pause_filter_count = pause_filter_count;
1283		if (pause_filter_thresh)
1284			control->pause_filter_thresh = pause_filter_thresh;
1285		svm_set_intercept(svm, INTERCEPT_PAUSE);
1286	} else {
1287		svm_clr_intercept(svm, INTERCEPT_PAUSE);
1288	}
1289
1290	svm_recalc_instruction_intercepts(vcpu, svm);
1291
1292	/*
1293	 * If the host supports V_SPEC_CTRL then disable the interception
1294	 * of MSR_IA32_SPEC_CTRL.
1295	 */
1296	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1297		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1298
1299	if (kvm_vcpu_apicv_active(vcpu))
1300		avic_init_vmcb(svm, vmcb);
1301
1302	if (vgif) {
1303		svm_clr_intercept(svm, INTERCEPT_STGI);
1304		svm_clr_intercept(svm, INTERCEPT_CLGI);
1305		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1306	}
1307
1308	if (sev_guest(vcpu->kvm))
1309		sev_init_vmcb(svm);
1310
1311	svm_hv_init_vmcb(vmcb);
1312	init_vmcb_after_set_cpuid(vcpu);
1313
1314	vmcb_mark_all_dirty(vmcb);
1315
1316	enable_gif(svm);
1317}
1318
1319static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1320{
1321	struct vcpu_svm *svm = to_svm(vcpu);
1322
1323	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1324
1325	svm_init_osvw(vcpu);
1326	vcpu->arch.microcode_version = 0x01000065;
1327	svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1328
1329	if (sev_es_guest(vcpu->kvm))
1330		sev_es_vcpu_reset(svm);
1331}
1332
1333static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1334{
1335	struct vcpu_svm *svm = to_svm(vcpu);
1336
1337	svm->spec_ctrl = 0;
1338	svm->virt_spec_ctrl = 0;
1339
1340	init_vmcb(vcpu);
1341
1342	if (!init_event)
1343		__svm_vcpu_reset(vcpu);
1344}
1345
1346void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1347{
1348	svm->current_vmcb = target_vmcb;
1349	svm->vmcb = target_vmcb->ptr;
1350}
1351
1352static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1353{
1354	struct vcpu_svm *svm;
1355	struct page *vmcb01_page;
1356	struct page *vmsa_page = NULL;
1357	int err;
1358
1359	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1360	svm = to_svm(vcpu);
1361
1362	err = -ENOMEM;
1363	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1364	if (!vmcb01_page)
1365		goto out;
1366
1367	if (sev_es_guest(vcpu->kvm)) {
1368		/*
1369		 * SEV-ES guests require a separate VMSA page used to contain
1370		 * the encrypted register state of the guest.
1371		 */
1372		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1373		if (!vmsa_page)
1374			goto error_free_vmcb_page;
1375
1376		/*
1377		 * SEV-ES guests maintain an encrypted version of their FPU
1378		 * state which is restored and saved on VMRUN and VMEXIT.
1379		 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1380		 * do xsave/xrstor on it.
1381		 */
1382		fpstate_set_confidential(&vcpu->arch.guest_fpu);
1383	}
1384
1385	err = avic_init_vcpu(svm);
1386	if (err)
1387		goto error_free_vmsa_page;
1388
1389	svm->msrpm = svm_vcpu_alloc_msrpm();
1390	if (!svm->msrpm) {
1391		err = -ENOMEM;
1392		goto error_free_vmsa_page;
1393	}
1394
1395	svm->x2avic_msrs_intercepted = true;
1396
1397	svm->vmcb01.ptr = page_address(vmcb01_page);
1398	svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1399	svm_switch_vmcb(svm, &svm->vmcb01);
1400
1401	if (vmsa_page)
1402		svm->sev_es.vmsa = page_address(vmsa_page);
1403
1404	svm->guest_state_loaded = false;
1405
1406	return 0;
1407
1408error_free_vmsa_page:
1409	if (vmsa_page)
1410		__free_page(vmsa_page);
1411error_free_vmcb_page:
1412	__free_page(vmcb01_page);
1413out:
1414	return err;
1415}
1416
1417static void svm_clear_current_vmcb(struct vmcb *vmcb)
1418{
1419	int i;
1420
1421	for_each_online_cpu(i)
1422		cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1423}
1424
1425static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1426{
1427	struct vcpu_svm *svm = to_svm(vcpu);
1428
1429	/*
1430	 * The vmcb page can be recycled, causing a false negative in
1431	 * svm_vcpu_load(). So, ensure that no logical CPU has this
1432	 * vmcb page recorded as its current vmcb.
1433	 */
1434	svm_clear_current_vmcb(svm->vmcb);
1435
1436	svm_leave_nested(vcpu);
1437	svm_free_nested(svm);
1438
1439	sev_free_vcpu(vcpu);
1440
1441	__free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1442	__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1443}
1444
1445static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1446{
1447	struct vcpu_svm *svm = to_svm(vcpu);
1448	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1449
1450	if (sev_es_guest(vcpu->kvm))
1451		sev_es_unmap_ghcb(svm);
1452
1453	if (svm->guest_state_loaded)
1454		return;
1455
1456	/*
1457	 * Save additional host state that will be restored on VMEXIT (sev-es)
1458	 * or subsequent vmload of host save area.
1459	 */
1460	vmsave(sd->save_area_pa);
1461	if (sev_es_guest(vcpu->kvm)) {
1462		struct sev_es_save_area *hostsa;
1463		hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1464
1465		sev_es_prepare_switch_to_guest(hostsa);
1466	}
1467
1468	if (tsc_scaling)
1469		__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1470
1471	if (likely(tsc_aux_uret_slot >= 0))
1472		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1473
1474	svm->guest_state_loaded = true;
1475}
1476
1477static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1478{
1479	to_svm(vcpu)->guest_state_loaded = false;
1480}
1481
1482static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1483{
1484	struct vcpu_svm *svm = to_svm(vcpu);
1485	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1486
1487	if (sd->current_vmcb != svm->vmcb) {
1488		sd->current_vmcb = svm->vmcb;
1489		indirect_branch_prediction_barrier();
1490	}
1491	if (kvm_vcpu_apicv_active(vcpu))
1492		avic_vcpu_load(vcpu, cpu);
1493}
1494
1495static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1496{
1497	if (kvm_vcpu_apicv_active(vcpu))
1498		avic_vcpu_put(vcpu);
1499
1500	svm_prepare_host_switch(vcpu);
1501
1502	++vcpu->stat.host_state_reload;
1503}
1504
1505static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1506{
1507	struct vcpu_svm *svm = to_svm(vcpu);
1508	unsigned long rflags = svm->vmcb->save.rflags;
1509
1510	if (svm->nmi_singlestep) {
1511		/* Hide our flags if they were not set by the guest */
1512		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1513			rflags &= ~X86_EFLAGS_TF;
1514		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1515			rflags &= ~X86_EFLAGS_RF;
1516	}
1517	return rflags;
1518}
1519
1520static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1521{
1522	if (to_svm(vcpu)->nmi_singlestep)
1523		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1524
1525       /*
1526        * Any change of EFLAGS.VM is accompanied by a reload of SS
1527        * (caused by either a task switch or an inter-privilege IRET),
1528        * so we do not need to update the CPL here.
1529        */
1530	to_svm(vcpu)->vmcb->save.rflags = rflags;
1531}
1532
1533static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1534{
1535	struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1536
1537	return sev_es_guest(vcpu->kvm)
1538		? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1539		: kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1540}
1541
1542static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1543{
1544	kvm_register_mark_available(vcpu, reg);
1545
1546	switch (reg) {
1547	case VCPU_EXREG_PDPTR:
1548		/*
1549		 * When !npt_enabled, mmu->pdptrs[] is already available since
1550		 * it is always updated per SDM when moving to CRs.
1551		 */
1552		if (npt_enabled)
1553			load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1554		break;
1555	default:
1556		KVM_BUG_ON(1, vcpu->kvm);
1557	}
1558}
1559
1560static void svm_set_vintr(struct vcpu_svm *svm)
1561{
1562	struct vmcb_control_area *control;
1563
1564	/*
1565	 * The following fields are ignored when AVIC is enabled
1566	 */
1567	WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1568
1569	svm_set_intercept(svm, INTERCEPT_VINTR);
1570
1571	/*
1572	 * This is just a dummy VINTR to actually cause a vmexit to happen.
1573	 * Actual injection of virtual interrupts happens through EVENTINJ.
1574	 */
1575	control = &svm->vmcb->control;
1576	control->int_vector = 0x0;
1577	control->int_ctl &= ~V_INTR_PRIO_MASK;
1578	control->int_ctl |= V_IRQ_MASK |
1579		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1580	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1581}
1582
1583static void svm_clear_vintr(struct vcpu_svm *svm)
1584{
1585	svm_clr_intercept(svm, INTERCEPT_VINTR);
1586
1587	/* Drop int_ctl fields related to VINTR injection.  */
1588	svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1589	if (is_guest_mode(&svm->vcpu)) {
1590		svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1591
1592		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1593			(svm->nested.ctl.int_ctl & V_TPR_MASK));
1594
1595		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1596			V_IRQ_INJECTION_BITS_MASK;
1597
1598		svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1599	}
1600
1601	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1602}
1603
1604static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1605{
1606	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1607	struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1608
1609	switch (seg) {
1610	case VCPU_SREG_CS: return &save->cs;
1611	case VCPU_SREG_DS: return &save->ds;
1612	case VCPU_SREG_ES: return &save->es;
1613	case VCPU_SREG_FS: return &save01->fs;
1614	case VCPU_SREG_GS: return &save01->gs;
1615	case VCPU_SREG_SS: return &save->ss;
1616	case VCPU_SREG_TR: return &save01->tr;
1617	case VCPU_SREG_LDTR: return &save01->ldtr;
1618	}
1619	BUG();
1620	return NULL;
1621}
1622
1623static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1624{
1625	struct vmcb_seg *s = svm_seg(vcpu, seg);
1626
1627	return s->base;
1628}
1629
1630static void svm_get_segment(struct kvm_vcpu *vcpu,
1631			    struct kvm_segment *var, int seg)
1632{
1633	struct vmcb_seg *s = svm_seg(vcpu, seg);
1634
1635	var->base = s->base;
1636	var->limit = s->limit;
1637	var->selector = s->selector;
1638	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1639	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1640	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1641	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1642	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1643	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1644	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1645
1646	/*
1647	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1648	 * However, the SVM spec states that the G bit is not observed by the
1649	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1650	 * So let's synthesize a legal G bit for all segments, this helps
1651	 * running KVM nested. It also helps cross-vendor migration, because
1652	 * Intel's vmentry has a check on the 'G' bit.
1653	 */
1654	var->g = s->limit > 0xfffff;
1655
1656	/*
1657	 * AMD's VMCB does not have an explicit unusable field, so emulate it
1658	 * for cross vendor migration purposes by "not present"
1659	 */
1660	var->unusable = !var->present;
1661
1662	switch (seg) {
1663	case VCPU_SREG_TR:
1664		/*
1665		 * Work around a bug where the busy flag in the tr selector
1666		 * isn't exposed
1667		 */
1668		var->type |= 0x2;
1669		break;
1670	case VCPU_SREG_DS:
1671	case VCPU_SREG_ES:
1672	case VCPU_SREG_FS:
1673	case VCPU_SREG_GS:
1674		/*
1675		 * The accessed bit must always be set in the segment
1676		 * descriptor cache, although it can be cleared in the
1677		 * descriptor, the cached bit always remains at 1. Since
1678		 * Intel has a check on this, set it here to support
1679		 * cross-vendor migration.
1680		 */
1681		if (!var->unusable)
1682			var->type |= 0x1;
1683		break;
1684	case VCPU_SREG_SS:
1685		/*
1686		 * On AMD CPUs sometimes the DB bit in the segment
1687		 * descriptor is left as 1, although the whole segment has
1688		 * been made unusable. Clear it here to pass an Intel VMX
1689		 * entry check when cross vendor migrating.
1690		 */
1691		if (var->unusable)
1692			var->db = 0;
1693		/* This is symmetric with svm_set_segment() */
1694		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1695		break;
1696	}
1697}
1698
1699static int svm_get_cpl(struct kvm_vcpu *vcpu)
1700{
1701	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1702
1703	return save->cpl;
1704}
1705
1706static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1707{
1708	struct kvm_segment cs;
1709
1710	svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1711	*db = cs.db;
1712	*l = cs.l;
1713}
1714
1715static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1716{
1717	struct vcpu_svm *svm = to_svm(vcpu);
1718
1719	dt->size = svm->vmcb->save.idtr.limit;
1720	dt->address = svm->vmcb->save.idtr.base;
1721}
1722
1723static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1724{
1725	struct vcpu_svm *svm = to_svm(vcpu);
1726
1727	svm->vmcb->save.idtr.limit = dt->size;
1728	svm->vmcb->save.idtr.base = dt->address ;
1729	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1730}
1731
1732static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1733{
1734	struct vcpu_svm *svm = to_svm(vcpu);
1735
1736	dt->size = svm->vmcb->save.gdtr.limit;
1737	dt->address = svm->vmcb->save.gdtr.base;
1738}
1739
1740static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1741{
1742	struct vcpu_svm *svm = to_svm(vcpu);
1743
1744	svm->vmcb->save.gdtr.limit = dt->size;
1745	svm->vmcb->save.gdtr.base = dt->address ;
1746	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1747}
1748
1749static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1750{
1751	struct vcpu_svm *svm = to_svm(vcpu);
1752
1753	/*
1754	 * For guests that don't set guest_state_protected, the cr3 update is
1755	 * handled via kvm_mmu_load() while entering the guest. For guests
1756	 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1757	 * VMCB save area now, since the save area will become the initial
1758	 * contents of the VMSA, and future VMCB save area updates won't be
1759	 * seen.
1760	 */
1761	if (sev_es_guest(vcpu->kvm)) {
1762		svm->vmcb->save.cr3 = cr3;
1763		vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1764	}
1765}
1766
1767void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1768{
1769	struct vcpu_svm *svm = to_svm(vcpu);
1770	u64 hcr0 = cr0;
1771	bool old_paging = is_paging(vcpu);
1772
1773#ifdef CONFIG_X86_64
1774	if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1775		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1776			vcpu->arch.efer |= EFER_LMA;
1777			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1778		}
1779
1780		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1781			vcpu->arch.efer &= ~EFER_LMA;
1782			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1783		}
1784	}
1785#endif
1786	vcpu->arch.cr0 = cr0;
1787
1788	if (!npt_enabled) {
1789		hcr0 |= X86_CR0_PG | X86_CR0_WP;
1790		if (old_paging != is_paging(vcpu))
1791			svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1792	}
1793
1794	/*
1795	 * re-enable caching here because the QEMU bios
1796	 * does not do it - this results in some delay at
1797	 * reboot
1798	 */
1799	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1800		hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1801
1802	svm->vmcb->save.cr0 = hcr0;
1803	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1804
1805	/*
1806	 * SEV-ES guests must always keep the CR intercepts cleared. CR
1807	 * tracking is done using the CR write traps.
1808	 */
1809	if (sev_es_guest(vcpu->kvm))
1810		return;
1811
1812	if (hcr0 == cr0) {
1813		/* Selective CR0 write remains on.  */
1814		svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1815		svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1816	} else {
1817		svm_set_intercept(svm, INTERCEPT_CR0_READ);
1818		svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1819	}
1820}
1821
1822static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1823{
1824	return true;
1825}
1826
1827void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1828{
1829	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1830	unsigned long old_cr4 = vcpu->arch.cr4;
1831
1832	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1833		svm_flush_tlb_current(vcpu);
1834
1835	vcpu->arch.cr4 = cr4;
1836	if (!npt_enabled) {
1837		cr4 |= X86_CR4_PAE;
1838
1839		if (!is_paging(vcpu))
1840			cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1841	}
1842	cr4 |= host_cr4_mce;
1843	to_svm(vcpu)->vmcb->save.cr4 = cr4;
1844	vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1845
1846	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1847		kvm_update_cpuid_runtime(vcpu);
1848}
1849
1850static void svm_set_segment(struct kvm_vcpu *vcpu,
1851			    struct kvm_segment *var, int seg)
1852{
1853	struct vcpu_svm *svm = to_svm(vcpu);
1854	struct vmcb_seg *s = svm_seg(vcpu, seg);
1855
1856	s->base = var->base;
1857	s->limit = var->limit;
1858	s->selector = var->selector;
1859	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1860	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1861	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1862	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1863	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1864	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1865	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1866	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1867
1868	/*
1869	 * This is always accurate, except if SYSRET returned to a segment
1870	 * with SS.DPL != 3.  Intel does not have this quirk, and always
1871	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1872	 * would entail passing the CPL to userspace and back.
1873	 */
1874	if (seg == VCPU_SREG_SS)
1875		/* This is symmetric with svm_get_segment() */
1876		svm->vmcb->save.cpl = (var->dpl & 3);
1877
1878	vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1879}
1880
1881static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1882{
1883	struct vcpu_svm *svm = to_svm(vcpu);
1884
1885	clr_exception_intercept(svm, BP_VECTOR);
1886
1887	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1888		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1889			set_exception_intercept(svm, BP_VECTOR);
1890	}
1891}
1892
1893static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1894{
1895	if (sd->next_asid > sd->max_asid) {
1896		++sd->asid_generation;
1897		sd->next_asid = sd->min_asid;
1898		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1899		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1900	}
1901
1902	svm->current_vmcb->asid_generation = sd->asid_generation;
1903	svm->asid = sd->next_asid++;
1904}
1905
1906static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1907{
1908	struct vmcb *vmcb = svm->vmcb;
1909
1910	if (svm->vcpu.arch.guest_state_protected)
1911		return;
1912
1913	if (unlikely(value != vmcb->save.dr6)) {
1914		vmcb->save.dr6 = value;
1915		vmcb_mark_dirty(vmcb, VMCB_DR);
1916	}
1917}
1918
1919static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1920{
1921	struct vcpu_svm *svm = to_svm(vcpu);
1922
1923	if (vcpu->arch.guest_state_protected)
1924		return;
1925
1926	get_debugreg(vcpu->arch.db[0], 0);
1927	get_debugreg(vcpu->arch.db[1], 1);
1928	get_debugreg(vcpu->arch.db[2], 2);
1929	get_debugreg(vcpu->arch.db[3], 3);
1930	/*
1931	 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1932	 * because db_interception might need it.  We can do it before vmentry.
1933	 */
1934	vcpu->arch.dr6 = svm->vmcb->save.dr6;
1935	vcpu->arch.dr7 = svm->vmcb->save.dr7;
1936	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1937	set_dr_intercepts(svm);
1938}
1939
1940static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1941{
1942	struct vcpu_svm *svm = to_svm(vcpu);
1943
1944	if (vcpu->arch.guest_state_protected)
1945		return;
1946
1947	svm->vmcb->save.dr7 = value;
1948	vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1949}
1950
1951static int pf_interception(struct kvm_vcpu *vcpu)
1952{
1953	struct vcpu_svm *svm = to_svm(vcpu);
1954
1955	u64 fault_address = svm->vmcb->control.exit_info_2;
1956	u64 error_code = svm->vmcb->control.exit_info_1;
1957
1958	return kvm_handle_page_fault(vcpu, error_code, fault_address,
1959			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1960			svm->vmcb->control.insn_bytes : NULL,
1961			svm->vmcb->control.insn_len);
1962}
1963
1964static int npf_interception(struct kvm_vcpu *vcpu)
1965{
1966	struct vcpu_svm *svm = to_svm(vcpu);
1967
1968	u64 fault_address = svm->vmcb->control.exit_info_2;
1969	u64 error_code = svm->vmcb->control.exit_info_1;
1970
1971	trace_kvm_page_fault(vcpu, fault_address, error_code);
1972	return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1973			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1974			svm->vmcb->control.insn_bytes : NULL,
1975			svm->vmcb->control.insn_len);
1976}
1977
1978static int db_interception(struct kvm_vcpu *vcpu)
1979{
1980	struct kvm_run *kvm_run = vcpu->run;
1981	struct vcpu_svm *svm = to_svm(vcpu);
1982
1983	if (!(vcpu->guest_debug &
1984	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1985		!svm->nmi_singlestep) {
1986		u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1987		kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1988		return 1;
1989	}
1990
1991	if (svm->nmi_singlestep) {
1992		disable_nmi_singlestep(svm);
1993		/* Make sure we check for pending NMIs upon entry */
1994		kvm_make_request(KVM_REQ_EVENT, vcpu);
1995	}
1996
1997	if (vcpu->guest_debug &
1998	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1999		kvm_run->exit_reason = KVM_EXIT_DEBUG;
2000		kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2001		kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2002		kvm_run->debug.arch.pc =
2003			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2004		kvm_run->debug.arch.exception = DB_VECTOR;
2005		return 0;
2006	}
2007
2008	return 1;
2009}
2010
2011static int bp_interception(struct kvm_vcpu *vcpu)
2012{
2013	struct vcpu_svm *svm = to_svm(vcpu);
2014	struct kvm_run *kvm_run = vcpu->run;
2015
2016	kvm_run->exit_reason = KVM_EXIT_DEBUG;
2017	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2018	kvm_run->debug.arch.exception = BP_VECTOR;
2019	return 0;
2020}
2021
2022static int ud_interception(struct kvm_vcpu *vcpu)
2023{
2024	return handle_ud(vcpu);
2025}
2026
2027static int ac_interception(struct kvm_vcpu *vcpu)
2028{
2029	kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2030	return 1;
2031}
2032
2033static bool is_erratum_383(void)
2034{
2035	int err, i;
2036	u64 value;
2037
2038	if (!erratum_383_found)
2039		return false;
2040
2041	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2042	if (err)
2043		return false;
2044
2045	/* Bit 62 may or may not be set for this mce */
2046	value &= ~(1ULL << 62);
2047
2048	if (value != 0xb600000000010015ULL)
2049		return false;
2050
2051	/* Clear MCi_STATUS registers */
2052	for (i = 0; i < 6; ++i)
2053		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2054
2055	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2056	if (!err) {
2057		u32 low, high;
2058
2059		value &= ~(1ULL << 2);
2060		low    = lower_32_bits(value);
2061		high   = upper_32_bits(value);
2062
2063		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2064	}
2065
2066	/* Flush tlb to evict multi-match entries */
2067	__flush_tlb_all();
2068
2069	return true;
2070}
2071
2072static void svm_handle_mce(struct kvm_vcpu *vcpu)
2073{
2074	if (is_erratum_383()) {
2075		/*
2076		 * Erratum 383 triggered. Guest state is corrupt so kill the
2077		 * guest.
2078		 */
2079		pr_err("KVM: Guest triggered AMD Erratum 383\n");
2080
2081		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2082
2083		return;
2084	}
2085
2086	/*
2087	 * On an #MC intercept the MCE handler is not called automatically in
2088	 * the host. So do it by hand here.
2089	 */
2090	kvm_machine_check();
2091}
2092
2093static int mc_interception(struct kvm_vcpu *vcpu)
2094{
2095	return 1;
2096}
2097
2098static int shutdown_interception(struct kvm_vcpu *vcpu)
2099{
2100	struct kvm_run *kvm_run = vcpu->run;
2101	struct vcpu_svm *svm = to_svm(vcpu);
2102
2103	/*
2104	 * The VM save area has already been encrypted so it
2105	 * cannot be reinitialized - just terminate.
2106	 */
2107	if (sev_es_guest(vcpu->kvm))
2108		return -EINVAL;
2109
2110	/*
2111	 * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2112	 * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2113	 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2114	 * userspace.  At a platform view, INIT is acceptable behavior as
2115	 * there exist bare metal platforms that automatically INIT the CPU
2116	 * in response to shutdown.
2117	 */
2118	clear_page(svm->vmcb);
2119	kvm_vcpu_reset(vcpu, true);
2120
2121	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2122	return 0;
2123}
2124
2125static int io_interception(struct kvm_vcpu *vcpu)
2126{
2127	struct vcpu_svm *svm = to_svm(vcpu);
2128	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2129	int size, in, string;
2130	unsigned port;
2131
2132	++vcpu->stat.io_exits;
2133	string = (io_info & SVM_IOIO_STR_MASK) != 0;
2134	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2135	port = io_info >> 16;
2136	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2137
2138	if (string) {
2139		if (sev_es_guest(vcpu->kvm))
2140			return sev_es_string_io(svm, size, port, in);
2141		else
2142			return kvm_emulate_instruction(vcpu, 0);
2143	}
2144
2145	svm->next_rip = svm->vmcb->control.exit_info_2;
2146
2147	return kvm_fast_pio(vcpu, size, port, in);
2148}
2149
2150static int nmi_interception(struct kvm_vcpu *vcpu)
2151{
2152	return 1;
2153}
2154
2155static int smi_interception(struct kvm_vcpu *vcpu)
2156{
2157	return 1;
2158}
2159
2160static int intr_interception(struct kvm_vcpu *vcpu)
2161{
2162	++vcpu->stat.irq_exits;
2163	return 1;
2164}
2165
2166static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2167{
2168	struct vcpu_svm *svm = to_svm(vcpu);
2169	struct vmcb *vmcb12;
2170	struct kvm_host_map map;
2171	int ret;
2172
2173	if (nested_svm_check_permissions(vcpu))
2174		return 1;
2175
2176	ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2177	if (ret) {
2178		if (ret == -EINVAL)
2179			kvm_inject_gp(vcpu, 0);
2180		return 1;
2181	}
2182
2183	vmcb12 = map.hva;
2184
2185	ret = kvm_skip_emulated_instruction(vcpu);
2186
2187	if (vmload) {
2188		svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2189		svm->sysenter_eip_hi = 0;
2190		svm->sysenter_esp_hi = 0;
2191	} else {
2192		svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2193	}
2194
2195	kvm_vcpu_unmap(vcpu, &map, true);
2196
2197	return ret;
2198}
2199
2200static int vmload_interception(struct kvm_vcpu *vcpu)
2201{
2202	return vmload_vmsave_interception(vcpu, true);
2203}
2204
2205static int vmsave_interception(struct kvm_vcpu *vcpu)
2206{
2207	return vmload_vmsave_interception(vcpu, false);
2208}
2209
2210static int vmrun_interception(struct kvm_vcpu *vcpu)
2211{
2212	if (nested_svm_check_permissions(vcpu))
2213		return 1;
2214
2215	return nested_svm_vmrun(vcpu);
2216}
2217
2218enum {
2219	NONE_SVM_INSTR,
2220	SVM_INSTR_VMRUN,
2221	SVM_INSTR_VMLOAD,
2222	SVM_INSTR_VMSAVE,
2223};
2224
2225/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2226static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2227{
2228	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2229
2230	if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2231		return NONE_SVM_INSTR;
2232
2233	switch (ctxt->modrm) {
2234	case 0xd8: /* VMRUN */
2235		return SVM_INSTR_VMRUN;
2236	case 0xda: /* VMLOAD */
2237		return SVM_INSTR_VMLOAD;
2238	case 0xdb: /* VMSAVE */
2239		return SVM_INSTR_VMSAVE;
2240	default:
2241		break;
2242	}
2243
2244	return NONE_SVM_INSTR;
2245}
2246
2247static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2248{
2249	const int guest_mode_exit_codes[] = {
2250		[SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2251		[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2252		[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2253	};
2254	int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2255		[SVM_INSTR_VMRUN] = vmrun_interception,
2256		[SVM_INSTR_VMLOAD] = vmload_interception,
2257		[SVM_INSTR_VMSAVE] = vmsave_interception,
2258	};
2259	struct vcpu_svm *svm = to_svm(vcpu);
2260	int ret;
2261
2262	if (is_guest_mode(vcpu)) {
2263		/* Returns '1' or -errno on failure, '0' on success. */
2264		ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2265		if (ret)
2266			return ret;
2267		return 1;
2268	}
2269	return svm_instr_handlers[opcode](vcpu);
2270}
2271
2272/*
2273 * #GP handling code. Note that #GP can be triggered under the following two
2274 * cases:
2275 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2276 *      some AMD CPUs when EAX of these instructions are in the reserved memory
2277 *      regions (e.g. SMM memory on host).
2278 *   2) VMware backdoor
2279 */
2280static int gp_interception(struct kvm_vcpu *vcpu)
2281{
2282	struct vcpu_svm *svm = to_svm(vcpu);
2283	u32 error_code = svm->vmcb->control.exit_info_1;
2284	int opcode;
2285
2286	/* Both #GP cases have zero error_code */
2287	if (error_code)
2288		goto reinject;
2289
2290	/* Decode the instruction for usage later */
2291	if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2292		goto reinject;
2293
2294	opcode = svm_instr_opcode(vcpu);
2295
2296	if (opcode == NONE_SVM_INSTR) {
2297		if (!enable_vmware_backdoor)
2298			goto reinject;
2299
2300		/*
2301		 * VMware backdoor emulation on #GP interception only handles
2302		 * IN{S}, OUT{S}, and RDPMC.
2303		 */
2304		if (!is_guest_mode(vcpu))
2305			return kvm_emulate_instruction(vcpu,
2306				EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2307	} else {
2308		/* All SVM instructions expect page aligned RAX */
2309		if (svm->vmcb->save.rax & ~PAGE_MASK)
2310			goto reinject;
2311
2312		return emulate_svm_instr(vcpu, opcode);
2313	}
2314
2315reinject:
2316	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2317	return 1;
2318}
2319
2320void svm_set_gif(struct vcpu_svm *svm, bool value)
2321{
2322	if (value) {
2323		/*
2324		 * If VGIF is enabled, the STGI intercept is only added to
2325		 * detect the opening of the SMI/NMI window; remove it now.
2326		 * Likewise, clear the VINTR intercept, we will set it
2327		 * again while processing KVM_REQ_EVENT if needed.
2328		 */
2329		if (vgif)
2330			svm_clr_intercept(svm, INTERCEPT_STGI);
2331		if (svm_is_intercept(svm, INTERCEPT_VINTR))
2332			svm_clear_vintr(svm);
2333
2334		enable_gif(svm);
2335		if (svm->vcpu.arch.smi_pending ||
2336		    svm->vcpu.arch.nmi_pending ||
2337		    kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2338		    kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2339			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2340	} else {
2341		disable_gif(svm);
2342
2343		/*
2344		 * After a CLGI no interrupts should come.  But if vGIF is
2345		 * in use, we still rely on the VINTR intercept (rather than
2346		 * STGI) to detect an open interrupt window.
2347		*/
2348		if (!vgif)
2349			svm_clear_vintr(svm);
2350	}
2351}
2352
2353static int stgi_interception(struct kvm_vcpu *vcpu)
2354{
2355	int ret;
2356
2357	if (nested_svm_check_permissions(vcpu))
2358		return 1;
2359
2360	ret = kvm_skip_emulated_instruction(vcpu);
2361	svm_set_gif(to_svm(vcpu), true);
2362	return ret;
2363}
2364
2365static int clgi_interception(struct kvm_vcpu *vcpu)
2366{
2367	int ret;
2368
2369	if (nested_svm_check_permissions(vcpu))
2370		return 1;
2371
2372	ret = kvm_skip_emulated_instruction(vcpu);
2373	svm_set_gif(to_svm(vcpu), false);
2374	return ret;
2375}
2376
2377static int invlpga_interception(struct kvm_vcpu *vcpu)
2378{
2379	gva_t gva = kvm_rax_read(vcpu);
2380	u32 asid = kvm_rcx_read(vcpu);
2381
2382	/* FIXME: Handle an address size prefix. */
2383	if (!is_long_mode(vcpu))
2384		gva = (u32)gva;
2385
2386	trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2387
2388	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2389	kvm_mmu_invlpg(vcpu, gva);
2390
2391	return kvm_skip_emulated_instruction(vcpu);
2392}
2393
2394static int skinit_interception(struct kvm_vcpu *vcpu)
2395{
2396	trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2397
2398	kvm_queue_exception(vcpu, UD_VECTOR);
2399	return 1;
2400}
2401
2402static int task_switch_interception(struct kvm_vcpu *vcpu)
2403{
2404	struct vcpu_svm *svm = to_svm(vcpu);
2405	u16 tss_selector;
2406	int reason;
2407	int int_type = svm->vmcb->control.exit_int_info &
2408		SVM_EXITINTINFO_TYPE_MASK;
2409	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2410	uint32_t type =
2411		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2412	uint32_t idt_v =
2413		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2414	bool has_error_code = false;
2415	u32 error_code = 0;
2416
2417	tss_selector = (u16)svm->vmcb->control.exit_info_1;
2418
2419	if (svm->vmcb->control.exit_info_2 &
2420	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2421		reason = TASK_SWITCH_IRET;
2422	else if (svm->vmcb->control.exit_info_2 &
2423		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2424		reason = TASK_SWITCH_JMP;
2425	else if (idt_v)
2426		reason = TASK_SWITCH_GATE;
2427	else
2428		reason = TASK_SWITCH_CALL;
2429
2430	if (reason == TASK_SWITCH_GATE) {
2431		switch (type) {
2432		case SVM_EXITINTINFO_TYPE_NMI:
2433			vcpu->arch.nmi_injected = false;
2434			break;
2435		case SVM_EXITINTINFO_TYPE_EXEPT:
2436			if (svm->vmcb->control.exit_info_2 &
2437			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2438				has_error_code = true;
2439				error_code =
2440					(u32)svm->vmcb->control.exit_info_2;
2441			}
2442			kvm_clear_exception_queue(vcpu);
2443			break;
2444		case SVM_EXITINTINFO_TYPE_INTR:
2445		case SVM_EXITINTINFO_TYPE_SOFT:
2446			kvm_clear_interrupt_queue(vcpu);
2447			break;
2448		default:
2449			break;
2450		}
2451	}
2452
2453	if (reason != TASK_SWITCH_GATE ||
2454	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2455	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2456	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2457		if (!svm_skip_emulated_instruction(vcpu))
2458			return 0;
2459	}
2460
2461	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2462		int_vec = -1;
2463
2464	return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2465			       has_error_code, error_code);
2466}
2467
2468static int iret_interception(struct kvm_vcpu *vcpu)
2469{
2470	struct vcpu_svm *svm = to_svm(vcpu);
2471
2472	++vcpu->stat.nmi_window_exits;
2473	vcpu->arch.hflags |= HF_IRET_MASK;
2474	if (!sev_es_guest(vcpu->kvm)) {
2475		svm_clr_intercept(svm, INTERCEPT_IRET);
2476		svm->nmi_iret_rip = kvm_rip_read(vcpu);
2477	}
2478	kvm_make_request(KVM_REQ_EVENT, vcpu);
2479	return 1;
2480}
2481
2482static int invlpg_interception(struct kvm_vcpu *vcpu)
2483{
2484	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2485		return kvm_emulate_instruction(vcpu, 0);
2486
2487	kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2488	return kvm_skip_emulated_instruction(vcpu);
2489}
2490
2491static int emulate_on_interception(struct kvm_vcpu *vcpu)
2492{
2493	return kvm_emulate_instruction(vcpu, 0);
2494}
2495
2496static int rsm_interception(struct kvm_vcpu *vcpu)
2497{
2498	return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2499}
2500
2501static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2502					    unsigned long val)
2503{
2504	struct vcpu_svm *svm = to_svm(vcpu);
2505	unsigned long cr0 = vcpu->arch.cr0;
2506	bool ret = false;
2507
2508	if (!is_guest_mode(vcpu) ||
2509	    (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2510		return false;
2511
2512	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2513	val &= ~SVM_CR0_SELECTIVE_MASK;
2514
2515	if (cr0 ^ val) {
2516		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2517		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2518	}
2519
2520	return ret;
2521}
2522
2523#define CR_VALID (1ULL << 63)
2524
2525static int cr_interception(struct kvm_vcpu *vcpu)
2526{
2527	struct vcpu_svm *svm = to_svm(vcpu);
2528	int reg, cr;
2529	unsigned long val;
2530	int err;
2531
2532	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2533		return emulate_on_interception(vcpu);
2534
2535	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2536		return emulate_on_interception(vcpu);
2537
2538	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2539	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2540		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2541	else
2542		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2543
2544	err = 0;
2545	if (cr >= 16) { /* mov to cr */
2546		cr -= 16;
2547		val = kvm_register_read(vcpu, reg);
2548		trace_kvm_cr_write(cr, val);
2549		switch (cr) {
2550		case 0:
2551			if (!check_selective_cr0_intercepted(vcpu, val))
2552				err = kvm_set_cr0(vcpu, val);
2553			else
2554				return 1;
2555
2556			break;
2557		case 3:
2558			err = kvm_set_cr3(vcpu, val);
2559			break;
2560		case 4:
2561			err = kvm_set_cr4(vcpu, val);
2562			break;
2563		case 8:
2564			err = kvm_set_cr8(vcpu, val);
2565			break;
2566		default:
2567			WARN(1, "unhandled write to CR%d", cr);
2568			kvm_queue_exception(vcpu, UD_VECTOR);
2569			return 1;
2570		}
2571	} else { /* mov from cr */
2572		switch (cr) {
2573		case 0:
2574			val = kvm_read_cr0(vcpu);
2575			break;
2576		case 2:
2577			val = vcpu->arch.cr2;
2578			break;
2579		case 3:
2580			val = kvm_read_cr3(vcpu);
2581			break;
2582		case 4:
2583			val = kvm_read_cr4(vcpu);
2584			break;
2585		case 8:
2586			val = kvm_get_cr8(vcpu);
2587			break;
2588		default:
2589			WARN(1, "unhandled read from CR%d", cr);
2590			kvm_queue_exception(vcpu, UD_VECTOR);
2591			return 1;
2592		}
2593		kvm_register_write(vcpu, reg, val);
2594		trace_kvm_cr_read(cr, val);
2595	}
2596	return kvm_complete_insn_gp(vcpu, err);
2597}
2598
2599static int cr_trap(struct kvm_vcpu *vcpu)
2600{
2601	struct vcpu_svm *svm = to_svm(vcpu);
2602	unsigned long old_value, new_value;
2603	unsigned int cr;
2604	int ret = 0;
2605
2606	new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2607
2608	cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2609	switch (cr) {
2610	case 0:
2611		old_value = kvm_read_cr0(vcpu);
2612		svm_set_cr0(vcpu, new_value);
2613
2614		kvm_post_set_cr0(vcpu, old_value, new_value);
2615		break;
2616	case 4:
2617		old_value = kvm_read_cr4(vcpu);
2618		svm_set_cr4(vcpu, new_value);
2619
2620		kvm_post_set_cr4(vcpu, old_value, new_value);
2621		break;
2622	case 8:
2623		ret = kvm_set_cr8(vcpu, new_value);
2624		break;
2625	default:
2626		WARN(1, "unhandled CR%d write trap", cr);
2627		kvm_queue_exception(vcpu, UD_VECTOR);
2628		return 1;
2629	}
2630
2631	return kvm_complete_insn_gp(vcpu, ret);
2632}
2633
2634static int dr_interception(struct kvm_vcpu *vcpu)
2635{
2636	struct vcpu_svm *svm = to_svm(vcpu);
2637	int reg, dr;
2638	unsigned long val;
2639	int err = 0;
2640
2641	if (vcpu->guest_debug == 0) {
2642		/*
2643		 * No more DR vmexits; force a reload of the debug registers
2644		 * and reenter on this instruction.  The next vmexit will
2645		 * retrieve the full state of the debug registers.
2646		 */
2647		clr_dr_intercepts(svm);
2648		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2649		return 1;
2650	}
2651
2652	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2653		return emulate_on_interception(vcpu);
2654
2655	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2656	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2657	if (dr >= 16) { /* mov to DRn  */
2658		dr -= 16;
2659		val = kvm_register_read(vcpu, reg);
2660		err = kvm_set_dr(vcpu, dr, val);
2661	} else {
2662		kvm_get_dr(vcpu, dr, &val);
2663		kvm_register_write(vcpu, reg, val);
2664	}
2665
2666	return kvm_complete_insn_gp(vcpu, err);
2667}
2668
2669static int cr8_write_interception(struct kvm_vcpu *vcpu)
2670{
2671	int r;
2672
2673	u8 cr8_prev = kvm_get_cr8(vcpu);
2674	/* instruction emulation calls kvm_set_cr8() */
2675	r = cr_interception(vcpu);
2676	if (lapic_in_kernel(vcpu))
2677		return r;
2678	if (cr8_prev <= kvm_get_cr8(vcpu))
2679		return r;
2680	vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2681	return 0;
2682}
2683
2684static int efer_trap(struct kvm_vcpu *vcpu)
2685{
2686	struct msr_data msr_info;
2687	int ret;
2688
2689	/*
2690	 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2691	 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2692	 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2693	 * the guest doesn't have X86_FEATURE_SVM.
2694	 */
2695	msr_info.host_initiated = false;
2696	msr_info.index = MSR_EFER;
2697	msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2698	ret = kvm_set_msr_common(vcpu, &msr_info);
2699
2700	return kvm_complete_insn_gp(vcpu, ret);
2701}
2702
2703static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2704{
2705	msr->data = 0;
2706
2707	switch (msr->index) {
2708	case MSR_AMD64_DE_CFG:
2709		if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2710			msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2711		break;
2712	default:
2713		return KVM_MSR_RET_INVALID;
2714	}
2715
2716	return 0;
2717}
2718
2719static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2720{
2721	struct vcpu_svm *svm = to_svm(vcpu);
2722
2723	switch (msr_info->index) {
2724	case MSR_AMD64_TSC_RATIO:
2725		if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2726			return 1;
2727		msr_info->data = svm->tsc_ratio_msr;
2728		break;
2729	case MSR_STAR:
2730		msr_info->data = svm->vmcb01.ptr->save.star;
2731		break;
2732#ifdef CONFIG_X86_64
2733	case MSR_LSTAR:
2734		msr_info->data = svm->vmcb01.ptr->save.lstar;
2735		break;
2736	case MSR_CSTAR:
2737		msr_info->data = svm->vmcb01.ptr->save.cstar;
2738		break;
2739	case MSR_KERNEL_GS_BASE:
2740		msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2741		break;
2742	case MSR_SYSCALL_MASK:
2743		msr_info->data = svm->vmcb01.ptr->save.sfmask;
2744		break;
2745#endif
2746	case MSR_IA32_SYSENTER_CS:
2747		msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2748		break;
2749	case MSR_IA32_SYSENTER_EIP:
2750		msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2751		if (guest_cpuid_is_intel(vcpu))
2752			msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2753		break;
2754	case MSR_IA32_SYSENTER_ESP:
2755		msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2756		if (guest_cpuid_is_intel(vcpu))
2757			msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2758		break;
2759	case MSR_TSC_AUX:
2760		msr_info->data = svm->tsc_aux;
2761		break;
2762	case MSR_IA32_DEBUGCTLMSR:
2763	case MSR_IA32_LASTBRANCHFROMIP:
2764	case MSR_IA32_LASTBRANCHTOIP:
2765	case MSR_IA32_LASTINTFROMIP:
2766	case MSR_IA32_LASTINTTOIP:
2767		msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
2768		break;
2769	case MSR_VM_HSAVE_PA:
2770		msr_info->data = svm->nested.hsave_msr;
2771		break;
2772	case MSR_VM_CR:
2773		msr_info->data = svm->nested.vm_cr_msr;
2774		break;
2775	case MSR_IA32_SPEC_CTRL:
2776		if (!msr_info->host_initiated &&
2777		    !guest_has_spec_ctrl_msr(vcpu))
2778			return 1;
2779
2780		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2781			msr_info->data = svm->vmcb->save.spec_ctrl;
2782		else
2783			msr_info->data = svm->spec_ctrl;
2784		break;
2785	case MSR_AMD64_VIRT_SPEC_CTRL:
2786		if (!msr_info->host_initiated &&
2787		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2788			return 1;
2789
2790		msr_info->data = svm->virt_spec_ctrl;
2791		break;
2792	case MSR_F15H_IC_CFG: {
2793
2794		int family, model;
2795
2796		family = guest_cpuid_family(vcpu);
2797		model  = guest_cpuid_model(vcpu);
2798
2799		if (family < 0 || model < 0)
2800			return kvm_get_msr_common(vcpu, msr_info);
2801
2802		msr_info->data = 0;
2803
2804		if (family == 0x15 &&
2805		    (model >= 0x2 && model < 0x20))
2806			msr_info->data = 0x1E;
2807		}
2808		break;
2809	case MSR_AMD64_DE_CFG:
2810		msr_info->data = svm->msr_decfg;
2811		break;
2812	default:
2813		return kvm_get_msr_common(vcpu, msr_info);
2814	}
2815	return 0;
2816}
2817
2818static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2819{
2820	struct vcpu_svm *svm = to_svm(vcpu);
2821	if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2822		return kvm_complete_insn_gp(vcpu, err);
2823
2824	ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2825	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2826				X86_TRAP_GP |
2827				SVM_EVTINJ_TYPE_EXEPT |
2828				SVM_EVTINJ_VALID);
2829	return 1;
2830}
2831
2832static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2833{
2834	struct vcpu_svm *svm = to_svm(vcpu);
2835	int svm_dis, chg_mask;
2836
2837	if (data & ~SVM_VM_CR_VALID_MASK)
2838		return 1;
2839
2840	chg_mask = SVM_VM_CR_VALID_MASK;
2841
2842	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2843		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2844
2845	svm->nested.vm_cr_msr &= ~chg_mask;
2846	svm->nested.vm_cr_msr |= (data & chg_mask);
2847
2848	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2849
2850	/* check for svm_disable while efer.svme is set */
2851	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2852		return 1;
2853
2854	return 0;
2855}
2856
2857static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2858{
2859	struct vcpu_svm *svm = to_svm(vcpu);
2860	int r;
2861
2862	u32 ecx = msr->index;
2863	u64 data = msr->data;
2864	switch (ecx) {
2865	case MSR_AMD64_TSC_RATIO:
2866
2867		if (!svm->tsc_scaling_enabled) {
2868
2869			if (!msr->host_initiated)
2870				return 1;
2871			/*
2872			 * In case TSC scaling is not enabled, always
2873			 * leave this MSR at the default value.
2874			 *
2875			 * Due to bug in qemu 6.2.0, it would try to set
2876			 * this msr to 0 if tsc scaling is not enabled.
2877			 * Ignore this value as well.
2878			 */
2879			if (data != 0 && data != svm->tsc_ratio_msr)
2880				return 1;
2881			break;
2882		}
2883
2884		if (data & SVM_TSC_RATIO_RSVD)
2885			return 1;
2886
2887		svm->tsc_ratio_msr = data;
2888
2889		if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2890			nested_svm_update_tsc_ratio_msr(vcpu);
2891
2892		break;
2893	case MSR_IA32_CR_PAT:
2894		if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2895			return 1;
2896		vcpu->arch.pat = data;
2897		svm->vmcb01.ptr->save.g_pat = data;
2898		if (is_guest_mode(vcpu))
2899			nested_vmcb02_compute_g_pat(svm);
2900		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2901		break;
2902	case MSR_IA32_SPEC_CTRL:
2903		if (!msr->host_initiated &&
2904		    !guest_has_spec_ctrl_msr(vcpu))
2905			return 1;
2906
2907		if (kvm_spec_ctrl_test_value(data))
2908			return 1;
2909
2910		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2911			svm->vmcb->save.spec_ctrl = data;
2912		else
2913			svm->spec_ctrl = data;
2914		if (!data)
2915			break;
2916
2917		/*
2918		 * For non-nested:
2919		 * When it's written (to non-zero) for the first time, pass
2920		 * it through.
2921		 *
2922		 * For nested:
2923		 * The handling of the MSR bitmap for L2 guests is done in
2924		 * nested_svm_vmrun_msrpm.
2925		 * We update the L1 MSR bit as well since it will end up
2926		 * touching the MSR anyway now.
2927		 */
2928		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2929		break;
2930	case MSR_IA32_PRED_CMD:
2931		if (!msr->host_initiated &&
2932		    !guest_has_pred_cmd_msr(vcpu))
2933			return 1;
2934
2935		if (data & ~PRED_CMD_IBPB)
2936			return 1;
2937		if (!boot_cpu_has(X86_FEATURE_IBPB))
2938			return 1;
2939		if (!data)
2940			break;
2941
2942		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2943		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2944		break;
2945	case MSR_AMD64_VIRT_SPEC_CTRL:
2946		if (!msr->host_initiated &&
2947		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2948			return 1;
2949
2950		if (data & ~SPEC_CTRL_SSBD)
2951			return 1;
2952
2953		svm->virt_spec_ctrl = data;
2954		break;
2955	case MSR_STAR:
2956		svm->vmcb01.ptr->save.star = data;
2957		break;
2958#ifdef CONFIG_X86_64
2959	case MSR_LSTAR:
2960		svm->vmcb01.ptr->save.lstar = data;
2961		break;
2962	case MSR_CSTAR:
2963		svm->vmcb01.ptr->save.cstar = data;
2964		break;
2965	case MSR_KERNEL_GS_BASE:
2966		svm->vmcb01.ptr->save.kernel_gs_base = data;
2967		break;
2968	case MSR_SYSCALL_MASK:
2969		svm->vmcb01.ptr->save.sfmask = data;
2970		break;
2971#endif
2972	case MSR_IA32_SYSENTER_CS:
2973		svm->vmcb01.ptr->save.sysenter_cs = data;
2974		break;
2975	case MSR_IA32_SYSENTER_EIP:
2976		svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2977		/*
2978		 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2979		 * when we spoof an Intel vendor ID (for cross vendor migration).
2980		 * In this case we use this intercept to track the high
2981		 * 32 bit part of these msrs to support Intel's
2982		 * implementation of SYSENTER/SYSEXIT.
2983		 */
2984		svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2985		break;
2986	case MSR_IA32_SYSENTER_ESP:
2987		svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2988		svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2989		break;
2990	case MSR_TSC_AUX:
2991		/*
2992		 * TSC_AUX is usually changed only during boot and never read
2993		 * directly.  Intercept TSC_AUX instead of exposing it to the
2994		 * guest via direct_access_msrs, and switch it via user return.
2995		 */
2996		preempt_disable();
2997		r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
2998		preempt_enable();
2999		if (r)
3000			return 1;
3001
3002		svm->tsc_aux = data;
3003		break;
3004	case MSR_IA32_DEBUGCTLMSR:
3005		if (!lbrv) {
3006			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3007				    __func__, data);
3008			break;
3009		}
3010		if (data & DEBUGCTL_RESERVED_BITS)
3011			return 1;
3012
3013		if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
3014			svm->vmcb->save.dbgctl = data;
3015		else
3016			svm->vmcb01.ptr->save.dbgctl = data;
3017
3018		svm_update_lbrv(vcpu);
3019
3020		break;
3021	case MSR_VM_HSAVE_PA:
3022		/*
3023		 * Old kernels did not validate the value written to
3024		 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3025		 * value to allow live migrating buggy or malicious guests
3026		 * originating from those kernels.
3027		 */
3028		if (!msr->host_initiated && !page_address_valid(vcpu, data))
3029			return 1;
3030
3031		svm->nested.hsave_msr = data & PAGE_MASK;
3032		break;
3033	case MSR_VM_CR:
3034		return svm_set_vm_cr(vcpu, data);
3035	case MSR_VM_IGNNE:
3036		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3037		break;
3038	case MSR_AMD64_DE_CFG: {
3039		struct kvm_msr_entry msr_entry;
3040
3041		msr_entry.index = msr->index;
3042		if (svm_get_msr_feature(&msr_entry))
3043			return 1;
3044
3045		/* Check the supported bits */
3046		if (data & ~msr_entry.data)
3047			return 1;
3048
3049		/* Don't allow the guest to change a bit, #GP */
3050		if (!msr->host_initiated && (data ^ msr_entry.data))
3051			return 1;
3052
3053		svm->msr_decfg = data;
3054		break;
3055	}
3056	default:
3057		return kvm_set_msr_common(vcpu, msr);
3058	}
3059	return 0;
3060}
3061
3062static int msr_interception(struct kvm_vcpu *vcpu)
3063{
3064	if (to_svm(vcpu)->vmcb->control.exit_info_1)
3065		return kvm_emulate_wrmsr(vcpu);
3066	else
3067		return kvm_emulate_rdmsr(vcpu);
3068}
3069
3070static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3071{
3072	kvm_make_request(KVM_REQ_EVENT, vcpu);
3073	svm_clear_vintr(to_svm(vcpu));
3074
3075	/*
3076	 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3077	 * In this case AVIC was temporarily disabled for
3078	 * requesting the IRQ window and we have to re-enable it.
3079	 *
3080	 * If running nested, still remove the VM wide AVIC inhibit to
3081	 * support case in which the interrupt window was requested when the
3082	 * vCPU was not running nested.
3083
3084	 * All vCPUs which run still run nested, will remain to have their
3085	 * AVIC still inhibited due to per-cpu AVIC inhibition.
3086	 */
3087	kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3088
3089	++vcpu->stat.irq_window_exits;
3090	return 1;
3091}
3092
3093static int pause_interception(struct kvm_vcpu *vcpu)
3094{
3095	bool in_kernel;
3096	/*
3097	 * CPL is not made available for an SEV-ES guest, therefore
3098	 * vcpu->arch.preempted_in_kernel can never be true.  Just
3099	 * set in_kernel to false as well.
3100	 */
3101	in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3102
3103	grow_ple_window(vcpu);
3104
3105	kvm_vcpu_on_spin(vcpu, in_kernel);
3106	return kvm_skip_emulated_instruction(vcpu);
3107}
3108
3109static int invpcid_interception(struct kvm_vcpu *vcpu)
3110{
3111	struct vcpu_svm *svm = to_svm(vcpu);
3112	unsigned long type;
3113	gva_t gva;
3114
3115	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3116		kvm_queue_exception(vcpu, UD_VECTOR);
3117		return 1;
3118	}
3119
3120	/*
3121	 * For an INVPCID intercept:
3122	 * EXITINFO1 provides the linear address of the memory operand.
3123	 * EXITINFO2 provides the contents of the register operand.
3124	 */
3125	type = svm->vmcb->control.exit_info_2;
3126	gva = svm->vmcb->control.exit_info_1;
3127
3128	return kvm_handle_invpcid(vcpu, type, gva);
3129}
3130
3131static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3132	[SVM_EXIT_READ_CR0]			= cr_interception,
3133	[SVM_EXIT_READ_CR3]			= cr_interception,
3134	[SVM_EXIT_READ_CR4]			= cr_interception,
3135	[SVM_EXIT_READ_CR8]			= cr_interception,
3136	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
3137	[SVM_EXIT_WRITE_CR0]			= cr_interception,
3138	[SVM_EXIT_WRITE_CR3]			= cr_interception,
3139	[SVM_EXIT_WRITE_CR4]			= cr_interception,
3140	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
3141	[SVM_EXIT_READ_DR0]			= dr_interception,
3142	[SVM_EXIT_READ_DR1]			= dr_interception,
3143	[SVM_EXIT_READ_DR2]			= dr_interception,
3144	[SVM_EXIT_READ_DR3]			= dr_interception,
3145	[SVM_EXIT_READ_DR4]			= dr_interception,
3146	[SVM_EXIT_READ_DR5]			= dr_interception,
3147	[SVM_EXIT_READ_DR6]			= dr_interception,
3148	[SVM_EXIT_READ_DR7]			= dr_interception,
3149	[SVM_EXIT_WRITE_DR0]			= dr_interception,
3150	[SVM_EXIT_WRITE_DR1]			= dr_interception,
3151	[SVM_EXIT_WRITE_DR2]			= dr_interception,
3152	[SVM_EXIT_WRITE_DR3]			= dr_interception,
3153	[SVM_EXIT_WRITE_DR4]			= dr_interception,
3154	[SVM_EXIT_WRITE_DR5]			= dr_interception,
3155	[SVM_EXIT_WRITE_DR6]			= dr_interception,
3156	[SVM_EXIT_WRITE_DR7]			= dr_interception,
3157	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
3158	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
3159	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
3160	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
3161	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
3162	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
3163	[SVM_EXIT_EXCP_BASE + GP_VECTOR]	= gp_interception,
3164	[SVM_EXIT_INTR]				= intr_interception,
3165	[SVM_EXIT_NMI]				= nmi_interception,
3166	[SVM_EXIT_SMI]				= smi_interception,
3167	[SVM_EXIT_VINTR]			= interrupt_window_interception,
3168	[SVM_EXIT_RDPMC]			= kvm_emulate_rdpmc,
3169	[SVM_EXIT_CPUID]			= kvm_emulate_cpuid,
3170	[SVM_EXIT_IRET]                         = iret_interception,
3171	[SVM_EXIT_INVD]                         = kvm_emulate_invd,
3172	[SVM_EXIT_PAUSE]			= pause_interception,
3173	[SVM_EXIT_HLT]				= kvm_emulate_halt,
3174	[SVM_EXIT_INVLPG]			= invlpg_interception,
3175	[SVM_EXIT_INVLPGA]			= invlpga_interception,
3176	[SVM_EXIT_IOIO]				= io_interception,
3177	[SVM_EXIT_MSR]				= msr_interception,
3178	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
3179	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
3180	[SVM_EXIT_VMRUN]			= vmrun_interception,
3181	[SVM_EXIT_VMMCALL]			= kvm_emulate_hypercall,
3182	[SVM_EXIT_VMLOAD]			= vmload_interception,
3183	[SVM_EXIT_VMSAVE]			= vmsave_interception,
3184	[SVM_EXIT_STGI]				= stgi_interception,
3185	[SVM_EXIT_CLGI]				= clgi_interception,
3186	[SVM_EXIT_SKINIT]			= skinit_interception,
3187	[SVM_EXIT_RDTSCP]			= kvm_handle_invalid_op,
3188	[SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3189	[SVM_EXIT_MONITOR]			= kvm_emulate_monitor,
3190	[SVM_EXIT_MWAIT]			= kvm_emulate_mwait,
3191	[SVM_EXIT_XSETBV]			= kvm_emulate_xsetbv,
3192	[SVM_EXIT_RDPRU]			= kvm_handle_invalid_op,
3193	[SVM_EXIT_EFER_WRITE_TRAP]		= efer_trap,
3194	[SVM_EXIT_CR0_WRITE_TRAP]		= cr_trap,
3195	[SVM_EXIT_CR4_WRITE_TRAP]		= cr_trap,
3196	[SVM_EXIT_CR8_WRITE_TRAP]		= cr_trap,
3197	[SVM_EXIT_INVPCID]                      = invpcid_interception,
3198	[SVM_EXIT_NPF]				= npf_interception,
3199	[SVM_EXIT_RSM]                          = rsm_interception,
3200	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
3201	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
3202	[SVM_EXIT_VMGEXIT]			= sev_handle_vmgexit,
3203};
3204
3205static void dump_vmcb(struct kvm_vcpu *vcpu)
3206{
3207	struct vcpu_svm *svm = to_svm(vcpu);
3208	struct vmcb_control_area *control = &svm->vmcb->control;
3209	struct vmcb_save_area *save = &svm->vmcb->save;
3210	struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3211
3212	if (!dump_invalid_vmcb) {
3213		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3214		return;
3215	}
3216
3217	pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3218	       svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3219	pr_err("VMCB Control Area:\n");
3220	pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3221	pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3222	pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3223	pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3224	pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3225	pr_err("%-20s%08x %08x\n", "intercepts:",
3226              control->intercepts[INTERCEPT_WORD3],
3227	       control->intercepts[INTERCEPT_WORD4]);
3228	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3229	pr_err("%-20s%d\n", "pause filter threshold:",
3230	       control->pause_filter_thresh);
3231	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3232	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3233	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3234	pr_err("%-20s%d\n", "asid:", control->asid);
3235	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3236	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3237	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3238	pr_err("%-20s%08x\n", "int_state:", control->int_state);
3239	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3240	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3241	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3242	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3243	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3244	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3245	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3246	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3247	pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3248	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3249	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3250	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3251	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3252	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3253	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3254	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3255	pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3256	pr_err("VMCB State Save Area:\n");
3257	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3258	       "es:",
3259	       save->es.selector, save->es.attrib,
3260	       save->es.limit, save->es.base);
3261	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3262	       "cs:",
3263	       save->cs.selector, save->cs.attrib,
3264	       save->cs.limit, save->cs.base);
3265	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3266	       "ss:",
3267	       save->ss.selector, save->ss.attrib,
3268	       save->ss.limit, save->ss.base);
3269	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3270	       "ds:",
3271	       save->ds.selector, save->ds.attrib,
3272	       save->ds.limit, save->ds.base);
3273	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3274	       "fs:",
3275	       save01->fs.selector, save01->fs.attrib,
3276	       save01->fs.limit, save01->fs.base);
3277	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3278	       "gs:",
3279	       save01->gs.selector, save01->gs.attrib,
3280	       save01->gs.limit, save01->gs.base);
3281	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3282	       "gdtr:",
3283	       save->gdtr.selector, save->gdtr.attrib,
3284	       save->gdtr.limit, save->gdtr.base);
3285	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3286	       "ldtr:",
3287	       save01->ldtr.selector, save01->ldtr.attrib,
3288	       save01->ldtr.limit, save01->ldtr.base);
3289	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3290	       "idtr:",
3291	       save->idtr.selector, save->idtr.attrib,
3292	       save->idtr.limit, save->idtr.base);
3293	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3294	       "tr:",
3295	       save01->tr.selector, save01->tr.attrib,
3296	       save01->tr.limit, save01->tr.base);
3297	pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3298	       save->vmpl, save->cpl, save->efer);
3299	pr_err("%-15s %016llx %-13s %016llx\n",
3300	       "cr0:", save->cr0, "cr2:", save->cr2);
3301	pr_err("%-15s %016llx %-13s %016llx\n",
3302	       "cr3:", save->cr3, "cr4:", save->cr4);
3303	pr_err("%-15s %016llx %-13s %016llx\n",
3304	       "dr6:", save->dr6, "dr7:", save->dr7);
3305	pr_err("%-15s %016llx %-13s %016llx\n",
3306	       "rip:", save->rip, "rflags:", save->rflags);
3307	pr_err("%-15s %016llx %-13s %016llx\n",
3308	       "rsp:", save->rsp, "rax:", save->rax);
3309	pr_err("%-15s %016llx %-13s %016llx\n",
3310	       "star:", save01->star, "lstar:", save01->lstar);
3311	pr_err("%-15s %016llx %-13s %016llx\n",
3312	       "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3313	pr_err("%-15s %016llx %-13s %016llx\n",
3314	       "kernel_gs_base:", save01->kernel_gs_base,
3315	       "sysenter_cs:", save01->sysenter_cs);
3316	pr_err("%-15s %016llx %-13s %016llx\n",
3317	       "sysenter_esp:", save01->sysenter_esp,
3318	       "sysenter_eip:", save01->sysenter_eip);
3319	pr_err("%-15s %016llx %-13s %016llx\n",
3320	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3321	pr_err("%-15s %016llx %-13s %016llx\n",
3322	       "br_from:", save->br_from, "br_to:", save->br_to);
3323	pr_err("%-15s %016llx %-13s %016llx\n",
3324	       "excp_from:", save->last_excp_from,
3325	       "excp_to:", save->last_excp_to);
3326}
3327
3328static bool svm_check_exit_valid(u64 exit_code)
3329{
3330	return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3331		svm_exit_handlers[exit_code]);
3332}
3333
3334static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3335{
3336	vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3337	dump_vmcb(vcpu);
3338	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3339	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3340	vcpu->run->internal.ndata = 2;
3341	vcpu->run->internal.data[0] = exit_code;
3342	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3343	return 0;
3344}
3345
3346int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3347{
3348	if (!svm_check_exit_valid(exit_code))
3349		return svm_handle_invalid_exit(vcpu, exit_code);
3350
3351#ifdef CONFIG_RETPOLINE
3352	if (exit_code == SVM_EXIT_MSR)
3353		return msr_interception(vcpu);
3354	else if (exit_code == SVM_EXIT_VINTR)
3355		return interrupt_window_interception(vcpu);
3356	else if (exit_code == SVM_EXIT_INTR)
3357		return intr_interception(vcpu);
3358	else if (exit_code == SVM_EXIT_HLT)
3359		return kvm_emulate_halt(vcpu);
3360	else if (exit_code == SVM_EXIT_NPF)
3361		return npf_interception(vcpu);
3362#endif
3363	return svm_exit_handlers[exit_code](vcpu);
3364}
3365
3366static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3367			      u64 *info1, u64 *info2,
3368			      u32 *intr_info, u32 *error_code)
3369{
3370	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3371
3372	*reason = control->exit_code;
3373	*info1 = control->exit_info_1;
3374	*info2 = control->exit_info_2;
3375	*intr_info = control->exit_int_info;
3376	if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3377	    (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3378		*error_code = control->exit_int_info_err;
3379	else
3380		*error_code = 0;
3381}
3382
3383static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3384{
3385	struct vcpu_svm *svm = to_svm(vcpu);
3386	struct kvm_run *kvm_run = vcpu->run;
3387	u32 exit_code = svm->vmcb->control.exit_code;
3388
3389	trace_kvm_exit(vcpu, KVM_ISA_SVM);
3390
3391	/* SEV-ES guests must use the CR write traps to track CR registers. */
3392	if (!sev_es_guest(vcpu->kvm)) {
3393		if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3394			vcpu->arch.cr0 = svm->vmcb->save.cr0;
3395		if (npt_enabled)
3396			vcpu->arch.cr3 = svm->vmcb->save.cr3;
3397	}
3398
3399	if (is_guest_mode(vcpu)) {
3400		int vmexit;
3401
3402		trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3403
3404		vmexit = nested_svm_exit_special(svm);
3405
3406		if (vmexit == NESTED_EXIT_CONTINUE)
3407			vmexit = nested_svm_exit_handled(svm);
3408
3409		if (vmexit == NESTED_EXIT_DONE)
3410			return 1;
3411	}
3412
3413	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3414		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3415		kvm_run->fail_entry.hardware_entry_failure_reason
3416			= svm->vmcb->control.exit_code;
3417		kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3418		dump_vmcb(vcpu);
3419		return 0;
3420	}
3421
3422	if (exit_fastpath != EXIT_FASTPATH_NONE)
3423		return 1;
3424
3425	return svm_invoke_exit_handler(vcpu, exit_code);
3426}
3427
3428static void reload_tss(struct kvm_vcpu *vcpu)
3429{
3430	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3431
3432	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3433	load_TR_desc();
3434}
3435
3436static void pre_svm_run(struct kvm_vcpu *vcpu)
3437{
3438	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3439	struct vcpu_svm *svm = to_svm(vcpu);
3440
3441	/*
3442	 * If the previous vmrun of the vmcb occurred on a different physical
3443	 * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3444	 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3445	 */
3446	if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3447		svm->current_vmcb->asid_generation = 0;
3448		vmcb_mark_all_dirty(svm->vmcb);
3449		svm->current_vmcb->cpu = vcpu->cpu;
3450        }
3451
3452	if (sev_guest(vcpu->kvm))
3453		return pre_sev_run(svm, vcpu->cpu);
3454
3455	/* FIXME: handle wraparound of asid_generation */
3456	if (svm->current_vmcb->asid_generation != sd->asid_generation)
3457		new_asid(svm, sd);
3458}
3459
3460static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3461{
3462	struct vcpu_svm *svm = to_svm(vcpu);
3463
3464	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3465
3466	if (svm->nmi_l1_to_l2)
3467		return;
3468
3469	vcpu->arch.hflags |= HF_NMI_MASK;
3470	if (!sev_es_guest(vcpu->kvm))
3471		svm_set_intercept(svm, INTERCEPT_IRET);
3472	++vcpu->stat.nmi_injections;
3473}
3474
3475static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3476{
3477	struct vcpu_svm *svm = to_svm(vcpu);
3478	u32 type;
3479
3480	if (vcpu->arch.interrupt.soft) {
3481		if (svm_update_soft_interrupt_rip(vcpu))
3482			return;
3483
3484		type = SVM_EVTINJ_TYPE_SOFT;
3485	} else {
3486		type = SVM_EVTINJ_TYPE_INTR;
3487	}
3488
3489	trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3490			   vcpu->arch.interrupt.soft, reinjected);
3491	++vcpu->stat.irq_injections;
3492
3493	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3494				       SVM_EVTINJ_VALID | type;
3495}
3496
3497void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3498				     int trig_mode, int vector)
3499{
3500	/*
3501	 * apic->apicv_active must be read after vcpu->mode.
3502	 * Pairs with smp_store_release in vcpu_enter_guest.
3503	 */
3504	bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3505
3506	/* Note, this is called iff the local APIC is in-kernel. */
3507	if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3508		/* Process the interrupt via kvm_check_and_inject_events(). */
3509		kvm_make_request(KVM_REQ_EVENT, vcpu);
3510		kvm_vcpu_kick(vcpu);
3511		return;
3512	}
3513
3514	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3515	if (in_guest_mode) {
3516		/*
3517		 * Signal the doorbell to tell hardware to inject the IRQ.  If
3518		 * the vCPU exits the guest before the doorbell chimes, hardware
3519		 * will automatically process AVIC interrupts at the next VMRUN.
3520		 */
3521		avic_ring_doorbell(vcpu);
3522	} else {
3523		/*
3524		 * Wake the vCPU if it was blocking.  KVM will then detect the
3525		 * pending IRQ when checking if the vCPU has a wake event.
3526		 */
3527		kvm_vcpu_wake_up(vcpu);
3528	}
3529}
3530
3531static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3532				  int trig_mode, int vector)
3533{
3534	kvm_lapic_set_irr(vector, apic);
3535
3536	/*
3537	 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3538	 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3539	 * the read of guest_mode.  This guarantees that either VMRUN will see
3540	 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3541	 * will signal the doorbell if the CPU has already entered the guest.
3542	 */
3543	smp_mb__after_atomic();
3544	svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3545}
3546
3547static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3548{
3549	struct vcpu_svm *svm = to_svm(vcpu);
3550
3551	/*
3552	 * SEV-ES guests must always keep the CR intercepts cleared. CR
3553	 * tracking is done using the CR write traps.
3554	 */
3555	if (sev_es_guest(vcpu->kvm))
3556		return;
3557
3558	if (nested_svm_virtualize_tpr(vcpu))
3559		return;
3560
3561	svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3562
3563	if (irr == -1)
3564		return;
3565
3566	if (tpr >= irr)
3567		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3568}
3569
3570bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3571{
3572	struct vcpu_svm *svm = to_svm(vcpu);
3573	struct vmcb *vmcb = svm->vmcb;
3574	bool ret;
3575
3576	if (!gif_set(svm))
3577		return true;
3578
3579	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3580		return false;
3581
3582	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3583	      (vcpu->arch.hflags & HF_NMI_MASK);
3584
3585	return ret;
3586}
3587
3588static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3589{
3590	struct vcpu_svm *svm = to_svm(vcpu);
3591	if (svm->nested.nested_run_pending)
3592		return -EBUSY;
3593
3594	if (svm_nmi_blocked(vcpu))
3595		return 0;
3596
3597	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3598	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3599		return -EBUSY;
3600	return 1;
3601}
3602
3603static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3604{
3605	return !!(vcpu->arch.hflags & HF_NMI_MASK);
3606}
3607
3608static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3609{
3610	struct vcpu_svm *svm = to_svm(vcpu);
3611
3612	if (masked) {
3613		vcpu->arch.hflags |= HF_NMI_MASK;
3614		if (!sev_es_guest(vcpu->kvm))
3615			svm_set_intercept(svm, INTERCEPT_IRET);
3616	} else {
3617		vcpu->arch.hflags &= ~HF_NMI_MASK;
3618		if (!sev_es_guest(vcpu->kvm))
3619			svm_clr_intercept(svm, INTERCEPT_IRET);
3620	}
3621}
3622
3623bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3624{
3625	struct vcpu_svm *svm = to_svm(vcpu);
3626	struct vmcb *vmcb = svm->vmcb;
3627
3628	if (!gif_set(svm))
3629		return true;
3630
3631	if (is_guest_mode(vcpu)) {
3632		/* As long as interrupts are being delivered...  */
3633		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3634		    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3635		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3636			return true;
3637
3638		/* ... vmexits aren't blocked by the interrupt shadow  */
3639		if (nested_exit_on_intr(svm))
3640			return false;
3641	} else {
3642		if (!svm_get_if_flag(vcpu))
3643			return true;
3644	}
3645
3646	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3647}
3648
3649static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3650{
3651	struct vcpu_svm *svm = to_svm(vcpu);
3652
3653	if (svm->nested.nested_run_pending)
3654		return -EBUSY;
3655
3656	if (svm_interrupt_blocked(vcpu))
3657		return 0;
3658
3659	/*
3660	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3661	 * e.g. if the IRQ arrived asynchronously after checking nested events.
3662	 */
3663	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3664		return -EBUSY;
3665
3666	return 1;
3667}
3668
3669static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3670{
3671	struct vcpu_svm *svm = to_svm(vcpu);
3672
3673	/*
3674	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3675	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3676	 * get that intercept, this function will be called again though and
3677	 * we'll get the vintr intercept. However, if the vGIF feature is
3678	 * enabled, the STGI interception will not occur. Enable the irq
3679	 * window under the assumption that the hardware will set the GIF.
3680	 */
3681	if (vgif || gif_set(svm)) {
3682		/*
3683		 * IRQ window is not needed when AVIC is enabled,
3684		 * unless we have pending ExtINT since it cannot be injected
3685		 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3686		 * and fallback to injecting IRQ via V_IRQ.
3687		 *
3688		 * If running nested, AVIC is already locally inhibited
3689		 * on this vCPU, therefore there is no need to request
3690		 * the VM wide AVIC inhibition.
3691		 */
3692		if (!is_guest_mode(vcpu))
3693			kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3694
3695		svm_set_vintr(svm);
3696	}
3697}
3698
3699static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3700{
3701	struct vcpu_svm *svm = to_svm(vcpu);
3702
3703	if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
3704		return; /* IRET will cause a vm exit */
3705
3706	if (!gif_set(svm)) {
3707		if (vgif)
3708			svm_set_intercept(svm, INTERCEPT_STGI);
3709		return; /* STGI will cause a vm exit */
3710	}
3711
3712	/*
3713	 * Something prevents NMI from been injected. Single step over possible
3714	 * problem (IRET or exception injection or interrupt shadow)
3715	 */
3716	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3717	svm->nmi_singlestep = true;
3718	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3719}
3720
3721static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3722{
3723	struct vcpu_svm *svm = to_svm(vcpu);
3724
3725	/*
3726	 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3727	 * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3728	 * entries, and thus is a superset of Hyper-V's fine grained flushing.
3729	 */
3730	kvm_hv_vcpu_purge_flush_tlb(vcpu);
3731
3732	/*
3733	 * Flush only the current ASID even if the TLB flush was invoked via
3734	 * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3735	 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3736	 * unconditionally does a TLB flush on both nested VM-Enter and nested
3737	 * VM-Exit (via kvm_mmu_reset_context()).
3738	 */
3739	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3740		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3741	else
3742		svm->current_vmcb->asid_generation--;
3743}
3744
3745static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3746{
3747	struct vcpu_svm *svm = to_svm(vcpu);
3748
3749	invlpga(gva, svm->vmcb->control.asid);
3750}
3751
3752static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3753{
3754	struct vcpu_svm *svm = to_svm(vcpu);
3755
3756	if (nested_svm_virtualize_tpr(vcpu))
3757		return;
3758
3759	if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3760		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3761		kvm_set_cr8(vcpu, cr8);
3762	}
3763}
3764
3765static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3766{
3767	struct vcpu_svm *svm = to_svm(vcpu);
3768	u64 cr8;
3769
3770	if (nested_svm_virtualize_tpr(vcpu) ||
3771	    kvm_vcpu_apicv_active(vcpu))
3772		return;
3773
3774	cr8 = kvm_get_cr8(vcpu);
3775	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3776	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3777}
3778
3779static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3780					int type)
3781{
3782	bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3783	bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3784	struct vcpu_svm *svm = to_svm(vcpu);
3785
3786	/*
3787	 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3788	 * associated with the original soft exception/interrupt.  next_rip is
3789	 * cleared on all exits that can occur while vectoring an event, so KVM
3790	 * needs to manually set next_rip for re-injection.  Unlike the !nrips
3791	 * case below, this needs to be done if and only if KVM is re-injecting
3792	 * the same event, i.e. if the event is a soft exception/interrupt,
3793	 * otherwise next_rip is unused on VMRUN.
3794	 */
3795	if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3796	    kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3797		svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3798	/*
3799	 * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3800	 * injecting the soft exception/interrupt.  That advancement needs to
3801	 * be unwound if vectoring didn't complete.  Note, the new event may
3802	 * not be the injected event, e.g. if KVM injected an INTn, the INTn
3803	 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3804	 * be the reported vectored event, but RIP still needs to be unwound.
3805	 */
3806	else if (!nrips && (is_soft || is_exception) &&
3807		 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3808		kvm_rip_write(vcpu, svm->soft_int_old_rip);
3809}
3810
3811static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3812{
3813	struct vcpu_svm *svm = to_svm(vcpu);
3814	u8 vector;
3815	int type;
3816	u32 exitintinfo = svm->vmcb->control.exit_int_info;
3817	bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3818	bool soft_int_injected = svm->soft_int_injected;
3819
3820	svm->nmi_l1_to_l2 = false;
3821	svm->soft_int_injected = false;
3822
3823	/*
3824	 * If we've made progress since setting HF_IRET_MASK, we've
3825	 * executed an IRET and can allow NMI injection.
3826	 */
3827	if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3828	    (sev_es_guest(vcpu->kvm) ||
3829	     kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3830		vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3831		kvm_make_request(KVM_REQ_EVENT, vcpu);
3832	}
3833
3834	vcpu->arch.nmi_injected = false;
3835	kvm_clear_exception_queue(vcpu);
3836	kvm_clear_interrupt_queue(vcpu);
3837
3838	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3839		return;
3840
3841	kvm_make_request(KVM_REQ_EVENT, vcpu);
3842
3843	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3844	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3845
3846	if (soft_int_injected)
3847		svm_complete_soft_interrupt(vcpu, vector, type);
3848
3849	switch (type) {
3850	case SVM_EXITINTINFO_TYPE_NMI:
3851		vcpu->arch.nmi_injected = true;
3852		svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3853		break;
3854	case SVM_EXITINTINFO_TYPE_EXEPT:
3855		/*
3856		 * Never re-inject a #VC exception.
3857		 */
3858		if (vector == X86_TRAP_VC)
3859			break;
3860
3861		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3862			u32 err = svm->vmcb->control.exit_int_info_err;
3863			kvm_requeue_exception_e(vcpu, vector, err);
3864
3865		} else
3866			kvm_requeue_exception(vcpu, vector);
3867		break;
3868	case SVM_EXITINTINFO_TYPE_INTR:
3869		kvm_queue_interrupt(vcpu, vector, false);
3870		break;
3871	case SVM_EXITINTINFO_TYPE_SOFT:
3872		kvm_queue_interrupt(vcpu, vector, true);
3873		break;
3874	default:
3875		break;
3876	}
3877
3878}
3879
3880static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3881{
3882	struct vcpu_svm *svm = to_svm(vcpu);
3883	struct vmcb_control_area *control = &svm->vmcb->control;
3884
3885	control->exit_int_info = control->event_inj;
3886	control->exit_int_info_err = control->event_inj_err;
3887	control->event_inj = 0;
3888	svm_complete_interrupts(vcpu);
3889}
3890
3891static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3892{
3893	return 1;
3894}
3895
3896static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3897{
3898	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3899
3900	/*
3901	 * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
3902	 * can't read guest memory (dereference memslots) to decode the WRMSR.
3903	 */
3904	if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
3905	    nrips && control->next_rip)
3906		return handle_fastpath_set_msr_irqoff(vcpu);
3907
3908	return EXIT_FASTPATH_NONE;
3909}
3910
3911static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
3912{
3913	struct vcpu_svm *svm = to_svm(vcpu);
3914
3915	guest_state_enter_irqoff();
3916
3917	if (sev_es_guest(vcpu->kvm))
3918		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
3919	else
3920		__svm_vcpu_run(svm, spec_ctrl_intercepted);
3921
3922	guest_state_exit_irqoff();
3923}
3924
3925static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3926{
3927	struct vcpu_svm *svm = to_svm(vcpu);
3928	bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
3929
3930	trace_kvm_entry(vcpu);
3931
3932	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3933	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3934	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3935
3936	/*
3937	 * Disable singlestep if we're injecting an interrupt/exception.
3938	 * We don't want our modified rflags to be pushed on the stack where
3939	 * we might not be able to easily reset them if we disabled NMI
3940	 * singlestep later.
3941	 */
3942	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3943		/*
3944		 * Event injection happens before external interrupts cause a
3945		 * vmexit and interrupts are disabled here, so smp_send_reschedule
3946		 * is enough to force an immediate vmexit.
3947		 */
3948		disable_nmi_singlestep(svm);
3949		smp_send_reschedule(vcpu->cpu);
3950	}
3951
3952	pre_svm_run(vcpu);
3953
3954	sync_lapic_to_cr8(vcpu);
3955
3956	if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3957		svm->vmcb->control.asid = svm->asid;
3958		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3959	}
3960	svm->vmcb->save.cr2 = vcpu->arch.cr2;
3961
3962	svm_hv_update_vp_id(svm->vmcb, vcpu);
3963
3964	/*
3965	 * Run with all-zero DR6 unless needed, so that we can get the exact cause
3966	 * of a #DB.
3967	 */
3968	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3969		svm_set_dr6(svm, vcpu->arch.dr6);
3970	else
3971		svm_set_dr6(svm, DR6_ACTIVE_LOW);
3972
3973	clgi();
3974	kvm_load_guest_xsave_state(vcpu);
3975
3976	kvm_wait_lapic_expire(vcpu);
3977
3978	/*
3979	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3980	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
3981	 * is no need to worry about the conditional branch over the wrmsr
3982	 * being speculatively taken.
3983	 */
3984	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3985		x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
3986
3987	svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
3988
3989	if (!sev_es_guest(vcpu->kvm))
3990		reload_tss(vcpu);
3991
3992	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3993		x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
3994
3995	if (!sev_es_guest(vcpu->kvm)) {
3996		vcpu->arch.cr2 = svm->vmcb->save.cr2;
3997		vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3998		vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3999		vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4000	}
4001	vcpu->arch.regs_dirty = 0;
4002
4003	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4004		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4005
4006	kvm_load_host_xsave_state(vcpu);
4007	stgi();
4008
4009	/* Any pending NMI will happen here */
4010
4011	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4012		kvm_after_interrupt(vcpu);
4013
4014	sync_cr8_to_lapic(vcpu);
4015
4016	svm->next_rip = 0;
4017	if (is_guest_mode(vcpu)) {
4018		nested_sync_control_from_vmcb02(svm);
4019
4020		/* Track VMRUNs that have made past consistency checking */
4021		if (svm->nested.nested_run_pending &&
4022		    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4023                        ++vcpu->stat.nested_run;
4024
4025		svm->nested.nested_run_pending = 0;
4026	}
4027
4028	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4029	vmcb_mark_all_clean(svm->vmcb);
4030
4031	/* if exit due to PF check for async PF */
4032	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4033		vcpu->arch.apf.host_apf_flags =
4034			kvm_read_and_reset_apf_flags();
4035
4036	vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4037
4038	/*
4039	 * We need to handle MC intercepts here before the vcpu has a chance to
4040	 * change the physical cpu
4041	 */
4042	if (unlikely(svm->vmcb->control.exit_code ==
4043		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
4044		svm_handle_mce(vcpu);
4045
4046	svm_complete_interrupts(vcpu);
4047
4048	if (is_guest_mode(vcpu))
4049		return EXIT_FASTPATH_NONE;
4050
4051	return svm_exit_handlers_fastpath(vcpu);
4052}
4053
4054static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4055			     int root_level)
4056{
4057	struct vcpu_svm *svm = to_svm(vcpu);
4058	unsigned long cr3;
4059
4060	if (npt_enabled) {
4061		svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4062		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4063
4064		hv_track_root_tdp(vcpu, root_hpa);
4065
4066		cr3 = vcpu->arch.cr3;
4067	} else if (root_level >= PT64_ROOT_4LEVEL) {
4068		cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4069	} else {
4070		/* PCID in the guest should be impossible with a 32-bit MMU. */
4071		WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4072		cr3 = root_hpa;
4073	}
4074
4075	svm->vmcb->save.cr3 = cr3;
4076	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4077}
4078
4079static int is_disabled(void)
4080{
4081	u64 vm_cr;
4082
4083	rdmsrl(MSR_VM_CR, vm_cr);
4084	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4085		return 1;
4086
4087	return 0;
4088}
4089
4090static void
4091svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4092{
4093	/*
4094	 * Patch in the VMMCALL instruction:
4095	 */
4096	hypercall[0] = 0x0f;
4097	hypercall[1] = 0x01;
4098	hypercall[2] = 0xd9;
4099}
4100
4101static int __init svm_check_processor_compat(void)
4102{
4103	return 0;
4104}
4105
4106/*
4107 * The kvm parameter can be NULL (module initialization, or invocation before
4108 * VM creation). Be sure to check the kvm parameter before using it.
4109 */
4110static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4111{
4112	switch (index) {
4113	case MSR_IA32_MCG_EXT_CTL:
4114	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4115		return false;
4116	case MSR_IA32_SMBASE:
4117		if (!IS_ENABLED(CONFIG_KVM_SMM))
4118			return false;
4119		/* SEV-ES guests do not support SMM, so report false */
4120		if (kvm && sev_es_guest(kvm))
4121			return false;
4122		break;
4123	default:
4124		break;
4125	}
4126
4127	return true;
4128}
4129
4130static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4131{
4132	struct vcpu_svm *svm = to_svm(vcpu);
4133	struct kvm_cpuid_entry2 *best;
4134
4135	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4136				    boot_cpu_has(X86_FEATURE_XSAVE) &&
4137				    boot_cpu_has(X86_FEATURE_XSAVES);
4138
4139	/* Update nrips enabled cache */
4140	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4141			     guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4142
4143	svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4144	svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4145
4146	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4147
4148	svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4149			guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4150
4151	svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4152			guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4153
4154	svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4155
4156	svm_recalc_instruction_intercepts(vcpu, svm);
4157
4158	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
4159	if (sev_guest(vcpu->kvm)) {
4160		best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4161		if (best)
4162			vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4163	}
4164
4165	init_vmcb_after_set_cpuid(vcpu);
4166}
4167
4168static bool svm_has_wbinvd_exit(void)
4169{
4170	return true;
4171}
4172
4173#define PRE_EX(exit)  { .exit_code = (exit), \
4174			.stage = X86_ICPT_PRE_EXCEPT, }
4175#define POST_EX(exit) { .exit_code = (exit), \
4176			.stage = X86_ICPT_POST_EXCEPT, }
4177#define POST_MEM(exit) { .exit_code = (exit), \
4178			.stage = X86_ICPT_POST_MEMACCESS, }
4179
4180static const struct __x86_intercept {
4181	u32 exit_code;
4182	enum x86_intercept_stage stage;
4183} x86_intercept_map[] = {
4184	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
4185	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
4186	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
4187	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
4188	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
4189	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
4190	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
4191	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
4192	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
4193	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
4194	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
4195	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
4196	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
4197	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
4198	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
4199	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
4200	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
4201	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
4202	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
4203	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
4204	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
4205	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
4206	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
4207	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
4208	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
4209	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
4210	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
4211	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
4212	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
4213	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
4214	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
4215	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
4216	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
4217	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
4218	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
4219	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
4220	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
4221	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
4222	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
4223	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
4224	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
4225	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
4226	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
4227	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
4228	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
4229	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
4230	[x86_intercept_xsetbv]		= PRE_EX(SVM_EXIT_XSETBV),
4231};
4232
4233#undef PRE_EX
4234#undef POST_EX
4235#undef POST_MEM
4236
4237static int svm_check_intercept(struct kvm_vcpu *vcpu,
4238			       struct x86_instruction_info *info,
4239			       enum x86_intercept_stage stage,
4240			       struct x86_exception *exception)
4241{
4242	struct vcpu_svm *svm = to_svm(vcpu);
4243	int vmexit, ret = X86EMUL_CONTINUE;
4244	struct __x86_intercept icpt_info;
4245	struct vmcb *vmcb = svm->vmcb;
4246
4247	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4248		goto out;
4249
4250	icpt_info = x86_intercept_map[info->intercept];
4251
4252	if (stage != icpt_info.stage)
4253		goto out;
4254
4255	switch (icpt_info.exit_code) {
4256	case SVM_EXIT_READ_CR0:
4257		if (info->intercept == x86_intercept_cr_read)
4258			icpt_info.exit_code += info->modrm_reg;
4259		break;
4260	case SVM_EXIT_WRITE_CR0: {
4261		unsigned long cr0, val;
4262
4263		if (info->intercept == x86_intercept_cr_write)
4264			icpt_info.exit_code += info->modrm_reg;
4265
4266		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4267		    info->intercept == x86_intercept_clts)
4268			break;
4269
4270		if (!(vmcb12_is_intercept(&svm->nested.ctl,
4271					INTERCEPT_SELECTIVE_CR0)))
4272			break;
4273
4274		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4275		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4276
4277		if (info->intercept == x86_intercept_lmsw) {
4278			cr0 &= 0xfUL;
4279			val &= 0xfUL;
4280			/* lmsw can't clear PE - catch this here */
4281			if (cr0 & X86_CR0_PE)
4282				val |= X86_CR0_PE;
4283		}
4284
4285		if (cr0 ^ val)
4286			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4287
4288		break;
4289	}
4290	case SVM_EXIT_READ_DR0:
4291	case SVM_EXIT_WRITE_DR0:
4292		icpt_info.exit_code += info->modrm_reg;
4293		break;
4294	case SVM_EXIT_MSR:
4295		if (info->intercept == x86_intercept_wrmsr)
4296			vmcb->control.exit_info_1 = 1;
4297		else
4298			vmcb->control.exit_info_1 = 0;
4299		break;
4300	case SVM_EXIT_PAUSE:
4301		/*
4302		 * We get this for NOP only, but pause
4303		 * is rep not, check this here
4304		 */
4305		if (info->rep_prefix != REPE_PREFIX)
4306			goto out;
4307		break;
4308	case SVM_EXIT_IOIO: {
4309		u64 exit_info;
4310		u32 bytes;
4311
4312		if (info->intercept == x86_intercept_in ||
4313		    info->intercept == x86_intercept_ins) {
4314			exit_info = ((info->src_val & 0xffff) << 16) |
4315				SVM_IOIO_TYPE_MASK;
4316			bytes = info->dst_bytes;
4317		} else {
4318			exit_info = (info->dst_val & 0xffff) << 16;
4319			bytes = info->src_bytes;
4320		}
4321
4322		if (info->intercept == x86_intercept_outs ||
4323		    info->intercept == x86_intercept_ins)
4324			exit_info |= SVM_IOIO_STR_MASK;
4325
4326		if (info->rep_prefix)
4327			exit_info |= SVM_IOIO_REP_MASK;
4328
4329		bytes = min(bytes, 4u);
4330
4331		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4332
4333		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4334
4335		vmcb->control.exit_info_1 = exit_info;
4336		vmcb->control.exit_info_2 = info->next_rip;
4337
4338		break;
4339	}
4340	default:
4341		break;
4342	}
4343
4344	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4345	if (static_cpu_has(X86_FEATURE_NRIPS))
4346		vmcb->control.next_rip  = info->next_rip;
4347	vmcb->control.exit_code = icpt_info.exit_code;
4348	vmexit = nested_svm_exit_handled(svm);
4349
4350	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4351					   : X86EMUL_CONTINUE;
4352
4353out:
4354	return ret;
4355}
4356
4357static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4358{
4359	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4360		vcpu->arch.at_instruction_boundary = true;
4361}
4362
4363static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4364{
4365	if (!kvm_pause_in_guest(vcpu->kvm))
4366		shrink_ple_window(vcpu);
4367}
4368
4369static void svm_setup_mce(struct kvm_vcpu *vcpu)
4370{
4371	/* [63:9] are reserved. */
4372	vcpu->arch.mcg_cap &= 0x1ff;
4373}
4374
4375#ifdef CONFIG_KVM_SMM
4376bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4377{
4378	struct vcpu_svm *svm = to_svm(vcpu);
4379
4380	/* Per APM Vol.2 15.22.2 "Response to SMI" */
4381	if (!gif_set(svm))
4382		return true;
4383
4384	return is_smm(vcpu);
4385}
4386
4387static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4388{
4389	struct vcpu_svm *svm = to_svm(vcpu);
4390	if (svm->nested.nested_run_pending)
4391		return -EBUSY;
4392
4393	if (svm_smi_blocked(vcpu))
4394		return 0;
4395
4396	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4397	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4398		return -EBUSY;
4399
4400	return 1;
4401}
4402
4403static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4404{
4405	struct vcpu_svm *svm = to_svm(vcpu);
4406	struct kvm_host_map map_save;
4407	int ret;
4408
4409	if (!is_guest_mode(vcpu))
4410		return 0;
4411
4412	/*
4413	 * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4414	 * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4415	 */
4416
4417	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4418		return 1;
4419
4420	smram->smram64.svm_guest_flag = 1;
4421	smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4422
4423	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4424	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4425	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4426
4427	ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4428	if (ret)
4429		return ret;
4430
4431	/*
4432	 * KVM uses VMCB01 to store L1 host state while L2 runs but
4433	 * VMCB01 is going to be used during SMM and thus the state will
4434	 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4435	 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4436	 * format of the area is identical to guest save area offsetted
4437	 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4438	 * within 'struct vmcb'). Note: HSAVE area may also be used by
4439	 * L1 hypervisor to save additional host context (e.g. KVM does
4440	 * that, see svm_prepare_switch_to_guest()) which must be
4441	 * preserved.
4442	 */
4443	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4444		return 1;
4445
4446	BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4447
4448	svm_copy_vmrun_state(map_save.hva + 0x400,
4449			     &svm->vmcb01.ptr->save);
4450
4451	kvm_vcpu_unmap(vcpu, &map_save, true);
4452	return 0;
4453}
4454
4455static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4456{
4457	struct vcpu_svm *svm = to_svm(vcpu);
4458	struct kvm_host_map map, map_save;
4459	struct vmcb *vmcb12;
4460	int ret;
4461
4462	const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4463
4464	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4465		return 0;
4466
4467	/* Non-zero if SMI arrived while vCPU was in guest mode. */
4468	if (!smram64->svm_guest_flag)
4469		return 0;
4470
4471	if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4472		return 1;
4473
4474	if (!(smram64->efer & EFER_SVME))
4475		return 1;
4476
4477	if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4478		return 1;
4479
4480	ret = 1;
4481	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4482		goto unmap_map;
4483
4484	if (svm_allocate_nested(svm))
4485		goto unmap_save;
4486
4487	/*
4488	 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4489	 * used during SMM (see svm_enter_smm())
4490	 */
4491
4492	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4493
4494	/*
4495	 * Enter the nested guest now
4496	 */
4497
4498	vmcb_mark_all_dirty(svm->vmcb01.ptr);
4499
4500	vmcb12 = map.hva;
4501	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4502	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4503	ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4504
4505	if (ret)
4506		goto unmap_save;
4507
4508	svm->nested.nested_run_pending = 1;
4509
4510unmap_save:
4511	kvm_vcpu_unmap(vcpu, &map_save, true);
4512unmap_map:
4513	kvm_vcpu_unmap(vcpu, &map, true);
4514	return ret;
4515}
4516
4517static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4518{
4519	struct vcpu_svm *svm = to_svm(vcpu);
4520
4521	if (!gif_set(svm)) {
4522		if (vgif)
4523			svm_set_intercept(svm, INTERCEPT_STGI);
4524		/* STGI will cause a vm exit */
4525	} else {
4526		/* We must be in SMM; RSM will cause a vmexit anyway.  */
4527	}
4528}
4529#endif
4530
4531static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4532					void *insn, int insn_len)
4533{
4534	bool smep, smap, is_user;
4535	unsigned long cr4;
4536	u64 error_code;
4537
4538	/* Emulation is always possible when KVM has access to all guest state. */
4539	if (!sev_guest(vcpu->kvm))
4540		return true;
4541
4542	/* #UD and #GP should never be intercepted for SEV guests. */
4543	WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4544				  EMULTYPE_TRAP_UD_FORCED |
4545				  EMULTYPE_VMWARE_GP));
4546
4547	/*
4548	 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4549	 * to guest register state.
4550	 */
4551	if (sev_es_guest(vcpu->kvm))
4552		return false;
4553
4554	/*
4555	 * Emulation is possible if the instruction is already decoded, e.g.
4556	 * when completing I/O after returning from userspace.
4557	 */
4558	if (emul_type & EMULTYPE_NO_DECODE)
4559		return true;
4560
4561	/*
4562	 * Emulation is possible for SEV guests if and only if a prefilled
4563	 * buffer containing the bytes of the intercepted instruction is
4564	 * available. SEV guest memory is encrypted with a guest specific key
4565	 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4566	 * decode garbage.
4567	 *
4568	 * Inject #UD if KVM reached this point without an instruction buffer.
4569	 * In practice, this path should never be hit by a well-behaved guest,
4570	 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4571	 * is still theoretically reachable, e.g. via unaccelerated fault-like
4572	 * AVIC access, and needs to be handled by KVM to avoid putting the
4573	 * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
4574	 * but its the least awful option given lack of insight into the guest.
4575	 */
4576	if (unlikely(!insn)) {
4577		kvm_queue_exception(vcpu, UD_VECTOR);
4578		return false;
4579	}
4580
4581	/*
4582	 * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4583	 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4584	 * the faulting instruction because the code fetch itself faulted, e.g.
4585	 * the guest attempted to fetch from emulated MMIO or a guest page
4586	 * table used to translate CS:RIP resides in emulated MMIO.
4587	 */
4588	if (likely(insn_len))
4589		return true;
4590
4591	/*
4592	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4593	 *
4594	 * Errata:
4595	 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4596	 * possible that CPU microcode implementing DecodeAssist will fail to
4597	 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4598	 * be '0'.  This happens because microcode reads CS:RIP using a _data_
4599	 * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4600	 * gives up and does not fill the instruction bytes buffer.
4601	 *
4602	 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4603	 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4604	 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4605	 * GuestIntrBytes field of the VMCB.
4606	 *
4607	 * This does _not_ mean that the erratum has been encountered, as the
4608	 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4609	 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4610	 * encountered a reserved/not-present #PF.
4611	 *
4612	 * To hit the erratum, the following conditions must be true:
4613	 *    1. CR4.SMAP=1 (obviously).
4614	 *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4615	 *       have been hit as the guest would have encountered a SMEP
4616	 *       violation #PF, not a #NPF.
4617	 *    3. The #NPF is not due to a code fetch, in which case failure to
4618	 *       retrieve the instruction bytes is legitimate (see abvoe).
4619	 *
4620	 * In addition, don't apply the erratum workaround if the #NPF occurred
4621	 * while translating guest page tables (see below).
4622	 */
4623	error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4624	if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4625		goto resume_guest;
4626
4627	cr4 = kvm_read_cr4(vcpu);
4628	smep = cr4 & X86_CR4_SMEP;
4629	smap = cr4 & X86_CR4_SMAP;
4630	is_user = svm_get_cpl(vcpu) == 3;
4631	if (smap && (!smep || is_user)) {
4632		pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
4633
4634		/*
4635		 * If the fault occurred in userspace, arbitrarily inject #GP
4636		 * to avoid killing the guest and to hopefully avoid confusing
4637		 * the guest kernel too much, e.g. injecting #PF would not be
4638		 * coherent with respect to the guest's page tables.  Request
4639		 * triple fault if the fault occurred in the kernel as there's
4640		 * no fault that KVM can inject without confusing the guest.
4641		 * In practice, the triple fault is moot as no sane SEV kernel
4642		 * will execute from user memory while also running with SMAP=1.
4643		 */
4644		if (is_user)
4645			kvm_inject_gp(vcpu, 0);
4646		else
4647			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4648	}
4649
4650resume_guest:
4651	/*
4652	 * If the erratum was not hit, simply resume the guest and let it fault
4653	 * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4654	 * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4655	 * userspace will kill the guest, and letting the emulator read garbage
4656	 * will yield random behavior and potentially corrupt the guest.
4657	 *
4658	 * Simply resuming the guest is technically not a violation of the SEV
4659	 * architecture.  AMD's APM states that all code fetches and page table
4660	 * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4661	 * APM also states that encrypted accesses to MMIO are "ignored", but
4662	 * doesn't explicitly define "ignored", i.e. doing nothing and letting
4663	 * the guest spin is technically "ignoring" the access.
4664	 */
4665	return false;
4666}
4667
4668static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4669{
4670	struct vcpu_svm *svm = to_svm(vcpu);
4671
4672	return !gif_set(svm);
4673}
4674
4675static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4676{
4677	if (!sev_es_guest(vcpu->kvm))
4678		return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4679
4680	sev_vcpu_deliver_sipi_vector(vcpu, vector);
4681}
4682
4683static void svm_vm_destroy(struct kvm *kvm)
4684{
4685	avic_vm_destroy(kvm);
4686	sev_vm_destroy(kvm);
4687}
4688
4689static int svm_vm_init(struct kvm *kvm)
4690{
4691	if (!pause_filter_count || !pause_filter_thresh)
4692		kvm->arch.pause_in_guest = true;
4693
4694	if (enable_apicv) {
4695		int ret = avic_vm_init(kvm);
4696		if (ret)
4697			return ret;
4698	}
4699
4700	return 0;
4701}
4702
4703static struct kvm_x86_ops svm_x86_ops __initdata = {
4704	.name = "kvm_amd",
4705
4706	.hardware_unsetup = svm_hardware_unsetup,
4707	.hardware_enable = svm_hardware_enable,
4708	.hardware_disable = svm_hardware_disable,
4709	.has_emulated_msr = svm_has_emulated_msr,
4710
4711	.vcpu_create = svm_vcpu_create,
4712	.vcpu_free = svm_vcpu_free,
4713	.vcpu_reset = svm_vcpu_reset,
4714
4715	.vm_size = sizeof(struct kvm_svm),
4716	.vm_init = svm_vm_init,
4717	.vm_destroy = svm_vm_destroy,
4718
4719	.prepare_switch_to_guest = svm_prepare_switch_to_guest,
4720	.vcpu_load = svm_vcpu_load,
4721	.vcpu_put = svm_vcpu_put,
4722	.vcpu_blocking = avic_vcpu_blocking,
4723	.vcpu_unblocking = avic_vcpu_unblocking,
4724
4725	.update_exception_bitmap = svm_update_exception_bitmap,
4726	.get_msr_feature = svm_get_msr_feature,
4727	.get_msr = svm_get_msr,
4728	.set_msr = svm_set_msr,
4729	.get_segment_base = svm_get_segment_base,
4730	.get_segment = svm_get_segment,
4731	.set_segment = svm_set_segment,
4732	.get_cpl = svm_get_cpl,
4733	.get_cs_db_l_bits = svm_get_cs_db_l_bits,
4734	.set_cr0 = svm_set_cr0,
4735	.post_set_cr3 = sev_post_set_cr3,
4736	.is_valid_cr4 = svm_is_valid_cr4,
4737	.set_cr4 = svm_set_cr4,
4738	.set_efer = svm_set_efer,
4739	.get_idt = svm_get_idt,
4740	.set_idt = svm_set_idt,
4741	.get_gdt = svm_get_gdt,
4742	.set_gdt = svm_set_gdt,
4743	.set_dr7 = svm_set_dr7,
4744	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4745	.cache_reg = svm_cache_reg,
4746	.get_rflags = svm_get_rflags,
4747	.set_rflags = svm_set_rflags,
4748	.get_if_flag = svm_get_if_flag,
4749
4750	.flush_tlb_all = svm_flush_tlb_current,
4751	.flush_tlb_current = svm_flush_tlb_current,
4752	.flush_tlb_gva = svm_flush_tlb_gva,
4753	.flush_tlb_guest = svm_flush_tlb_current,
4754
4755	.vcpu_pre_run = svm_vcpu_pre_run,
4756	.vcpu_run = svm_vcpu_run,
4757	.handle_exit = svm_handle_exit,
4758	.skip_emulated_instruction = svm_skip_emulated_instruction,
4759	.update_emulated_instruction = NULL,
4760	.set_interrupt_shadow = svm_set_interrupt_shadow,
4761	.get_interrupt_shadow = svm_get_interrupt_shadow,
4762	.patch_hypercall = svm_patch_hypercall,
4763	.inject_irq = svm_inject_irq,
4764	.inject_nmi = svm_inject_nmi,
4765	.inject_exception = svm_inject_exception,
4766	.cancel_injection = svm_cancel_injection,
4767	.interrupt_allowed = svm_interrupt_allowed,
4768	.nmi_allowed = svm_nmi_allowed,
4769	.get_nmi_mask = svm_get_nmi_mask,
4770	.set_nmi_mask = svm_set_nmi_mask,
4771	.enable_nmi_window = svm_enable_nmi_window,
4772	.enable_irq_window = svm_enable_irq_window,
4773	.update_cr8_intercept = svm_update_cr8_intercept,
4774	.set_virtual_apic_mode = avic_set_virtual_apic_mode,
4775	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4776	.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
4777	.apicv_post_state_restore = avic_apicv_post_state_restore,
4778
4779	.get_exit_info = svm_get_exit_info,
4780
4781	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4782
4783	.has_wbinvd_exit = svm_has_wbinvd_exit,
4784
4785	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
4786	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4787	.write_tsc_offset = svm_write_tsc_offset,
4788	.write_tsc_multiplier = svm_write_tsc_multiplier,
4789
4790	.load_mmu_pgd = svm_load_mmu_pgd,
4791
4792	.check_intercept = svm_check_intercept,
4793	.handle_exit_irqoff = svm_handle_exit_irqoff,
4794
4795	.request_immediate_exit = __kvm_request_immediate_exit,
4796
4797	.sched_in = svm_sched_in,
4798
4799	.nested_ops = &svm_nested_ops,
4800
4801	.deliver_interrupt = svm_deliver_interrupt,
4802	.pi_update_irte = avic_pi_update_irte,
4803	.setup_mce = svm_setup_mce,
4804
4805#ifdef CONFIG_KVM_SMM
4806	.smi_allowed = svm_smi_allowed,
4807	.enter_smm = svm_enter_smm,
4808	.leave_smm = svm_leave_smm,
4809	.enable_smi_window = svm_enable_smi_window,
4810#endif
4811
4812	.mem_enc_ioctl = sev_mem_enc_ioctl,
4813	.mem_enc_register_region = sev_mem_enc_register_region,
4814	.mem_enc_unregister_region = sev_mem_enc_unregister_region,
4815	.guest_memory_reclaimed = sev_guest_memory_reclaimed,
4816
4817	.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4818	.vm_move_enc_context_from = sev_vm_move_enc_context_from,
4819
4820	.can_emulate_instruction = svm_can_emulate_instruction,
4821
4822	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
4823
4824	.msr_filter_changed = svm_msr_filter_changed,
4825	.complete_emulated_msr = svm_complete_emulated_msr,
4826
4827	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4828	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4829};
4830
4831/*
4832 * The default MMIO mask is a single bit (excluding the present bit),
4833 * which could conflict with the memory encryption bit. Check for
4834 * memory encryption support and override the default MMIO mask if
4835 * memory encryption is enabled.
4836 */
4837static __init void svm_adjust_mmio_mask(void)
4838{
4839	unsigned int enc_bit, mask_bit;
4840	u64 msr, mask;
4841
4842	/* If there is no memory encryption support, use existing mask */
4843	if (cpuid_eax(0x80000000) < 0x8000001f)
4844		return;
4845
4846	/* If memory encryption is not enabled, use existing mask */
4847	rdmsrl(MSR_AMD64_SYSCFG, msr);
4848	if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4849		return;
4850
4851	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4852	mask_bit = boot_cpu_data.x86_phys_bits;
4853
4854	/* Increment the mask bit if it is the same as the encryption bit */
4855	if (enc_bit == mask_bit)
4856		mask_bit++;
4857
4858	/*
4859	 * If the mask bit location is below 52, then some bits above the
4860	 * physical addressing limit will always be reserved, so use the
4861	 * rsvd_bits() function to generate the mask. This mask, along with
4862	 * the present bit, will be used to generate a page fault with
4863	 * PFER.RSV = 1.
4864	 *
4865	 * If the mask bit location is 52 (or above), then clear the mask.
4866	 */
4867	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4868
4869	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4870}
4871
4872static __init void svm_set_cpu_caps(void)
4873{
4874	kvm_set_cpu_caps();
4875
4876	kvm_caps.supported_perf_cap = 0;
4877	kvm_caps.supported_xss = 0;
4878
4879	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
4880	if (nested) {
4881		kvm_cpu_cap_set(X86_FEATURE_SVM);
4882		kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
4883
4884		if (nrips)
4885			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4886
4887		if (npt_enabled)
4888			kvm_cpu_cap_set(X86_FEATURE_NPT);
4889
4890		if (tsc_scaling)
4891			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4892
4893		if (vls)
4894			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
4895		if (lbrv)
4896			kvm_cpu_cap_set(X86_FEATURE_LBRV);
4897
4898		if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4899			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4900
4901		if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4902			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4903
4904		if (vgif)
4905			kvm_cpu_cap_set(X86_FEATURE_VGIF);
4906
4907		/* Nested VM can receive #VMEXIT instead of triggering #GP */
4908		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4909	}
4910
4911	/* CPUID 0x80000008 */
4912	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4913	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
4914		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4915
4916	/* AMD PMU PERFCTR_CORE CPUID */
4917	if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4918		kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
4919
4920	/* CPUID 0x8000001F (SME/SEV features) */
4921	sev_set_cpu_caps();
4922}
4923
4924static __init int svm_hardware_setup(void)
4925{
4926	int cpu;
4927	struct page *iopm_pages;
4928	void *iopm_va;
4929	int r;
4930	unsigned int order = get_order(IOPM_SIZE);
4931
4932	/*
4933	 * NX is required for shadow paging and for NPT if the NX huge pages
4934	 * mitigation is enabled.
4935	 */
4936	if (!boot_cpu_has(X86_FEATURE_NX)) {
4937		pr_err_ratelimited("NX (Execute Disable) not supported\n");
4938		return -EOPNOTSUPP;
4939	}
4940	kvm_enable_efer_bits(EFER_NX);
4941
4942	iopm_pages = alloc_pages(GFP_KERNEL, order);
4943
4944	if (!iopm_pages)
4945		return -ENOMEM;
4946
4947	iopm_va = page_address(iopm_pages);
4948	memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4949	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4950
4951	init_msrpm_offsets();
4952
4953	kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
4954				     XFEATURE_MASK_BNDCSR);
4955
4956	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4957		kvm_enable_efer_bits(EFER_FFXSR);
4958
4959	if (tsc_scaling) {
4960		if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4961			tsc_scaling = false;
4962		} else {
4963			pr_info("TSC scaling supported\n");
4964			kvm_caps.has_tsc_control = true;
4965		}
4966	}
4967	kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
4968	kvm_caps.tsc_scaling_ratio_frac_bits = 32;
4969
4970	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
4971
4972	/* Check for pause filtering support */
4973	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
4974		pause_filter_count = 0;
4975		pause_filter_thresh = 0;
4976	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
4977		pause_filter_thresh = 0;
4978	}
4979
4980	if (nested) {
4981		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
4982		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
4983	}
4984
4985	/*
4986	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
4987	 * NPT isn't supported if the host is using 2-level paging since host
4988	 * CR4 is unchanged on VMRUN.
4989	 */
4990	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
4991		npt_enabled = false;
4992
4993	if (!boot_cpu_has(X86_FEATURE_NPT))
4994		npt_enabled = false;
4995
4996	/* Force VM NPT level equal to the host's paging level */
4997	kvm_configure_mmu(npt_enabled, get_npt_level(),
4998			  get_npt_level(), PG_LEVEL_1G);
4999	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5000
5001	/* Setup shadow_me_value and shadow_me_mask */
5002	kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5003
5004	svm_adjust_mmio_mask();
5005
5006	/*
5007	 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5008	 * may be modified by svm_adjust_mmio_mask()).
5009	 */
5010	sev_hardware_setup();
5011
5012	svm_hv_hardware_setup();
5013
5014	for_each_possible_cpu(cpu) {
5015		r = svm_cpu_init(cpu);
5016		if (r)
5017			goto err;
5018	}
5019
5020	if (nrips) {
5021		if (!boot_cpu_has(X86_FEATURE_NRIPS))
5022			nrips = false;
5023	}
5024
5025	enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
5026
5027	if (!enable_apicv) {
5028		svm_x86_ops.vcpu_blocking = NULL;
5029		svm_x86_ops.vcpu_unblocking = NULL;
5030		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5031	}
5032
5033	if (vls) {
5034		if (!npt_enabled ||
5035		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5036		    !IS_ENABLED(CONFIG_X86_64)) {
5037			vls = false;
5038		} else {
5039			pr_info("Virtual VMLOAD VMSAVE supported\n");
5040		}
5041	}
5042
5043	if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5044		svm_gp_erratum_intercept = false;
5045
5046	if (vgif) {
5047		if (!boot_cpu_has(X86_FEATURE_VGIF))
5048			vgif = false;
5049		else
5050			pr_info("Virtual GIF supported\n");
5051	}
5052
5053	if (lbrv) {
5054		if (!boot_cpu_has(X86_FEATURE_LBRV))
5055			lbrv = false;
5056		else
5057			pr_info("LBR virtualization supported\n");
5058	}
5059
5060	if (!enable_pmu)
5061		pr_info("PMU virtualization is disabled\n");
5062
5063	svm_set_cpu_caps();
5064
5065	/*
5066	 * It seems that on AMD processors PTE's accessed bit is
5067	 * being set by the CPU hardware before the NPF vmexit.
5068	 * This is not expected behaviour and our tests fail because
5069	 * of it.
5070	 * A workaround here is to disable support for
5071	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5072	 * In this case userspace can know if there is support using
5073	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5074	 * it
5075	 * If future AMD CPU models change the behaviour described above,
5076	 * this variable can be changed accordingly
5077	 */
5078	allow_smaller_maxphyaddr = !npt_enabled;
5079
5080	return 0;
5081
5082err:
5083	svm_hardware_unsetup();
5084	return r;
5085}
5086
5087
5088static struct kvm_x86_init_ops svm_init_ops __initdata = {
5089	.cpu_has_kvm_support = has_svm,
5090	.disabled_by_bios = is_disabled,
5091	.hardware_setup = svm_hardware_setup,
5092	.check_processor_compatibility = svm_check_processor_compat,
5093
5094	.runtime_ops = &svm_x86_ops,
5095	.pmu_ops = &amd_pmu_ops,
5096};
5097
5098static int __init svm_init(void)
5099{
5100	__unused_size_checks();
5101
5102	return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
5103			__alignof__(struct vcpu_svm), THIS_MODULE);
5104}
5105
5106static void __exit svm_exit(void)
5107{
5108	kvm_exit();
5109}
5110
5111module_init(svm_init)
5112module_exit(svm_exit)