svm.c - arch/x86/kvm/svm/svm.c - Linux diff v6.2 - Bootlin Elixir Cross Referencer

   1#define pr_fmt(fmt) "SVM: " fmt
   2
   3#include <linux/kvm_host.h>
   4
   5#include "irq.h"
   6#include "mmu.h"
   7#include "kvm_cache_regs.h"
   8#include "x86.h"
   9#include "smm.h"
  10#include "cpuid.h"
  11#include "pmu.h"
  12
  13#include <linux/module.h>
  14#include <linux/mod_devicetable.h>
  15#include <linux/kernel.h>
  16#include <linux/vmalloc.h>
  17#include <linux/highmem.h>
  18#include <linux/amd-iommu.h>
  19#include <linux/sched.h>
  20#include <linux/trace_events.h>
  21#include <linux/slab.h>
  22#include <linux/hashtable.h>
  23#include <linux/objtool.h>
  24#include <linux/psp-sev.h>
  25#include <linux/file.h>
  26#include <linux/pagemap.h>
  27#include <linux/swap.h>
  28#include <linux/rwsem.h>
  29#include <linux/cc_platform.h>
 
  30
  31#include <asm/apic.h>
  32#include <asm/perf_event.h>
  33#include <asm/tlbflush.h>
  34#include <asm/desc.h>
  35#include <asm/debugreg.h>
  36#include <asm/kvm_para.h>
  37#include <asm/irq_remapping.h>
  38#include <asm/spec-ctrl.h>
  39#include <asm/cpu_device_id.h>
  40#include <asm/traps.h>
 
  41#include <asm/fpu/api.h>
  42
  43#include <asm/virtext.h>
 
  44#include "trace.h"
  45
  46#include "svm.h"
  47#include "svm_ops.h"
  48
  49#include "kvm_onhyperv.h"
  50#include "svm_onhyperv.h"
  51
  52MODULE_AUTHOR("Qumranet");
  53MODULE_LICENSE("GPL");
  54
  55#ifdef MODULE
  56static const struct x86_cpu_id svm_cpu_id[] = {
  57	X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  58	{}
  59};
  60MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  61#endif
  62
  63#define SEG_TYPE_LDT 2
  64#define SEG_TYPE_BUSY_TSS16 3
  65
  66static bool erratum_383_found __read_mostly;
  67
  68u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  69
  70/*
  71 * Set osvw_len to higher value when updated Revision Guides
  72 * are published and we know what the new status bits are
  73 */
  74static uint64_t osvw_len = 4, osvw_status;
  75
  76static DEFINE_PER_CPU(u64, current_tsc_ratio);
  77
  78#define X2APIC_MSR(x)	(APIC_BASE_MSR + (x >> 4))
  79
  80static const struct svm_direct_access_msrs {
  81	u32 index;   /* Index of the MSR */
  82	bool always; /* True if intercept is initially cleared */
  83} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  84	{ .index = MSR_STAR,				.always = true  },
  85	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
  86	{ .index = MSR_IA32_SYSENTER_EIP,		.always = false },
  87	{ .index = MSR_IA32_SYSENTER_ESP,		.always = false },
  88#ifdef CONFIG_X86_64
  89	{ .index = MSR_GS_BASE,				.always = true  },
  90	{ .index = MSR_FS_BASE,				.always = true  },
  91	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
  92	{ .index = MSR_LSTAR,				.always = true  },
  93	{ .index = MSR_CSTAR,				.always = true  },
  94	{ .index = MSR_SYSCALL_MASK,			.always = true  },
  95#endif
  96	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
  97	{ .index = MSR_IA32_PRED_CMD,			.always = false },
 
  98	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
  99	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
 100	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
 101	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
 
 102	{ .index = MSR_EFER,				.always = false },
 103	{ .index = MSR_IA32_CR_PAT,			.always = false },
 104	{ .index = MSR_AMD64_SEV_ES_GHCB,		.always = true  },
 105	{ .index = MSR_TSC_AUX,				.always = false },
 106	{ .index = X2APIC_MSR(APIC_ID),			.always = false },
 107	{ .index = X2APIC_MSR(APIC_LVR),		.always = false },
 108	{ .index = X2APIC_MSR(APIC_TASKPRI),		.always = false },
 109	{ .index = X2APIC_MSR(APIC_ARBPRI),		.always = false },
 110	{ .index = X2APIC_MSR(APIC_PROCPRI),		.always = false },
 111	{ .index = X2APIC_MSR(APIC_EOI),		.always = false },
 112	{ .index = X2APIC_MSR(APIC_RRR),		.always = false },
 113	{ .index = X2APIC_MSR(APIC_LDR),		.always = false },
 114	{ .index = X2APIC_MSR(APIC_DFR),		.always = false },
 115	{ .index = X2APIC_MSR(APIC_SPIV),		.always = false },
 116	{ .index = X2APIC_MSR(APIC_ISR),		.always = false },
 117	{ .index = X2APIC_MSR(APIC_TMR),		.always = false },
 118	{ .index = X2APIC_MSR(APIC_IRR),		.always = false },
 119	{ .index = X2APIC_MSR(APIC_ESR),		.always = false },
 120	{ .index = X2APIC_MSR(APIC_ICR),		.always = false },
 121	{ .index = X2APIC_MSR(APIC_ICR2),		.always = false },
 122
 123	/*
 124	 * Note:
 125	 * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 126	 * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 127	 * the AVIC hardware would generate GP fault. Therefore, always
 128	 * intercept the MSR 0x832, and do not setup direct_access_msr.
 129	 */
 130	{ .index = X2APIC_MSR(APIC_LVTTHMR),		.always = false },
 131	{ .index = X2APIC_MSR(APIC_LVTPC),		.always = false },
 132	{ .index = X2APIC_MSR(APIC_LVT0),		.always = false },
 133	{ .index = X2APIC_MSR(APIC_LVT1),		.always = false },
 134	{ .index = X2APIC_MSR(APIC_LVTERR),		.always = false },
 135	{ .index = X2APIC_MSR(APIC_TMICT),		.always = false },
 136	{ .index = X2APIC_MSR(APIC_TMCCT),		.always = false },
 137	{ .index = X2APIC_MSR(APIC_TDCR),		.always = false },
 138	{ .index = MSR_INVALID,				.always = false },
 139};
 140
 141/*
 142 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 143 * pause_filter_count: On processors that support Pause filtering(indicated
 144 *	by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 145 *	count value. On VMRUN this value is loaded into an internal counter.
 146 *	Each time a pause instruction is executed, this counter is decremented
 147 *	until it reaches zero at which time a #VMEXIT is generated if pause
 148 *	intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 149 *	Intercept Filtering for more details.
 150 *	This also indicate if ple logic enabled.
 151 *
 152 * pause_filter_thresh: In addition, some processor families support advanced
 153 *	pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 154 *	the amount of time a guest is allowed to execute in a pause loop.
 155 *	In this mode, a 16-bit pause filter threshold field is added in the
 156 *	VMCB. The threshold value is a cycle count that is used to reset the
 157 *	pause counter. As with simple pause filtering, VMRUN loads the pause
 158 *	count value from VMCB into an internal counter. Then, on each pause
 159 *	instruction the hardware checks the elapsed number of cycles since
 160 *	the most recent pause instruction against the pause filter threshold.
 161 *	If the elapsed cycle count is greater than the pause filter threshold,
 162 *	then the internal pause count is reloaded from the VMCB and execution
 163 *	continues. If the elapsed cycle count is less than the pause filter
 164 *	threshold, then the internal pause count is decremented. If the count
 165 *	value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 166 *	triggered. If advanced pause filtering is supported and pause filter
 167 *	threshold field is set to zero, the filter will operate in the simpler,
 168 *	count only mode.
 169 */
 170
 171static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 172module_param(pause_filter_thresh, ushort, 0444);
 173
 174static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 175module_param(pause_filter_count, ushort, 0444);
 176
 177/* Default doubles per-vcpu window every exit. */
 178static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 179module_param(pause_filter_count_grow, ushort, 0444);
 180
 181/* Default resets per-vcpu window every exit to pause_filter_count. */
 182static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 183module_param(pause_filter_count_shrink, ushort, 0444);
 184
 185/* Default is to compute the maximum so we can never overflow. */
 186static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 187module_param(pause_filter_count_max, ushort, 0444);
 188
 189/*
 190 * Use nested page tables by default.  Note, NPT may get forced off by
 191 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 192 */
 193bool npt_enabled = true;
 194module_param_named(npt, npt_enabled, bool, 0444);
 195
 196/* allow nested virtualization in KVM/SVM */
 197static int nested = true;
 198module_param(nested, int, S_IRUGO);
 199
 200/* enable/disable Next RIP Save */
 201static int nrips = true;
 202module_param(nrips, int, 0444);
 203
 204/* enable/disable Virtual VMLOAD VMSAVE */
 205static int vls = true;
 206module_param(vls, int, 0444);
 207
 208/* enable/disable Virtual GIF */
 209int vgif = true;
 210module_param(vgif, int, 0444);
 211
 212/* enable/disable LBR virtualization */
 213static int lbrv = true;
 214module_param(lbrv, int, 0444);
 215
 216static int tsc_scaling = true;
 217module_param(tsc_scaling, int, 0444);
 218
 219/*
 220 * enable / disable AVIC.  Because the defaults differ for APICv
 221 * support between VMX and SVM we cannot use module_param_named.
 222 */
 223static bool avic;
 224module_param(avic, bool, 0444);
 225
 226bool __read_mostly dump_invalid_vmcb;
 227module_param(dump_invalid_vmcb, bool, 0644);
 228
 229
 230bool intercept_smi = true;
 231module_param(intercept_smi, bool, 0444);
 232
 
 
 233
 234static bool svm_gp_erratum_intercept = true;
 235
 236static u8 rsm_ins_bytes[] = "\x0f\xaa";
 237
 238static unsigned long iopm_base;
 239
 240struct kvm_ldttss_desc {
 241	u16 limit0;
 242	u16 base0;
 243	unsigned base1:8, type:5, dpl:2, p:1;
 244	unsigned limit1:4, zero0:3, g:1, base2:8;
 245	u32 base3;
 246	u32 zero1;
 247} __attribute__((packed));
 248
 249DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 250
 251/*
 252 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 253 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 254 *
 255 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 256 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 257 */
 258static int tsc_aux_uret_slot __read_mostly = -1;
 259
 260static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 261
 262#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 263#define MSRS_RANGE_SIZE 2048
 264#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 265
 266u32 svm_msrpm_offset(u32 msr)
 267{
 268	u32 offset;
 269	int i;
 270
 271	for (i = 0; i < NUM_MSR_MAPS; i++) {
 272		if (msr < msrpm_ranges[i] ||
 273		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 274			continue;
 275
 276		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 277		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 278
 279		/* Now we have the u8 offset - but need the u32 offset */
 280		return offset / 4;
 281	}
 282
 283	/* MSR not in any range */
 284	return MSR_INVALID;
 285}
 286
 287static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 288
 289static int get_npt_level(void)
 290{
 291#ifdef CONFIG_X86_64
 292	return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 293#else
 294	return PT32E_ROOT_LEVEL;
 295#endif
 296}
 297
 298int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 299{
 300	struct vcpu_svm *svm = to_svm(vcpu);
 301	u64 old_efer = vcpu->arch.efer;
 302	vcpu->arch.efer = efer;
 303
 304	if (!npt_enabled) {
 305		/* Shadow paging assumes NX to be available.  */
 306		efer |= EFER_NX;
 307
 308		if (!(efer & EFER_LMA))
 309			efer &= ~EFER_LME;
 310	}
 311
 312	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 313		if (!(efer & EFER_SVME)) {
 314			svm_leave_nested(vcpu);
 315			svm_set_gif(svm, true);
 316			/* #GP intercept is still needed for vmware backdoor */
 317			if (!enable_vmware_backdoor)
 318				clr_exception_intercept(svm, GP_VECTOR);
 319
 320			/*
 321			 * Free the nested guest state, unless we are in SMM.
 322			 * In this case we will return to the nested guest
 323			 * as soon as we leave SMM.
 324			 */
 325			if (!is_smm(vcpu))
 326				svm_free_nested(svm);
 327
 328		} else {
 329			int ret = svm_allocate_nested(svm);
 330
 331			if (ret) {
 332				vcpu->arch.efer = old_efer;
 333				return ret;
 334			}
 335
 336			/*
 337			 * Never intercept #GP for SEV guests, KVM can't
 338			 * decrypt guest memory to workaround the erratum.
 339			 */
 340			if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 341				set_exception_intercept(svm, GP_VECTOR);
 342		}
 343	}
 344
 345	svm->vmcb->save.efer = efer | EFER_SVME;
 346	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 347	return 0;
 348}
 349
 350static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 351{
 352	struct vcpu_svm *svm = to_svm(vcpu);
 353	u32 ret = 0;
 354
 355	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 356		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 357	return ret;
 358}
 359
 360static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 361{
 362	struct vcpu_svm *svm = to_svm(vcpu);
 363
 364	if (mask == 0)
 365		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 366	else
 367		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 368
 369}
 370
 371static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 372					   bool commit_side_effects)
 373{
 374	struct vcpu_svm *svm = to_svm(vcpu);
 375	unsigned long old_rflags;
 376
 377	/*
 378	 * SEV-ES does not expose the next RIP. The RIP update is controlled by
 379	 * the type of exit and the #VC handler in the guest.
 380	 */
 381	if (sev_es_guest(vcpu->kvm))
 382		goto done;
 383
 384	if (nrips && svm->vmcb->control.next_rip != 0) {
 385		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 386		svm->next_rip = svm->vmcb->control.next_rip;
 387	}
 388
 389	if (!svm->next_rip) {
 390		if (unlikely(!commit_side_effects))
 391			old_rflags = svm->vmcb->save.rflags;
 392
 393		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 394			return 0;
 395
 396		if (unlikely(!commit_side_effects))
 397			svm->vmcb->save.rflags = old_rflags;
 398	} else {
 399		kvm_rip_write(vcpu, svm->next_rip);
 400	}
 401
 402done:
 403	if (likely(commit_side_effects))
 404		svm_set_interrupt_shadow(vcpu, 0);
 405
 406	return 1;
 407}
 408
 409static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 410{
 411	return __svm_skip_emulated_instruction(vcpu, true);
 412}
 413
 414static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 415{
 416	unsigned long rip, old_rip = kvm_rip_read(vcpu);
 417	struct vcpu_svm *svm = to_svm(vcpu);
 418
 419	/*
 420	 * Due to architectural shortcomings, the CPU doesn't always provide
 421	 * NextRIP, e.g. if KVM intercepted an exception that occurred while
 422	 * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 423	 * the instruction even if NextRIP is supported to acquire the next
 424	 * RIP so that it can be shoved into the NextRIP field, otherwise
 425	 * hardware will fail to advance guest RIP during event injection.
 426	 * Drop the exception/interrupt if emulation fails and effectively
 427	 * retry the instruction, it's the least awful option.  If NRIPS is
 428	 * in use, the skip must not commit any side effects such as clearing
 429	 * the interrupt shadow or RFLAGS.RF.
 430	 */
 431	if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 432		return -EIO;
 433
 434	rip = kvm_rip_read(vcpu);
 435
 436	/*
 437	 * Save the injection information, even when using next_rip, as the
 438	 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 439	 * doesn't complete due to a VM-Exit occurring while the CPU is
 440	 * vectoring the event.   Decoding the instruction isn't guaranteed to
 441	 * work as there may be no backing instruction, e.g. if the event is
 442	 * being injected by L1 for L2, or if the guest is patching INT3 into
 443	 * a different instruction.
 444	 */
 445	svm->soft_int_injected = true;
 446	svm->soft_int_csbase = svm->vmcb->save.cs.base;
 447	svm->soft_int_old_rip = old_rip;
 448	svm->soft_int_next_rip = rip;
 449
 450	if (nrips)
 451		kvm_rip_write(vcpu, old_rip);
 452
 453	if (static_cpu_has(X86_FEATURE_NRIPS))
 454		svm->vmcb->control.next_rip = rip;
 455
 456	return 0;
 457}
 458
 459static void svm_inject_exception(struct kvm_vcpu *vcpu)
 460{
 461	struct kvm_queued_exception *ex = &vcpu->arch.exception;
 462	struct vcpu_svm *svm = to_svm(vcpu);
 463
 464	kvm_deliver_exception_payload(vcpu, ex);
 465
 466	if (kvm_exception_is_soft(ex->vector) &&
 467	    svm_update_soft_interrupt_rip(vcpu))
 468		return;
 469
 470	svm->vmcb->control.event_inj = ex->vector
 471		| SVM_EVTINJ_VALID
 472		| (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 473		| SVM_EVTINJ_TYPE_EXEPT;
 474	svm->vmcb->control.event_inj_err = ex->error_code;
 475}
 476
 477static void svm_init_erratum_383(void)
 478{
 479	u32 low, high;
 480	int err;
 481	u64 val;
 482
 483	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 484		return;
 485
 486	/* Use _safe variants to not break nested virtualization */
 487	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 488	if (err)
 489		return;
 490
 491	val |= (1ULL << 47);
 492
 493	low  = lower_32_bits(val);
 494	high = upper_32_bits(val);
 495
 496	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 497
 498	erratum_383_found = true;
 499}
 500
 501static void svm_init_osvw(struct kvm_vcpu *vcpu)
 502{
 503	/*
 504	 * Guests should see errata 400 and 415 as fixed (assuming that
 505	 * HLT and IO instructions are intercepted).
 506	 */
 507	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 508	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 509
 510	/*
 511	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
 512	 * all osvw.status bits inside that length, including bit 0 (which is
 513	 * reserved for erratum 298), are valid. However, if host processor's
 514	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
 515	 * be conservative here and therefore we tell the guest that erratum 298
 516	 * is present (because we really don't know).
 517	 */
 518	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 519		vcpu->arch.osvw.status |= 1;
 520}
 521
 522static int has_svm(void)
 523{
 524	const char *msg;
 
 525
 526	if (!cpu_has_svm(&msg)) {
 527		printk(KERN_INFO "has_svm: %s\n", msg);
 528		return 0;
 
 
 
 
 
 
 529	}
 530
 531	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 532		pr_info("KVM is unsupported when running as an SEV guest\n");
 533		return 0;
 534	}
 535
 536	return 1;
 537}
 538
 539void __svm_write_tsc_multiplier(u64 multiplier)
 540{
 541	preempt_disable();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 542
 
 
 543	if (multiplier == __this_cpu_read(current_tsc_ratio))
 544		goto out;
 545
 546	wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 547	__this_cpu_write(current_tsc_ratio, multiplier);
 548out:
 549	preempt_enable();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 550}
 551
 552static void svm_hardware_disable(void)
 553{
 554	/* Make sure we clean up behind us */
 555	if (tsc_scaling)
 556		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 557
 558	cpu_svm_disable();
 559
 560	amd_pmu_disable_virt();
 561}
 562
 563static int svm_hardware_enable(void)
 564{
 565
 566	struct svm_cpu_data *sd;
 567	uint64_t efer;
 568	struct desc_struct *gdt;
 569	int me = raw_smp_processor_id();
 570
 571	rdmsrl(MSR_EFER, efer);
 572	if (efer & EFER_SVME)
 573		return -EBUSY;
 574
 575	if (!has_svm()) {
 576		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 577		return -EINVAL;
 578	}
 579	sd = per_cpu_ptr(&svm_data, me);
 580	sd->asid_generation = 1;
 581	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 582	sd->next_asid = sd->max_asid + 1;
 583	sd->min_asid = max_sev_asid + 1;
 584
 585	gdt = get_current_gdt_rw();
 586	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 587
 588	wrmsrl(MSR_EFER, efer | EFER_SVME);
 589
 590	wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 591
 592	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 593		/*
 594		 * Set the default value, even if we don't use TSC scaling
 595		 * to avoid having stale value in the msr
 596		 */
 597		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 598	}
 599
 600
 601	/*
 602	 * Get OSVW bits.
 603	 *
 604	 * Note that it is possible to have a system with mixed processor
 605	 * revisions and therefore different OSVW bits. If bits are not the same
 606	 * on different processors then choose the worst case (i.e. if erratum
 607	 * is present on one processor and not on another then assume that the
 608	 * erratum is present everywhere).
 609	 */
 610	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 611		uint64_t len, status = 0;
 612		int err;
 613
 614		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 615		if (!err)
 616			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 617						      &err);
 618
 619		if (err)
 620			osvw_status = osvw_len = 0;
 621		else {
 622			if (len < osvw_len)
 623				osvw_len = len;
 624			osvw_status |= status;
 625			osvw_status &= (1ULL << osvw_len) - 1;
 626		}
 627	} else
 628		osvw_status = osvw_len = 0;
 629
 630	svm_init_erratum_383();
 631
 632	amd_pmu_enable_virt();
 633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 634	return 0;
 635}
 636
 637static void svm_cpu_uninit(int cpu)
 638{
 639	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 640
 641	if (!sd->save_area)
 642		return;
 643
 644	kfree(sd->sev_vmcbs);
 645	__free_page(sd->save_area);
 646	sd->save_area_pa = 0;
 647	sd->save_area = NULL;
 648}
 649
 650static int svm_cpu_init(int cpu)
 651{
 652	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 653	int ret = -ENOMEM;
 654
 655	memset(sd, 0, sizeof(struct svm_cpu_data));
 656	sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 657	if (!sd->save_area)
 658		return ret;
 659
 660	ret = sev_cpu_init(sd);
 661	if (ret)
 662		goto free_save_area;
 663
 664	sd->save_area_pa = __sme_page_pa(sd->save_area);
 665	return 0;
 666
 667free_save_area:
 668	__free_page(sd->save_area);
 669	sd->save_area = NULL;
 670	return ret;
 671
 672}
 673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 674static int direct_access_msr_slot(u32 msr)
 675{
 676	u32 i;
 677
 678	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 679		if (direct_access_msrs[i].index == msr)
 680			return i;
 681
 682	return -ENOENT;
 683}
 684
 685static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 686				     int write)
 687{
 688	struct vcpu_svm *svm = to_svm(vcpu);
 689	int slot = direct_access_msr_slot(msr);
 690
 691	if (slot == -ENOENT)
 692		return;
 693
 694	/* Set the shadow bitmaps to the desired intercept states */
 695	if (read)
 696		set_bit(slot, svm->shadow_msr_intercept.read);
 697	else
 698		clear_bit(slot, svm->shadow_msr_intercept.read);
 699
 700	if (write)
 701		set_bit(slot, svm->shadow_msr_intercept.write);
 702	else
 703		clear_bit(slot, svm->shadow_msr_intercept.write);
 704}
 705
 706static bool valid_msr_intercept(u32 index)
 707{
 708	return direct_access_msr_slot(index) != -ENOENT;
 709}
 710
 711static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 712{
 713	u8 bit_write;
 714	unsigned long tmp;
 715	u32 offset;
 716	u32 *msrpm;
 717
 718	/*
 719	 * For non-nested case:
 720	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
 721	 * save it.
 722	 *
 723	 * For nested case:
 724	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 725	 * save it.
 726	 */
 727	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 728				      to_svm(vcpu)->msrpm;
 729
 730	offset    = svm_msrpm_offset(msr);
 731	bit_write = 2 * (msr & 0x0f) + 1;
 732	tmp       = msrpm[offset];
 733
 734	BUG_ON(offset == MSR_INVALID);
 735
 736	return !!test_bit(bit_write,  &tmp);
 737}
 738
 739static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 740					u32 msr, int read, int write)
 741{
 742	struct vcpu_svm *svm = to_svm(vcpu);
 743	u8 bit_read, bit_write;
 744	unsigned long tmp;
 745	u32 offset;
 746
 747	/*
 748	 * If this warning triggers extend the direct_access_msrs list at the
 749	 * beginning of the file
 750	 */
 751	WARN_ON(!valid_msr_intercept(msr));
 752
 753	/* Enforce non allowed MSRs to trap */
 754	if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 755		read = 0;
 756
 757	if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 758		write = 0;
 759
 760	offset    = svm_msrpm_offset(msr);
 761	bit_read  = 2 * (msr & 0x0f);
 762	bit_write = 2 * (msr & 0x0f) + 1;
 763	tmp       = msrpm[offset];
 764
 765	BUG_ON(offset == MSR_INVALID);
 766
 767	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 768	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 769
 770	msrpm[offset] = tmp;
 771
 772	svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 773	svm->nested.force_msr_bitmap_recalc = true;
 774}
 775
 776void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 777			  int read, int write)
 778{
 779	set_shadow_msr_intercept(vcpu, msr, read, write);
 780	set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 781}
 782
 783u32 *svm_vcpu_alloc_msrpm(void)
 784{
 785	unsigned int order = get_order(MSRPM_SIZE);
 786	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 787	u32 *msrpm;
 788
 789	if (!pages)
 790		return NULL;
 791
 792	msrpm = page_address(pages);
 793	memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 794
 795	return msrpm;
 796}
 797
 798void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 799{
 800	int i;
 801
 802	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 803		if (!direct_access_msrs[i].always)
 804			continue;
 805		set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 806	}
 807}
 808
 809void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 810{
 811	int i;
 812
 813	if (intercept == svm->x2avic_msrs_intercepted)
 814		return;
 815
 816	if (avic_mode != AVIC_MODE_X2 ||
 817	    !apic_x2apic_mode(svm->vcpu.arch.apic))
 818		return;
 819
 820	for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 821		int index = direct_access_msrs[i].index;
 822
 823		if ((index < APIC_BASE_MSR) ||
 824		    (index > APIC_BASE_MSR + 0xff))
 825			continue;
 826		set_msr_interception(&svm->vcpu, svm->msrpm, index,
 827				     !intercept, !intercept);
 828	}
 829
 830	svm->x2avic_msrs_intercepted = intercept;
 831}
 832
 833void svm_vcpu_free_msrpm(u32 *msrpm)
 834{
 835	__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 836}
 837
 838static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 839{
 840	struct vcpu_svm *svm = to_svm(vcpu);
 841	u32 i;
 842
 843	/*
 844	 * Set intercept permissions for all direct access MSRs again. They
 845	 * will automatically get filtered through the MSR filter, so we are
 846	 * back in sync after this.
 847	 */
 848	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 849		u32 msr = direct_access_msrs[i].index;
 850		u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 851		u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 852
 853		set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 854	}
 855}
 856
 857static void add_msr_offset(u32 offset)
 858{
 859	int i;
 860
 861	for (i = 0; i < MSRPM_OFFSETS; ++i) {
 862
 863		/* Offset already in list? */
 864		if (msrpm_offsets[i] == offset)
 865			return;
 866
 867		/* Slot used by another offset? */
 868		if (msrpm_offsets[i] != MSR_INVALID)
 869			continue;
 870
 871		/* Add offset to list */
 872		msrpm_offsets[i] = offset;
 873
 874		return;
 875	}
 876
 877	/*
 878	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
 879	 * increase MSRPM_OFFSETS in this case.
 880	 */
 881	BUG();
 882}
 883
 884static void init_msrpm_offsets(void)
 885{
 886	int i;
 887
 888	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 889
 890	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 891		u32 offset;
 892
 893		offset = svm_msrpm_offset(direct_access_msrs[i].index);
 894		BUG_ON(offset == MSR_INVALID);
 895
 896		add_msr_offset(offset);
 897	}
 898}
 899
 900void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 901{
 902	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
 903	to_vmcb->save.br_from		= from_vmcb->save.br_from;
 904	to_vmcb->save.br_to		= from_vmcb->save.br_to;
 905	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
 906	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
 907
 908	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 909}
 910
 911static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 912{
 913	struct vcpu_svm *svm = to_svm(vcpu);
 914
 915	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 916	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 917	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 918	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 919	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 920
 921	/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 922	if (is_guest_mode(vcpu))
 923		svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
 924}
 925
 926static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 927{
 928	struct vcpu_svm *svm = to_svm(vcpu);
 929
 930	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
 931	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 932	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 933	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 934	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 935
 936	/*
 937	 * Move the LBR msrs back to the vmcb01 to avoid copying them
 938	 * on nested guest entries.
 939	 */
 940	if (is_guest_mode(vcpu))
 941		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 942}
 943
 944static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
 945{
 946	/*
 947	 * If the LBR virtualization is disabled, the LBR msrs are always
 948	 * kept in the vmcb01 to avoid copying them on nested guest entries.
 949	 *
 950	 * If nested, and the LBR virtualization is enabled/disabled, the msrs
 951	 * are moved between the vmcb01 and vmcb02 as needed.
 952	 */
 953	struct vmcb *vmcb =
 954		(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
 955			svm->vmcb : svm->vmcb01.ptr;
 956
 957	switch (index) {
 958	case MSR_IA32_DEBUGCTLMSR:
 959		return vmcb->save.dbgctl;
 960	case MSR_IA32_LASTBRANCHFROMIP:
 961		return vmcb->save.br_from;
 962	case MSR_IA32_LASTBRANCHTOIP:
 963		return vmcb->save.br_to;
 964	case MSR_IA32_LASTINTFROMIP:
 965		return vmcb->save.last_excp_from;
 966	case MSR_IA32_LASTINTTOIP:
 967		return vmcb->save.last_excp_to;
 968	default:
 969		KVM_BUG(false, svm->vcpu.kvm,
 970			"%s: Unknown MSR 0x%x", __func__, index);
 971		return 0;
 972	}
 973}
 974
 975void svm_update_lbrv(struct kvm_vcpu *vcpu)
 976{
 977	struct vcpu_svm *svm = to_svm(vcpu);
 978
 979	bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
 980					   DEBUGCTLMSR_LBR;
 981
 982	bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
 983				      LBR_CTL_ENABLE_MASK);
 984
 985	if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
 986		if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
 987			enable_lbrv = true;
 988
 989	if (enable_lbrv == current_enable_lbrv)
 990		return;
 991
 992	if (enable_lbrv)
 993		svm_enable_lbrv(vcpu);
 994	else
 995		svm_disable_lbrv(vcpu);
 996}
 997
 998void disable_nmi_singlestep(struct vcpu_svm *svm)
 999{
1000	svm->nmi_singlestep = false;
1001
1002	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1003		/* Clear our flags if they were not set by the guest */
1004		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1005			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1006		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1007			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1008	}
1009}
1010
1011static void grow_ple_window(struct kvm_vcpu *vcpu)
1012{
1013	struct vcpu_svm *svm = to_svm(vcpu);
1014	struct vmcb_control_area *control = &svm->vmcb->control;
1015	int old = control->pause_filter_count;
1016
1017	if (kvm_pause_in_guest(vcpu->kvm))
1018		return;
1019
1020	control->pause_filter_count = __grow_ple_window(old,
1021							pause_filter_count,
1022							pause_filter_count_grow,
1023							pause_filter_count_max);
1024
1025	if (control->pause_filter_count != old) {
1026		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1027		trace_kvm_ple_window_update(vcpu->vcpu_id,
1028					    control->pause_filter_count, old);
1029	}
1030}
1031
1032static void shrink_ple_window(struct kvm_vcpu *vcpu)
1033{
1034	struct vcpu_svm *svm = to_svm(vcpu);
1035	struct vmcb_control_area *control = &svm->vmcb->control;
1036	int old = control->pause_filter_count;
1037
1038	if (kvm_pause_in_guest(vcpu->kvm))
1039		return;
1040
1041	control->pause_filter_count =
1042				__shrink_ple_window(old,
1043						    pause_filter_count,
1044						    pause_filter_count_shrink,
1045						    pause_filter_count);
1046	if (control->pause_filter_count != old) {
1047		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1048		trace_kvm_ple_window_update(vcpu->vcpu_id,
1049					    control->pause_filter_count, old);
1050	}
1051}
1052
1053static void svm_hardware_unsetup(void)
1054{
1055	int cpu;
1056
1057	sev_hardware_unsetup();
1058
1059	for_each_possible_cpu(cpu)
1060		svm_cpu_uninit(cpu);
1061
1062	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1063	get_order(IOPM_SIZE));
1064	iopm_base = 0;
1065}
1066
1067static void init_seg(struct vmcb_seg *seg)
1068{
1069	seg->selector = 0;
1070	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1071		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1072	seg->limit = 0xffff;
1073	seg->base = 0;
1074}
1075
1076static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1077{
1078	seg->selector = 0;
1079	seg->attrib = SVM_SELECTOR_P_MASK | type;
1080	seg->limit = 0xffff;
1081	seg->base = 0;
1082}
1083
1084static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1085{
1086	struct vcpu_svm *svm = to_svm(vcpu);
1087
1088	return svm->nested.ctl.tsc_offset;
1089}
1090
1091static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1092{
1093	struct vcpu_svm *svm = to_svm(vcpu);
1094
1095	return svm->tsc_ratio_msr;
1096}
1097
1098static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1099{
1100	struct vcpu_svm *svm = to_svm(vcpu);
1101
1102	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1103	svm->vmcb->control.tsc_offset = offset;
1104	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1105}
1106
1107static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1108{
1109	__svm_write_tsc_multiplier(multiplier);
 
 
 
1110}
1111
1112
1113/* Evaluate instruction intercepts that depend on guest CPUID features. */
1114static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1115					      struct vcpu_svm *svm)
1116{
1117	/*
1118	 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1119	 * roots, or if INVPCID is disabled in the guest to inject #UD.
1120	 */
1121	if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1122		if (!npt_enabled ||
1123		    !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1124			svm_set_intercept(svm, INTERCEPT_INVPCID);
1125		else
1126			svm_clr_intercept(svm, INTERCEPT_INVPCID);
1127	}
1128
1129	if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1130		if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1131			svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1132		else
1133			svm_set_intercept(svm, INTERCEPT_RDTSCP);
1134	}
1135}
1136
1137static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1138{
1139	struct vcpu_svm *svm = to_svm(vcpu);
1140
1141	if (guest_cpuid_is_intel(vcpu)) {
1142		/*
1143		 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1144		 * accesses because the processor only stores 32 bits.
1145		 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1146		 */
1147		svm_set_intercept(svm, INTERCEPT_VMLOAD);
1148		svm_set_intercept(svm, INTERCEPT_VMSAVE);
1149		svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1150
1151		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1152		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1153
1154		svm->v_vmload_vmsave_enabled = false;
1155	} else {
1156		/*
1157		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1158		 * in VMCB and clear intercepts to avoid #VMEXIT.
1159		 */
1160		if (vls) {
1161			svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1162			svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1163			svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1164		}
1165		/* No need to intercept these MSRs */
1166		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1167		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1168	}
1169}
1170
1171static void init_vmcb(struct kvm_vcpu *vcpu)
1172{
1173	struct vcpu_svm *svm = to_svm(vcpu);
1174	struct vmcb *vmcb = svm->vmcb01.ptr;
1175	struct vmcb_control_area *control = &vmcb->control;
1176	struct vmcb_save_area *save = &vmcb->save;
1177
1178	svm_set_intercept(svm, INTERCEPT_CR0_READ);
1179	svm_set_intercept(svm, INTERCEPT_CR3_READ);
1180	svm_set_intercept(svm, INTERCEPT_CR4_READ);
1181	svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1182	svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1183	svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1184	if (!kvm_vcpu_apicv_active(vcpu))
1185		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1186
1187	set_dr_intercepts(svm);
1188
1189	set_exception_intercept(svm, PF_VECTOR);
1190	set_exception_intercept(svm, UD_VECTOR);
1191	set_exception_intercept(svm, MC_VECTOR);
1192	set_exception_intercept(svm, AC_VECTOR);
1193	set_exception_intercept(svm, DB_VECTOR);
1194	/*
1195	 * Guest access to VMware backdoor ports could legitimately
1196	 * trigger #GP because of TSS I/O permission bitmap.
1197	 * We intercept those #GP and allow access to them anyway
1198	 * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
1199	 * decrypt guest memory to decode the faulting instruction.
1200	 */
1201	if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
1202		set_exception_intercept(svm, GP_VECTOR);
1203
1204	svm_set_intercept(svm, INTERCEPT_INTR);
1205	svm_set_intercept(svm, INTERCEPT_NMI);
1206
1207	if (intercept_smi)
1208		svm_set_intercept(svm, INTERCEPT_SMI);
1209
1210	svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1211	svm_set_intercept(svm, INTERCEPT_RDPMC);
1212	svm_set_intercept(svm, INTERCEPT_CPUID);
1213	svm_set_intercept(svm, INTERCEPT_INVD);
1214	svm_set_intercept(svm, INTERCEPT_INVLPG);
1215	svm_set_intercept(svm, INTERCEPT_INVLPGA);
1216	svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1217	svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1218	svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1219	svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1220	svm_set_intercept(svm, INTERCEPT_VMRUN);
1221	svm_set_intercept(svm, INTERCEPT_VMMCALL);
1222	svm_set_intercept(svm, INTERCEPT_VMLOAD);
1223	svm_set_intercept(svm, INTERCEPT_VMSAVE);
1224	svm_set_intercept(svm, INTERCEPT_STGI);
1225	svm_set_intercept(svm, INTERCEPT_CLGI);
1226	svm_set_intercept(svm, INTERCEPT_SKINIT);
1227	svm_set_intercept(svm, INTERCEPT_WBINVD);
1228	svm_set_intercept(svm, INTERCEPT_XSETBV);
1229	svm_set_intercept(svm, INTERCEPT_RDPRU);
1230	svm_set_intercept(svm, INTERCEPT_RSM);
1231
1232	if (!kvm_mwait_in_guest(vcpu->kvm)) {
1233		svm_set_intercept(svm, INTERCEPT_MONITOR);
1234		svm_set_intercept(svm, INTERCEPT_MWAIT);
1235	}
1236
1237	if (!kvm_hlt_in_guest(vcpu->kvm))
1238		svm_set_intercept(svm, INTERCEPT_HLT);
1239
1240	control->iopm_base_pa = __sme_set(iopm_base);
1241	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1242	control->int_ctl = V_INTR_MASKING_MASK;
1243
1244	init_seg(&save->es);
1245	init_seg(&save->ss);
1246	init_seg(&save->ds);
1247	init_seg(&save->fs);
1248	init_seg(&save->gs);
1249
1250	save->cs.selector = 0xf000;
1251	save->cs.base = 0xffff0000;
1252	/* Executable/Readable Code Segment */
1253	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1254		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1255	save->cs.limit = 0xffff;
1256
1257	save->gdtr.base = 0;
1258	save->gdtr.limit = 0xffff;
1259	save->idtr.base = 0;
1260	save->idtr.limit = 0xffff;
1261
1262	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1263	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1264
1265	if (npt_enabled) {
1266		/* Setup VMCB for Nested Paging */
1267		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1268		svm_clr_intercept(svm, INTERCEPT_INVLPG);
1269		clr_exception_intercept(svm, PF_VECTOR);
1270		svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1271		svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1272		save->g_pat = vcpu->arch.pat;
1273		save->cr3 = 0;
1274	}
1275	svm->current_vmcb->asid_generation = 0;
1276	svm->asid = 0;
1277
1278	svm->nested.vmcb12_gpa = INVALID_GPA;
1279	svm->nested.last_vmcb12_gpa = INVALID_GPA;
1280
1281	if (!kvm_pause_in_guest(vcpu->kvm)) {
1282		control->pause_filter_count = pause_filter_count;
1283		if (pause_filter_thresh)
1284			control->pause_filter_thresh = pause_filter_thresh;
1285		svm_set_intercept(svm, INTERCEPT_PAUSE);
1286	} else {
1287		svm_clr_intercept(svm, INTERCEPT_PAUSE);
1288	}
1289
1290	svm_recalc_instruction_intercepts(vcpu, svm);
1291
1292	/*
1293	 * If the host supports V_SPEC_CTRL then disable the interception
1294	 * of MSR_IA32_SPEC_CTRL.
1295	 */
1296	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1297		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1298
1299	if (kvm_vcpu_apicv_active(vcpu))
1300		avic_init_vmcb(svm, vmcb);
1301
 
 
 
1302	if (vgif) {
1303		svm_clr_intercept(svm, INTERCEPT_STGI);
1304		svm_clr_intercept(svm, INTERCEPT_CLGI);
1305		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1306	}
1307
1308	if (sev_guest(vcpu->kvm))
1309		sev_init_vmcb(svm);
1310
1311	svm_hv_init_vmcb(vmcb);
1312	init_vmcb_after_set_cpuid(vcpu);
1313
1314	vmcb_mark_all_dirty(vmcb);
1315
1316	enable_gif(svm);
1317}
1318
1319static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1320{
1321	struct vcpu_svm *svm = to_svm(vcpu);
1322
1323	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1324
1325	svm_init_osvw(vcpu);
1326	vcpu->arch.microcode_version = 0x01000065;
1327	svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1328
 
 
 
1329	if (sev_es_guest(vcpu->kvm))
1330		sev_es_vcpu_reset(svm);
1331}
1332
1333static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1334{
1335	struct vcpu_svm *svm = to_svm(vcpu);
1336
1337	svm->spec_ctrl = 0;
1338	svm->virt_spec_ctrl = 0;
1339
1340	init_vmcb(vcpu);
1341
1342	if (!init_event)
1343		__svm_vcpu_reset(vcpu);
1344}
1345
1346void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1347{
1348	svm->current_vmcb = target_vmcb;
1349	svm->vmcb = target_vmcb->ptr;
1350}
1351
1352static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1353{
1354	struct vcpu_svm *svm;
1355	struct page *vmcb01_page;
1356	struct page *vmsa_page = NULL;
1357	int err;
1358
1359	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1360	svm = to_svm(vcpu);
1361
1362	err = -ENOMEM;
1363	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1364	if (!vmcb01_page)
1365		goto out;
1366
1367	if (sev_es_guest(vcpu->kvm)) {
1368		/*
1369		 * SEV-ES guests require a separate VMSA page used to contain
1370		 * the encrypted register state of the guest.
1371		 */
1372		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1373		if (!vmsa_page)
1374			goto error_free_vmcb_page;
1375
1376		/*
1377		 * SEV-ES guests maintain an encrypted version of their FPU
1378		 * state which is restored and saved on VMRUN and VMEXIT.
1379		 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1380		 * do xsave/xrstor on it.
1381		 */
1382		fpstate_set_confidential(&vcpu->arch.guest_fpu);
1383	}
1384
1385	err = avic_init_vcpu(svm);
1386	if (err)
1387		goto error_free_vmsa_page;
1388
1389	svm->msrpm = svm_vcpu_alloc_msrpm();
1390	if (!svm->msrpm) {
1391		err = -ENOMEM;
1392		goto error_free_vmsa_page;
1393	}
1394
1395	svm->x2avic_msrs_intercepted = true;
1396
1397	svm->vmcb01.ptr = page_address(vmcb01_page);
1398	svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1399	svm_switch_vmcb(svm, &svm->vmcb01);
1400
1401	if (vmsa_page)
1402		svm->sev_es.vmsa = page_address(vmsa_page);
1403
1404	svm->guest_state_loaded = false;
1405
1406	return 0;
1407
1408error_free_vmsa_page:
1409	if (vmsa_page)
1410		__free_page(vmsa_page);
1411error_free_vmcb_page:
1412	__free_page(vmcb01_page);
1413out:
1414	return err;
1415}
1416
1417static void svm_clear_current_vmcb(struct vmcb *vmcb)
1418{
1419	int i;
1420
1421	for_each_online_cpu(i)
1422		cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1423}
1424
1425static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1426{
1427	struct vcpu_svm *svm = to_svm(vcpu);
1428
1429	/*
1430	 * The vmcb page can be recycled, causing a false negative in
1431	 * svm_vcpu_load(). So, ensure that no logical CPU has this
1432	 * vmcb page recorded as its current vmcb.
1433	 */
1434	svm_clear_current_vmcb(svm->vmcb);
1435
1436	svm_leave_nested(vcpu);
1437	svm_free_nested(svm);
1438
1439	sev_free_vcpu(vcpu);
1440
1441	__free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1442	__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1443}
1444
1445static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1446{
1447	struct vcpu_svm *svm = to_svm(vcpu);
1448	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1449
1450	if (sev_es_guest(vcpu->kvm))
1451		sev_es_unmap_ghcb(svm);
1452
1453	if (svm->guest_state_loaded)
1454		return;
1455
1456	/*
1457	 * Save additional host state that will be restored on VMEXIT (sev-es)
1458	 * or subsequent vmload of host save area.
1459	 */
1460	vmsave(sd->save_area_pa);
1461	if (sev_es_guest(vcpu->kvm)) {
1462		struct sev_es_save_area *hostsa;
1463		hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1464
1465		sev_es_prepare_switch_to_guest(hostsa);
1466	}
1467
1468	if (tsc_scaling)
1469		__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1470
1471	if (likely(tsc_aux_uret_slot >= 0))
 
 
 
 
 
 
 
1472		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1473
1474	svm->guest_state_loaded = true;
1475}
1476
1477static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1478{
1479	to_svm(vcpu)->guest_state_loaded = false;
1480}
1481
1482static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1483{
1484	struct vcpu_svm *svm = to_svm(vcpu);
1485	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1486
1487	if (sd->current_vmcb != svm->vmcb) {
1488		sd->current_vmcb = svm->vmcb;
1489		indirect_branch_prediction_barrier();
 
 
1490	}
1491	if (kvm_vcpu_apicv_active(vcpu))
1492		avic_vcpu_load(vcpu, cpu);
1493}
1494
1495static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1496{
1497	if (kvm_vcpu_apicv_active(vcpu))
1498		avic_vcpu_put(vcpu);
1499
1500	svm_prepare_host_switch(vcpu);
1501
1502	++vcpu->stat.host_state_reload;
1503}
1504
1505static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1506{
1507	struct vcpu_svm *svm = to_svm(vcpu);
1508	unsigned long rflags = svm->vmcb->save.rflags;
1509
1510	if (svm->nmi_singlestep) {
1511		/* Hide our flags if they were not set by the guest */
1512		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1513			rflags &= ~X86_EFLAGS_TF;
1514		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1515			rflags &= ~X86_EFLAGS_RF;
1516	}
1517	return rflags;
1518}
1519
1520static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1521{
1522	if (to_svm(vcpu)->nmi_singlestep)
1523		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1524
1525       /*
1526        * Any change of EFLAGS.VM is accompanied by a reload of SS
1527        * (caused by either a task switch or an inter-privilege IRET),
1528        * so we do not need to update the CPL here.
1529        */
1530	to_svm(vcpu)->vmcb->save.rflags = rflags;
1531}
1532
1533static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1534{
1535	struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1536
1537	return sev_es_guest(vcpu->kvm)
1538		? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1539		: kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1540}
1541
1542static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1543{
1544	kvm_register_mark_available(vcpu, reg);
1545
1546	switch (reg) {
1547	case VCPU_EXREG_PDPTR:
1548		/*
1549		 * When !npt_enabled, mmu->pdptrs[] is already available since
1550		 * it is always updated per SDM when moving to CRs.
1551		 */
1552		if (npt_enabled)
1553			load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1554		break;
1555	default:
1556		KVM_BUG_ON(1, vcpu->kvm);
1557	}
1558}
1559
1560static void svm_set_vintr(struct vcpu_svm *svm)
1561{
1562	struct vmcb_control_area *control;
1563
1564	/*
1565	 * The following fields are ignored when AVIC is enabled
1566	 */
1567	WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1568
1569	svm_set_intercept(svm, INTERCEPT_VINTR);
1570
1571	/*
 
 
 
 
 
 
 
 
 
 
1572	 * This is just a dummy VINTR to actually cause a vmexit to happen.
1573	 * Actual injection of virtual interrupts happens through EVENTINJ.
1574	 */
1575	control = &svm->vmcb->control;
1576	control->int_vector = 0x0;
1577	control->int_ctl &= ~V_INTR_PRIO_MASK;
1578	control->int_ctl |= V_IRQ_MASK |
1579		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1580	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1581}
1582
1583static void svm_clear_vintr(struct vcpu_svm *svm)
1584{
1585	svm_clr_intercept(svm, INTERCEPT_VINTR);
1586
1587	/* Drop int_ctl fields related to VINTR injection.  */
1588	svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1589	if (is_guest_mode(&svm->vcpu)) {
1590		svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1591
1592		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1593			(svm->nested.ctl.int_ctl & V_TPR_MASK));
1594
1595		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1596			V_IRQ_INJECTION_BITS_MASK;
1597
1598		svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1599	}
1600
1601	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1602}
1603
1604static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1605{
1606	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1607	struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1608
1609	switch (seg) {
1610	case VCPU_SREG_CS: return &save->cs;
1611	case VCPU_SREG_DS: return &save->ds;
1612	case VCPU_SREG_ES: return &save->es;
1613	case VCPU_SREG_FS: return &save01->fs;
1614	case VCPU_SREG_GS: return &save01->gs;
1615	case VCPU_SREG_SS: return &save->ss;
1616	case VCPU_SREG_TR: return &save01->tr;
1617	case VCPU_SREG_LDTR: return &save01->ldtr;
1618	}
1619	BUG();
1620	return NULL;
1621}
1622
1623static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1624{
1625	struct vmcb_seg *s = svm_seg(vcpu, seg);
1626
1627	return s->base;
1628}
1629
1630static void svm_get_segment(struct kvm_vcpu *vcpu,
1631			    struct kvm_segment *var, int seg)
1632{
1633	struct vmcb_seg *s = svm_seg(vcpu, seg);
1634
1635	var->base = s->base;
1636	var->limit = s->limit;
1637	var->selector = s->selector;
1638	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1639	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1640	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1641	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1642	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1643	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1644	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1645
1646	/*
1647	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1648	 * However, the SVM spec states that the G bit is not observed by the
1649	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1650	 * So let's synthesize a legal G bit for all segments, this helps
1651	 * running KVM nested. It also helps cross-vendor migration, because
1652	 * Intel's vmentry has a check on the 'G' bit.
1653	 */
1654	var->g = s->limit > 0xfffff;
1655
1656	/*
1657	 * AMD's VMCB does not have an explicit unusable field, so emulate it
1658	 * for cross vendor migration purposes by "not present"
1659	 */
1660	var->unusable = !var->present;
1661
1662	switch (seg) {
1663	case VCPU_SREG_TR:
1664		/*
1665		 * Work around a bug where the busy flag in the tr selector
1666		 * isn't exposed
1667		 */
1668		var->type |= 0x2;
1669		break;
1670	case VCPU_SREG_DS:
1671	case VCPU_SREG_ES:
1672	case VCPU_SREG_FS:
1673	case VCPU_SREG_GS:
1674		/*
1675		 * The accessed bit must always be set in the segment
1676		 * descriptor cache, although it can be cleared in the
1677		 * descriptor, the cached bit always remains at 1. Since
1678		 * Intel has a check on this, set it here to support
1679		 * cross-vendor migration.
1680		 */
1681		if (!var->unusable)
1682			var->type |= 0x1;
1683		break;
1684	case VCPU_SREG_SS:
1685		/*
1686		 * On AMD CPUs sometimes the DB bit in the segment
1687		 * descriptor is left as 1, although the whole segment has
1688		 * been made unusable. Clear it here to pass an Intel VMX
1689		 * entry check when cross vendor migrating.
1690		 */
1691		if (var->unusable)
1692			var->db = 0;
1693		/* This is symmetric with svm_set_segment() */
1694		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1695		break;
1696	}
1697}
1698
1699static int svm_get_cpl(struct kvm_vcpu *vcpu)
1700{
1701	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1702
1703	return save->cpl;
1704}
1705
1706static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1707{
1708	struct kvm_segment cs;
1709
1710	svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1711	*db = cs.db;
1712	*l = cs.l;
1713}
1714
1715static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1716{
1717	struct vcpu_svm *svm = to_svm(vcpu);
1718
1719	dt->size = svm->vmcb->save.idtr.limit;
1720	dt->address = svm->vmcb->save.idtr.base;
1721}
1722
1723static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1724{
1725	struct vcpu_svm *svm = to_svm(vcpu);
1726
1727	svm->vmcb->save.idtr.limit = dt->size;
1728	svm->vmcb->save.idtr.base = dt->address ;
1729	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1730}
1731
1732static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1733{
1734	struct vcpu_svm *svm = to_svm(vcpu);
1735
1736	dt->size = svm->vmcb->save.gdtr.limit;
1737	dt->address = svm->vmcb->save.gdtr.base;
1738}
1739
1740static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1741{
1742	struct vcpu_svm *svm = to_svm(vcpu);
1743
1744	svm->vmcb->save.gdtr.limit = dt->size;
1745	svm->vmcb->save.gdtr.base = dt->address ;
1746	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1747}
1748
1749static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1750{
1751	struct vcpu_svm *svm = to_svm(vcpu);
1752
1753	/*
1754	 * For guests that don't set guest_state_protected, the cr3 update is
1755	 * handled via kvm_mmu_load() while entering the guest. For guests
1756	 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1757	 * VMCB save area now, since the save area will become the initial
1758	 * contents of the VMSA, and future VMCB save area updates won't be
1759	 * seen.
1760	 */
1761	if (sev_es_guest(vcpu->kvm)) {
1762		svm->vmcb->save.cr3 = cr3;
1763		vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1764	}
1765}
1766
 
 
 
 
 
1767void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1768{
1769	struct vcpu_svm *svm = to_svm(vcpu);
1770	u64 hcr0 = cr0;
1771	bool old_paging = is_paging(vcpu);
1772
1773#ifdef CONFIG_X86_64
1774	if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1775		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1776			vcpu->arch.efer |= EFER_LMA;
1777			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
 
1778		}
1779
1780		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1781			vcpu->arch.efer &= ~EFER_LMA;
1782			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
 
1783		}
1784	}
1785#endif
1786	vcpu->arch.cr0 = cr0;
1787
1788	if (!npt_enabled) {
1789		hcr0 |= X86_CR0_PG | X86_CR0_WP;
1790		if (old_paging != is_paging(vcpu))
1791			svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1792	}
1793
1794	/*
1795	 * re-enable caching here because the QEMU bios
1796	 * does not do it - this results in some delay at
1797	 * reboot
1798	 */
1799	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1800		hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1801
1802	svm->vmcb->save.cr0 = hcr0;
1803	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1804
1805	/*
1806	 * SEV-ES guests must always keep the CR intercepts cleared. CR
1807	 * tracking is done using the CR write traps.
1808	 */
1809	if (sev_es_guest(vcpu->kvm))
1810		return;
1811
1812	if (hcr0 == cr0) {
1813		/* Selective CR0 write remains on.  */
1814		svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1815		svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1816	} else {
1817		svm_set_intercept(svm, INTERCEPT_CR0_READ);
1818		svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1819	}
1820}
1821
1822static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1823{
1824	return true;
1825}
1826
1827void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1828{
1829	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1830	unsigned long old_cr4 = vcpu->arch.cr4;
1831
1832	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1833		svm_flush_tlb_current(vcpu);
1834
1835	vcpu->arch.cr4 = cr4;
1836	if (!npt_enabled) {
1837		cr4 |= X86_CR4_PAE;
1838
1839		if (!is_paging(vcpu))
1840			cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1841	}
1842	cr4 |= host_cr4_mce;
1843	to_svm(vcpu)->vmcb->save.cr4 = cr4;
1844	vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1845
1846	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1847		kvm_update_cpuid_runtime(vcpu);
1848}
1849
1850static void svm_set_segment(struct kvm_vcpu *vcpu,
1851			    struct kvm_segment *var, int seg)
1852{
1853	struct vcpu_svm *svm = to_svm(vcpu);
1854	struct vmcb_seg *s = svm_seg(vcpu, seg);
1855
1856	s->base = var->base;
1857	s->limit = var->limit;
1858	s->selector = var->selector;
1859	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1860	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1861	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1862	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1863	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1864	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1865	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1866	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1867
1868	/*
1869	 * This is always accurate, except if SYSRET returned to a segment
1870	 * with SS.DPL != 3.  Intel does not have this quirk, and always
1871	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1872	 * would entail passing the CPL to userspace and back.
1873	 */
1874	if (seg == VCPU_SREG_SS)
1875		/* This is symmetric with svm_get_segment() */
1876		svm->vmcb->save.cpl = (var->dpl & 3);
1877
1878	vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1879}
1880
1881static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1882{
1883	struct vcpu_svm *svm = to_svm(vcpu);
1884
1885	clr_exception_intercept(svm, BP_VECTOR);
1886
1887	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1888		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1889			set_exception_intercept(svm, BP_VECTOR);
1890	}
1891}
1892
1893static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1894{
1895	if (sd->next_asid > sd->max_asid) {
1896		++sd->asid_generation;
1897		sd->next_asid = sd->min_asid;
1898		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1899		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1900	}
1901
1902	svm->current_vmcb->asid_generation = sd->asid_generation;
1903	svm->asid = sd->next_asid++;
1904}
1905
1906static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1907{
1908	struct vmcb *vmcb = svm->vmcb;
1909
1910	if (svm->vcpu.arch.guest_state_protected)
1911		return;
1912
1913	if (unlikely(value != vmcb->save.dr6)) {
1914		vmcb->save.dr6 = value;
1915		vmcb_mark_dirty(vmcb, VMCB_DR);
1916	}
1917}
1918
1919static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1920{
1921	struct vcpu_svm *svm = to_svm(vcpu);
1922
1923	if (vcpu->arch.guest_state_protected)
1924		return;
1925
1926	get_debugreg(vcpu->arch.db[0], 0);
1927	get_debugreg(vcpu->arch.db[1], 1);
1928	get_debugreg(vcpu->arch.db[2], 2);
1929	get_debugreg(vcpu->arch.db[3], 3);
1930	/*
1931	 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1932	 * because db_interception might need it.  We can do it before vmentry.
1933	 */
1934	vcpu->arch.dr6 = svm->vmcb->save.dr6;
1935	vcpu->arch.dr7 = svm->vmcb->save.dr7;
1936	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1937	set_dr_intercepts(svm);
1938}
1939
1940static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1941{
1942	struct vcpu_svm *svm = to_svm(vcpu);
1943
1944	if (vcpu->arch.guest_state_protected)
1945		return;
1946
1947	svm->vmcb->save.dr7 = value;
1948	vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1949}
1950
1951static int pf_interception(struct kvm_vcpu *vcpu)
1952{
1953	struct vcpu_svm *svm = to_svm(vcpu);
1954
1955	u64 fault_address = svm->vmcb->control.exit_info_2;
1956	u64 error_code = svm->vmcb->control.exit_info_1;
1957
1958	return kvm_handle_page_fault(vcpu, error_code, fault_address,
1959			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1960			svm->vmcb->control.insn_bytes : NULL,
1961			svm->vmcb->control.insn_len);
1962}
1963
1964static int npf_interception(struct kvm_vcpu *vcpu)
1965{
1966	struct vcpu_svm *svm = to_svm(vcpu);
1967
1968	u64 fault_address = svm->vmcb->control.exit_info_2;
1969	u64 error_code = svm->vmcb->control.exit_info_1;
1970
1971	trace_kvm_page_fault(vcpu, fault_address, error_code);
1972	return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1973			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1974			svm->vmcb->control.insn_bytes : NULL,
1975			svm->vmcb->control.insn_len);
1976}
1977
1978static int db_interception(struct kvm_vcpu *vcpu)
1979{
1980	struct kvm_run *kvm_run = vcpu->run;
1981	struct vcpu_svm *svm = to_svm(vcpu);
1982
1983	if (!(vcpu->guest_debug &
1984	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1985		!svm->nmi_singlestep) {
1986		u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1987		kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1988		return 1;
1989	}
1990
1991	if (svm->nmi_singlestep) {
1992		disable_nmi_singlestep(svm);
1993		/* Make sure we check for pending NMIs upon entry */
1994		kvm_make_request(KVM_REQ_EVENT, vcpu);
1995	}
1996
1997	if (vcpu->guest_debug &
1998	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1999		kvm_run->exit_reason = KVM_EXIT_DEBUG;
2000		kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2001		kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2002		kvm_run->debug.arch.pc =
2003			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2004		kvm_run->debug.arch.exception = DB_VECTOR;
2005		return 0;
2006	}
2007
2008	return 1;
2009}
2010
2011static int bp_interception(struct kvm_vcpu *vcpu)
2012{
2013	struct vcpu_svm *svm = to_svm(vcpu);
2014	struct kvm_run *kvm_run = vcpu->run;
2015
2016	kvm_run->exit_reason = KVM_EXIT_DEBUG;
2017	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2018	kvm_run->debug.arch.exception = BP_VECTOR;
2019	return 0;
2020}
2021
2022static int ud_interception(struct kvm_vcpu *vcpu)
2023{
2024	return handle_ud(vcpu);
2025}
2026
2027static int ac_interception(struct kvm_vcpu *vcpu)
2028{
2029	kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2030	return 1;
2031}
2032
2033static bool is_erratum_383(void)
2034{
2035	int err, i;
2036	u64 value;
2037
2038	if (!erratum_383_found)
2039		return false;
2040
2041	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2042	if (err)
2043		return false;
2044
2045	/* Bit 62 may or may not be set for this mce */
2046	value &= ~(1ULL << 62);
2047
2048	if (value != 0xb600000000010015ULL)
2049		return false;
2050
2051	/* Clear MCi_STATUS registers */
2052	for (i = 0; i < 6; ++i)
2053		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2054
2055	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2056	if (!err) {
2057		u32 low, high;
2058
2059		value &= ~(1ULL << 2);
2060		low    = lower_32_bits(value);
2061		high   = upper_32_bits(value);
2062
2063		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2064	}
2065
2066	/* Flush tlb to evict multi-match entries */
2067	__flush_tlb_all();
2068
2069	return true;
2070}
2071
2072static void svm_handle_mce(struct kvm_vcpu *vcpu)
2073{
2074	if (is_erratum_383()) {
2075		/*
2076		 * Erratum 383 triggered. Guest state is corrupt so kill the
2077		 * guest.
2078		 */
2079		pr_err("KVM: Guest triggered AMD Erratum 383\n");
2080
2081		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2082
2083		return;
2084	}
2085
2086	/*
2087	 * On an #MC intercept the MCE handler is not called automatically in
2088	 * the host. So do it by hand here.
2089	 */
2090	kvm_machine_check();
2091}
2092
2093static int mc_interception(struct kvm_vcpu *vcpu)
2094{
2095	return 1;
2096}
2097
2098static int shutdown_interception(struct kvm_vcpu *vcpu)
2099{
2100	struct kvm_run *kvm_run = vcpu->run;
2101	struct vcpu_svm *svm = to_svm(vcpu);
2102
2103	/*
2104	 * The VM save area has already been encrypted so it
2105	 * cannot be reinitialized - just terminate.
2106	 */
2107	if (sev_es_guest(vcpu->kvm))
2108		return -EINVAL;
2109
2110	/*
2111	 * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2112	 * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2113	 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2114	 * userspace.  At a platform view, INIT is acceptable behavior as
2115	 * there exist bare metal platforms that automatically INIT the CPU
2116	 * in response to shutdown.
 
 
 
2117	 */
2118	clear_page(svm->vmcb);
2119	kvm_vcpu_reset(vcpu, true);
 
 
2120
2121	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2122	return 0;
2123}
2124
2125static int io_interception(struct kvm_vcpu *vcpu)
2126{
2127	struct vcpu_svm *svm = to_svm(vcpu);
2128	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2129	int size, in, string;
2130	unsigned port;
2131
2132	++vcpu->stat.io_exits;
2133	string = (io_info & SVM_IOIO_STR_MASK) != 0;
2134	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2135	port = io_info >> 16;
2136	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2137
2138	if (string) {
2139		if (sev_es_guest(vcpu->kvm))
2140			return sev_es_string_io(svm, size, port, in);
2141		else
2142			return kvm_emulate_instruction(vcpu, 0);
2143	}
2144
2145	svm->next_rip = svm->vmcb->control.exit_info_2;
2146
2147	return kvm_fast_pio(vcpu, size, port, in);
2148}
2149
2150static int nmi_interception(struct kvm_vcpu *vcpu)
2151{
2152	return 1;
2153}
2154
2155static int smi_interception(struct kvm_vcpu *vcpu)
2156{
2157	return 1;
2158}
2159
2160static int intr_interception(struct kvm_vcpu *vcpu)
2161{
2162	++vcpu->stat.irq_exits;
2163	return 1;
2164}
2165
2166static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2167{
2168	struct vcpu_svm *svm = to_svm(vcpu);
2169	struct vmcb *vmcb12;
2170	struct kvm_host_map map;
2171	int ret;
2172
2173	if (nested_svm_check_permissions(vcpu))
2174		return 1;
2175
2176	ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2177	if (ret) {
2178		if (ret == -EINVAL)
2179			kvm_inject_gp(vcpu, 0);
2180		return 1;
2181	}
2182
2183	vmcb12 = map.hva;
2184
2185	ret = kvm_skip_emulated_instruction(vcpu);
2186
2187	if (vmload) {
2188		svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2189		svm->sysenter_eip_hi = 0;
2190		svm->sysenter_esp_hi = 0;
2191	} else {
2192		svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2193	}
2194
2195	kvm_vcpu_unmap(vcpu, &map, true);
2196
2197	return ret;
2198}
2199
2200static int vmload_interception(struct kvm_vcpu *vcpu)
2201{
2202	return vmload_vmsave_interception(vcpu, true);
2203}
2204
2205static int vmsave_interception(struct kvm_vcpu *vcpu)
2206{
2207	return vmload_vmsave_interception(vcpu, false);
2208}
2209
2210static int vmrun_interception(struct kvm_vcpu *vcpu)
2211{
2212	if (nested_svm_check_permissions(vcpu))
2213		return 1;
2214
2215	return nested_svm_vmrun(vcpu);
2216}
2217
2218enum {
2219	NONE_SVM_INSTR,
2220	SVM_INSTR_VMRUN,
2221	SVM_INSTR_VMLOAD,
2222	SVM_INSTR_VMSAVE,
2223};
2224
2225/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2226static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2227{
2228	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2229
2230	if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2231		return NONE_SVM_INSTR;
2232
2233	switch (ctxt->modrm) {
2234	case 0xd8: /* VMRUN */
2235		return SVM_INSTR_VMRUN;
2236	case 0xda: /* VMLOAD */
2237		return SVM_INSTR_VMLOAD;
2238	case 0xdb: /* VMSAVE */
2239		return SVM_INSTR_VMSAVE;
2240	default:
2241		break;
2242	}
2243
2244	return NONE_SVM_INSTR;
2245}
2246
2247static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2248{
2249	const int guest_mode_exit_codes[] = {
2250		[SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2251		[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2252		[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2253	};
2254	int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2255		[SVM_INSTR_VMRUN] = vmrun_interception,
2256		[SVM_INSTR_VMLOAD] = vmload_interception,
2257		[SVM_INSTR_VMSAVE] = vmsave_interception,
2258	};
2259	struct vcpu_svm *svm = to_svm(vcpu);
2260	int ret;
2261
2262	if (is_guest_mode(vcpu)) {
2263		/* Returns '1' or -errno on failure, '0' on success. */
2264		ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2265		if (ret)
2266			return ret;
2267		return 1;
2268	}
2269	return svm_instr_handlers[opcode](vcpu);
2270}
2271
2272/*
2273 * #GP handling code. Note that #GP can be triggered under the following two
2274 * cases:
2275 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2276 *      some AMD CPUs when EAX of these instructions are in the reserved memory
2277 *      regions (e.g. SMM memory on host).
2278 *   2) VMware backdoor
2279 */
2280static int gp_interception(struct kvm_vcpu *vcpu)
2281{
2282	struct vcpu_svm *svm = to_svm(vcpu);
2283	u32 error_code = svm->vmcb->control.exit_info_1;
2284	int opcode;
2285
2286	/* Both #GP cases have zero error_code */
2287	if (error_code)
2288		goto reinject;
2289
2290	/* Decode the instruction for usage later */
2291	if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2292		goto reinject;
2293
2294	opcode = svm_instr_opcode(vcpu);
2295
2296	if (opcode == NONE_SVM_INSTR) {
2297		if (!enable_vmware_backdoor)
2298			goto reinject;
2299
2300		/*
2301		 * VMware backdoor emulation on #GP interception only handles
2302		 * IN{S}, OUT{S}, and RDPMC.
2303		 */
2304		if (!is_guest_mode(vcpu))
2305			return kvm_emulate_instruction(vcpu,
2306				EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2307	} else {
2308		/* All SVM instructions expect page aligned RAX */
2309		if (svm->vmcb->save.rax & ~PAGE_MASK)
2310			goto reinject;
2311
2312		return emulate_svm_instr(vcpu, opcode);
2313	}
2314
2315reinject:
2316	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2317	return 1;
2318}
2319
2320void svm_set_gif(struct vcpu_svm *svm, bool value)
2321{
2322	if (value) {
2323		/*
2324		 * If VGIF is enabled, the STGI intercept is only added to
2325		 * detect the opening of the SMI/NMI window; remove it now.
2326		 * Likewise, clear the VINTR intercept, we will set it
2327		 * again while processing KVM_REQ_EVENT if needed.
2328		 */
2329		if (vgif)
2330			svm_clr_intercept(svm, INTERCEPT_STGI);
2331		if (svm_is_intercept(svm, INTERCEPT_VINTR))
2332			svm_clear_vintr(svm);
2333
2334		enable_gif(svm);
2335		if (svm->vcpu.arch.smi_pending ||
2336		    svm->vcpu.arch.nmi_pending ||
2337		    kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2338		    kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2339			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2340	} else {
2341		disable_gif(svm);
2342
2343		/*
2344		 * After a CLGI no interrupts should come.  But if vGIF is
2345		 * in use, we still rely on the VINTR intercept (rather than
2346		 * STGI) to detect an open interrupt window.
2347		*/
2348		if (!vgif)
2349			svm_clear_vintr(svm);
2350	}
2351}
2352
2353static int stgi_interception(struct kvm_vcpu *vcpu)
2354{
2355	int ret;
2356
2357	if (nested_svm_check_permissions(vcpu))
2358		return 1;
2359
2360	ret = kvm_skip_emulated_instruction(vcpu);
2361	svm_set_gif(to_svm(vcpu), true);
2362	return ret;
2363}
2364
2365static int clgi_interception(struct kvm_vcpu *vcpu)
2366{
2367	int ret;
2368
2369	if (nested_svm_check_permissions(vcpu))
2370		return 1;
2371
2372	ret = kvm_skip_emulated_instruction(vcpu);
2373	svm_set_gif(to_svm(vcpu), false);
2374	return ret;
2375}
2376
2377static int invlpga_interception(struct kvm_vcpu *vcpu)
2378{
2379	gva_t gva = kvm_rax_read(vcpu);
2380	u32 asid = kvm_rcx_read(vcpu);
2381
2382	/* FIXME: Handle an address size prefix. */
2383	if (!is_long_mode(vcpu))
2384		gva = (u32)gva;
2385
2386	trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2387
2388	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2389	kvm_mmu_invlpg(vcpu, gva);
2390
2391	return kvm_skip_emulated_instruction(vcpu);
2392}
2393
2394static int skinit_interception(struct kvm_vcpu *vcpu)
2395{
2396	trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2397
2398	kvm_queue_exception(vcpu, UD_VECTOR);
2399	return 1;
2400}
2401
2402static int task_switch_interception(struct kvm_vcpu *vcpu)
2403{
2404	struct vcpu_svm *svm = to_svm(vcpu);
2405	u16 tss_selector;
2406	int reason;
2407	int int_type = svm->vmcb->control.exit_int_info &
2408		SVM_EXITINTINFO_TYPE_MASK;
2409	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2410	uint32_t type =
2411		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2412	uint32_t idt_v =
2413		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2414	bool has_error_code = false;
2415	u32 error_code = 0;
2416
2417	tss_selector = (u16)svm->vmcb->control.exit_info_1;
2418
2419	if (svm->vmcb->control.exit_info_2 &
2420	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2421		reason = TASK_SWITCH_IRET;
2422	else if (svm->vmcb->control.exit_info_2 &
2423		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2424		reason = TASK_SWITCH_JMP;
2425	else if (idt_v)
2426		reason = TASK_SWITCH_GATE;
2427	else
2428		reason = TASK_SWITCH_CALL;
2429
2430	if (reason == TASK_SWITCH_GATE) {
2431		switch (type) {
2432		case SVM_EXITINTINFO_TYPE_NMI:
2433			vcpu->arch.nmi_injected = false;
2434			break;
2435		case SVM_EXITINTINFO_TYPE_EXEPT:
2436			if (svm->vmcb->control.exit_info_2 &
2437			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2438				has_error_code = true;
2439				error_code =
2440					(u32)svm->vmcb->control.exit_info_2;
2441			}
2442			kvm_clear_exception_queue(vcpu);
2443			break;
2444		case SVM_EXITINTINFO_TYPE_INTR:
2445		case SVM_EXITINTINFO_TYPE_SOFT:
2446			kvm_clear_interrupt_queue(vcpu);
2447			break;
2448		default:
2449			break;
2450		}
2451	}
2452
2453	if (reason != TASK_SWITCH_GATE ||
2454	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2455	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2456	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2457		if (!svm_skip_emulated_instruction(vcpu))
2458			return 0;
2459	}
2460
2461	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2462		int_vec = -1;
2463
2464	return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2465			       has_error_code, error_code);
2466}
2467
 
 
 
 
 
 
 
 
 
 
 
 
2468static int iret_interception(struct kvm_vcpu *vcpu)
2469{
2470	struct vcpu_svm *svm = to_svm(vcpu);
2471
 
 
2472	++vcpu->stat.nmi_window_exits;
2473	vcpu->arch.hflags |= HF_IRET_MASK;
2474	if (!sev_es_guest(vcpu->kvm)) {
2475		svm_clr_intercept(svm, INTERCEPT_IRET);
2476		svm->nmi_iret_rip = kvm_rip_read(vcpu);
2477	}
2478	kvm_make_request(KVM_REQ_EVENT, vcpu);
2479	return 1;
2480}
2481
2482static int invlpg_interception(struct kvm_vcpu *vcpu)
2483{
2484	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2485		return kvm_emulate_instruction(vcpu, 0);
2486
2487	kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2488	return kvm_skip_emulated_instruction(vcpu);
2489}
2490
2491static int emulate_on_interception(struct kvm_vcpu *vcpu)
2492{
2493	return kvm_emulate_instruction(vcpu, 0);
2494}
2495
2496static int rsm_interception(struct kvm_vcpu *vcpu)
2497{
2498	return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2499}
2500
2501static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2502					    unsigned long val)
2503{
2504	struct vcpu_svm *svm = to_svm(vcpu);
2505	unsigned long cr0 = vcpu->arch.cr0;
2506	bool ret = false;
2507
2508	if (!is_guest_mode(vcpu) ||
2509	    (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2510		return false;
2511
2512	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2513	val &= ~SVM_CR0_SELECTIVE_MASK;
2514
2515	if (cr0 ^ val) {
2516		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2517		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2518	}
2519
2520	return ret;
2521}
2522
2523#define CR_VALID (1ULL << 63)
2524
2525static int cr_interception(struct kvm_vcpu *vcpu)
2526{
2527	struct vcpu_svm *svm = to_svm(vcpu);
2528	int reg, cr;
2529	unsigned long val;
2530	int err;
2531
2532	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2533		return emulate_on_interception(vcpu);
2534
2535	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2536		return emulate_on_interception(vcpu);
2537
2538	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2539	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2540		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2541	else
2542		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2543
2544	err = 0;
2545	if (cr >= 16) { /* mov to cr */
2546		cr -= 16;
2547		val = kvm_register_read(vcpu, reg);
2548		trace_kvm_cr_write(cr, val);
2549		switch (cr) {
2550		case 0:
2551			if (!check_selective_cr0_intercepted(vcpu, val))
2552				err = kvm_set_cr0(vcpu, val);
2553			else
2554				return 1;
2555
2556			break;
2557		case 3:
2558			err = kvm_set_cr3(vcpu, val);
2559			break;
2560		case 4:
2561			err = kvm_set_cr4(vcpu, val);
2562			break;
2563		case 8:
2564			err = kvm_set_cr8(vcpu, val);
2565			break;
2566		default:
2567			WARN(1, "unhandled write to CR%d", cr);
2568			kvm_queue_exception(vcpu, UD_VECTOR);
2569			return 1;
2570		}
2571	} else { /* mov from cr */
2572		switch (cr) {
2573		case 0:
2574			val = kvm_read_cr0(vcpu);
2575			break;
2576		case 2:
2577			val = vcpu->arch.cr2;
2578			break;
2579		case 3:
2580			val = kvm_read_cr3(vcpu);
2581			break;
2582		case 4:
2583			val = kvm_read_cr4(vcpu);
2584			break;
2585		case 8:
2586			val = kvm_get_cr8(vcpu);
2587			break;
2588		default:
2589			WARN(1, "unhandled read from CR%d", cr);
2590			kvm_queue_exception(vcpu, UD_VECTOR);
2591			return 1;
2592		}
2593		kvm_register_write(vcpu, reg, val);
2594		trace_kvm_cr_read(cr, val);
2595	}
2596	return kvm_complete_insn_gp(vcpu, err);
2597}
2598
2599static int cr_trap(struct kvm_vcpu *vcpu)
2600{
2601	struct vcpu_svm *svm = to_svm(vcpu);
2602	unsigned long old_value, new_value;
2603	unsigned int cr;
2604	int ret = 0;
2605
2606	new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2607
2608	cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2609	switch (cr) {
2610	case 0:
2611		old_value = kvm_read_cr0(vcpu);
2612		svm_set_cr0(vcpu, new_value);
2613
2614		kvm_post_set_cr0(vcpu, old_value, new_value);
2615		break;
2616	case 4:
2617		old_value = kvm_read_cr4(vcpu);
2618		svm_set_cr4(vcpu, new_value);
2619
2620		kvm_post_set_cr4(vcpu, old_value, new_value);
2621		break;
2622	case 8:
2623		ret = kvm_set_cr8(vcpu, new_value);
2624		break;
2625	default:
2626		WARN(1, "unhandled CR%d write trap", cr);
2627		kvm_queue_exception(vcpu, UD_VECTOR);
2628		return 1;
2629	}
2630
2631	return kvm_complete_insn_gp(vcpu, ret);
2632}
2633
2634static int dr_interception(struct kvm_vcpu *vcpu)
2635{
2636	struct vcpu_svm *svm = to_svm(vcpu);
2637	int reg, dr;
2638	unsigned long val;
2639	int err = 0;
2640
 
 
 
 
 
 
 
2641	if (vcpu->guest_debug == 0) {
2642		/*
2643		 * No more DR vmexits; force a reload of the debug registers
2644		 * and reenter on this instruction.  The next vmexit will
2645		 * retrieve the full state of the debug registers.
2646		 */
2647		clr_dr_intercepts(svm);
2648		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2649		return 1;
2650	}
2651
2652	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2653		return emulate_on_interception(vcpu);
2654
2655	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2656	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2657	if (dr >= 16) { /* mov to DRn  */
2658		dr -= 16;
2659		val = kvm_register_read(vcpu, reg);
2660		err = kvm_set_dr(vcpu, dr, val);
2661	} else {
2662		kvm_get_dr(vcpu, dr, &val);
2663		kvm_register_write(vcpu, reg, val);
2664	}
2665
2666	return kvm_complete_insn_gp(vcpu, err);
2667}
2668
2669static int cr8_write_interception(struct kvm_vcpu *vcpu)
2670{
2671	int r;
2672
2673	u8 cr8_prev = kvm_get_cr8(vcpu);
2674	/* instruction emulation calls kvm_set_cr8() */
2675	r = cr_interception(vcpu);
2676	if (lapic_in_kernel(vcpu))
2677		return r;
2678	if (cr8_prev <= kvm_get_cr8(vcpu))
2679		return r;
2680	vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2681	return 0;
2682}
2683
2684static int efer_trap(struct kvm_vcpu *vcpu)
2685{
2686	struct msr_data msr_info;
2687	int ret;
2688
2689	/*
2690	 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2691	 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2692	 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2693	 * the guest doesn't have X86_FEATURE_SVM.
2694	 */
2695	msr_info.host_initiated = false;
2696	msr_info.index = MSR_EFER;
2697	msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2698	ret = kvm_set_msr_common(vcpu, &msr_info);
2699
2700	return kvm_complete_insn_gp(vcpu, ret);
2701}
2702
2703static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2704{
2705	msr->data = 0;
2706
2707	switch (msr->index) {
2708	case MSR_AMD64_DE_CFG:
2709		if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2710			msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2711		break;
2712	default:
2713		return KVM_MSR_RET_INVALID;
2714	}
2715
2716	return 0;
2717}
2718
2719static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2720{
2721	struct vcpu_svm *svm = to_svm(vcpu);
2722
2723	switch (msr_info->index) {
2724	case MSR_AMD64_TSC_RATIO:
2725		if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
 
2726			return 1;
2727		msr_info->data = svm->tsc_ratio_msr;
2728		break;
2729	case MSR_STAR:
2730		msr_info->data = svm->vmcb01.ptr->save.star;
2731		break;
2732#ifdef CONFIG_X86_64
2733	case MSR_LSTAR:
2734		msr_info->data = svm->vmcb01.ptr->save.lstar;
2735		break;
2736	case MSR_CSTAR:
2737		msr_info->data = svm->vmcb01.ptr->save.cstar;
2738		break;
2739	case MSR_KERNEL_GS_BASE:
2740		msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2741		break;
2742	case MSR_SYSCALL_MASK:
2743		msr_info->data = svm->vmcb01.ptr->save.sfmask;
2744		break;
2745#endif
2746	case MSR_IA32_SYSENTER_CS:
2747		msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2748		break;
2749	case MSR_IA32_SYSENTER_EIP:
2750		msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2751		if (guest_cpuid_is_intel(vcpu))
2752			msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2753		break;
2754	case MSR_IA32_SYSENTER_ESP:
2755		msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2756		if (guest_cpuid_is_intel(vcpu))
2757			msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2758		break;
2759	case MSR_TSC_AUX:
2760		msr_info->data = svm->tsc_aux;
2761		break;
2762	case MSR_IA32_DEBUGCTLMSR:
 
 
2763	case MSR_IA32_LASTBRANCHFROMIP:
 
 
2764	case MSR_IA32_LASTBRANCHTOIP:
 
 
2765	case MSR_IA32_LASTINTFROMIP:
 
 
2766	case MSR_IA32_LASTINTTOIP:
2767		msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
2768		break;
2769	case MSR_VM_HSAVE_PA:
2770		msr_info->data = svm->nested.hsave_msr;
2771		break;
2772	case MSR_VM_CR:
2773		msr_info->data = svm->nested.vm_cr_msr;
2774		break;
2775	case MSR_IA32_SPEC_CTRL:
2776		if (!msr_info->host_initiated &&
2777		    !guest_has_spec_ctrl_msr(vcpu))
2778			return 1;
2779
2780		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2781			msr_info->data = svm->vmcb->save.spec_ctrl;
2782		else
2783			msr_info->data = svm->spec_ctrl;
2784		break;
2785	case MSR_AMD64_VIRT_SPEC_CTRL:
2786		if (!msr_info->host_initiated &&
2787		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2788			return 1;
2789
2790		msr_info->data = svm->virt_spec_ctrl;
2791		break;
2792	case MSR_F15H_IC_CFG: {
2793
2794		int family, model;
2795
2796		family = guest_cpuid_family(vcpu);
2797		model  = guest_cpuid_model(vcpu);
2798
2799		if (family < 0 || model < 0)
2800			return kvm_get_msr_common(vcpu, msr_info);
2801
2802		msr_info->data = 0;
2803
2804		if (family == 0x15 &&
2805		    (model >= 0x2 && model < 0x20))
2806			msr_info->data = 0x1E;
2807		}
2808		break;
2809	case MSR_AMD64_DE_CFG:
2810		msr_info->data = svm->msr_decfg;
2811		break;
2812	default:
2813		return kvm_get_msr_common(vcpu, msr_info);
2814	}
2815	return 0;
2816}
2817
2818static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2819{
2820	struct vcpu_svm *svm = to_svm(vcpu);
2821	if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2822		return kvm_complete_insn_gp(vcpu, err);
2823
2824	ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2825	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2826				X86_TRAP_GP |
2827				SVM_EVTINJ_TYPE_EXEPT |
2828				SVM_EVTINJ_VALID);
2829	return 1;
2830}
2831
2832static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2833{
2834	struct vcpu_svm *svm = to_svm(vcpu);
2835	int svm_dis, chg_mask;
2836
2837	if (data & ~SVM_VM_CR_VALID_MASK)
2838		return 1;
2839
2840	chg_mask = SVM_VM_CR_VALID_MASK;
2841
2842	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2843		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2844
2845	svm->nested.vm_cr_msr &= ~chg_mask;
2846	svm->nested.vm_cr_msr |= (data & chg_mask);
2847
2848	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2849
2850	/* check for svm_disable while efer.svme is set */
2851	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2852		return 1;
2853
2854	return 0;
2855}
2856
2857static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2858{
2859	struct vcpu_svm *svm = to_svm(vcpu);
2860	int r;
2861
2862	u32 ecx = msr->index;
2863	u64 data = msr->data;
2864	switch (ecx) {
2865	case MSR_AMD64_TSC_RATIO:
2866
2867		if (!svm->tsc_scaling_enabled) {
2868
2869			if (!msr->host_initiated)
2870				return 1;
2871			/*
2872			 * In case TSC scaling is not enabled, always
2873			 * leave this MSR at the default value.
2874			 *
2875			 * Due to bug in qemu 6.2.0, it would try to set
2876			 * this msr to 0 if tsc scaling is not enabled.
2877			 * Ignore this value as well.
2878			 */
2879			if (data != 0 && data != svm->tsc_ratio_msr)
2880				return 1;
2881			break;
2882		}
2883
2884		if (data & SVM_TSC_RATIO_RSVD)
2885			return 1;
2886
2887		svm->tsc_ratio_msr = data;
2888
2889		if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
 
2890			nested_svm_update_tsc_ratio_msr(vcpu);
2891
2892		break;
2893	case MSR_IA32_CR_PAT:
2894		if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2895			return 1;
2896		vcpu->arch.pat = data;
 
2897		svm->vmcb01.ptr->save.g_pat = data;
2898		if (is_guest_mode(vcpu))
2899			nested_vmcb02_compute_g_pat(svm);
2900		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2901		break;
2902	case MSR_IA32_SPEC_CTRL:
2903		if (!msr->host_initiated &&
2904		    !guest_has_spec_ctrl_msr(vcpu))
2905			return 1;
2906
2907		if (kvm_spec_ctrl_test_value(data))
2908			return 1;
2909
2910		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2911			svm->vmcb->save.spec_ctrl = data;
2912		else
2913			svm->spec_ctrl = data;
2914		if (!data)
2915			break;
2916
2917		/*
2918		 * For non-nested:
2919		 * When it's written (to non-zero) for the first time, pass
2920		 * it through.
2921		 *
2922		 * For nested:
2923		 * The handling of the MSR bitmap for L2 guests is done in
2924		 * nested_svm_vmrun_msrpm.
2925		 * We update the L1 MSR bit as well since it will end up
2926		 * touching the MSR anyway now.
2927		 */
2928		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2929		break;
2930	case MSR_IA32_PRED_CMD:
2931		if (!msr->host_initiated &&
2932		    !guest_has_pred_cmd_msr(vcpu))
2933			return 1;
2934
2935		if (data & ~PRED_CMD_IBPB)
2936			return 1;
2937		if (!boot_cpu_has(X86_FEATURE_IBPB))
2938			return 1;
2939		if (!data)
2940			break;
2941
2942		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2943		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2944		break;
2945	case MSR_AMD64_VIRT_SPEC_CTRL:
2946		if (!msr->host_initiated &&
2947		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2948			return 1;
2949
2950		if (data & ~SPEC_CTRL_SSBD)
2951			return 1;
2952
2953		svm->virt_spec_ctrl = data;
2954		break;
2955	case MSR_STAR:
2956		svm->vmcb01.ptr->save.star = data;
2957		break;
2958#ifdef CONFIG_X86_64
2959	case MSR_LSTAR:
2960		svm->vmcb01.ptr->save.lstar = data;
2961		break;
2962	case MSR_CSTAR:
2963		svm->vmcb01.ptr->save.cstar = data;
2964		break;
2965	case MSR_KERNEL_GS_BASE:
2966		svm->vmcb01.ptr->save.kernel_gs_base = data;
2967		break;
2968	case MSR_SYSCALL_MASK:
2969		svm->vmcb01.ptr->save.sfmask = data;
2970		break;
2971#endif
2972	case MSR_IA32_SYSENTER_CS:
2973		svm->vmcb01.ptr->save.sysenter_cs = data;
2974		break;
2975	case MSR_IA32_SYSENTER_EIP:
2976		svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2977		/*
2978		 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2979		 * when we spoof an Intel vendor ID (for cross vendor migration).
2980		 * In this case we use this intercept to track the high
2981		 * 32 bit part of these msrs to support Intel's
2982		 * implementation of SYSENTER/SYSEXIT.
2983		 */
2984		svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2985		break;
2986	case MSR_IA32_SYSENTER_ESP:
2987		svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2988		svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2989		break;
2990	case MSR_TSC_AUX:
2991		/*
 
 
 
 
 
 
 
 
 
 
2992		 * TSC_AUX is usually changed only during boot and never read
2993		 * directly.  Intercept TSC_AUX instead of exposing it to the
2994		 * guest via direct_access_msrs, and switch it via user return.
2995		 */
2996		preempt_disable();
2997		r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
2998		preempt_enable();
2999		if (r)
3000			return 1;
3001
3002		svm->tsc_aux = data;
3003		break;
3004	case MSR_IA32_DEBUGCTLMSR:
3005		if (!lbrv) {
3006			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3007				    __func__, data);
3008			break;
3009		}
3010		if (data & DEBUGCTL_RESERVED_BITS)
3011			return 1;
3012
3013		if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
3014			svm->vmcb->save.dbgctl = data;
3015		else
3016			svm->vmcb01.ptr->save.dbgctl = data;
3017
3018		svm_update_lbrv(vcpu);
3019
3020		break;
3021	case MSR_VM_HSAVE_PA:
3022		/*
3023		 * Old kernels did not validate the value written to
3024		 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3025		 * value to allow live migrating buggy or malicious guests
3026		 * originating from those kernels.
3027		 */
3028		if (!msr->host_initiated && !page_address_valid(vcpu, data))
3029			return 1;
3030
3031		svm->nested.hsave_msr = data & PAGE_MASK;
3032		break;
3033	case MSR_VM_CR:
3034		return svm_set_vm_cr(vcpu, data);
3035	case MSR_VM_IGNNE:
3036		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3037		break;
3038	case MSR_AMD64_DE_CFG: {
3039		struct kvm_msr_entry msr_entry;
3040
3041		msr_entry.index = msr->index;
3042		if (svm_get_msr_feature(&msr_entry))
3043			return 1;
3044
3045		/* Check the supported bits */
3046		if (data & ~msr_entry.data)
3047			return 1;
3048
3049		/* Don't allow the guest to change a bit, #GP */
3050		if (!msr->host_initiated && (data ^ msr_entry.data))
3051			return 1;
3052
3053		svm->msr_decfg = data;
3054		break;
3055	}
3056	default:
3057		return kvm_set_msr_common(vcpu, msr);
3058	}
3059	return 0;
3060}
3061
3062static int msr_interception(struct kvm_vcpu *vcpu)
3063{
3064	if (to_svm(vcpu)->vmcb->control.exit_info_1)
3065		return kvm_emulate_wrmsr(vcpu);
3066	else
3067		return kvm_emulate_rdmsr(vcpu);
3068}
3069
3070static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3071{
3072	kvm_make_request(KVM_REQ_EVENT, vcpu);
3073	svm_clear_vintr(to_svm(vcpu));
3074
3075	/*
3076	 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3077	 * In this case AVIC was temporarily disabled for
3078	 * requesting the IRQ window and we have to re-enable it.
3079	 *
3080	 * If running nested, still remove the VM wide AVIC inhibit to
3081	 * support case in which the interrupt window was requested when the
3082	 * vCPU was not running nested.
3083
3084	 * All vCPUs which run still run nested, will remain to have their
3085	 * AVIC still inhibited due to per-cpu AVIC inhibition.
3086	 */
3087	kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3088
3089	++vcpu->stat.irq_window_exits;
3090	return 1;
3091}
3092
3093static int pause_interception(struct kvm_vcpu *vcpu)
3094{
3095	bool in_kernel;
3096	/*
3097	 * CPL is not made available for an SEV-ES guest, therefore
3098	 * vcpu->arch.preempted_in_kernel can never be true.  Just
3099	 * set in_kernel to false as well.
3100	 */
3101	in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3102
3103	grow_ple_window(vcpu);
3104
3105	kvm_vcpu_on_spin(vcpu, in_kernel);
3106	return kvm_skip_emulated_instruction(vcpu);
3107}
3108
3109static int invpcid_interception(struct kvm_vcpu *vcpu)
3110{
3111	struct vcpu_svm *svm = to_svm(vcpu);
3112	unsigned long type;
3113	gva_t gva;
3114
3115	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3116		kvm_queue_exception(vcpu, UD_VECTOR);
3117		return 1;
3118	}
3119
3120	/*
3121	 * For an INVPCID intercept:
3122	 * EXITINFO1 provides the linear address of the memory operand.
3123	 * EXITINFO2 provides the contents of the register operand.
3124	 */
3125	type = svm->vmcb->control.exit_info_2;
3126	gva = svm->vmcb->control.exit_info_1;
3127
3128	return kvm_handle_invpcid(vcpu, type, gva);
3129}
3130
3131static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3132	[SVM_EXIT_READ_CR0]			= cr_interception,
3133	[SVM_EXIT_READ_CR3]			= cr_interception,
3134	[SVM_EXIT_READ_CR4]			= cr_interception,
3135	[SVM_EXIT_READ_CR8]			= cr_interception,
3136	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
3137	[SVM_EXIT_WRITE_CR0]			= cr_interception,
3138	[SVM_EXIT_WRITE_CR3]			= cr_interception,
3139	[SVM_EXIT_WRITE_CR4]			= cr_interception,
3140	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
3141	[SVM_EXIT_READ_DR0]			= dr_interception,
3142	[SVM_EXIT_READ_DR1]			= dr_interception,
3143	[SVM_EXIT_READ_DR2]			= dr_interception,
3144	[SVM_EXIT_READ_DR3]			= dr_interception,
3145	[SVM_EXIT_READ_DR4]			= dr_interception,
3146	[SVM_EXIT_READ_DR5]			= dr_interception,
3147	[SVM_EXIT_READ_DR6]			= dr_interception,
3148	[SVM_EXIT_READ_DR7]			= dr_interception,
3149	[SVM_EXIT_WRITE_DR0]			= dr_interception,
3150	[SVM_EXIT_WRITE_DR1]			= dr_interception,
3151	[SVM_EXIT_WRITE_DR2]			= dr_interception,
3152	[SVM_EXIT_WRITE_DR3]			= dr_interception,
3153	[SVM_EXIT_WRITE_DR4]			= dr_interception,
3154	[SVM_EXIT_WRITE_DR5]			= dr_interception,
3155	[SVM_EXIT_WRITE_DR6]			= dr_interception,
3156	[SVM_EXIT_WRITE_DR7]			= dr_interception,
3157	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
3158	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
3159	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
3160	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
3161	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
3162	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
3163	[SVM_EXIT_EXCP_BASE + GP_VECTOR]	= gp_interception,
3164	[SVM_EXIT_INTR]				= intr_interception,
3165	[SVM_EXIT_NMI]				= nmi_interception,
3166	[SVM_EXIT_SMI]				= smi_interception,
3167	[SVM_EXIT_VINTR]			= interrupt_window_interception,
3168	[SVM_EXIT_RDPMC]			= kvm_emulate_rdpmc,
3169	[SVM_EXIT_CPUID]			= kvm_emulate_cpuid,
3170	[SVM_EXIT_IRET]                         = iret_interception,
3171	[SVM_EXIT_INVD]                         = kvm_emulate_invd,
3172	[SVM_EXIT_PAUSE]			= pause_interception,
3173	[SVM_EXIT_HLT]				= kvm_emulate_halt,
3174	[SVM_EXIT_INVLPG]			= invlpg_interception,
3175	[SVM_EXIT_INVLPGA]			= invlpga_interception,
3176	[SVM_EXIT_IOIO]				= io_interception,
3177	[SVM_EXIT_MSR]				= msr_interception,
3178	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
3179	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
3180	[SVM_EXIT_VMRUN]			= vmrun_interception,
3181	[SVM_EXIT_VMMCALL]			= kvm_emulate_hypercall,
3182	[SVM_EXIT_VMLOAD]			= vmload_interception,
3183	[SVM_EXIT_VMSAVE]			= vmsave_interception,
3184	[SVM_EXIT_STGI]				= stgi_interception,
3185	[SVM_EXIT_CLGI]				= clgi_interception,
3186	[SVM_EXIT_SKINIT]			= skinit_interception,
3187	[SVM_EXIT_RDTSCP]			= kvm_handle_invalid_op,
3188	[SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3189	[SVM_EXIT_MONITOR]			= kvm_emulate_monitor,
3190	[SVM_EXIT_MWAIT]			= kvm_emulate_mwait,
3191	[SVM_EXIT_XSETBV]			= kvm_emulate_xsetbv,
3192	[SVM_EXIT_RDPRU]			= kvm_handle_invalid_op,
3193	[SVM_EXIT_EFER_WRITE_TRAP]		= efer_trap,
3194	[SVM_EXIT_CR0_WRITE_TRAP]		= cr_trap,
3195	[SVM_EXIT_CR4_WRITE_TRAP]		= cr_trap,
3196	[SVM_EXIT_CR8_WRITE_TRAP]		= cr_trap,
3197	[SVM_EXIT_INVPCID]                      = invpcid_interception,
3198	[SVM_EXIT_NPF]				= npf_interception,
3199	[SVM_EXIT_RSM]                          = rsm_interception,
3200	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
3201	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
3202	[SVM_EXIT_VMGEXIT]			= sev_handle_vmgexit,
3203};
3204
3205static void dump_vmcb(struct kvm_vcpu *vcpu)
3206{
3207	struct vcpu_svm *svm = to_svm(vcpu);
3208	struct vmcb_control_area *control = &svm->vmcb->control;
3209	struct vmcb_save_area *save = &svm->vmcb->save;
3210	struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3211
3212	if (!dump_invalid_vmcb) {
3213		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3214		return;
3215	}
3216
3217	pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3218	       svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3219	pr_err("VMCB Control Area:\n");
3220	pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3221	pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3222	pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3223	pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3224	pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3225	pr_err("%-20s%08x %08x\n", "intercepts:",
3226              control->intercepts[INTERCEPT_WORD3],
3227	       control->intercepts[INTERCEPT_WORD4]);
3228	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3229	pr_err("%-20s%d\n", "pause filter threshold:",
3230	       control->pause_filter_thresh);
3231	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3232	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3233	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3234	pr_err("%-20s%d\n", "asid:", control->asid);
3235	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3236	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3237	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3238	pr_err("%-20s%08x\n", "int_state:", control->int_state);
3239	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3240	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3241	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3242	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3243	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3244	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3245	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3246	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3247	pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3248	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3249	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3250	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3251	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3252	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3253	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3254	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3255	pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3256	pr_err("VMCB State Save Area:\n");
3257	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3258	       "es:",
3259	       save->es.selector, save->es.attrib,
3260	       save->es.limit, save->es.base);
3261	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3262	       "cs:",
3263	       save->cs.selector, save->cs.attrib,
3264	       save->cs.limit, save->cs.base);
3265	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3266	       "ss:",
3267	       save->ss.selector, save->ss.attrib,
3268	       save->ss.limit, save->ss.base);
3269	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3270	       "ds:",
3271	       save->ds.selector, save->ds.attrib,
3272	       save->ds.limit, save->ds.base);
3273	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3274	       "fs:",
3275	       save01->fs.selector, save01->fs.attrib,
3276	       save01->fs.limit, save01->fs.base);
3277	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3278	       "gs:",
3279	       save01->gs.selector, save01->gs.attrib,
3280	       save01->gs.limit, save01->gs.base);
3281	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3282	       "gdtr:",
3283	       save->gdtr.selector, save->gdtr.attrib,
3284	       save->gdtr.limit, save->gdtr.base);
3285	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3286	       "ldtr:",
3287	       save01->ldtr.selector, save01->ldtr.attrib,
3288	       save01->ldtr.limit, save01->ldtr.base);
3289	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3290	       "idtr:",
3291	       save->idtr.selector, save->idtr.attrib,
3292	       save->idtr.limit, save->idtr.base);
3293	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3294	       "tr:",
3295	       save01->tr.selector, save01->tr.attrib,
3296	       save01->tr.limit, save01->tr.base);
3297	pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3298	       save->vmpl, save->cpl, save->efer);
3299	pr_err("%-15s %016llx %-13s %016llx\n",
3300	       "cr0:", save->cr0, "cr2:", save->cr2);
3301	pr_err("%-15s %016llx %-13s %016llx\n",
3302	       "cr3:", save->cr3, "cr4:", save->cr4);
3303	pr_err("%-15s %016llx %-13s %016llx\n",
3304	       "dr6:", save->dr6, "dr7:", save->dr7);
3305	pr_err("%-15s %016llx %-13s %016llx\n",
3306	       "rip:", save->rip, "rflags:", save->rflags);
3307	pr_err("%-15s %016llx %-13s %016llx\n",
3308	       "rsp:", save->rsp, "rax:", save->rax);
3309	pr_err("%-15s %016llx %-13s %016llx\n",
3310	       "star:", save01->star, "lstar:", save01->lstar);
3311	pr_err("%-15s %016llx %-13s %016llx\n",
3312	       "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3313	pr_err("%-15s %016llx %-13s %016llx\n",
3314	       "kernel_gs_base:", save01->kernel_gs_base,
3315	       "sysenter_cs:", save01->sysenter_cs);
3316	pr_err("%-15s %016llx %-13s %016llx\n",
3317	       "sysenter_esp:", save01->sysenter_esp,
3318	       "sysenter_eip:", save01->sysenter_eip);
3319	pr_err("%-15s %016llx %-13s %016llx\n",
3320	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3321	pr_err("%-15s %016llx %-13s %016llx\n",
3322	       "br_from:", save->br_from, "br_to:", save->br_to);
3323	pr_err("%-15s %016llx %-13s %016llx\n",
3324	       "excp_from:", save->last_excp_from,
3325	       "excp_to:", save->last_excp_to);
3326}
3327
3328static bool svm_check_exit_valid(u64 exit_code)
3329{
3330	return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3331		svm_exit_handlers[exit_code]);
3332}
3333
3334static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3335{
3336	vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3337	dump_vmcb(vcpu);
3338	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3339	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3340	vcpu->run->internal.ndata = 2;
3341	vcpu->run->internal.data[0] = exit_code;
3342	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3343	return 0;
3344}
3345
3346int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3347{
3348	if (!svm_check_exit_valid(exit_code))
3349		return svm_handle_invalid_exit(vcpu, exit_code);
3350
3351#ifdef CONFIG_RETPOLINE
3352	if (exit_code == SVM_EXIT_MSR)
3353		return msr_interception(vcpu);
3354	else if (exit_code == SVM_EXIT_VINTR)
3355		return interrupt_window_interception(vcpu);
3356	else if (exit_code == SVM_EXIT_INTR)
3357		return intr_interception(vcpu);
3358	else if (exit_code == SVM_EXIT_HLT)
3359		return kvm_emulate_halt(vcpu);
3360	else if (exit_code == SVM_EXIT_NPF)
3361		return npf_interception(vcpu);
3362#endif
3363	return svm_exit_handlers[exit_code](vcpu);
3364}
3365
3366static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3367			      u64 *info1, u64 *info2,
3368			      u32 *intr_info, u32 *error_code)
3369{
3370	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3371
3372	*reason = control->exit_code;
3373	*info1 = control->exit_info_1;
3374	*info2 = control->exit_info_2;
3375	*intr_info = control->exit_int_info;
3376	if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3377	    (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3378		*error_code = control->exit_int_info_err;
3379	else
3380		*error_code = 0;
3381}
3382
3383static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3384{
3385	struct vcpu_svm *svm = to_svm(vcpu);
3386	struct kvm_run *kvm_run = vcpu->run;
3387	u32 exit_code = svm->vmcb->control.exit_code;
3388
3389	trace_kvm_exit(vcpu, KVM_ISA_SVM);
3390
3391	/* SEV-ES guests must use the CR write traps to track CR registers. */
3392	if (!sev_es_guest(vcpu->kvm)) {
3393		if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3394			vcpu->arch.cr0 = svm->vmcb->save.cr0;
3395		if (npt_enabled)
3396			vcpu->arch.cr3 = svm->vmcb->save.cr3;
3397	}
3398
3399	if (is_guest_mode(vcpu)) {
3400		int vmexit;
3401
3402		trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3403
3404		vmexit = nested_svm_exit_special(svm);
3405
3406		if (vmexit == NESTED_EXIT_CONTINUE)
3407			vmexit = nested_svm_exit_handled(svm);
3408
3409		if (vmexit == NESTED_EXIT_DONE)
3410			return 1;
3411	}
3412
3413	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3414		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3415		kvm_run->fail_entry.hardware_entry_failure_reason
3416			= svm->vmcb->control.exit_code;
3417		kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3418		dump_vmcb(vcpu);
3419		return 0;
3420	}
3421
3422	if (exit_fastpath != EXIT_FASTPATH_NONE)
3423		return 1;
3424
3425	return svm_invoke_exit_handler(vcpu, exit_code);
3426}
3427
3428static void reload_tss(struct kvm_vcpu *vcpu)
3429{
3430	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3431
3432	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3433	load_TR_desc();
3434}
3435
3436static void pre_svm_run(struct kvm_vcpu *vcpu)
3437{
3438	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3439	struct vcpu_svm *svm = to_svm(vcpu);
3440
3441	/*
3442	 * If the previous vmrun of the vmcb occurred on a different physical
3443	 * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3444	 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3445	 */
3446	if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3447		svm->current_vmcb->asid_generation = 0;
3448		vmcb_mark_all_dirty(svm->vmcb);
3449		svm->current_vmcb->cpu = vcpu->cpu;
3450        }
3451
3452	if (sev_guest(vcpu->kvm))
3453		return pre_sev_run(svm, vcpu->cpu);
3454
3455	/* FIXME: handle wraparound of asid_generation */
3456	if (svm->current_vmcb->asid_generation != sd->asid_generation)
3457		new_asid(svm, sd);
3458}
3459
3460static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3461{
3462	struct vcpu_svm *svm = to_svm(vcpu);
3463
3464	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3465
3466	if (svm->nmi_l1_to_l2)
3467		return;
3468
3469	vcpu->arch.hflags |= HF_NMI_MASK;
3470	if (!sev_es_guest(vcpu->kvm))
3471		svm_set_intercept(svm, INTERCEPT_IRET);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3472	++vcpu->stat.nmi_injections;
 
 
3473}
3474
3475static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3476{
3477	struct vcpu_svm *svm = to_svm(vcpu);
3478	u32 type;
3479
3480	if (vcpu->arch.interrupt.soft) {
3481		if (svm_update_soft_interrupt_rip(vcpu))
3482			return;
3483
3484		type = SVM_EVTINJ_TYPE_SOFT;
3485	} else {
3486		type = SVM_EVTINJ_TYPE_INTR;
3487	}
3488
3489	trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3490			   vcpu->arch.interrupt.soft, reinjected);
3491	++vcpu->stat.irq_injections;
3492
3493	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3494				       SVM_EVTINJ_VALID | type;
3495}
3496
3497void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3498				     int trig_mode, int vector)
3499{
3500	/*
3501	 * apic->apicv_active must be read after vcpu->mode.
3502	 * Pairs with smp_store_release in vcpu_enter_guest.
3503	 */
3504	bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3505
3506	/* Note, this is called iff the local APIC is in-kernel. */
3507	if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3508		/* Process the interrupt via kvm_check_and_inject_events(). */
3509		kvm_make_request(KVM_REQ_EVENT, vcpu);
3510		kvm_vcpu_kick(vcpu);
3511		return;
3512	}
3513
3514	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3515	if (in_guest_mode) {
3516		/*
3517		 * Signal the doorbell to tell hardware to inject the IRQ.  If
3518		 * the vCPU exits the guest before the doorbell chimes, hardware
3519		 * will automatically process AVIC interrupts at the next VMRUN.
3520		 */
3521		avic_ring_doorbell(vcpu);
3522	} else {
3523		/*
3524		 * Wake the vCPU if it was blocking.  KVM will then detect the
3525		 * pending IRQ when checking if the vCPU has a wake event.
3526		 */
3527		kvm_vcpu_wake_up(vcpu);
3528	}
3529}
3530
3531static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3532				  int trig_mode, int vector)
3533{
3534	kvm_lapic_set_irr(vector, apic);
3535
3536	/*
3537	 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3538	 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3539	 * the read of guest_mode.  This guarantees that either VMRUN will see
3540	 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3541	 * will signal the doorbell if the CPU has already entered the guest.
3542	 */
3543	smp_mb__after_atomic();
3544	svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3545}
3546
3547static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3548{
3549	struct vcpu_svm *svm = to_svm(vcpu);
3550
3551	/*
3552	 * SEV-ES guests must always keep the CR intercepts cleared. CR
3553	 * tracking is done using the CR write traps.
3554	 */
3555	if (sev_es_guest(vcpu->kvm))
3556		return;
3557
3558	if (nested_svm_virtualize_tpr(vcpu))
3559		return;
3560
3561	svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3562
3563	if (irr == -1)
3564		return;
3565
3566	if (tpr >= irr)
3567		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3568}
3569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3570bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3571{
3572	struct vcpu_svm *svm = to_svm(vcpu);
3573	struct vmcb *vmcb = svm->vmcb;
3574	bool ret;
3575
3576	if (!gif_set(svm))
3577		return true;
3578
3579	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3580		return false;
3581
3582	ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3583	      (vcpu->arch.hflags & HF_NMI_MASK);
3584
3585	return ret;
3586}
3587
3588static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3589{
3590	struct vcpu_svm *svm = to_svm(vcpu);
3591	if (svm->nested.nested_run_pending)
3592		return -EBUSY;
3593
3594	if (svm_nmi_blocked(vcpu))
3595		return 0;
3596
3597	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3598	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3599		return -EBUSY;
3600	return 1;
3601}
3602
3603static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3604{
3605	return !!(vcpu->arch.hflags & HF_NMI_MASK);
3606}
3607
3608static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3609{
3610	struct vcpu_svm *svm = to_svm(vcpu);
3611
3612	if (masked) {
3613		vcpu->arch.hflags |= HF_NMI_MASK;
3614		if (!sev_es_guest(vcpu->kvm))
3615			svm_set_intercept(svm, INTERCEPT_IRET);
3616	} else {
3617		vcpu->arch.hflags &= ~HF_NMI_MASK;
3618		if (!sev_es_guest(vcpu->kvm))
3619			svm_clr_intercept(svm, INTERCEPT_IRET);
3620	}
3621}
3622
3623bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3624{
3625	struct vcpu_svm *svm = to_svm(vcpu);
3626	struct vmcb *vmcb = svm->vmcb;
3627
3628	if (!gif_set(svm))
3629		return true;
3630
3631	if (is_guest_mode(vcpu)) {
3632		/* As long as interrupts are being delivered...  */
3633		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3634		    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3635		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3636			return true;
3637
3638		/* ... vmexits aren't blocked by the interrupt shadow  */
3639		if (nested_exit_on_intr(svm))
3640			return false;
3641	} else {
3642		if (!svm_get_if_flag(vcpu))
3643			return true;
3644	}
3645
3646	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3647}
3648
3649static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3650{
3651	struct vcpu_svm *svm = to_svm(vcpu);
3652
3653	if (svm->nested.nested_run_pending)
3654		return -EBUSY;
3655
3656	if (svm_interrupt_blocked(vcpu))
3657		return 0;
3658
3659	/*
3660	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3661	 * e.g. if the IRQ arrived asynchronously after checking nested events.
3662	 */
3663	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3664		return -EBUSY;
3665
3666	return 1;
3667}
3668
3669static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3670{
3671	struct vcpu_svm *svm = to_svm(vcpu);
3672
3673	/*
3674	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3675	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3676	 * get that intercept, this function will be called again though and
3677	 * we'll get the vintr intercept. However, if the vGIF feature is
3678	 * enabled, the STGI interception will not occur. Enable the irq
3679	 * window under the assumption that the hardware will set the GIF.
3680	 */
3681	if (vgif || gif_set(svm)) {
3682		/*
3683		 * IRQ window is not needed when AVIC is enabled,
3684		 * unless we have pending ExtINT since it cannot be injected
3685		 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3686		 * and fallback to injecting IRQ via V_IRQ.
3687		 *
3688		 * If running nested, AVIC is already locally inhibited
3689		 * on this vCPU, therefore there is no need to request
3690		 * the VM wide AVIC inhibition.
3691		 */
3692		if (!is_guest_mode(vcpu))
3693			kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3694
3695		svm_set_vintr(svm);
3696	}
3697}
3698
3699static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3700{
3701	struct vcpu_svm *svm = to_svm(vcpu);
3702
3703	if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
 
 
 
 
 
 
 
 
 
3704		return; /* IRET will cause a vm exit */
3705
 
 
 
 
 
 
 
 
 
 
 
 
 
3706	if (!gif_set(svm)) {
3707		if (vgif)
3708			svm_set_intercept(svm, INTERCEPT_STGI);
3709		return; /* STGI will cause a vm exit */
3710	}
3711
3712	/*
3713	 * Something prevents NMI from been injected. Single step over possible
3714	 * problem (IRET or exception injection or interrupt shadow)
3715	 */
3716	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3717	svm->nmi_singlestep = true;
3718	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3719}
3720
3721static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3722{
3723	struct vcpu_svm *svm = to_svm(vcpu);
3724
3725	/*
3726	 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3727	 * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3728	 * entries, and thus is a superset of Hyper-V's fine grained flushing.
3729	 */
3730	kvm_hv_vcpu_purge_flush_tlb(vcpu);
3731
3732	/*
3733	 * Flush only the current ASID even if the TLB flush was invoked via
3734	 * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3735	 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3736	 * unconditionally does a TLB flush on both nested VM-Enter and nested
3737	 * VM-Exit (via kvm_mmu_reset_context()).
3738	 */
3739	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3740		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3741	else
3742		svm->current_vmcb->asid_generation--;
3743}
3744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3745static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3746{
3747	struct vcpu_svm *svm = to_svm(vcpu);
3748
3749	invlpga(gva, svm->vmcb->control.asid);
3750}
3751
3752static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3753{
3754	struct vcpu_svm *svm = to_svm(vcpu);
3755
3756	if (nested_svm_virtualize_tpr(vcpu))
3757		return;
3758
3759	if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3760		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3761		kvm_set_cr8(vcpu, cr8);
3762	}
3763}
3764
3765static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3766{
3767	struct vcpu_svm *svm = to_svm(vcpu);
3768	u64 cr8;
3769
3770	if (nested_svm_virtualize_tpr(vcpu) ||
3771	    kvm_vcpu_apicv_active(vcpu))
3772		return;
3773
3774	cr8 = kvm_get_cr8(vcpu);
3775	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3776	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3777}
3778
3779static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3780					int type)
3781{
3782	bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3783	bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3784	struct vcpu_svm *svm = to_svm(vcpu);
3785
3786	/*
3787	 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3788	 * associated with the original soft exception/interrupt.  next_rip is
3789	 * cleared on all exits that can occur while vectoring an event, so KVM
3790	 * needs to manually set next_rip for re-injection.  Unlike the !nrips
3791	 * case below, this needs to be done if and only if KVM is re-injecting
3792	 * the same event, i.e. if the event is a soft exception/interrupt,
3793	 * otherwise next_rip is unused on VMRUN.
3794	 */
3795	if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3796	    kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3797		svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3798	/*
3799	 * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3800	 * injecting the soft exception/interrupt.  That advancement needs to
3801	 * be unwound if vectoring didn't complete.  Note, the new event may
3802	 * not be the injected event, e.g. if KVM injected an INTn, the INTn
3803	 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3804	 * be the reported vectored event, but RIP still needs to be unwound.
3805	 */
3806	else if (!nrips && (is_soft || is_exception) &&
3807		 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3808		kvm_rip_write(vcpu, svm->soft_int_old_rip);
3809}
3810
3811static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3812{
3813	struct vcpu_svm *svm = to_svm(vcpu);
3814	u8 vector;
3815	int type;
3816	u32 exitintinfo = svm->vmcb->control.exit_int_info;
3817	bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3818	bool soft_int_injected = svm->soft_int_injected;
3819
3820	svm->nmi_l1_to_l2 = false;
3821	svm->soft_int_injected = false;
3822
3823	/*
3824	 * If we've made progress since setting HF_IRET_MASK, we've
3825	 * executed an IRET and can allow NMI injection.
3826	 */
3827	if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3828	    (sev_es_guest(vcpu->kvm) ||
3829	     kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3830		vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3831		kvm_make_request(KVM_REQ_EVENT, vcpu);
3832	}
3833
3834	vcpu->arch.nmi_injected = false;
3835	kvm_clear_exception_queue(vcpu);
3836	kvm_clear_interrupt_queue(vcpu);
3837
3838	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3839		return;
3840
3841	kvm_make_request(KVM_REQ_EVENT, vcpu);
3842
3843	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3844	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3845
3846	if (soft_int_injected)
3847		svm_complete_soft_interrupt(vcpu, vector, type);
3848
3849	switch (type) {
3850	case SVM_EXITINTINFO_TYPE_NMI:
3851		vcpu->arch.nmi_injected = true;
3852		svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3853		break;
3854	case SVM_EXITINTINFO_TYPE_EXEPT:
3855		/*
3856		 * Never re-inject a #VC exception.
3857		 */
3858		if (vector == X86_TRAP_VC)
3859			break;
3860
3861		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3862			u32 err = svm->vmcb->control.exit_int_info_err;
3863			kvm_requeue_exception_e(vcpu, vector, err);
3864
3865		} else
3866			kvm_requeue_exception(vcpu, vector);
3867		break;
3868	case SVM_EXITINTINFO_TYPE_INTR:
3869		kvm_queue_interrupt(vcpu, vector, false);
3870		break;
3871	case SVM_EXITINTINFO_TYPE_SOFT:
3872		kvm_queue_interrupt(vcpu, vector, true);
3873		break;
3874	default:
3875		break;
3876	}
3877
3878}
3879
3880static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3881{
3882	struct vcpu_svm *svm = to_svm(vcpu);
3883	struct vmcb_control_area *control = &svm->vmcb->control;
3884
3885	control->exit_int_info = control->event_inj;
3886	control->exit_int_info_err = control->event_inj_err;
3887	control->event_inj = 0;
3888	svm_complete_interrupts(vcpu);
3889}
3890
3891static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3892{
3893	return 1;
3894}
3895
3896static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3897{
3898	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3899
3900	/*
3901	 * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
3902	 * can't read guest memory (dereference memslots) to decode the WRMSR.
3903	 */
3904	if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
3905	    nrips && control->next_rip)
3906		return handle_fastpath_set_msr_irqoff(vcpu);
3907
3908	return EXIT_FASTPATH_NONE;
3909}
3910
3911static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
3912{
3913	struct vcpu_svm *svm = to_svm(vcpu);
3914
3915	guest_state_enter_irqoff();
3916
 
 
3917	if (sev_es_guest(vcpu->kvm))
3918		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
3919	else
3920		__svm_vcpu_run(svm, spec_ctrl_intercepted);
3921
3922	guest_state_exit_irqoff();
3923}
3924
3925static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3926{
3927	struct vcpu_svm *svm = to_svm(vcpu);
3928	bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
3929
3930	trace_kvm_entry(vcpu);
3931
3932	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3933	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3934	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3935
3936	/*
3937	 * Disable singlestep if we're injecting an interrupt/exception.
3938	 * We don't want our modified rflags to be pushed on the stack where
3939	 * we might not be able to easily reset them if we disabled NMI
3940	 * singlestep later.
3941	 */
3942	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3943		/*
3944		 * Event injection happens before external interrupts cause a
3945		 * vmexit and interrupts are disabled here, so smp_send_reschedule
3946		 * is enough to force an immediate vmexit.
3947		 */
3948		disable_nmi_singlestep(svm);
3949		smp_send_reschedule(vcpu->cpu);
3950	}
3951
3952	pre_svm_run(vcpu);
3953
3954	sync_lapic_to_cr8(vcpu);
3955
3956	if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3957		svm->vmcb->control.asid = svm->asid;
3958		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3959	}
3960	svm->vmcb->save.cr2 = vcpu->arch.cr2;
3961
3962	svm_hv_update_vp_id(svm->vmcb, vcpu);
3963
3964	/*
3965	 * Run with all-zero DR6 unless needed, so that we can get the exact cause
3966	 * of a #DB.
3967	 */
3968	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3969		svm_set_dr6(svm, vcpu->arch.dr6);
3970	else
3971		svm_set_dr6(svm, DR6_ACTIVE_LOW);
3972
3973	clgi();
3974	kvm_load_guest_xsave_state(vcpu);
3975
3976	kvm_wait_lapic_expire(vcpu);
3977
3978	/*
3979	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3980	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
3981	 * is no need to worry about the conditional branch over the wrmsr
3982	 * being speculatively taken.
3983	 */
3984	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3985		x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
3986
3987	svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
3988
3989	if (!sev_es_guest(vcpu->kvm))
3990		reload_tss(vcpu);
3991
3992	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3993		x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
3994
3995	if (!sev_es_guest(vcpu->kvm)) {
3996		vcpu->arch.cr2 = svm->vmcb->save.cr2;
3997		vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3998		vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3999		vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4000	}
4001	vcpu->arch.regs_dirty = 0;
4002
4003	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4004		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4005
4006	kvm_load_host_xsave_state(vcpu);
4007	stgi();
4008
4009	/* Any pending NMI will happen here */
4010
4011	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4012		kvm_after_interrupt(vcpu);
4013
4014	sync_cr8_to_lapic(vcpu);
4015
4016	svm->next_rip = 0;
4017	if (is_guest_mode(vcpu)) {
4018		nested_sync_control_from_vmcb02(svm);
4019
4020		/* Track VMRUNs that have made past consistency checking */
4021		if (svm->nested.nested_run_pending &&
4022		    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4023                        ++vcpu->stat.nested_run;
4024
4025		svm->nested.nested_run_pending = 0;
4026	}
4027
4028	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4029	vmcb_mark_all_clean(svm->vmcb);
4030
4031	/* if exit due to PF check for async PF */
4032	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4033		vcpu->arch.apf.host_apf_flags =
4034			kvm_read_and_reset_apf_flags();
4035
4036	vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4037
4038	/*
4039	 * We need to handle MC intercepts here before the vcpu has a chance to
4040	 * change the physical cpu
4041	 */
4042	if (unlikely(svm->vmcb->control.exit_code ==
4043		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
4044		svm_handle_mce(vcpu);
4045
 
 
4046	svm_complete_interrupts(vcpu);
4047
4048	if (is_guest_mode(vcpu))
4049		return EXIT_FASTPATH_NONE;
4050
4051	return svm_exit_handlers_fastpath(vcpu);
4052}
4053
4054static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4055			     int root_level)
4056{
4057	struct vcpu_svm *svm = to_svm(vcpu);
4058	unsigned long cr3;
4059
4060	if (npt_enabled) {
4061		svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4062		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4063
4064		hv_track_root_tdp(vcpu, root_hpa);
4065
4066		cr3 = vcpu->arch.cr3;
4067	} else if (root_level >= PT64_ROOT_4LEVEL) {
4068		cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4069	} else {
4070		/* PCID in the guest should be impossible with a 32-bit MMU. */
4071		WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4072		cr3 = root_hpa;
4073	}
4074
4075	svm->vmcb->save.cr3 = cr3;
4076	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4077}
4078
4079static int is_disabled(void)
4080{
4081	u64 vm_cr;
4082
4083	rdmsrl(MSR_VM_CR, vm_cr);
4084	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4085		return 1;
4086
4087	return 0;
4088}
4089
4090static void
4091svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4092{
4093	/*
4094	 * Patch in the VMMCALL instruction:
4095	 */
4096	hypercall[0] = 0x0f;
4097	hypercall[1] = 0x01;
4098	hypercall[2] = 0xd9;
4099}
4100
4101static int __init svm_check_processor_compat(void)
4102{
4103	return 0;
4104}
4105
4106/*
4107 * The kvm parameter can be NULL (module initialization, or invocation before
4108 * VM creation). Be sure to check the kvm parameter before using it.
4109 */
4110static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4111{
4112	switch (index) {
4113	case MSR_IA32_MCG_EXT_CTL:
4114	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4115		return false;
4116	case MSR_IA32_SMBASE:
4117		if (!IS_ENABLED(CONFIG_KVM_SMM))
4118			return false;
4119		/* SEV-ES guests do not support SMM, so report false */
4120		if (kvm && sev_es_guest(kvm))
4121			return false;
4122		break;
4123	default:
4124		break;
4125	}
4126
4127	return true;
4128}
4129
4130static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4131{
4132	struct vcpu_svm *svm = to_svm(vcpu);
4133	struct kvm_cpuid_entry2 *best;
4134
4135	vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4136				    boot_cpu_has(X86_FEATURE_XSAVE) &&
4137				    boot_cpu_has(X86_FEATURE_XSAVES);
4138
4139	/* Update nrips enabled cache */
4140	svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4141			     guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4142
4143	svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4144	svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4145
4146	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4147
4148	svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4149			guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4150
4151	svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4152			guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4153
4154	svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4155
4156	svm_recalc_instruction_intercepts(vcpu, svm);
4157
4158	/* For sev guests, the memory encryption bit is not reserved in CR3.  */
4159	if (sev_guest(vcpu->kvm)) {
4160		best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
4161		if (best)
4162			vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4163	}
 
 
 
 
4164
4165	init_vmcb_after_set_cpuid(vcpu);
4166}
4167
4168static bool svm_has_wbinvd_exit(void)
4169{
4170	return true;
4171}
4172
4173#define PRE_EX(exit)  { .exit_code = (exit), \
4174			.stage = X86_ICPT_PRE_EXCEPT, }
4175#define POST_EX(exit) { .exit_code = (exit), \
4176			.stage = X86_ICPT_POST_EXCEPT, }
4177#define POST_MEM(exit) { .exit_code = (exit), \
4178			.stage = X86_ICPT_POST_MEMACCESS, }
4179
4180static const struct __x86_intercept {
4181	u32 exit_code;
4182	enum x86_intercept_stage stage;
4183} x86_intercept_map[] = {
4184	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
4185	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
4186	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
4187	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
4188	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
4189	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
4190	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
4191	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
4192	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
4193	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
4194	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
4195	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
4196	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
4197	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
4198	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
4199	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
4200	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
4201	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
4202	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
4203	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
4204	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
4205	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
4206	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
4207	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
4208	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
4209	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
4210	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
4211	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
4212	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
4213	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
4214	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
4215	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
4216	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
4217	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
4218	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
4219	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
4220	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
4221	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
4222	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
4223	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
4224	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
4225	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
4226	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
4227	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
4228	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
4229	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
4230	[x86_intercept_xsetbv]		= PRE_EX(SVM_EXIT_XSETBV),
4231};
4232
4233#undef PRE_EX
4234#undef POST_EX
4235#undef POST_MEM
4236
4237static int svm_check_intercept(struct kvm_vcpu *vcpu,
4238			       struct x86_instruction_info *info,
4239			       enum x86_intercept_stage stage,
4240			       struct x86_exception *exception)
4241{
4242	struct vcpu_svm *svm = to_svm(vcpu);
4243	int vmexit, ret = X86EMUL_CONTINUE;
4244	struct __x86_intercept icpt_info;
4245	struct vmcb *vmcb = svm->vmcb;
4246
4247	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4248		goto out;
4249
4250	icpt_info = x86_intercept_map[info->intercept];
4251
4252	if (stage != icpt_info.stage)
4253		goto out;
4254
4255	switch (icpt_info.exit_code) {
4256	case SVM_EXIT_READ_CR0:
4257		if (info->intercept == x86_intercept_cr_read)
4258			icpt_info.exit_code += info->modrm_reg;
4259		break;
4260	case SVM_EXIT_WRITE_CR0: {
4261		unsigned long cr0, val;
4262
4263		if (info->intercept == x86_intercept_cr_write)
4264			icpt_info.exit_code += info->modrm_reg;
4265
4266		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4267		    info->intercept == x86_intercept_clts)
4268			break;
4269
4270		if (!(vmcb12_is_intercept(&svm->nested.ctl,
4271					INTERCEPT_SELECTIVE_CR0)))
4272			break;
4273
4274		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4275		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4276
4277		if (info->intercept == x86_intercept_lmsw) {
4278			cr0 &= 0xfUL;
4279			val &= 0xfUL;
4280			/* lmsw can't clear PE - catch this here */
4281			if (cr0 & X86_CR0_PE)
4282				val |= X86_CR0_PE;
4283		}
4284
4285		if (cr0 ^ val)
4286			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4287
4288		break;
4289	}
4290	case SVM_EXIT_READ_DR0:
4291	case SVM_EXIT_WRITE_DR0:
4292		icpt_info.exit_code += info->modrm_reg;
4293		break;
4294	case SVM_EXIT_MSR:
4295		if (info->intercept == x86_intercept_wrmsr)
4296			vmcb->control.exit_info_1 = 1;
4297		else
4298			vmcb->control.exit_info_1 = 0;
4299		break;
4300	case SVM_EXIT_PAUSE:
4301		/*
4302		 * We get this for NOP only, but pause
4303		 * is rep not, check this here
4304		 */
4305		if (info->rep_prefix != REPE_PREFIX)
4306			goto out;
4307		break;
4308	case SVM_EXIT_IOIO: {
4309		u64 exit_info;
4310		u32 bytes;
4311
4312		if (info->intercept == x86_intercept_in ||
4313		    info->intercept == x86_intercept_ins) {
4314			exit_info = ((info->src_val & 0xffff) << 16) |
4315				SVM_IOIO_TYPE_MASK;
4316			bytes = info->dst_bytes;
4317		} else {
4318			exit_info = (info->dst_val & 0xffff) << 16;
4319			bytes = info->src_bytes;
4320		}
4321
4322		if (info->intercept == x86_intercept_outs ||
4323		    info->intercept == x86_intercept_ins)
4324			exit_info |= SVM_IOIO_STR_MASK;
4325
4326		if (info->rep_prefix)
4327			exit_info |= SVM_IOIO_REP_MASK;
4328
4329		bytes = min(bytes, 4u);
4330
4331		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4332
4333		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4334
4335		vmcb->control.exit_info_1 = exit_info;
4336		vmcb->control.exit_info_2 = info->next_rip;
4337
4338		break;
4339	}
4340	default:
4341		break;
4342	}
4343
4344	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4345	if (static_cpu_has(X86_FEATURE_NRIPS))
4346		vmcb->control.next_rip  = info->next_rip;
4347	vmcb->control.exit_code = icpt_info.exit_code;
4348	vmexit = nested_svm_exit_handled(svm);
4349
4350	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4351					   : X86EMUL_CONTINUE;
4352
4353out:
4354	return ret;
4355}
4356
4357static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4358{
4359	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4360		vcpu->arch.at_instruction_boundary = true;
4361}
4362
4363static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4364{
4365	if (!kvm_pause_in_guest(vcpu->kvm))
4366		shrink_ple_window(vcpu);
4367}
4368
4369static void svm_setup_mce(struct kvm_vcpu *vcpu)
4370{
4371	/* [63:9] are reserved. */
4372	vcpu->arch.mcg_cap &= 0x1ff;
4373}
4374
4375#ifdef CONFIG_KVM_SMM
4376bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4377{
4378	struct vcpu_svm *svm = to_svm(vcpu);
4379
4380	/* Per APM Vol.2 15.22.2 "Response to SMI" */
4381	if (!gif_set(svm))
4382		return true;
4383
4384	return is_smm(vcpu);
4385}
4386
4387static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4388{
4389	struct vcpu_svm *svm = to_svm(vcpu);
4390	if (svm->nested.nested_run_pending)
4391		return -EBUSY;
4392
4393	if (svm_smi_blocked(vcpu))
4394		return 0;
4395
4396	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4397	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4398		return -EBUSY;
4399
4400	return 1;
4401}
4402
4403static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4404{
4405	struct vcpu_svm *svm = to_svm(vcpu);
4406	struct kvm_host_map map_save;
4407	int ret;
4408
4409	if (!is_guest_mode(vcpu))
4410		return 0;
4411
4412	/*
4413	 * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4414	 * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4415	 */
4416
4417	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4418		return 1;
4419
4420	smram->smram64.svm_guest_flag = 1;
4421	smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4422
4423	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4424	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4425	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4426
4427	ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4428	if (ret)
4429		return ret;
4430
4431	/*
4432	 * KVM uses VMCB01 to store L1 host state while L2 runs but
4433	 * VMCB01 is going to be used during SMM and thus the state will
4434	 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4435	 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4436	 * format of the area is identical to guest save area offsetted
4437	 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4438	 * within 'struct vmcb'). Note: HSAVE area may also be used by
4439	 * L1 hypervisor to save additional host context (e.g. KVM does
4440	 * that, see svm_prepare_switch_to_guest()) which must be
4441	 * preserved.
4442	 */
4443	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4444		return 1;
4445
4446	BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4447
4448	svm_copy_vmrun_state(map_save.hva + 0x400,
4449			     &svm->vmcb01.ptr->save);
4450
4451	kvm_vcpu_unmap(vcpu, &map_save, true);
4452	return 0;
4453}
4454
4455static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4456{
4457	struct vcpu_svm *svm = to_svm(vcpu);
4458	struct kvm_host_map map, map_save;
4459	struct vmcb *vmcb12;
4460	int ret;
4461
4462	const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4463
4464	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4465		return 0;
4466
4467	/* Non-zero if SMI arrived while vCPU was in guest mode. */
4468	if (!smram64->svm_guest_flag)
4469		return 0;
4470
4471	if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4472		return 1;
4473
4474	if (!(smram64->efer & EFER_SVME))
4475		return 1;
4476
4477	if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4478		return 1;
4479
4480	ret = 1;
4481	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4482		goto unmap_map;
4483
4484	if (svm_allocate_nested(svm))
4485		goto unmap_save;
4486
4487	/*
4488	 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4489	 * used during SMM (see svm_enter_smm())
4490	 */
4491
4492	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4493
4494	/*
4495	 * Enter the nested guest now
4496	 */
4497
4498	vmcb_mark_all_dirty(svm->vmcb01.ptr);
4499
4500	vmcb12 = map.hva;
4501	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4502	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4503	ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4504
4505	if (ret)
4506		goto unmap_save;
4507
4508	svm->nested.nested_run_pending = 1;
4509
4510unmap_save:
4511	kvm_vcpu_unmap(vcpu, &map_save, true);
4512unmap_map:
4513	kvm_vcpu_unmap(vcpu, &map, true);
4514	return ret;
4515}
4516
4517static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4518{
4519	struct vcpu_svm *svm = to_svm(vcpu);
4520
4521	if (!gif_set(svm)) {
4522		if (vgif)
4523			svm_set_intercept(svm, INTERCEPT_STGI);
4524		/* STGI will cause a vm exit */
4525	} else {
4526		/* We must be in SMM; RSM will cause a vmexit anyway.  */
4527	}
4528}
4529#endif
4530
4531static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4532					void *insn, int insn_len)
4533{
4534	bool smep, smap, is_user;
4535	unsigned long cr4;
4536	u64 error_code;
4537
4538	/* Emulation is always possible when KVM has access to all guest state. */
4539	if (!sev_guest(vcpu->kvm))
4540		return true;
4541
4542	/* #UD and #GP should never be intercepted for SEV guests. */
4543	WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4544				  EMULTYPE_TRAP_UD_FORCED |
4545				  EMULTYPE_VMWARE_GP));
4546
4547	/*
4548	 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4549	 * to guest register state.
4550	 */
4551	if (sev_es_guest(vcpu->kvm))
4552		return false;
4553
4554	/*
4555	 * Emulation is possible if the instruction is already decoded, e.g.
4556	 * when completing I/O after returning from userspace.
4557	 */
4558	if (emul_type & EMULTYPE_NO_DECODE)
4559		return true;
4560
4561	/*
4562	 * Emulation is possible for SEV guests if and only if a prefilled
4563	 * buffer containing the bytes of the intercepted instruction is
4564	 * available. SEV guest memory is encrypted with a guest specific key
4565	 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4566	 * decode garbage.
4567	 *
4568	 * Inject #UD if KVM reached this point without an instruction buffer.
4569	 * In practice, this path should never be hit by a well-behaved guest,
4570	 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4571	 * is still theoretically reachable, e.g. via unaccelerated fault-like
4572	 * AVIC access, and needs to be handled by KVM to avoid putting the
4573	 * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
4574	 * but its the least awful option given lack of insight into the guest.
 
 
 
 
 
 
 
 
4575	 */
4576	if (unlikely(!insn)) {
 
 
 
4577		kvm_queue_exception(vcpu, UD_VECTOR);
4578		return false;
4579	}
4580
4581	/*
4582	 * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4583	 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4584	 * the faulting instruction because the code fetch itself faulted, e.g.
4585	 * the guest attempted to fetch from emulated MMIO or a guest page
4586	 * table used to translate CS:RIP resides in emulated MMIO.
4587	 */
4588	if (likely(insn_len))
4589		return true;
4590
4591	/*
4592	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4593	 *
4594	 * Errata:
4595	 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4596	 * possible that CPU microcode implementing DecodeAssist will fail to
4597	 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4598	 * be '0'.  This happens because microcode reads CS:RIP using a _data_
4599	 * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4600	 * gives up and does not fill the instruction bytes buffer.
4601	 *
4602	 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4603	 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4604	 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4605	 * GuestIntrBytes field of the VMCB.
4606	 *
4607	 * This does _not_ mean that the erratum has been encountered, as the
4608	 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4609	 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4610	 * encountered a reserved/not-present #PF.
4611	 *
4612	 * To hit the erratum, the following conditions must be true:
4613	 *    1. CR4.SMAP=1 (obviously).
4614	 *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4615	 *       have been hit as the guest would have encountered a SMEP
4616	 *       violation #PF, not a #NPF.
4617	 *    3. The #NPF is not due to a code fetch, in which case failure to
4618	 *       retrieve the instruction bytes is legitimate (see abvoe).
4619	 *
4620	 * In addition, don't apply the erratum workaround if the #NPF occurred
4621	 * while translating guest page tables (see below).
4622	 */
4623	error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4624	if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4625		goto resume_guest;
4626
4627	cr4 = kvm_read_cr4(vcpu);
4628	smep = cr4 & X86_CR4_SMEP;
4629	smap = cr4 & X86_CR4_SMAP;
4630	is_user = svm_get_cpl(vcpu) == 3;
4631	if (smap && (!smep || is_user)) {
4632		pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
4633
4634		/*
4635		 * If the fault occurred in userspace, arbitrarily inject #GP
4636		 * to avoid killing the guest and to hopefully avoid confusing
4637		 * the guest kernel too much, e.g. injecting #PF would not be
4638		 * coherent with respect to the guest's page tables.  Request
4639		 * triple fault if the fault occurred in the kernel as there's
4640		 * no fault that KVM can inject without confusing the guest.
4641		 * In practice, the triple fault is moot as no sane SEV kernel
4642		 * will execute from user memory while also running with SMAP=1.
4643		 */
4644		if (is_user)
4645			kvm_inject_gp(vcpu, 0);
4646		else
4647			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 
4648	}
4649
4650resume_guest:
4651	/*
4652	 * If the erratum was not hit, simply resume the guest and let it fault
4653	 * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4654	 * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4655	 * userspace will kill the guest, and letting the emulator read garbage
4656	 * will yield random behavior and potentially corrupt the guest.
4657	 *
4658	 * Simply resuming the guest is technically not a violation of the SEV
4659	 * architecture.  AMD's APM states that all code fetches and page table
4660	 * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4661	 * APM also states that encrypted accesses to MMIO are "ignored", but
4662	 * doesn't explicitly define "ignored", i.e. doing nothing and letting
4663	 * the guest spin is technically "ignoring" the access.
4664	 */
4665	return false;
4666}
4667
4668static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4669{
4670	struct vcpu_svm *svm = to_svm(vcpu);
4671
4672	return !gif_set(svm);
4673}
4674
4675static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4676{
4677	if (!sev_es_guest(vcpu->kvm))
4678		return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4679
4680	sev_vcpu_deliver_sipi_vector(vcpu, vector);
4681}
4682
4683static void svm_vm_destroy(struct kvm *kvm)
4684{
4685	avic_vm_destroy(kvm);
4686	sev_vm_destroy(kvm);
4687}
4688
4689static int svm_vm_init(struct kvm *kvm)
4690{
4691	if (!pause_filter_count || !pause_filter_thresh)
4692		kvm->arch.pause_in_guest = true;
4693
4694	if (enable_apicv) {
4695		int ret = avic_vm_init(kvm);
4696		if (ret)
4697			return ret;
4698	}
4699
4700	return 0;
4701}
4702
4703static struct kvm_x86_ops svm_x86_ops __initdata = {
4704	.name = "kvm_amd",
 
 
4705
4706	.hardware_unsetup = svm_hardware_unsetup,
4707	.hardware_enable = svm_hardware_enable,
4708	.hardware_disable = svm_hardware_disable,
4709	.has_emulated_msr = svm_has_emulated_msr,
4710
4711	.vcpu_create = svm_vcpu_create,
4712	.vcpu_free = svm_vcpu_free,
4713	.vcpu_reset = svm_vcpu_reset,
4714
4715	.vm_size = sizeof(struct kvm_svm),
4716	.vm_init = svm_vm_init,
4717	.vm_destroy = svm_vm_destroy,
4718
4719	.prepare_switch_to_guest = svm_prepare_switch_to_guest,
4720	.vcpu_load = svm_vcpu_load,
4721	.vcpu_put = svm_vcpu_put,
4722	.vcpu_blocking = avic_vcpu_blocking,
4723	.vcpu_unblocking = avic_vcpu_unblocking,
4724
4725	.update_exception_bitmap = svm_update_exception_bitmap,
4726	.get_msr_feature = svm_get_msr_feature,
4727	.get_msr = svm_get_msr,
4728	.set_msr = svm_set_msr,
4729	.get_segment_base = svm_get_segment_base,
4730	.get_segment = svm_get_segment,
4731	.set_segment = svm_set_segment,
4732	.get_cpl = svm_get_cpl,
4733	.get_cs_db_l_bits = svm_get_cs_db_l_bits,
 
4734	.set_cr0 = svm_set_cr0,
4735	.post_set_cr3 = sev_post_set_cr3,
4736	.is_valid_cr4 = svm_is_valid_cr4,
4737	.set_cr4 = svm_set_cr4,
4738	.set_efer = svm_set_efer,
4739	.get_idt = svm_get_idt,
4740	.set_idt = svm_set_idt,
4741	.get_gdt = svm_get_gdt,
4742	.set_gdt = svm_set_gdt,
4743	.set_dr7 = svm_set_dr7,
4744	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4745	.cache_reg = svm_cache_reg,
4746	.get_rflags = svm_get_rflags,
4747	.set_rflags = svm_set_rflags,
4748	.get_if_flag = svm_get_if_flag,
4749
4750	.flush_tlb_all = svm_flush_tlb_current,
4751	.flush_tlb_current = svm_flush_tlb_current,
4752	.flush_tlb_gva = svm_flush_tlb_gva,
4753	.flush_tlb_guest = svm_flush_tlb_current,
4754
4755	.vcpu_pre_run = svm_vcpu_pre_run,
4756	.vcpu_run = svm_vcpu_run,
4757	.handle_exit = svm_handle_exit,
4758	.skip_emulated_instruction = svm_skip_emulated_instruction,
4759	.update_emulated_instruction = NULL,
4760	.set_interrupt_shadow = svm_set_interrupt_shadow,
4761	.get_interrupt_shadow = svm_get_interrupt_shadow,
4762	.patch_hypercall = svm_patch_hypercall,
4763	.inject_irq = svm_inject_irq,
4764	.inject_nmi = svm_inject_nmi,
 
 
4765	.inject_exception = svm_inject_exception,
4766	.cancel_injection = svm_cancel_injection,
4767	.interrupt_allowed = svm_interrupt_allowed,
4768	.nmi_allowed = svm_nmi_allowed,
4769	.get_nmi_mask = svm_get_nmi_mask,
4770	.set_nmi_mask = svm_set_nmi_mask,
4771	.enable_nmi_window = svm_enable_nmi_window,
4772	.enable_irq_window = svm_enable_irq_window,
4773	.update_cr8_intercept = svm_update_cr8_intercept,
4774	.set_virtual_apic_mode = avic_set_virtual_apic_mode,
4775	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4776	.check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
4777	.apicv_post_state_restore = avic_apicv_post_state_restore,
 
4778
4779	.get_exit_info = svm_get_exit_info,
4780
4781	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4782
4783	.has_wbinvd_exit = svm_has_wbinvd_exit,
4784
4785	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
4786	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4787	.write_tsc_offset = svm_write_tsc_offset,
4788	.write_tsc_multiplier = svm_write_tsc_multiplier,
4789
4790	.load_mmu_pgd = svm_load_mmu_pgd,
4791
4792	.check_intercept = svm_check_intercept,
4793	.handle_exit_irqoff = svm_handle_exit_irqoff,
4794
4795	.request_immediate_exit = __kvm_request_immediate_exit,
4796
4797	.sched_in = svm_sched_in,
4798
4799	.nested_ops = &svm_nested_ops,
4800
4801	.deliver_interrupt = svm_deliver_interrupt,
4802	.pi_update_irte = avic_pi_update_irte,
4803	.setup_mce = svm_setup_mce,
4804
4805#ifdef CONFIG_KVM_SMM
4806	.smi_allowed = svm_smi_allowed,
4807	.enter_smm = svm_enter_smm,
4808	.leave_smm = svm_leave_smm,
4809	.enable_smi_window = svm_enable_smi_window,
4810#endif
4811
4812	.mem_enc_ioctl = sev_mem_enc_ioctl,
4813	.mem_enc_register_region = sev_mem_enc_register_region,
4814	.mem_enc_unregister_region = sev_mem_enc_unregister_region,
4815	.guest_memory_reclaimed = sev_guest_memory_reclaimed,
4816
4817	.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4818	.vm_move_enc_context_from = sev_vm_move_enc_context_from,
4819
4820	.can_emulate_instruction = svm_can_emulate_instruction,
4821
4822	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
4823
4824	.msr_filter_changed = svm_msr_filter_changed,
4825	.complete_emulated_msr = svm_complete_emulated_msr,
4826
4827	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4828	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4829};
4830
4831/*
4832 * The default MMIO mask is a single bit (excluding the present bit),
4833 * which could conflict with the memory encryption bit. Check for
4834 * memory encryption support and override the default MMIO mask if
4835 * memory encryption is enabled.
4836 */
4837static __init void svm_adjust_mmio_mask(void)
4838{
4839	unsigned int enc_bit, mask_bit;
4840	u64 msr, mask;
4841
4842	/* If there is no memory encryption support, use existing mask */
4843	if (cpuid_eax(0x80000000) < 0x8000001f)
4844		return;
4845
4846	/* If memory encryption is not enabled, use existing mask */
4847	rdmsrl(MSR_AMD64_SYSCFG, msr);
4848	if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4849		return;
4850
4851	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4852	mask_bit = boot_cpu_data.x86_phys_bits;
4853
4854	/* Increment the mask bit if it is the same as the encryption bit */
4855	if (enc_bit == mask_bit)
4856		mask_bit++;
4857
4858	/*
4859	 * If the mask bit location is below 52, then some bits above the
4860	 * physical addressing limit will always be reserved, so use the
4861	 * rsvd_bits() function to generate the mask. This mask, along with
4862	 * the present bit, will be used to generate a page fault with
4863	 * PFER.RSV = 1.
4864	 *
4865	 * If the mask bit location is 52 (or above), then clear the mask.
4866	 */
4867	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4868
4869	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4870}
4871
4872static __init void svm_set_cpu_caps(void)
4873{
4874	kvm_set_cpu_caps();
4875
4876	kvm_caps.supported_perf_cap = 0;
4877	kvm_caps.supported_xss = 0;
4878
4879	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
4880	if (nested) {
4881		kvm_cpu_cap_set(X86_FEATURE_SVM);
4882		kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
4883
 
 
 
 
 
 
 
4884		if (nrips)
4885			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4886
4887		if (npt_enabled)
4888			kvm_cpu_cap_set(X86_FEATURE_NPT);
4889
4890		if (tsc_scaling)
4891			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4892
4893		if (vls)
4894			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
4895		if (lbrv)
4896			kvm_cpu_cap_set(X86_FEATURE_LBRV);
4897
4898		if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4899			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4900
4901		if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4902			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4903
4904		if (vgif)
4905			kvm_cpu_cap_set(X86_FEATURE_VGIF);
4906
 
 
 
4907		/* Nested VM can receive #VMEXIT instead of triggering #GP */
4908		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4909	}
4910
4911	/* CPUID 0x80000008 */
4912	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4913	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
4914		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4915
4916	/* AMD PMU PERFCTR_CORE CPUID */
4917	if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4918		kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
 
 
 
 
 
 
 
 
 
 
 
 
 
4919
4920	/* CPUID 0x8000001F (SME/SEV features) */
4921	sev_set_cpu_caps();
4922}
4923
4924static __init int svm_hardware_setup(void)
4925{
4926	int cpu;
4927	struct page *iopm_pages;
4928	void *iopm_va;
4929	int r;
4930	unsigned int order = get_order(IOPM_SIZE);
4931
4932	/*
4933	 * NX is required for shadow paging and for NPT if the NX huge pages
4934	 * mitigation is enabled.
4935	 */
4936	if (!boot_cpu_has(X86_FEATURE_NX)) {
4937		pr_err_ratelimited("NX (Execute Disable) not supported\n");
4938		return -EOPNOTSUPP;
4939	}
4940	kvm_enable_efer_bits(EFER_NX);
4941
4942	iopm_pages = alloc_pages(GFP_KERNEL, order);
4943
4944	if (!iopm_pages)
4945		return -ENOMEM;
4946
4947	iopm_va = page_address(iopm_pages);
4948	memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4949	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4950
4951	init_msrpm_offsets();
4952
4953	kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
4954				     XFEATURE_MASK_BNDCSR);
4955
4956	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4957		kvm_enable_efer_bits(EFER_FFXSR);
4958
4959	if (tsc_scaling) {
4960		if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4961			tsc_scaling = false;
4962		} else {
4963			pr_info("TSC scaling supported\n");
4964			kvm_caps.has_tsc_control = true;
4965		}
4966	}
4967	kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
4968	kvm_caps.tsc_scaling_ratio_frac_bits = 32;
4969
4970	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
4971
 
 
 
4972	/* Check for pause filtering support */
4973	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
4974		pause_filter_count = 0;
4975		pause_filter_thresh = 0;
4976	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
4977		pause_filter_thresh = 0;
4978	}
4979
4980	if (nested) {
4981		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
4982		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
4983	}
4984
4985	/*
4986	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
4987	 * NPT isn't supported if the host is using 2-level paging since host
4988	 * CR4 is unchanged on VMRUN.
4989	 */
4990	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
4991		npt_enabled = false;
4992
4993	if (!boot_cpu_has(X86_FEATURE_NPT))
4994		npt_enabled = false;
4995
4996	/* Force VM NPT level equal to the host's paging level */
4997	kvm_configure_mmu(npt_enabled, get_npt_level(),
4998			  get_npt_level(), PG_LEVEL_1G);
4999	pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5000
5001	/* Setup shadow_me_value and shadow_me_mask */
5002	kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5003
5004	svm_adjust_mmio_mask();
5005
 
 
5006	/*
5007	 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5008	 * may be modified by svm_adjust_mmio_mask()).
5009	 */
5010	sev_hardware_setup();
5011
5012	svm_hv_hardware_setup();
5013
5014	for_each_possible_cpu(cpu) {
5015		r = svm_cpu_init(cpu);
5016		if (r)
5017			goto err;
5018	}
5019
5020	if (nrips) {
5021		if (!boot_cpu_has(X86_FEATURE_NRIPS))
5022			nrips = false;
5023	}
5024
5025	enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
5026
5027	if (!enable_apicv) {
5028		svm_x86_ops.vcpu_blocking = NULL;
5029		svm_x86_ops.vcpu_unblocking = NULL;
5030		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
 
 
5031	}
5032
5033	if (vls) {
5034		if (!npt_enabled ||
5035		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5036		    !IS_ENABLED(CONFIG_X86_64)) {
5037			vls = false;
5038		} else {
5039			pr_info("Virtual VMLOAD VMSAVE supported\n");
5040		}
5041	}
5042
5043	if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5044		svm_gp_erratum_intercept = false;
5045
5046	if (vgif) {
5047		if (!boot_cpu_has(X86_FEATURE_VGIF))
5048			vgif = false;
5049		else
5050			pr_info("Virtual GIF supported\n");
5051	}
5052
 
 
 
 
 
 
 
 
 
 
5053	if (lbrv) {
5054		if (!boot_cpu_has(X86_FEATURE_LBRV))
5055			lbrv = false;
5056		else
5057			pr_info("LBR virtualization supported\n");
5058	}
5059
5060	if (!enable_pmu)
5061		pr_info("PMU virtualization is disabled\n");
5062
5063	svm_set_cpu_caps();
5064
5065	/*
5066	 * It seems that on AMD processors PTE's accessed bit is
5067	 * being set by the CPU hardware before the NPF vmexit.
5068	 * This is not expected behaviour and our tests fail because
5069	 * of it.
5070	 * A workaround here is to disable support for
5071	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5072	 * In this case userspace can know if there is support using
5073	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5074	 * it
5075	 * If future AMD CPU models change the behaviour described above,
5076	 * this variable can be changed accordingly
5077	 */
5078	allow_smaller_maxphyaddr = !npt_enabled;
5079
5080	return 0;
5081
5082err:
5083	svm_hardware_unsetup();
5084	return r;
5085}
5086
5087
5088static struct kvm_x86_init_ops svm_init_ops __initdata = {
5089	.cpu_has_kvm_support = has_svm,
5090	.disabled_by_bios = is_disabled,
5091	.hardware_setup = svm_hardware_setup,
5092	.check_processor_compatibility = svm_check_processor_compat,
5093
5094	.runtime_ops = &svm_x86_ops,
5095	.pmu_ops = &amd_pmu_ops,
5096};
5097
 
 
 
 
 
 
 
5098static int __init svm_init(void)
5099{
 
 
5100	__unused_size_checks();
5101
5102	return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
5103			__alignof__(struct vcpu_svm), THIS_MODULE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5104}
5105
5106static void __exit svm_exit(void)
5107{
5108	kvm_exit();
 
5109}
5110
5111module_init(svm_init)
5112module_exit(svm_exit)

   1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   2
   3#include <linux/kvm_host.h>
   4
   5#include "irq.h"
   6#include "mmu.h"
   7#include "kvm_cache_regs.h"
   8#include "x86.h"
   9#include "smm.h"
  10#include "cpuid.h"
  11#include "pmu.h"
  12
  13#include <linux/module.h>
  14#include <linux/mod_devicetable.h>
  15#include <linux/kernel.h>
  16#include <linux/vmalloc.h>
  17#include <linux/highmem.h>
  18#include <linux/amd-iommu.h>
  19#include <linux/sched.h>
  20#include <linux/trace_events.h>
  21#include <linux/slab.h>
  22#include <linux/hashtable.h>
  23#include <linux/objtool.h>
  24#include <linux/psp-sev.h>
  25#include <linux/file.h>
  26#include <linux/pagemap.h>
  27#include <linux/swap.h>
  28#include <linux/rwsem.h>
  29#include <linux/cc_platform.h>
  30#include <linux/smp.h>
  31
  32#include <asm/apic.h>
  33#include <asm/perf_event.h>
  34#include <asm/tlbflush.h>
  35#include <asm/desc.h>
  36#include <asm/debugreg.h>
  37#include <asm/kvm_para.h>
  38#include <asm/irq_remapping.h>
  39#include <asm/spec-ctrl.h>
  40#include <asm/cpu_device_id.h>
  41#include <asm/traps.h>
  42#include <asm/reboot.h>
  43#include <asm/fpu/api.h>
  44
  45#include <trace/events/ipi.h>
  46
  47#include "trace.h"
  48
  49#include "svm.h"
  50#include "svm_ops.h"
  51
  52#include "kvm_onhyperv.h"
  53#include "svm_onhyperv.h"
  54
  55MODULE_AUTHOR("Qumranet");
  56MODULE_LICENSE("GPL");
  57
  58#ifdef MODULE
  59static const struct x86_cpu_id svm_cpu_id[] = {
  60	X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  61	{}
  62};
  63MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  64#endif
  65
  66#define SEG_TYPE_LDT 2
  67#define SEG_TYPE_BUSY_TSS16 3
  68
  69static bool erratum_383_found __read_mostly;
  70
  71u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  72
  73/*
  74 * Set osvw_len to higher value when updated Revision Guides
  75 * are published and we know what the new status bits are
  76 */
  77static uint64_t osvw_len = 4, osvw_status;
  78
  79static DEFINE_PER_CPU(u64, current_tsc_ratio);
  80
  81#define X2APIC_MSR(x)	(APIC_BASE_MSR + (x >> 4))
  82
  83static const struct svm_direct_access_msrs {
  84	u32 index;   /* Index of the MSR */
  85	bool always; /* True if intercept is initially cleared */
  86} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  87	{ .index = MSR_STAR,				.always = true  },
  88	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
  89	{ .index = MSR_IA32_SYSENTER_EIP,		.always = false },
  90	{ .index = MSR_IA32_SYSENTER_ESP,		.always = false },
  91#ifdef CONFIG_X86_64
  92	{ .index = MSR_GS_BASE,				.always = true  },
  93	{ .index = MSR_FS_BASE,				.always = true  },
  94	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
  95	{ .index = MSR_LSTAR,				.always = true  },
  96	{ .index = MSR_CSTAR,				.always = true  },
  97	{ .index = MSR_SYSCALL_MASK,			.always = true  },
  98#endif
  99	{ .index = MSR_IA32_SPEC_CTRL,			.always = false },
 100	{ .index = MSR_IA32_PRED_CMD,			.always = false },
 101	{ .index = MSR_IA32_FLUSH_CMD,			.always = false },
 102	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
 103	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
 104	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
 105	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
 106	{ .index = MSR_IA32_XSS,			.always = false },
 107	{ .index = MSR_EFER,				.always = false },
 108	{ .index = MSR_IA32_CR_PAT,			.always = false },
 109	{ .index = MSR_AMD64_SEV_ES_GHCB,		.always = true  },
 110	{ .index = MSR_TSC_AUX,				.always = false },
 111	{ .index = X2APIC_MSR(APIC_ID),			.always = false },
 112	{ .index = X2APIC_MSR(APIC_LVR),		.always = false },
 113	{ .index = X2APIC_MSR(APIC_TASKPRI),		.always = false },
 114	{ .index = X2APIC_MSR(APIC_ARBPRI),		.always = false },
 115	{ .index = X2APIC_MSR(APIC_PROCPRI),		.always = false },
 116	{ .index = X2APIC_MSR(APIC_EOI),		.always = false },
 117	{ .index = X2APIC_MSR(APIC_RRR),		.always = false },
 118	{ .index = X2APIC_MSR(APIC_LDR),		.always = false },
 119	{ .index = X2APIC_MSR(APIC_DFR),		.always = false },
 120	{ .index = X2APIC_MSR(APIC_SPIV),		.always = false },
 121	{ .index = X2APIC_MSR(APIC_ISR),		.always = false },
 122	{ .index = X2APIC_MSR(APIC_TMR),		.always = false },
 123	{ .index = X2APIC_MSR(APIC_IRR),		.always = false },
 124	{ .index = X2APIC_MSR(APIC_ESR),		.always = false },
 125	{ .index = X2APIC_MSR(APIC_ICR),		.always = false },
 126	{ .index = X2APIC_MSR(APIC_ICR2),		.always = false },
 127
 128	/*
 129	 * Note:
 130	 * AMD does not virtualize APIC TSC-deadline timer mode, but it is
 131	 * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
 132	 * the AVIC hardware would generate GP fault. Therefore, always
 133	 * intercept the MSR 0x832, and do not setup direct_access_msr.
 134	 */
 135	{ .index = X2APIC_MSR(APIC_LVTTHMR),		.always = false },
 136	{ .index = X2APIC_MSR(APIC_LVTPC),		.always = false },
 137	{ .index = X2APIC_MSR(APIC_LVT0),		.always = false },
 138	{ .index = X2APIC_MSR(APIC_LVT1),		.always = false },
 139	{ .index = X2APIC_MSR(APIC_LVTERR),		.always = false },
 140	{ .index = X2APIC_MSR(APIC_TMICT),		.always = false },
 141	{ .index = X2APIC_MSR(APIC_TMCCT),		.always = false },
 142	{ .index = X2APIC_MSR(APIC_TDCR),		.always = false },
 143	{ .index = MSR_INVALID,				.always = false },
 144};
 145
 146/*
 147 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
 148 * pause_filter_count: On processors that support Pause filtering(indicated
 149 *	by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
 150 *	count value. On VMRUN this value is loaded into an internal counter.
 151 *	Each time a pause instruction is executed, this counter is decremented
 152 *	until it reaches zero at which time a #VMEXIT is generated if pause
 153 *	intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
 154 *	Intercept Filtering for more details.
 155 *	This also indicate if ple logic enabled.
 156 *
 157 * pause_filter_thresh: In addition, some processor families support advanced
 158 *	pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
 159 *	the amount of time a guest is allowed to execute in a pause loop.
 160 *	In this mode, a 16-bit pause filter threshold field is added in the
 161 *	VMCB. The threshold value is a cycle count that is used to reset the
 162 *	pause counter. As with simple pause filtering, VMRUN loads the pause
 163 *	count value from VMCB into an internal counter. Then, on each pause
 164 *	instruction the hardware checks the elapsed number of cycles since
 165 *	the most recent pause instruction against the pause filter threshold.
 166 *	If the elapsed cycle count is greater than the pause filter threshold,
 167 *	then the internal pause count is reloaded from the VMCB and execution
 168 *	continues. If the elapsed cycle count is less than the pause filter
 169 *	threshold, then the internal pause count is decremented. If the count
 170 *	value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
 171 *	triggered. If advanced pause filtering is supported and pause filter
 172 *	threshold field is set to zero, the filter will operate in the simpler,
 173 *	count only mode.
 174 */
 175
 176static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
 177module_param(pause_filter_thresh, ushort, 0444);
 178
 179static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
 180module_param(pause_filter_count, ushort, 0444);
 181
 182/* Default doubles per-vcpu window every exit. */
 183static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
 184module_param(pause_filter_count_grow, ushort, 0444);
 185
 186/* Default resets per-vcpu window every exit to pause_filter_count. */
 187static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
 188module_param(pause_filter_count_shrink, ushort, 0444);
 189
 190/* Default is to compute the maximum so we can never overflow. */
 191static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
 192module_param(pause_filter_count_max, ushort, 0444);
 193
 194/*
 195 * Use nested page tables by default.  Note, NPT may get forced off by
 196 * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
 197 */
 198bool npt_enabled = true;
 199module_param_named(npt, npt_enabled, bool, 0444);
 200
 201/* allow nested virtualization in KVM/SVM */
 202static int nested = true;
 203module_param(nested, int, 0444);
 204
 205/* enable/disable Next RIP Save */
 206int nrips = true;
 207module_param(nrips, int, 0444);
 208
 209/* enable/disable Virtual VMLOAD VMSAVE */
 210static int vls = true;
 211module_param(vls, int, 0444);
 212
 213/* enable/disable Virtual GIF */
 214int vgif = true;
 215module_param(vgif, int, 0444);
 216
 217/* enable/disable LBR virtualization */
 218static int lbrv = true;
 219module_param(lbrv, int, 0444);
 220
 221static int tsc_scaling = true;
 222module_param(tsc_scaling, int, 0444);
 223
 224/*
 225 * enable / disable AVIC.  Because the defaults differ for APICv
 226 * support between VMX and SVM we cannot use module_param_named.
 227 */
 228static bool avic;
 229module_param(avic, bool, 0444);
 230
 231bool __read_mostly dump_invalid_vmcb;
 232module_param(dump_invalid_vmcb, bool, 0644);
 233
 234
 235bool intercept_smi = true;
 236module_param(intercept_smi, bool, 0444);
 237
 238bool vnmi = true;
 239module_param(vnmi, bool, 0444);
 240
 241static bool svm_gp_erratum_intercept = true;
 242
 243static u8 rsm_ins_bytes[] = "\x0f\xaa";
 244
 245static unsigned long iopm_base;
 246
 
 
 
 
 
 
 
 
 
 247DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
 248
 249/*
 250 * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
 251 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
 252 *
 253 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
 254 * defer the restoration of TSC_AUX until the CPU returns to userspace.
 255 */
 256static int tsc_aux_uret_slot __read_mostly = -1;
 257
 258static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 259
 260#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 261#define MSRS_RANGE_SIZE 2048
 262#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 263
 264u32 svm_msrpm_offset(u32 msr)
 265{
 266	u32 offset;
 267	int i;
 268
 269	for (i = 0; i < NUM_MSR_MAPS; i++) {
 270		if (msr < msrpm_ranges[i] ||
 271		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 272			continue;
 273
 274		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 275		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 276
 277		/* Now we have the u8 offset - but need the u32 offset */
 278		return offset / 4;
 279	}
 280
 281	/* MSR not in any range */
 282	return MSR_INVALID;
 283}
 284
 285static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
 286
 287static int get_npt_level(void)
 288{
 289#ifdef CONFIG_X86_64
 290	return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
 291#else
 292	return PT32E_ROOT_LEVEL;
 293#endif
 294}
 295
 296int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 297{
 298	struct vcpu_svm *svm = to_svm(vcpu);
 299	u64 old_efer = vcpu->arch.efer;
 300	vcpu->arch.efer = efer;
 301
 302	if (!npt_enabled) {
 303		/* Shadow paging assumes NX to be available.  */
 304		efer |= EFER_NX;
 305
 306		if (!(efer & EFER_LMA))
 307			efer &= ~EFER_LME;
 308	}
 309
 310	if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
 311		if (!(efer & EFER_SVME)) {
 312			svm_leave_nested(vcpu);
 313			svm_set_gif(svm, true);
 314			/* #GP intercept is still needed for vmware backdoor */
 315			if (!enable_vmware_backdoor)
 316				clr_exception_intercept(svm, GP_VECTOR);
 317
 318			/*
 319			 * Free the nested guest state, unless we are in SMM.
 320			 * In this case we will return to the nested guest
 321			 * as soon as we leave SMM.
 322			 */
 323			if (!is_smm(vcpu))
 324				svm_free_nested(svm);
 325
 326		} else {
 327			int ret = svm_allocate_nested(svm);
 328
 329			if (ret) {
 330				vcpu->arch.efer = old_efer;
 331				return ret;
 332			}
 333
 334			/*
 335			 * Never intercept #GP for SEV guests, KVM can't
 336			 * decrypt guest memory to workaround the erratum.
 337			 */
 338			if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
 339				set_exception_intercept(svm, GP_VECTOR);
 340		}
 341	}
 342
 343	svm->vmcb->save.efer = efer | EFER_SVME;
 344	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
 345	return 0;
 346}
 347
 348static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 349{
 350	struct vcpu_svm *svm = to_svm(vcpu);
 351	u32 ret = 0;
 352
 353	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 354		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 355	return ret;
 356}
 357
 358static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 359{
 360	struct vcpu_svm *svm = to_svm(vcpu);
 361
 362	if (mask == 0)
 363		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 364	else
 365		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 366
 367}
 368
 369static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
 370					   bool commit_side_effects)
 371{
 372	struct vcpu_svm *svm = to_svm(vcpu);
 373	unsigned long old_rflags;
 374
 375	/*
 376	 * SEV-ES does not expose the next RIP. The RIP update is controlled by
 377	 * the type of exit and the #VC handler in the guest.
 378	 */
 379	if (sev_es_guest(vcpu->kvm))
 380		goto done;
 381
 382	if (nrips && svm->vmcb->control.next_rip != 0) {
 383		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 384		svm->next_rip = svm->vmcb->control.next_rip;
 385	}
 386
 387	if (!svm->next_rip) {
 388		if (unlikely(!commit_side_effects))
 389			old_rflags = svm->vmcb->save.rflags;
 390
 391		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
 392			return 0;
 393
 394		if (unlikely(!commit_side_effects))
 395			svm->vmcb->save.rflags = old_rflags;
 396	} else {
 397		kvm_rip_write(vcpu, svm->next_rip);
 398	}
 399
 400done:
 401	if (likely(commit_side_effects))
 402		svm_set_interrupt_shadow(vcpu, 0);
 403
 404	return 1;
 405}
 406
 407static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 408{
 409	return __svm_skip_emulated_instruction(vcpu, true);
 410}
 411
 412static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
 413{
 414	unsigned long rip, old_rip = kvm_rip_read(vcpu);
 415	struct vcpu_svm *svm = to_svm(vcpu);
 416
 417	/*
 418	 * Due to architectural shortcomings, the CPU doesn't always provide
 419	 * NextRIP, e.g. if KVM intercepted an exception that occurred while
 420	 * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
 421	 * the instruction even if NextRIP is supported to acquire the next
 422	 * RIP so that it can be shoved into the NextRIP field, otherwise
 423	 * hardware will fail to advance guest RIP during event injection.
 424	 * Drop the exception/interrupt if emulation fails and effectively
 425	 * retry the instruction, it's the least awful option.  If NRIPS is
 426	 * in use, the skip must not commit any side effects such as clearing
 427	 * the interrupt shadow or RFLAGS.RF.
 428	 */
 429	if (!__svm_skip_emulated_instruction(vcpu, !nrips))
 430		return -EIO;
 431
 432	rip = kvm_rip_read(vcpu);
 433
 434	/*
 435	 * Save the injection information, even when using next_rip, as the
 436	 * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
 437	 * doesn't complete due to a VM-Exit occurring while the CPU is
 438	 * vectoring the event.   Decoding the instruction isn't guaranteed to
 439	 * work as there may be no backing instruction, e.g. if the event is
 440	 * being injected by L1 for L2, or if the guest is patching INT3 into
 441	 * a different instruction.
 442	 */
 443	svm->soft_int_injected = true;
 444	svm->soft_int_csbase = svm->vmcb->save.cs.base;
 445	svm->soft_int_old_rip = old_rip;
 446	svm->soft_int_next_rip = rip;
 447
 448	if (nrips)
 449		kvm_rip_write(vcpu, old_rip);
 450
 451	if (static_cpu_has(X86_FEATURE_NRIPS))
 452		svm->vmcb->control.next_rip = rip;
 453
 454	return 0;
 455}
 456
 457static void svm_inject_exception(struct kvm_vcpu *vcpu)
 458{
 459	struct kvm_queued_exception *ex = &vcpu->arch.exception;
 460	struct vcpu_svm *svm = to_svm(vcpu);
 461
 462	kvm_deliver_exception_payload(vcpu, ex);
 463
 464	if (kvm_exception_is_soft(ex->vector) &&
 465	    svm_update_soft_interrupt_rip(vcpu))
 466		return;
 467
 468	svm->vmcb->control.event_inj = ex->vector
 469		| SVM_EVTINJ_VALID
 470		| (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 471		| SVM_EVTINJ_TYPE_EXEPT;
 472	svm->vmcb->control.event_inj_err = ex->error_code;
 473}
 474
 475static void svm_init_erratum_383(void)
 476{
 477	u32 low, high;
 478	int err;
 479	u64 val;
 480
 481	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 482		return;
 483
 484	/* Use _safe variants to not break nested virtualization */
 485	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 486	if (err)
 487		return;
 488
 489	val |= (1ULL << 47);
 490
 491	low  = lower_32_bits(val);
 492	high = upper_32_bits(val);
 493
 494	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 495
 496	erratum_383_found = true;
 497}
 498
 499static void svm_init_osvw(struct kvm_vcpu *vcpu)
 500{
 501	/*
 502	 * Guests should see errata 400 and 415 as fixed (assuming that
 503	 * HLT and IO instructions are intercepted).
 504	 */
 505	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 506	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 507
 508	/*
 509	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
 510	 * all osvw.status bits inside that length, including bit 0 (which is
 511	 * reserved for erratum 298), are valid. However, if host processor's
 512	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
 513	 * be conservative here and therefore we tell the guest that erratum 298
 514	 * is present (because we really don't know).
 515	 */
 516	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 517		vcpu->arch.osvw.status |= 1;
 518}
 519
 520static bool __kvm_is_svm_supported(void)
 521{
 522	int cpu = smp_processor_id();
 523	struct cpuinfo_x86 *c = &cpu_data(cpu);
 524
 525	if (c->x86_vendor != X86_VENDOR_AMD &&
 526	    c->x86_vendor != X86_VENDOR_HYGON) {
 527		pr_err("CPU %d isn't AMD or Hygon\n", cpu);
 528		return false;
 529	}
 530
 531	if (!cpu_has(c, X86_FEATURE_SVM)) {
 532		pr_err("SVM not supported by CPU %d\n", cpu);
 533		return false;
 534	}
 535
 536	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
 537		pr_info("KVM is unsupported when running as an SEV guest\n");
 538		return false;
 539	}
 540
 541	return true;
 542}
 543
 544static bool kvm_is_svm_supported(void)
 545{
 546	bool supported;
 547
 548	migrate_disable();
 549	supported = __kvm_is_svm_supported();
 550	migrate_enable();
 551
 552	return supported;
 553}
 554
 555static int svm_check_processor_compat(void)
 556{
 557	if (!__kvm_is_svm_supported())
 558		return -EIO;
 559
 560	return 0;
 561}
 562
 563static void __svm_write_tsc_multiplier(u64 multiplier)
 564{
 565	if (multiplier == __this_cpu_read(current_tsc_ratio))
 566		return;
 567
 568	wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 569	__this_cpu_write(current_tsc_ratio, multiplier);
 570}
 571
 572static inline void kvm_cpu_svm_disable(void)
 573{
 574	uint64_t efer;
 575
 576	wrmsrl(MSR_VM_HSAVE_PA, 0);
 577	rdmsrl(MSR_EFER, efer);
 578	if (efer & EFER_SVME) {
 579		/*
 580		 * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
 581		 * NMI aren't blocked.
 582		 */
 583		stgi();
 584		wrmsrl(MSR_EFER, efer & ~EFER_SVME);
 585	}
 586}
 587
 588static void svm_emergency_disable(void)
 589{
 590	kvm_rebooting = true;
 591
 592	kvm_cpu_svm_disable();
 593}
 594
 595static void svm_hardware_disable(void)
 596{
 597	/* Make sure we clean up behind us */
 598	if (tsc_scaling)
 599		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 600
 601	kvm_cpu_svm_disable();
 602
 603	amd_pmu_disable_virt();
 604}
 605
 606static int svm_hardware_enable(void)
 607{
 608
 609	struct svm_cpu_data *sd;
 610	uint64_t efer;
 
 611	int me = raw_smp_processor_id();
 612
 613	rdmsrl(MSR_EFER, efer);
 614	if (efer & EFER_SVME)
 615		return -EBUSY;
 616
 
 
 
 
 617	sd = per_cpu_ptr(&svm_data, me);
 618	sd->asid_generation = 1;
 619	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 620	sd->next_asid = sd->max_asid + 1;
 621	sd->min_asid = max_sev_asid + 1;
 622
 
 
 
 623	wrmsrl(MSR_EFER, efer | EFER_SVME);
 624
 625	wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
 626
 627	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 628		/*
 629		 * Set the default value, even if we don't use TSC scaling
 630		 * to avoid having stale value in the msr
 631		 */
 632		__svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
 633	}
 634
 635
 636	/*
 637	 * Get OSVW bits.
 638	 *
 639	 * Note that it is possible to have a system with mixed processor
 640	 * revisions and therefore different OSVW bits. If bits are not the same
 641	 * on different processors then choose the worst case (i.e. if erratum
 642	 * is present on one processor and not on another then assume that the
 643	 * erratum is present everywhere).
 644	 */
 645	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 646		uint64_t len, status = 0;
 647		int err;
 648
 649		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 650		if (!err)
 651			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 652						      &err);
 653
 654		if (err)
 655			osvw_status = osvw_len = 0;
 656		else {
 657			if (len < osvw_len)
 658				osvw_len = len;
 659			osvw_status |= status;
 660			osvw_status &= (1ULL << osvw_len) - 1;
 661		}
 662	} else
 663		osvw_status = osvw_len = 0;
 664
 665	svm_init_erratum_383();
 666
 667	amd_pmu_enable_virt();
 668
 669	/*
 670	 * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type
 671	 * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests.
 672	 * Since Linux does not change the value of TSC_AUX once set, prime the
 673	 * TSC_AUX field now to avoid a RDMSR on every vCPU run.
 674	 */
 675	if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
 676		struct sev_es_save_area *hostsa;
 677		u32 __maybe_unused msr_hi;
 678
 679		hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
 680
 681		rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
 682	}
 683
 684	return 0;
 685}
 686
 687static void svm_cpu_uninit(int cpu)
 688{
 689	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 690
 691	if (!sd->save_area)
 692		return;
 693
 694	kfree(sd->sev_vmcbs);
 695	__free_page(sd->save_area);
 696	sd->save_area_pa = 0;
 697	sd->save_area = NULL;
 698}
 699
 700static int svm_cpu_init(int cpu)
 701{
 702	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
 703	int ret = -ENOMEM;
 704
 705	memset(sd, 0, sizeof(struct svm_cpu_data));
 706	sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
 707	if (!sd->save_area)
 708		return ret;
 709
 710	ret = sev_cpu_init(sd);
 711	if (ret)
 712		goto free_save_area;
 713
 714	sd->save_area_pa = __sme_page_pa(sd->save_area);
 715	return 0;
 716
 717free_save_area:
 718	__free_page(sd->save_area);
 719	sd->save_area = NULL;
 720	return ret;
 721
 722}
 723
 724static void set_dr_intercepts(struct vcpu_svm *svm)
 725{
 726	struct vmcb *vmcb = svm->vmcb01.ptr;
 727
 728	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
 729	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
 730	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
 731	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
 732	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
 733	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
 734	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
 735	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
 736	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
 737	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
 738	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
 739	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
 740	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
 741	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
 742	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
 743	vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
 744
 745	recalc_intercepts(svm);
 746}
 747
 748static void clr_dr_intercepts(struct vcpu_svm *svm)
 749{
 750	struct vmcb *vmcb = svm->vmcb01.ptr;
 751
 752	vmcb->control.intercepts[INTERCEPT_DR] = 0;
 753
 754	recalc_intercepts(svm);
 755}
 756
 757static int direct_access_msr_slot(u32 msr)
 758{
 759	u32 i;
 760
 761	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 762		if (direct_access_msrs[i].index == msr)
 763			return i;
 764
 765	return -ENOENT;
 766}
 767
 768static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
 769				     int write)
 770{
 771	struct vcpu_svm *svm = to_svm(vcpu);
 772	int slot = direct_access_msr_slot(msr);
 773
 774	if (slot == -ENOENT)
 775		return;
 776
 777	/* Set the shadow bitmaps to the desired intercept states */
 778	if (read)
 779		set_bit(slot, svm->shadow_msr_intercept.read);
 780	else
 781		clear_bit(slot, svm->shadow_msr_intercept.read);
 782
 783	if (write)
 784		set_bit(slot, svm->shadow_msr_intercept.write);
 785	else
 786		clear_bit(slot, svm->shadow_msr_intercept.write);
 787}
 788
 789static bool valid_msr_intercept(u32 index)
 790{
 791	return direct_access_msr_slot(index) != -ENOENT;
 792}
 793
 794static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
 795{
 796	u8 bit_write;
 797	unsigned long tmp;
 798	u32 offset;
 799	u32 *msrpm;
 800
 801	/*
 802	 * For non-nested case:
 803	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
 804	 * save it.
 805	 *
 806	 * For nested case:
 807	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 808	 * save it.
 809	 */
 810	msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
 811				      to_svm(vcpu)->msrpm;
 812
 813	offset    = svm_msrpm_offset(msr);
 814	bit_write = 2 * (msr & 0x0f) + 1;
 815	tmp       = msrpm[offset];
 816
 817	BUG_ON(offset == MSR_INVALID);
 818
 819	return test_bit(bit_write, &tmp);
 820}
 821
 822static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
 823					u32 msr, int read, int write)
 824{
 825	struct vcpu_svm *svm = to_svm(vcpu);
 826	u8 bit_read, bit_write;
 827	unsigned long tmp;
 828	u32 offset;
 829
 830	/*
 831	 * If this warning triggers extend the direct_access_msrs list at the
 832	 * beginning of the file
 833	 */
 834	WARN_ON(!valid_msr_intercept(msr));
 835
 836	/* Enforce non allowed MSRs to trap */
 837	if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
 838		read = 0;
 839
 840	if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
 841		write = 0;
 842
 843	offset    = svm_msrpm_offset(msr);
 844	bit_read  = 2 * (msr & 0x0f);
 845	bit_write = 2 * (msr & 0x0f) + 1;
 846	tmp       = msrpm[offset];
 847
 848	BUG_ON(offset == MSR_INVALID);
 849
 850	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 851	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 852
 853	msrpm[offset] = tmp;
 854
 855	svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
 856	svm->nested.force_msr_bitmap_recalc = true;
 857}
 858
 859void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
 860			  int read, int write)
 861{
 862	set_shadow_msr_intercept(vcpu, msr, read, write);
 863	set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
 864}
 865
 866u32 *svm_vcpu_alloc_msrpm(void)
 867{
 868	unsigned int order = get_order(MSRPM_SIZE);
 869	struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
 870	u32 *msrpm;
 871
 872	if (!pages)
 873		return NULL;
 874
 875	msrpm = page_address(pages);
 876	memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
 877
 878	return msrpm;
 879}
 880
 881void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
 882{
 883	int i;
 884
 885	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 886		if (!direct_access_msrs[i].always)
 887			continue;
 888		set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
 889	}
 890}
 891
 892void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
 893{
 894	int i;
 895
 896	if (intercept == svm->x2avic_msrs_intercepted)
 897		return;
 898
 899	if (!x2avic_enabled)
 
 900		return;
 901
 902	for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
 903		int index = direct_access_msrs[i].index;
 904
 905		if ((index < APIC_BASE_MSR) ||
 906		    (index > APIC_BASE_MSR + 0xff))
 907			continue;
 908		set_msr_interception(&svm->vcpu, svm->msrpm, index,
 909				     !intercept, !intercept);
 910	}
 911
 912	svm->x2avic_msrs_intercepted = intercept;
 913}
 914
 915void svm_vcpu_free_msrpm(u32 *msrpm)
 916{
 917	__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
 918}
 919
 920static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
 921{
 922	struct vcpu_svm *svm = to_svm(vcpu);
 923	u32 i;
 924
 925	/*
 926	 * Set intercept permissions for all direct access MSRs again. They
 927	 * will automatically get filtered through the MSR filter, so we are
 928	 * back in sync after this.
 929	 */
 930	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 931		u32 msr = direct_access_msrs[i].index;
 932		u32 read = test_bit(i, svm->shadow_msr_intercept.read);
 933		u32 write = test_bit(i, svm->shadow_msr_intercept.write);
 934
 935		set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
 936	}
 937}
 938
 939static void add_msr_offset(u32 offset)
 940{
 941	int i;
 942
 943	for (i = 0; i < MSRPM_OFFSETS; ++i) {
 944
 945		/* Offset already in list? */
 946		if (msrpm_offsets[i] == offset)
 947			return;
 948
 949		/* Slot used by another offset? */
 950		if (msrpm_offsets[i] != MSR_INVALID)
 951			continue;
 952
 953		/* Add offset to list */
 954		msrpm_offsets[i] = offset;
 955
 956		return;
 957	}
 958
 959	/*
 960	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
 961	 * increase MSRPM_OFFSETS in this case.
 962	 */
 963	BUG();
 964}
 965
 966static void init_msrpm_offsets(void)
 967{
 968	int i;
 969
 970	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 971
 972	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 973		u32 offset;
 974
 975		offset = svm_msrpm_offset(direct_access_msrs[i].index);
 976		BUG_ON(offset == MSR_INVALID);
 977
 978		add_msr_offset(offset);
 979	}
 980}
 981
 982void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 983{
 984	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
 985	to_vmcb->save.br_from		= from_vmcb->save.br_from;
 986	to_vmcb->save.br_to		= from_vmcb->save.br_to;
 987	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
 988	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
 989
 990	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 991}
 992
 993static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 994{
 995	struct vcpu_svm *svm = to_svm(vcpu);
 996
 997	svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
 998	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 999	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
1000	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
1001	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
1002
1003	/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
1004	if (is_guest_mode(vcpu))
1005		svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
1006}
1007
1008static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
1009{
1010	struct vcpu_svm *svm = to_svm(vcpu);
1011
1012	svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
1013	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
1014	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
1015	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
1016	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1017
1018	/*
1019	 * Move the LBR msrs back to the vmcb01 to avoid copying them
1020	 * on nested guest entries.
1021	 */
1022	if (is_guest_mode(vcpu))
1023		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
1024}
1025
1026static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
1027{
1028	/*
1029	 * If LBR virtualization is disabled, the LBR MSRs are always kept in
1030	 * vmcb01.  If LBR virtualization is enabled and L1 is running VMs of
1031	 * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
 
 
1032	 */
1033	return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
1034								   svm->vmcb01.ptr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1035}
1036
1037void svm_update_lbrv(struct kvm_vcpu *vcpu)
1038{
1039	struct vcpu_svm *svm = to_svm(vcpu);
1040	bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
1041	bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
1042			    (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
1043			    (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
 
 
 
 
 
 
1044
1045	if (enable_lbrv == current_enable_lbrv)
1046		return;
1047
1048	if (enable_lbrv)
1049		svm_enable_lbrv(vcpu);
1050	else
1051		svm_disable_lbrv(vcpu);
1052}
1053
1054void disable_nmi_singlestep(struct vcpu_svm *svm)
1055{
1056	svm->nmi_singlestep = false;
1057
1058	if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1059		/* Clear our flags if they were not set by the guest */
1060		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1061			svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1062		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1063			svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1064	}
1065}
1066
1067static void grow_ple_window(struct kvm_vcpu *vcpu)
1068{
1069	struct vcpu_svm *svm = to_svm(vcpu);
1070	struct vmcb_control_area *control = &svm->vmcb->control;
1071	int old = control->pause_filter_count;
1072
1073	if (kvm_pause_in_guest(vcpu->kvm))
1074		return;
1075
1076	control->pause_filter_count = __grow_ple_window(old,
1077							pause_filter_count,
1078							pause_filter_count_grow,
1079							pause_filter_count_max);
1080
1081	if (control->pause_filter_count != old) {
1082		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1083		trace_kvm_ple_window_update(vcpu->vcpu_id,
1084					    control->pause_filter_count, old);
1085	}
1086}
1087
1088static void shrink_ple_window(struct kvm_vcpu *vcpu)
1089{
1090	struct vcpu_svm *svm = to_svm(vcpu);
1091	struct vmcb_control_area *control = &svm->vmcb->control;
1092	int old = control->pause_filter_count;
1093
1094	if (kvm_pause_in_guest(vcpu->kvm))
1095		return;
1096
1097	control->pause_filter_count =
1098				__shrink_ple_window(old,
1099						    pause_filter_count,
1100						    pause_filter_count_shrink,
1101						    pause_filter_count);
1102	if (control->pause_filter_count != old) {
1103		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1104		trace_kvm_ple_window_update(vcpu->vcpu_id,
1105					    control->pause_filter_count, old);
1106	}
1107}
1108
1109static void svm_hardware_unsetup(void)
1110{
1111	int cpu;
1112
1113	sev_hardware_unsetup();
1114
1115	for_each_possible_cpu(cpu)
1116		svm_cpu_uninit(cpu);
1117
1118	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1119	get_order(IOPM_SIZE));
1120	iopm_base = 0;
1121}
1122
1123static void init_seg(struct vmcb_seg *seg)
1124{
1125	seg->selector = 0;
1126	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1127		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1128	seg->limit = 0xffff;
1129	seg->base = 0;
1130}
1131
1132static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1133{
1134	seg->selector = 0;
1135	seg->attrib = SVM_SELECTOR_P_MASK | type;
1136	seg->limit = 0xffff;
1137	seg->base = 0;
1138}
1139
1140static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1141{
1142	struct vcpu_svm *svm = to_svm(vcpu);
1143
1144	return svm->nested.ctl.tsc_offset;
1145}
1146
1147static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1148{
1149	struct vcpu_svm *svm = to_svm(vcpu);
1150
1151	return svm->tsc_ratio_msr;
1152}
1153
1154static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
1155{
1156	struct vcpu_svm *svm = to_svm(vcpu);
1157
1158	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1159	svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
1160	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1161}
1162
1163void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1164{
1165	preempt_disable();
1166	if (to_svm(vcpu)->guest_state_loaded)
1167		__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1168	preempt_enable();
1169}
1170
 
1171/* Evaluate instruction intercepts that depend on guest CPUID features. */
1172static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1173					      struct vcpu_svm *svm)
1174{
1175	/*
1176	 * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1177	 * roots, or if INVPCID is disabled in the guest to inject #UD.
1178	 */
1179	if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1180		if (!npt_enabled ||
1181		    !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1182			svm_set_intercept(svm, INTERCEPT_INVPCID);
1183		else
1184			svm_clr_intercept(svm, INTERCEPT_INVPCID);
1185	}
1186
1187	if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1188		if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1189			svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1190		else
1191			svm_set_intercept(svm, INTERCEPT_RDTSCP);
1192	}
1193}
1194
1195static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1196{
1197	struct vcpu_svm *svm = to_svm(vcpu);
1198
1199	if (guest_cpuid_is_intel(vcpu)) {
1200		/*
1201		 * We must intercept SYSENTER_EIP and SYSENTER_ESP
1202		 * accesses because the processor only stores 32 bits.
1203		 * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1204		 */
1205		svm_set_intercept(svm, INTERCEPT_VMLOAD);
1206		svm_set_intercept(svm, INTERCEPT_VMSAVE);
1207		svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1208
1209		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1210		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
 
 
1211	} else {
1212		/*
1213		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
1214		 * in VMCB and clear intercepts to avoid #VMEXIT.
1215		 */
1216		if (vls) {
1217			svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1218			svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1219			svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1220		}
1221		/* No need to intercept these MSRs */
1222		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1223		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1224	}
1225}
1226
1227static void init_vmcb(struct kvm_vcpu *vcpu)
1228{
1229	struct vcpu_svm *svm = to_svm(vcpu);
1230	struct vmcb *vmcb = svm->vmcb01.ptr;
1231	struct vmcb_control_area *control = &vmcb->control;
1232	struct vmcb_save_area *save = &vmcb->save;
1233
1234	svm_set_intercept(svm, INTERCEPT_CR0_READ);
1235	svm_set_intercept(svm, INTERCEPT_CR3_READ);
1236	svm_set_intercept(svm, INTERCEPT_CR4_READ);
1237	svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1238	svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1239	svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1240	if (!kvm_vcpu_apicv_active(vcpu))
1241		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1242
1243	set_dr_intercepts(svm);
1244
1245	set_exception_intercept(svm, PF_VECTOR);
1246	set_exception_intercept(svm, UD_VECTOR);
1247	set_exception_intercept(svm, MC_VECTOR);
1248	set_exception_intercept(svm, AC_VECTOR);
1249	set_exception_intercept(svm, DB_VECTOR);
1250	/*
1251	 * Guest access to VMware backdoor ports could legitimately
1252	 * trigger #GP because of TSS I/O permission bitmap.
1253	 * We intercept those #GP and allow access to them anyway
1254	 * as VMware does.
 
1255	 */
1256	if (enable_vmware_backdoor)
1257		set_exception_intercept(svm, GP_VECTOR);
1258
1259	svm_set_intercept(svm, INTERCEPT_INTR);
1260	svm_set_intercept(svm, INTERCEPT_NMI);
1261
1262	if (intercept_smi)
1263		svm_set_intercept(svm, INTERCEPT_SMI);
1264
1265	svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1266	svm_set_intercept(svm, INTERCEPT_RDPMC);
1267	svm_set_intercept(svm, INTERCEPT_CPUID);
1268	svm_set_intercept(svm, INTERCEPT_INVD);
1269	svm_set_intercept(svm, INTERCEPT_INVLPG);
1270	svm_set_intercept(svm, INTERCEPT_INVLPGA);
1271	svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1272	svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1273	svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1274	svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1275	svm_set_intercept(svm, INTERCEPT_VMRUN);
1276	svm_set_intercept(svm, INTERCEPT_VMMCALL);
1277	svm_set_intercept(svm, INTERCEPT_VMLOAD);
1278	svm_set_intercept(svm, INTERCEPT_VMSAVE);
1279	svm_set_intercept(svm, INTERCEPT_STGI);
1280	svm_set_intercept(svm, INTERCEPT_CLGI);
1281	svm_set_intercept(svm, INTERCEPT_SKINIT);
1282	svm_set_intercept(svm, INTERCEPT_WBINVD);
1283	svm_set_intercept(svm, INTERCEPT_XSETBV);
1284	svm_set_intercept(svm, INTERCEPT_RDPRU);
1285	svm_set_intercept(svm, INTERCEPT_RSM);
1286
1287	if (!kvm_mwait_in_guest(vcpu->kvm)) {
1288		svm_set_intercept(svm, INTERCEPT_MONITOR);
1289		svm_set_intercept(svm, INTERCEPT_MWAIT);
1290	}
1291
1292	if (!kvm_hlt_in_guest(vcpu->kvm))
1293		svm_set_intercept(svm, INTERCEPT_HLT);
1294
1295	control->iopm_base_pa = __sme_set(iopm_base);
1296	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1297	control->int_ctl = V_INTR_MASKING_MASK;
1298
1299	init_seg(&save->es);
1300	init_seg(&save->ss);
1301	init_seg(&save->ds);
1302	init_seg(&save->fs);
1303	init_seg(&save->gs);
1304
1305	save->cs.selector = 0xf000;
1306	save->cs.base = 0xffff0000;
1307	/* Executable/Readable Code Segment */
1308	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1309		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1310	save->cs.limit = 0xffff;
1311
1312	save->gdtr.base = 0;
1313	save->gdtr.limit = 0xffff;
1314	save->idtr.base = 0;
1315	save->idtr.limit = 0xffff;
1316
1317	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1318	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1319
1320	if (npt_enabled) {
1321		/* Setup VMCB for Nested Paging */
1322		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1323		svm_clr_intercept(svm, INTERCEPT_INVLPG);
1324		clr_exception_intercept(svm, PF_VECTOR);
1325		svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1326		svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1327		save->g_pat = vcpu->arch.pat;
1328		save->cr3 = 0;
1329	}
1330	svm->current_vmcb->asid_generation = 0;
1331	svm->asid = 0;
1332
1333	svm->nested.vmcb12_gpa = INVALID_GPA;
1334	svm->nested.last_vmcb12_gpa = INVALID_GPA;
1335
1336	if (!kvm_pause_in_guest(vcpu->kvm)) {
1337		control->pause_filter_count = pause_filter_count;
1338		if (pause_filter_thresh)
1339			control->pause_filter_thresh = pause_filter_thresh;
1340		svm_set_intercept(svm, INTERCEPT_PAUSE);
1341	} else {
1342		svm_clr_intercept(svm, INTERCEPT_PAUSE);
1343	}
1344
1345	svm_recalc_instruction_intercepts(vcpu, svm);
1346
1347	/*
1348	 * If the host supports V_SPEC_CTRL then disable the interception
1349	 * of MSR_IA32_SPEC_CTRL.
1350	 */
1351	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1352		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1353
1354	if (kvm_vcpu_apicv_active(vcpu))
1355		avic_init_vmcb(svm, vmcb);
1356
1357	if (vnmi)
1358		svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
1359
1360	if (vgif) {
1361		svm_clr_intercept(svm, INTERCEPT_STGI);
1362		svm_clr_intercept(svm, INTERCEPT_CLGI);
1363		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1364	}
1365
1366	if (sev_guest(vcpu->kvm))
1367		sev_init_vmcb(svm);
1368
1369	svm_hv_init_vmcb(vmcb);
1370	init_vmcb_after_set_cpuid(vcpu);
1371
1372	vmcb_mark_all_dirty(vmcb);
1373
1374	enable_gif(svm);
1375}
1376
1377static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1378{
1379	struct vcpu_svm *svm = to_svm(vcpu);
1380
1381	svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1382
1383	svm_init_osvw(vcpu);
1384	vcpu->arch.microcode_version = 0x01000065;
1385	svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1386
1387	svm->nmi_masked = false;
1388	svm->awaiting_iret_completion = false;
1389
1390	if (sev_es_guest(vcpu->kvm))
1391		sev_es_vcpu_reset(svm);
1392}
1393
1394static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1395{
1396	struct vcpu_svm *svm = to_svm(vcpu);
1397
1398	svm->spec_ctrl = 0;
1399	svm->virt_spec_ctrl = 0;
1400
1401	init_vmcb(vcpu);
1402
1403	if (!init_event)
1404		__svm_vcpu_reset(vcpu);
1405}
1406
1407void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1408{
1409	svm->current_vmcb = target_vmcb;
1410	svm->vmcb = target_vmcb->ptr;
1411}
1412
1413static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1414{
1415	struct vcpu_svm *svm;
1416	struct page *vmcb01_page;
1417	struct page *vmsa_page = NULL;
1418	int err;
1419
1420	BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1421	svm = to_svm(vcpu);
1422
1423	err = -ENOMEM;
1424	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1425	if (!vmcb01_page)
1426		goto out;
1427
1428	if (sev_es_guest(vcpu->kvm)) {
1429		/*
1430		 * SEV-ES guests require a separate VMSA page used to contain
1431		 * the encrypted register state of the guest.
1432		 */
1433		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1434		if (!vmsa_page)
1435			goto error_free_vmcb_page;
1436
1437		/*
1438		 * SEV-ES guests maintain an encrypted version of their FPU
1439		 * state which is restored and saved on VMRUN and VMEXIT.
1440		 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1441		 * do xsave/xrstor on it.
1442		 */
1443		fpstate_set_confidential(&vcpu->arch.guest_fpu);
1444	}
1445
1446	err = avic_init_vcpu(svm);
1447	if (err)
1448		goto error_free_vmsa_page;
1449
1450	svm->msrpm = svm_vcpu_alloc_msrpm();
1451	if (!svm->msrpm) {
1452		err = -ENOMEM;
1453		goto error_free_vmsa_page;
1454	}
1455
1456	svm->x2avic_msrs_intercepted = true;
1457
1458	svm->vmcb01.ptr = page_address(vmcb01_page);
1459	svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1460	svm_switch_vmcb(svm, &svm->vmcb01);
1461
1462	if (vmsa_page)
1463		svm->sev_es.vmsa = page_address(vmsa_page);
1464
1465	svm->guest_state_loaded = false;
1466
1467	return 0;
1468
1469error_free_vmsa_page:
1470	if (vmsa_page)
1471		__free_page(vmsa_page);
1472error_free_vmcb_page:
1473	__free_page(vmcb01_page);
1474out:
1475	return err;
1476}
1477
1478static void svm_clear_current_vmcb(struct vmcb *vmcb)
1479{
1480	int i;
1481
1482	for_each_online_cpu(i)
1483		cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
1484}
1485
1486static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1487{
1488	struct vcpu_svm *svm = to_svm(vcpu);
1489
1490	/*
1491	 * The vmcb page can be recycled, causing a false negative in
1492	 * svm_vcpu_load(). So, ensure that no logical CPU has this
1493	 * vmcb page recorded as its current vmcb.
1494	 */
1495	svm_clear_current_vmcb(svm->vmcb);
1496
1497	svm_leave_nested(vcpu);
1498	svm_free_nested(svm);
1499
1500	sev_free_vcpu(vcpu);
1501
1502	__free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1503	__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1504}
1505
1506static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1507{
1508	struct vcpu_svm *svm = to_svm(vcpu);
1509	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
1510
1511	if (sev_es_guest(vcpu->kvm))
1512		sev_es_unmap_ghcb(svm);
1513
1514	if (svm->guest_state_loaded)
1515		return;
1516
1517	/*
1518	 * Save additional host state that will be restored on VMEXIT (sev-es)
1519	 * or subsequent vmload of host save area.
1520	 */
1521	vmsave(sd->save_area_pa);
1522	if (sev_es_guest(vcpu->kvm)) {
1523		struct sev_es_save_area *hostsa;
1524		hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1525
1526		sev_es_prepare_switch_to_guest(hostsa);
1527	}
1528
1529	if (tsc_scaling)
1530		__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1531
1532	/*
1533	 * TSC_AUX is always virtualized for SEV-ES guests when the feature is
1534	 * available. The user return MSR support is not required in this case
1535	 * because TSC_AUX is restored on #VMEXIT from the host save area
1536	 * (which has been initialized in svm_hardware_enable()).
1537	 */
1538	if (likely(tsc_aux_uret_slot >= 0) &&
1539	    (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
1540		kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1541
1542	svm->guest_state_loaded = true;
1543}
1544
1545static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1546{
1547	to_svm(vcpu)->guest_state_loaded = false;
1548}
1549
1550static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1551{
1552	struct vcpu_svm *svm = to_svm(vcpu);
1553	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
1554
1555	if (sd->current_vmcb != svm->vmcb) {
1556		sd->current_vmcb = svm->vmcb;
1557
1558		if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
1559			indirect_branch_prediction_barrier();
1560	}
1561	if (kvm_vcpu_apicv_active(vcpu))
1562		avic_vcpu_load(vcpu, cpu);
1563}
1564
1565static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1566{
1567	if (kvm_vcpu_apicv_active(vcpu))
1568		avic_vcpu_put(vcpu);
1569
1570	svm_prepare_host_switch(vcpu);
1571
1572	++vcpu->stat.host_state_reload;
1573}
1574
1575static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1576{
1577	struct vcpu_svm *svm = to_svm(vcpu);
1578	unsigned long rflags = svm->vmcb->save.rflags;
1579
1580	if (svm->nmi_singlestep) {
1581		/* Hide our flags if they were not set by the guest */
1582		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1583			rflags &= ~X86_EFLAGS_TF;
1584		if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1585			rflags &= ~X86_EFLAGS_RF;
1586	}
1587	return rflags;
1588}
1589
1590static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1591{
1592	if (to_svm(vcpu)->nmi_singlestep)
1593		rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1594
1595       /*
1596        * Any change of EFLAGS.VM is accompanied by a reload of SS
1597        * (caused by either a task switch or an inter-privilege IRET),
1598        * so we do not need to update the CPL here.
1599        */
1600	to_svm(vcpu)->vmcb->save.rflags = rflags;
1601}
1602
1603static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1604{
1605	struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1606
1607	return sev_es_guest(vcpu->kvm)
1608		? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1609		: kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1610}
1611
1612static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1613{
1614	kvm_register_mark_available(vcpu, reg);
1615
1616	switch (reg) {
1617	case VCPU_EXREG_PDPTR:
1618		/*
1619		 * When !npt_enabled, mmu->pdptrs[] is already available since
1620		 * it is always updated per SDM when moving to CRs.
1621		 */
1622		if (npt_enabled)
1623			load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1624		break;
1625	default:
1626		KVM_BUG_ON(1, vcpu->kvm);
1627	}
1628}
1629
1630static void svm_set_vintr(struct vcpu_svm *svm)
1631{
1632	struct vmcb_control_area *control;
1633
1634	/*
1635	 * The following fields are ignored when AVIC is enabled
1636	 */
1637	WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1638
1639	svm_set_intercept(svm, INTERCEPT_VINTR);
1640
1641	/*
1642	 * Recalculating intercepts may have cleared the VINTR intercept.  If
1643	 * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
1644	 * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
1645	 * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
1646	 * interrupts will never be unblocked while L2 is running.
1647	 */
1648	if (!svm_is_intercept(svm, INTERCEPT_VINTR))
1649		return;
1650
1651	/*
1652	 * This is just a dummy VINTR to actually cause a vmexit to happen.
1653	 * Actual injection of virtual interrupts happens through EVENTINJ.
1654	 */
1655	control = &svm->vmcb->control;
1656	control->int_vector = 0x0;
1657	control->int_ctl &= ~V_INTR_PRIO_MASK;
1658	control->int_ctl |= V_IRQ_MASK |
1659		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1660	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1661}
1662
1663static void svm_clear_vintr(struct vcpu_svm *svm)
1664{
1665	svm_clr_intercept(svm, INTERCEPT_VINTR);
1666
1667	/* Drop int_ctl fields related to VINTR injection.  */
1668	svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1669	if (is_guest_mode(&svm->vcpu)) {
1670		svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1671
1672		WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1673			(svm->nested.ctl.int_ctl & V_TPR_MASK));
1674
1675		svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1676			V_IRQ_INJECTION_BITS_MASK;
1677
1678		svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1679	}
1680
1681	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1682}
1683
1684static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1685{
1686	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1687	struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1688
1689	switch (seg) {
1690	case VCPU_SREG_CS: return &save->cs;
1691	case VCPU_SREG_DS: return &save->ds;
1692	case VCPU_SREG_ES: return &save->es;
1693	case VCPU_SREG_FS: return &save01->fs;
1694	case VCPU_SREG_GS: return &save01->gs;
1695	case VCPU_SREG_SS: return &save->ss;
1696	case VCPU_SREG_TR: return &save01->tr;
1697	case VCPU_SREG_LDTR: return &save01->ldtr;
1698	}
1699	BUG();
1700	return NULL;
1701}
1702
1703static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1704{
1705	struct vmcb_seg *s = svm_seg(vcpu, seg);
1706
1707	return s->base;
1708}
1709
1710static void svm_get_segment(struct kvm_vcpu *vcpu,
1711			    struct kvm_segment *var, int seg)
1712{
1713	struct vmcb_seg *s = svm_seg(vcpu, seg);
1714
1715	var->base = s->base;
1716	var->limit = s->limit;
1717	var->selector = s->selector;
1718	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1719	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1720	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1721	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1722	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1723	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1724	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1725
1726	/*
1727	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1728	 * However, the SVM spec states that the G bit is not observed by the
1729	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1730	 * So let's synthesize a legal G bit for all segments, this helps
1731	 * running KVM nested. It also helps cross-vendor migration, because
1732	 * Intel's vmentry has a check on the 'G' bit.
1733	 */
1734	var->g = s->limit > 0xfffff;
1735
1736	/*
1737	 * AMD's VMCB does not have an explicit unusable field, so emulate it
1738	 * for cross vendor migration purposes by "not present"
1739	 */
1740	var->unusable = !var->present;
1741
1742	switch (seg) {
1743	case VCPU_SREG_TR:
1744		/*
1745		 * Work around a bug where the busy flag in the tr selector
1746		 * isn't exposed
1747		 */
1748		var->type |= 0x2;
1749		break;
1750	case VCPU_SREG_DS:
1751	case VCPU_SREG_ES:
1752	case VCPU_SREG_FS:
1753	case VCPU_SREG_GS:
1754		/*
1755		 * The accessed bit must always be set in the segment
1756		 * descriptor cache, although it can be cleared in the
1757		 * descriptor, the cached bit always remains at 1. Since
1758		 * Intel has a check on this, set it here to support
1759		 * cross-vendor migration.
1760		 */
1761		if (!var->unusable)
1762			var->type |= 0x1;
1763		break;
1764	case VCPU_SREG_SS:
1765		/*
1766		 * On AMD CPUs sometimes the DB bit in the segment
1767		 * descriptor is left as 1, although the whole segment has
1768		 * been made unusable. Clear it here to pass an Intel VMX
1769		 * entry check when cross vendor migrating.
1770		 */
1771		if (var->unusable)
1772			var->db = 0;
1773		/* This is symmetric with svm_set_segment() */
1774		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1775		break;
1776	}
1777}
1778
1779static int svm_get_cpl(struct kvm_vcpu *vcpu)
1780{
1781	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1782
1783	return save->cpl;
1784}
1785
1786static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1787{
1788	struct kvm_segment cs;
1789
1790	svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1791	*db = cs.db;
1792	*l = cs.l;
1793}
1794
1795static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1796{
1797	struct vcpu_svm *svm = to_svm(vcpu);
1798
1799	dt->size = svm->vmcb->save.idtr.limit;
1800	dt->address = svm->vmcb->save.idtr.base;
1801}
1802
1803static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1804{
1805	struct vcpu_svm *svm = to_svm(vcpu);
1806
1807	svm->vmcb->save.idtr.limit = dt->size;
1808	svm->vmcb->save.idtr.base = dt->address ;
1809	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1810}
1811
1812static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1813{
1814	struct vcpu_svm *svm = to_svm(vcpu);
1815
1816	dt->size = svm->vmcb->save.gdtr.limit;
1817	dt->address = svm->vmcb->save.gdtr.base;
1818}
1819
1820static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1821{
1822	struct vcpu_svm *svm = to_svm(vcpu);
1823
1824	svm->vmcb->save.gdtr.limit = dt->size;
1825	svm->vmcb->save.gdtr.base = dt->address ;
1826	vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1827}
1828
1829static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1830{
1831	struct vcpu_svm *svm = to_svm(vcpu);
1832
1833	/*
1834	 * For guests that don't set guest_state_protected, the cr3 update is
1835	 * handled via kvm_mmu_load() while entering the guest. For guests
1836	 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1837	 * VMCB save area now, since the save area will become the initial
1838	 * contents of the VMSA, and future VMCB save area updates won't be
1839	 * seen.
1840	 */
1841	if (sev_es_guest(vcpu->kvm)) {
1842		svm->vmcb->save.cr3 = cr3;
1843		vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1844	}
1845}
1846
1847static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1848{
1849	return true;
1850}
1851
1852void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1853{
1854	struct vcpu_svm *svm = to_svm(vcpu);
1855	u64 hcr0 = cr0;
1856	bool old_paging = is_paging(vcpu);
1857
1858#ifdef CONFIG_X86_64
1859	if (vcpu->arch.efer & EFER_LME) {
1860		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1861			vcpu->arch.efer |= EFER_LMA;
1862			if (!vcpu->arch.guest_state_protected)
1863				svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1864		}
1865
1866		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1867			vcpu->arch.efer &= ~EFER_LMA;
1868			if (!vcpu->arch.guest_state_protected)
1869				svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1870		}
1871	}
1872#endif
1873	vcpu->arch.cr0 = cr0;
1874
1875	if (!npt_enabled) {
1876		hcr0 |= X86_CR0_PG | X86_CR0_WP;
1877		if (old_paging != is_paging(vcpu))
1878			svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1879	}
1880
1881	/*
1882	 * re-enable caching here because the QEMU bios
1883	 * does not do it - this results in some delay at
1884	 * reboot
1885	 */
1886	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1887		hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1888
1889	svm->vmcb->save.cr0 = hcr0;
1890	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1891
1892	/*
1893	 * SEV-ES guests must always keep the CR intercepts cleared. CR
1894	 * tracking is done using the CR write traps.
1895	 */
1896	if (sev_es_guest(vcpu->kvm))
1897		return;
1898
1899	if (hcr0 == cr0) {
1900		/* Selective CR0 write remains on.  */
1901		svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1902		svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1903	} else {
1904		svm_set_intercept(svm, INTERCEPT_CR0_READ);
1905		svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1906	}
1907}
1908
1909static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1910{
1911	return true;
1912}
1913
1914void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1915{
1916	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1917	unsigned long old_cr4 = vcpu->arch.cr4;
1918
1919	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1920		svm_flush_tlb_current(vcpu);
1921
1922	vcpu->arch.cr4 = cr4;
1923	if (!npt_enabled) {
1924		cr4 |= X86_CR4_PAE;
1925
1926		if (!is_paging(vcpu))
1927			cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1928	}
1929	cr4 |= host_cr4_mce;
1930	to_svm(vcpu)->vmcb->save.cr4 = cr4;
1931	vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1932
1933	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1934		kvm_update_cpuid_runtime(vcpu);
1935}
1936
1937static void svm_set_segment(struct kvm_vcpu *vcpu,
1938			    struct kvm_segment *var, int seg)
1939{
1940	struct vcpu_svm *svm = to_svm(vcpu);
1941	struct vmcb_seg *s = svm_seg(vcpu, seg);
1942
1943	s->base = var->base;
1944	s->limit = var->limit;
1945	s->selector = var->selector;
1946	s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1947	s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1948	s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1949	s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1950	s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1951	s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1952	s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1953	s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1954
1955	/*
1956	 * This is always accurate, except if SYSRET returned to a segment
1957	 * with SS.DPL != 3.  Intel does not have this quirk, and always
1958	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1959	 * would entail passing the CPL to userspace and back.
1960	 */
1961	if (seg == VCPU_SREG_SS)
1962		/* This is symmetric with svm_get_segment() */
1963		svm->vmcb->save.cpl = (var->dpl & 3);
1964
1965	vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1966}
1967
1968static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1969{
1970	struct vcpu_svm *svm = to_svm(vcpu);
1971
1972	clr_exception_intercept(svm, BP_VECTOR);
1973
1974	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1975		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1976			set_exception_intercept(svm, BP_VECTOR);
1977	}
1978}
1979
1980static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1981{
1982	if (sd->next_asid > sd->max_asid) {
1983		++sd->asid_generation;
1984		sd->next_asid = sd->min_asid;
1985		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1986		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1987	}
1988
1989	svm->current_vmcb->asid_generation = sd->asid_generation;
1990	svm->asid = sd->next_asid++;
1991}
1992
1993static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1994{
1995	struct vmcb *vmcb = svm->vmcb;
1996
1997	if (svm->vcpu.arch.guest_state_protected)
1998		return;
1999
2000	if (unlikely(value != vmcb->save.dr6)) {
2001		vmcb->save.dr6 = value;
2002		vmcb_mark_dirty(vmcb, VMCB_DR);
2003	}
2004}
2005
2006static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
2007{
2008	struct vcpu_svm *svm = to_svm(vcpu);
2009
2010	if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
2011		return;
2012
2013	get_debugreg(vcpu->arch.db[0], 0);
2014	get_debugreg(vcpu->arch.db[1], 1);
2015	get_debugreg(vcpu->arch.db[2], 2);
2016	get_debugreg(vcpu->arch.db[3], 3);
2017	/*
2018	 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
2019	 * because db_interception might need it.  We can do it before vmentry.
2020	 */
2021	vcpu->arch.dr6 = svm->vmcb->save.dr6;
2022	vcpu->arch.dr7 = svm->vmcb->save.dr7;
2023	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2024	set_dr_intercepts(svm);
2025}
2026
2027static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2028{
2029	struct vcpu_svm *svm = to_svm(vcpu);
2030
2031	if (vcpu->arch.guest_state_protected)
2032		return;
2033
2034	svm->vmcb->save.dr7 = value;
2035	vmcb_mark_dirty(svm->vmcb, VMCB_DR);
2036}
2037
2038static int pf_interception(struct kvm_vcpu *vcpu)
2039{
2040	struct vcpu_svm *svm = to_svm(vcpu);
2041
2042	u64 fault_address = svm->vmcb->control.exit_info_2;
2043	u64 error_code = svm->vmcb->control.exit_info_1;
2044
2045	return kvm_handle_page_fault(vcpu, error_code, fault_address,
2046			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2047			svm->vmcb->control.insn_bytes : NULL,
2048			svm->vmcb->control.insn_len);
2049}
2050
2051static int npf_interception(struct kvm_vcpu *vcpu)
2052{
2053	struct vcpu_svm *svm = to_svm(vcpu);
2054
2055	u64 fault_address = svm->vmcb->control.exit_info_2;
2056	u64 error_code = svm->vmcb->control.exit_info_1;
2057
2058	trace_kvm_page_fault(vcpu, fault_address, error_code);
2059	return kvm_mmu_page_fault(vcpu, fault_address, error_code,
2060			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2061			svm->vmcb->control.insn_bytes : NULL,
2062			svm->vmcb->control.insn_len);
2063}
2064
2065static int db_interception(struct kvm_vcpu *vcpu)
2066{
2067	struct kvm_run *kvm_run = vcpu->run;
2068	struct vcpu_svm *svm = to_svm(vcpu);
2069
2070	if (!(vcpu->guest_debug &
2071	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2072		!svm->nmi_singlestep) {
2073		u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
2074		kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
2075		return 1;
2076	}
2077
2078	if (svm->nmi_singlestep) {
2079		disable_nmi_singlestep(svm);
2080		/* Make sure we check for pending NMIs upon entry */
2081		kvm_make_request(KVM_REQ_EVENT, vcpu);
2082	}
2083
2084	if (vcpu->guest_debug &
2085	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2086		kvm_run->exit_reason = KVM_EXIT_DEBUG;
2087		kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
2088		kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
2089		kvm_run->debug.arch.pc =
2090			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2091		kvm_run->debug.arch.exception = DB_VECTOR;
2092		return 0;
2093	}
2094
2095	return 1;
2096}
2097
2098static int bp_interception(struct kvm_vcpu *vcpu)
2099{
2100	struct vcpu_svm *svm = to_svm(vcpu);
2101	struct kvm_run *kvm_run = vcpu->run;
2102
2103	kvm_run->exit_reason = KVM_EXIT_DEBUG;
2104	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2105	kvm_run->debug.arch.exception = BP_VECTOR;
2106	return 0;
2107}
2108
2109static int ud_interception(struct kvm_vcpu *vcpu)
2110{
2111	return handle_ud(vcpu);
2112}
2113
2114static int ac_interception(struct kvm_vcpu *vcpu)
2115{
2116	kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
2117	return 1;
2118}
2119
2120static bool is_erratum_383(void)
2121{
2122	int err, i;
2123	u64 value;
2124
2125	if (!erratum_383_found)
2126		return false;
2127
2128	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2129	if (err)
2130		return false;
2131
2132	/* Bit 62 may or may not be set for this mce */
2133	value &= ~(1ULL << 62);
2134
2135	if (value != 0xb600000000010015ULL)
2136		return false;
2137
2138	/* Clear MCi_STATUS registers */
2139	for (i = 0; i < 6; ++i)
2140		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2141
2142	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2143	if (!err) {
2144		u32 low, high;
2145
2146		value &= ~(1ULL << 2);
2147		low    = lower_32_bits(value);
2148		high   = upper_32_bits(value);
2149
2150		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2151	}
2152
2153	/* Flush tlb to evict multi-match entries */
2154	__flush_tlb_all();
2155
2156	return true;
2157}
2158
2159static void svm_handle_mce(struct kvm_vcpu *vcpu)
2160{
2161	if (is_erratum_383()) {
2162		/*
2163		 * Erratum 383 triggered. Guest state is corrupt so kill the
2164		 * guest.
2165		 */
2166		pr_err("Guest triggered AMD Erratum 383\n");
2167
2168		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2169
2170		return;
2171	}
2172
2173	/*
2174	 * On an #MC intercept the MCE handler is not called automatically in
2175	 * the host. So do it by hand here.
2176	 */
2177	kvm_machine_check();
2178}
2179
2180static int mc_interception(struct kvm_vcpu *vcpu)
2181{
2182	return 1;
2183}
2184
2185static int shutdown_interception(struct kvm_vcpu *vcpu)
2186{
2187	struct kvm_run *kvm_run = vcpu->run;
2188	struct vcpu_svm *svm = to_svm(vcpu);
2189
 
 
 
 
 
 
2190
2191	/*
2192	 * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2193	 * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2194	 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2195	 * userspace.  At a platform view, INIT is acceptable behavior as
2196	 * there exist bare metal platforms that automatically INIT the CPU
2197	 * in response to shutdown.
2198	 *
2199	 * The VM save area for SEV-ES guests has already been encrypted so it
2200	 * cannot be reinitialized, i.e. synthesizing INIT is futile.
2201	 */
2202	if (!sev_es_guest(vcpu->kvm)) {
2203		clear_page(svm->vmcb);
2204		kvm_vcpu_reset(vcpu, true);
2205	}
2206
2207	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2208	return 0;
2209}
2210
2211static int io_interception(struct kvm_vcpu *vcpu)
2212{
2213	struct vcpu_svm *svm = to_svm(vcpu);
2214	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2215	int size, in, string;
2216	unsigned port;
2217
2218	++vcpu->stat.io_exits;
2219	string = (io_info & SVM_IOIO_STR_MASK) != 0;
2220	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2221	port = io_info >> 16;
2222	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2223
2224	if (string) {
2225		if (sev_es_guest(vcpu->kvm))
2226			return sev_es_string_io(svm, size, port, in);
2227		else
2228			return kvm_emulate_instruction(vcpu, 0);
2229	}
2230
2231	svm->next_rip = svm->vmcb->control.exit_info_2;
2232
2233	return kvm_fast_pio(vcpu, size, port, in);
2234}
2235
2236static int nmi_interception(struct kvm_vcpu *vcpu)
2237{
2238	return 1;
2239}
2240
2241static int smi_interception(struct kvm_vcpu *vcpu)
2242{
2243	return 1;
2244}
2245
2246static int intr_interception(struct kvm_vcpu *vcpu)
2247{
2248	++vcpu->stat.irq_exits;
2249	return 1;
2250}
2251
2252static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2253{
2254	struct vcpu_svm *svm = to_svm(vcpu);
2255	struct vmcb *vmcb12;
2256	struct kvm_host_map map;
2257	int ret;
2258
2259	if (nested_svm_check_permissions(vcpu))
2260		return 1;
2261
2262	ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2263	if (ret) {
2264		if (ret == -EINVAL)
2265			kvm_inject_gp(vcpu, 0);
2266		return 1;
2267	}
2268
2269	vmcb12 = map.hva;
2270
2271	ret = kvm_skip_emulated_instruction(vcpu);
2272
2273	if (vmload) {
2274		svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2275		svm->sysenter_eip_hi = 0;
2276		svm->sysenter_esp_hi = 0;
2277	} else {
2278		svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2279	}
2280
2281	kvm_vcpu_unmap(vcpu, &map, true);
2282
2283	return ret;
2284}
2285
2286static int vmload_interception(struct kvm_vcpu *vcpu)
2287{
2288	return vmload_vmsave_interception(vcpu, true);
2289}
2290
2291static int vmsave_interception(struct kvm_vcpu *vcpu)
2292{
2293	return vmload_vmsave_interception(vcpu, false);
2294}
2295
2296static int vmrun_interception(struct kvm_vcpu *vcpu)
2297{
2298	if (nested_svm_check_permissions(vcpu))
2299		return 1;
2300
2301	return nested_svm_vmrun(vcpu);
2302}
2303
2304enum {
2305	NONE_SVM_INSTR,
2306	SVM_INSTR_VMRUN,
2307	SVM_INSTR_VMLOAD,
2308	SVM_INSTR_VMSAVE,
2309};
2310
2311/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2312static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2313{
2314	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2315
2316	if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2317		return NONE_SVM_INSTR;
2318
2319	switch (ctxt->modrm) {
2320	case 0xd8: /* VMRUN */
2321		return SVM_INSTR_VMRUN;
2322	case 0xda: /* VMLOAD */
2323		return SVM_INSTR_VMLOAD;
2324	case 0xdb: /* VMSAVE */
2325		return SVM_INSTR_VMSAVE;
2326	default:
2327		break;
2328	}
2329
2330	return NONE_SVM_INSTR;
2331}
2332
2333static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2334{
2335	const int guest_mode_exit_codes[] = {
2336		[SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2337		[SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2338		[SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2339	};
2340	int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2341		[SVM_INSTR_VMRUN] = vmrun_interception,
2342		[SVM_INSTR_VMLOAD] = vmload_interception,
2343		[SVM_INSTR_VMSAVE] = vmsave_interception,
2344	};
2345	struct vcpu_svm *svm = to_svm(vcpu);
2346	int ret;
2347
2348	if (is_guest_mode(vcpu)) {
2349		/* Returns '1' or -errno on failure, '0' on success. */
2350		ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2351		if (ret)
2352			return ret;
2353		return 1;
2354	}
2355	return svm_instr_handlers[opcode](vcpu);
2356}
2357
2358/*
2359 * #GP handling code. Note that #GP can be triggered under the following two
2360 * cases:
2361 *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2362 *      some AMD CPUs when EAX of these instructions are in the reserved memory
2363 *      regions (e.g. SMM memory on host).
2364 *   2) VMware backdoor
2365 */
2366static int gp_interception(struct kvm_vcpu *vcpu)
2367{
2368	struct vcpu_svm *svm = to_svm(vcpu);
2369	u32 error_code = svm->vmcb->control.exit_info_1;
2370	int opcode;
2371
2372	/* Both #GP cases have zero error_code */
2373	if (error_code)
2374		goto reinject;
2375
2376	/* Decode the instruction for usage later */
2377	if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2378		goto reinject;
2379
2380	opcode = svm_instr_opcode(vcpu);
2381
2382	if (opcode == NONE_SVM_INSTR) {
2383		if (!enable_vmware_backdoor)
2384			goto reinject;
2385
2386		/*
2387		 * VMware backdoor emulation on #GP interception only handles
2388		 * IN{S}, OUT{S}, and RDPMC.
2389		 */
2390		if (!is_guest_mode(vcpu))
2391			return kvm_emulate_instruction(vcpu,
2392				EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2393	} else {
2394		/* All SVM instructions expect page aligned RAX */
2395		if (svm->vmcb->save.rax & ~PAGE_MASK)
2396			goto reinject;
2397
2398		return emulate_svm_instr(vcpu, opcode);
2399	}
2400
2401reinject:
2402	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2403	return 1;
2404}
2405
2406void svm_set_gif(struct vcpu_svm *svm, bool value)
2407{
2408	if (value) {
2409		/*
2410		 * If VGIF is enabled, the STGI intercept is only added to
2411		 * detect the opening of the SMI/NMI window; remove it now.
2412		 * Likewise, clear the VINTR intercept, we will set it
2413		 * again while processing KVM_REQ_EVENT if needed.
2414		 */
2415		if (vgif)
2416			svm_clr_intercept(svm, INTERCEPT_STGI);
2417		if (svm_is_intercept(svm, INTERCEPT_VINTR))
2418			svm_clear_vintr(svm);
2419
2420		enable_gif(svm);
2421		if (svm->vcpu.arch.smi_pending ||
2422		    svm->vcpu.arch.nmi_pending ||
2423		    kvm_cpu_has_injectable_intr(&svm->vcpu) ||
2424		    kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
2425			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2426	} else {
2427		disable_gif(svm);
2428
2429		/*
2430		 * After a CLGI no interrupts should come.  But if vGIF is
2431		 * in use, we still rely on the VINTR intercept (rather than
2432		 * STGI) to detect an open interrupt window.
2433		*/
2434		if (!vgif)
2435			svm_clear_vintr(svm);
2436	}
2437}
2438
2439static int stgi_interception(struct kvm_vcpu *vcpu)
2440{
2441	int ret;
2442
2443	if (nested_svm_check_permissions(vcpu))
2444		return 1;
2445
2446	ret = kvm_skip_emulated_instruction(vcpu);
2447	svm_set_gif(to_svm(vcpu), true);
2448	return ret;
2449}
2450
2451static int clgi_interception(struct kvm_vcpu *vcpu)
2452{
2453	int ret;
2454
2455	if (nested_svm_check_permissions(vcpu))
2456		return 1;
2457
2458	ret = kvm_skip_emulated_instruction(vcpu);
2459	svm_set_gif(to_svm(vcpu), false);
2460	return ret;
2461}
2462
2463static int invlpga_interception(struct kvm_vcpu *vcpu)
2464{
2465	gva_t gva = kvm_rax_read(vcpu);
2466	u32 asid = kvm_rcx_read(vcpu);
2467
2468	/* FIXME: Handle an address size prefix. */
2469	if (!is_long_mode(vcpu))
2470		gva = (u32)gva;
2471
2472	trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2473
2474	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2475	kvm_mmu_invlpg(vcpu, gva);
2476
2477	return kvm_skip_emulated_instruction(vcpu);
2478}
2479
2480static int skinit_interception(struct kvm_vcpu *vcpu)
2481{
2482	trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2483
2484	kvm_queue_exception(vcpu, UD_VECTOR);
2485	return 1;
2486}
2487
2488static int task_switch_interception(struct kvm_vcpu *vcpu)
2489{
2490	struct vcpu_svm *svm = to_svm(vcpu);
2491	u16 tss_selector;
2492	int reason;
2493	int int_type = svm->vmcb->control.exit_int_info &
2494		SVM_EXITINTINFO_TYPE_MASK;
2495	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2496	uint32_t type =
2497		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2498	uint32_t idt_v =
2499		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2500	bool has_error_code = false;
2501	u32 error_code = 0;
2502
2503	tss_selector = (u16)svm->vmcb->control.exit_info_1;
2504
2505	if (svm->vmcb->control.exit_info_2 &
2506	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2507		reason = TASK_SWITCH_IRET;
2508	else if (svm->vmcb->control.exit_info_2 &
2509		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2510		reason = TASK_SWITCH_JMP;
2511	else if (idt_v)
2512		reason = TASK_SWITCH_GATE;
2513	else
2514		reason = TASK_SWITCH_CALL;
2515
2516	if (reason == TASK_SWITCH_GATE) {
2517		switch (type) {
2518		case SVM_EXITINTINFO_TYPE_NMI:
2519			vcpu->arch.nmi_injected = false;
2520			break;
2521		case SVM_EXITINTINFO_TYPE_EXEPT:
2522			if (svm->vmcb->control.exit_info_2 &
2523			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2524				has_error_code = true;
2525				error_code =
2526					(u32)svm->vmcb->control.exit_info_2;
2527			}
2528			kvm_clear_exception_queue(vcpu);
2529			break;
2530		case SVM_EXITINTINFO_TYPE_INTR:
2531		case SVM_EXITINTINFO_TYPE_SOFT:
2532			kvm_clear_interrupt_queue(vcpu);
2533			break;
2534		default:
2535			break;
2536		}
2537	}
2538
2539	if (reason != TASK_SWITCH_GATE ||
2540	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2541	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2542	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2543		if (!svm_skip_emulated_instruction(vcpu))
2544			return 0;
2545	}
2546
2547	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2548		int_vec = -1;
2549
2550	return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2551			       has_error_code, error_code);
2552}
2553
2554static void svm_clr_iret_intercept(struct vcpu_svm *svm)
2555{
2556	if (!sev_es_guest(svm->vcpu.kvm))
2557		svm_clr_intercept(svm, INTERCEPT_IRET);
2558}
2559
2560static void svm_set_iret_intercept(struct vcpu_svm *svm)
2561{
2562	if (!sev_es_guest(svm->vcpu.kvm))
2563		svm_set_intercept(svm, INTERCEPT_IRET);
2564}
2565
2566static int iret_interception(struct kvm_vcpu *vcpu)
2567{
2568	struct vcpu_svm *svm = to_svm(vcpu);
2569
2570	WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
2571
2572	++vcpu->stat.nmi_window_exits;
2573	svm->awaiting_iret_completion = true;
2574
2575	svm_clr_iret_intercept(svm);
2576	svm->nmi_iret_rip = kvm_rip_read(vcpu);
2577
2578	kvm_make_request(KVM_REQ_EVENT, vcpu);
2579	return 1;
2580}
2581
2582static int invlpg_interception(struct kvm_vcpu *vcpu)
2583{
2584	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2585		return kvm_emulate_instruction(vcpu, 0);
2586
2587	kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2588	return kvm_skip_emulated_instruction(vcpu);
2589}
2590
2591static int emulate_on_interception(struct kvm_vcpu *vcpu)
2592{
2593	return kvm_emulate_instruction(vcpu, 0);
2594}
2595
2596static int rsm_interception(struct kvm_vcpu *vcpu)
2597{
2598	return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2599}
2600
2601static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2602					    unsigned long val)
2603{
2604	struct vcpu_svm *svm = to_svm(vcpu);
2605	unsigned long cr0 = vcpu->arch.cr0;
2606	bool ret = false;
2607
2608	if (!is_guest_mode(vcpu) ||
2609	    (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2610		return false;
2611
2612	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2613	val &= ~SVM_CR0_SELECTIVE_MASK;
2614
2615	if (cr0 ^ val) {
2616		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2617		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2618	}
2619
2620	return ret;
2621}
2622
2623#define CR_VALID (1ULL << 63)
2624
2625static int cr_interception(struct kvm_vcpu *vcpu)
2626{
2627	struct vcpu_svm *svm = to_svm(vcpu);
2628	int reg, cr;
2629	unsigned long val;
2630	int err;
2631
2632	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2633		return emulate_on_interception(vcpu);
2634
2635	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2636		return emulate_on_interception(vcpu);
2637
2638	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2639	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2640		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2641	else
2642		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2643
2644	err = 0;
2645	if (cr >= 16) { /* mov to cr */
2646		cr -= 16;
2647		val = kvm_register_read(vcpu, reg);
2648		trace_kvm_cr_write(cr, val);
2649		switch (cr) {
2650		case 0:
2651			if (!check_selective_cr0_intercepted(vcpu, val))
2652				err = kvm_set_cr0(vcpu, val);
2653			else
2654				return 1;
2655
2656			break;
2657		case 3:
2658			err = kvm_set_cr3(vcpu, val);
2659			break;
2660		case 4:
2661			err = kvm_set_cr4(vcpu, val);
2662			break;
2663		case 8:
2664			err = kvm_set_cr8(vcpu, val);
2665			break;
2666		default:
2667			WARN(1, "unhandled write to CR%d", cr);
2668			kvm_queue_exception(vcpu, UD_VECTOR);
2669			return 1;
2670		}
2671	} else { /* mov from cr */
2672		switch (cr) {
2673		case 0:
2674			val = kvm_read_cr0(vcpu);
2675			break;
2676		case 2:
2677			val = vcpu->arch.cr2;
2678			break;
2679		case 3:
2680			val = kvm_read_cr3(vcpu);
2681			break;
2682		case 4:
2683			val = kvm_read_cr4(vcpu);
2684			break;
2685		case 8:
2686			val = kvm_get_cr8(vcpu);
2687			break;
2688		default:
2689			WARN(1, "unhandled read from CR%d", cr);
2690			kvm_queue_exception(vcpu, UD_VECTOR);
2691			return 1;
2692		}
2693		kvm_register_write(vcpu, reg, val);
2694		trace_kvm_cr_read(cr, val);
2695	}
2696	return kvm_complete_insn_gp(vcpu, err);
2697}
2698
2699static int cr_trap(struct kvm_vcpu *vcpu)
2700{
2701	struct vcpu_svm *svm = to_svm(vcpu);
2702	unsigned long old_value, new_value;
2703	unsigned int cr;
2704	int ret = 0;
2705
2706	new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2707
2708	cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2709	switch (cr) {
2710	case 0:
2711		old_value = kvm_read_cr0(vcpu);
2712		svm_set_cr0(vcpu, new_value);
2713
2714		kvm_post_set_cr0(vcpu, old_value, new_value);
2715		break;
2716	case 4:
2717		old_value = kvm_read_cr4(vcpu);
2718		svm_set_cr4(vcpu, new_value);
2719
2720		kvm_post_set_cr4(vcpu, old_value, new_value);
2721		break;
2722	case 8:
2723		ret = kvm_set_cr8(vcpu, new_value);
2724		break;
2725	default:
2726		WARN(1, "unhandled CR%d write trap", cr);
2727		kvm_queue_exception(vcpu, UD_VECTOR);
2728		return 1;
2729	}
2730
2731	return kvm_complete_insn_gp(vcpu, ret);
2732}
2733
2734static int dr_interception(struct kvm_vcpu *vcpu)
2735{
2736	struct vcpu_svm *svm = to_svm(vcpu);
2737	int reg, dr;
2738	unsigned long val;
2739	int err = 0;
2740
2741	/*
2742	 * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
2743	 * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
2744	 */
2745	if (sev_es_guest(vcpu->kvm))
2746		return 1;
2747
2748	if (vcpu->guest_debug == 0) {
2749		/*
2750		 * No more DR vmexits; force a reload of the debug registers
2751		 * and reenter on this instruction.  The next vmexit will
2752		 * retrieve the full state of the debug registers.
2753		 */
2754		clr_dr_intercepts(svm);
2755		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2756		return 1;
2757	}
2758
2759	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2760		return emulate_on_interception(vcpu);
2761
2762	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2763	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2764	if (dr >= 16) { /* mov to DRn  */
2765		dr -= 16;
2766		val = kvm_register_read(vcpu, reg);
2767		err = kvm_set_dr(vcpu, dr, val);
2768	} else {
2769		kvm_get_dr(vcpu, dr, &val);
2770		kvm_register_write(vcpu, reg, val);
2771	}
2772
2773	return kvm_complete_insn_gp(vcpu, err);
2774}
2775
2776static int cr8_write_interception(struct kvm_vcpu *vcpu)
2777{
2778	int r;
2779
2780	u8 cr8_prev = kvm_get_cr8(vcpu);
2781	/* instruction emulation calls kvm_set_cr8() */
2782	r = cr_interception(vcpu);
2783	if (lapic_in_kernel(vcpu))
2784		return r;
2785	if (cr8_prev <= kvm_get_cr8(vcpu))
2786		return r;
2787	vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2788	return 0;
2789}
2790
2791static int efer_trap(struct kvm_vcpu *vcpu)
2792{
2793	struct msr_data msr_info;
2794	int ret;
2795
2796	/*
2797	 * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2798	 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2799	 * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2800	 * the guest doesn't have X86_FEATURE_SVM.
2801	 */
2802	msr_info.host_initiated = false;
2803	msr_info.index = MSR_EFER;
2804	msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2805	ret = kvm_set_msr_common(vcpu, &msr_info);
2806
2807	return kvm_complete_insn_gp(vcpu, ret);
2808}
2809
2810static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2811{
2812	msr->data = 0;
2813
2814	switch (msr->index) {
2815	case MSR_AMD64_DE_CFG:
2816		if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
2817			msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
2818		break;
2819	default:
2820		return KVM_MSR_RET_INVALID;
2821	}
2822
2823	return 0;
2824}
2825
2826static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2827{
2828	struct vcpu_svm *svm = to_svm(vcpu);
2829
2830	switch (msr_info->index) {
2831	case MSR_AMD64_TSC_RATIO:
2832		if (!msr_info->host_initiated &&
2833		    !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
2834			return 1;
2835		msr_info->data = svm->tsc_ratio_msr;
2836		break;
2837	case MSR_STAR:
2838		msr_info->data = svm->vmcb01.ptr->save.star;
2839		break;
2840#ifdef CONFIG_X86_64
2841	case MSR_LSTAR:
2842		msr_info->data = svm->vmcb01.ptr->save.lstar;
2843		break;
2844	case MSR_CSTAR:
2845		msr_info->data = svm->vmcb01.ptr->save.cstar;
2846		break;
2847	case MSR_KERNEL_GS_BASE:
2848		msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2849		break;
2850	case MSR_SYSCALL_MASK:
2851		msr_info->data = svm->vmcb01.ptr->save.sfmask;
2852		break;
2853#endif
2854	case MSR_IA32_SYSENTER_CS:
2855		msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2856		break;
2857	case MSR_IA32_SYSENTER_EIP:
2858		msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2859		if (guest_cpuid_is_intel(vcpu))
2860			msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2861		break;
2862	case MSR_IA32_SYSENTER_ESP:
2863		msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2864		if (guest_cpuid_is_intel(vcpu))
2865			msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2866		break;
2867	case MSR_TSC_AUX:
2868		msr_info->data = svm->tsc_aux;
2869		break;
2870	case MSR_IA32_DEBUGCTLMSR:
2871		msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
2872		break;
2873	case MSR_IA32_LASTBRANCHFROMIP:
2874		msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
2875		break;
2876	case MSR_IA32_LASTBRANCHTOIP:
2877		msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
2878		break;
2879	case MSR_IA32_LASTINTFROMIP:
2880		msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
2881		break;
2882	case MSR_IA32_LASTINTTOIP:
2883		msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
2884		break;
2885	case MSR_VM_HSAVE_PA:
2886		msr_info->data = svm->nested.hsave_msr;
2887		break;
2888	case MSR_VM_CR:
2889		msr_info->data = svm->nested.vm_cr_msr;
2890		break;
2891	case MSR_IA32_SPEC_CTRL:
2892		if (!msr_info->host_initiated &&
2893		    !guest_has_spec_ctrl_msr(vcpu))
2894			return 1;
2895
2896		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2897			msr_info->data = svm->vmcb->save.spec_ctrl;
2898		else
2899			msr_info->data = svm->spec_ctrl;
2900		break;
2901	case MSR_AMD64_VIRT_SPEC_CTRL:
2902		if (!msr_info->host_initiated &&
2903		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2904			return 1;
2905
2906		msr_info->data = svm->virt_spec_ctrl;
2907		break;
2908	case MSR_F15H_IC_CFG: {
2909
2910		int family, model;
2911
2912		family = guest_cpuid_family(vcpu);
2913		model  = guest_cpuid_model(vcpu);
2914
2915		if (family < 0 || model < 0)
2916			return kvm_get_msr_common(vcpu, msr_info);
2917
2918		msr_info->data = 0;
2919
2920		if (family == 0x15 &&
2921		    (model >= 0x2 && model < 0x20))
2922			msr_info->data = 0x1E;
2923		}
2924		break;
2925	case MSR_AMD64_DE_CFG:
2926		msr_info->data = svm->msr_decfg;
2927		break;
2928	default:
2929		return kvm_get_msr_common(vcpu, msr_info);
2930	}
2931	return 0;
2932}
2933
2934static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2935{
2936	struct vcpu_svm *svm = to_svm(vcpu);
2937	if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2938		return kvm_complete_insn_gp(vcpu, err);
2939
2940	ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2941	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2942				X86_TRAP_GP |
2943				SVM_EVTINJ_TYPE_EXEPT |
2944				SVM_EVTINJ_VALID);
2945	return 1;
2946}
2947
2948static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2949{
2950	struct vcpu_svm *svm = to_svm(vcpu);
2951	int svm_dis, chg_mask;
2952
2953	if (data & ~SVM_VM_CR_VALID_MASK)
2954		return 1;
2955
2956	chg_mask = SVM_VM_CR_VALID_MASK;
2957
2958	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2959		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2960
2961	svm->nested.vm_cr_msr &= ~chg_mask;
2962	svm->nested.vm_cr_msr |= (data & chg_mask);
2963
2964	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2965
2966	/* check for svm_disable while efer.svme is set */
2967	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2968		return 1;
2969
2970	return 0;
2971}
2972
2973static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2974{
2975	struct vcpu_svm *svm = to_svm(vcpu);
2976	int ret = 0;
2977
2978	u32 ecx = msr->index;
2979	u64 data = msr->data;
2980	switch (ecx) {
2981	case MSR_AMD64_TSC_RATIO:
2982
2983		if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
2984
2985			if (!msr->host_initiated)
2986				return 1;
2987			/*
2988			 * In case TSC scaling is not enabled, always
2989			 * leave this MSR at the default value.
2990			 *
2991			 * Due to bug in qemu 6.2.0, it would try to set
2992			 * this msr to 0 if tsc scaling is not enabled.
2993			 * Ignore this value as well.
2994			 */
2995			if (data != 0 && data != svm->tsc_ratio_msr)
2996				return 1;
2997			break;
2998		}
2999
3000		if (data & SVM_TSC_RATIO_RSVD)
3001			return 1;
3002
3003		svm->tsc_ratio_msr = data;
3004
3005		if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
3006		    is_guest_mode(vcpu))
3007			nested_svm_update_tsc_ratio_msr(vcpu);
3008
3009		break;
3010	case MSR_IA32_CR_PAT:
3011		ret = kvm_set_msr_common(vcpu, msr);
3012		if (ret)
3013			break;
3014
3015		svm->vmcb01.ptr->save.g_pat = data;
3016		if (is_guest_mode(vcpu))
3017			nested_vmcb02_compute_g_pat(svm);
3018		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
3019		break;
3020	case MSR_IA32_SPEC_CTRL:
3021		if (!msr->host_initiated &&
3022		    !guest_has_spec_ctrl_msr(vcpu))
3023			return 1;
3024
3025		if (kvm_spec_ctrl_test_value(data))
3026			return 1;
3027
3028		if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3029			svm->vmcb->save.spec_ctrl = data;
3030		else
3031			svm->spec_ctrl = data;
3032		if (!data)
3033			break;
3034
3035		/*
3036		 * For non-nested:
3037		 * When it's written (to non-zero) for the first time, pass
3038		 * it through.
3039		 *
3040		 * For nested:
3041		 * The handling of the MSR bitmap for L2 guests is done in
3042		 * nested_svm_vmrun_msrpm.
3043		 * We update the L1 MSR bit as well since it will end up
3044		 * touching the MSR anyway now.
3045		 */
3046		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
3047		break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3048	case MSR_AMD64_VIRT_SPEC_CTRL:
3049		if (!msr->host_initiated &&
3050		    !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
3051			return 1;
3052
3053		if (data & ~SPEC_CTRL_SSBD)
3054			return 1;
3055
3056		svm->virt_spec_ctrl = data;
3057		break;
3058	case MSR_STAR:
3059		svm->vmcb01.ptr->save.star = data;
3060		break;
3061#ifdef CONFIG_X86_64
3062	case MSR_LSTAR:
3063		svm->vmcb01.ptr->save.lstar = data;
3064		break;
3065	case MSR_CSTAR:
3066		svm->vmcb01.ptr->save.cstar = data;
3067		break;
3068	case MSR_KERNEL_GS_BASE:
3069		svm->vmcb01.ptr->save.kernel_gs_base = data;
3070		break;
3071	case MSR_SYSCALL_MASK:
3072		svm->vmcb01.ptr->save.sfmask = data;
3073		break;
3074#endif
3075	case MSR_IA32_SYSENTER_CS:
3076		svm->vmcb01.ptr->save.sysenter_cs = data;
3077		break;
3078	case MSR_IA32_SYSENTER_EIP:
3079		svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
3080		/*
3081		 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
3082		 * when we spoof an Intel vendor ID (for cross vendor migration).
3083		 * In this case we use this intercept to track the high
3084		 * 32 bit part of these msrs to support Intel's
3085		 * implementation of SYSENTER/SYSEXIT.
3086		 */
3087		svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3088		break;
3089	case MSR_IA32_SYSENTER_ESP:
3090		svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
3091		svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
3092		break;
3093	case MSR_TSC_AUX:
3094		/*
3095		 * TSC_AUX is always virtualized for SEV-ES guests when the
3096		 * feature is available. The user return MSR support is not
3097		 * required in this case because TSC_AUX is restored on #VMEXIT
3098		 * from the host save area (which has been initialized in
3099		 * svm_hardware_enable()).
3100		 */
3101		if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
3102			break;
3103
3104		/*
3105		 * TSC_AUX is usually changed only during boot and never read
3106		 * directly.  Intercept TSC_AUX instead of exposing it to the
3107		 * guest via direct_access_msrs, and switch it via user return.
3108		 */
3109		preempt_disable();
3110		ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
3111		preempt_enable();
3112		if (ret)
3113			break;
3114
3115		svm->tsc_aux = data;
3116		break;
3117	case MSR_IA32_DEBUGCTLMSR:
3118		if (!lbrv) {
3119			kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
 
3120			break;
3121		}
3122		if (data & DEBUGCTL_RESERVED_BITS)
3123			return 1;
3124
3125		svm_get_lbr_vmcb(svm)->save.dbgctl = data;
 
 
 
 
3126		svm_update_lbrv(vcpu);
 
3127		break;
3128	case MSR_VM_HSAVE_PA:
3129		/*
3130		 * Old kernels did not validate the value written to
3131		 * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
3132		 * value to allow live migrating buggy or malicious guests
3133		 * originating from those kernels.
3134		 */
3135		if (!msr->host_initiated && !page_address_valid(vcpu, data))
3136			return 1;
3137
3138		svm->nested.hsave_msr = data & PAGE_MASK;
3139		break;
3140	case MSR_VM_CR:
3141		return svm_set_vm_cr(vcpu, data);
3142	case MSR_VM_IGNNE:
3143		kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
3144		break;
3145	case MSR_AMD64_DE_CFG: {
3146		struct kvm_msr_entry msr_entry;
3147
3148		msr_entry.index = msr->index;
3149		if (svm_get_msr_feature(&msr_entry))
3150			return 1;
3151
3152		/* Check the supported bits */
3153		if (data & ~msr_entry.data)
3154			return 1;
3155
3156		/* Don't allow the guest to change a bit, #GP */
3157		if (!msr->host_initiated && (data ^ msr_entry.data))
3158			return 1;
3159
3160		svm->msr_decfg = data;
3161		break;
3162	}
3163	default:
3164		return kvm_set_msr_common(vcpu, msr);
3165	}
3166	return ret;
3167}
3168
3169static int msr_interception(struct kvm_vcpu *vcpu)
3170{
3171	if (to_svm(vcpu)->vmcb->control.exit_info_1)
3172		return kvm_emulate_wrmsr(vcpu);
3173	else
3174		return kvm_emulate_rdmsr(vcpu);
3175}
3176
3177static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3178{
3179	kvm_make_request(KVM_REQ_EVENT, vcpu);
3180	svm_clear_vintr(to_svm(vcpu));
3181
3182	/*
3183	 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3184	 * In this case AVIC was temporarily disabled for
3185	 * requesting the IRQ window and we have to re-enable it.
3186	 *
3187	 * If running nested, still remove the VM wide AVIC inhibit to
3188	 * support case in which the interrupt window was requested when the
3189	 * vCPU was not running nested.
3190
3191	 * All vCPUs which run still run nested, will remain to have their
3192	 * AVIC still inhibited due to per-cpu AVIC inhibition.
3193	 */
3194	kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3195
3196	++vcpu->stat.irq_window_exits;
3197	return 1;
3198}
3199
3200static int pause_interception(struct kvm_vcpu *vcpu)
3201{
3202	bool in_kernel;
3203	/*
3204	 * CPL is not made available for an SEV-ES guest, therefore
3205	 * vcpu->arch.preempted_in_kernel can never be true.  Just
3206	 * set in_kernel to false as well.
3207	 */
3208	in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3209
3210	grow_ple_window(vcpu);
3211
3212	kvm_vcpu_on_spin(vcpu, in_kernel);
3213	return kvm_skip_emulated_instruction(vcpu);
3214}
3215
3216static int invpcid_interception(struct kvm_vcpu *vcpu)
3217{
3218	struct vcpu_svm *svm = to_svm(vcpu);
3219	unsigned long type;
3220	gva_t gva;
3221
3222	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3223		kvm_queue_exception(vcpu, UD_VECTOR);
3224		return 1;
3225	}
3226
3227	/*
3228	 * For an INVPCID intercept:
3229	 * EXITINFO1 provides the linear address of the memory operand.
3230	 * EXITINFO2 provides the contents of the register operand.
3231	 */
3232	type = svm->vmcb->control.exit_info_2;
3233	gva = svm->vmcb->control.exit_info_1;
3234
3235	return kvm_handle_invpcid(vcpu, type, gva);
3236}
3237
3238static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3239	[SVM_EXIT_READ_CR0]			= cr_interception,
3240	[SVM_EXIT_READ_CR3]			= cr_interception,
3241	[SVM_EXIT_READ_CR4]			= cr_interception,
3242	[SVM_EXIT_READ_CR8]			= cr_interception,
3243	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
3244	[SVM_EXIT_WRITE_CR0]			= cr_interception,
3245	[SVM_EXIT_WRITE_CR3]			= cr_interception,
3246	[SVM_EXIT_WRITE_CR4]			= cr_interception,
3247	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
3248	[SVM_EXIT_READ_DR0]			= dr_interception,
3249	[SVM_EXIT_READ_DR1]			= dr_interception,
3250	[SVM_EXIT_READ_DR2]			= dr_interception,
3251	[SVM_EXIT_READ_DR3]			= dr_interception,
3252	[SVM_EXIT_READ_DR4]			= dr_interception,
3253	[SVM_EXIT_READ_DR5]			= dr_interception,
3254	[SVM_EXIT_READ_DR6]			= dr_interception,
3255	[SVM_EXIT_READ_DR7]			= dr_interception,
3256	[SVM_EXIT_WRITE_DR0]			= dr_interception,
3257	[SVM_EXIT_WRITE_DR1]			= dr_interception,
3258	[SVM_EXIT_WRITE_DR2]			= dr_interception,
3259	[SVM_EXIT_WRITE_DR3]			= dr_interception,
3260	[SVM_EXIT_WRITE_DR4]			= dr_interception,
3261	[SVM_EXIT_WRITE_DR5]			= dr_interception,
3262	[SVM_EXIT_WRITE_DR6]			= dr_interception,
3263	[SVM_EXIT_WRITE_DR7]			= dr_interception,
3264	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
3265	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
3266	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
3267	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
3268	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
3269	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
3270	[SVM_EXIT_EXCP_BASE + GP_VECTOR]	= gp_interception,
3271	[SVM_EXIT_INTR]				= intr_interception,
3272	[SVM_EXIT_NMI]				= nmi_interception,
3273	[SVM_EXIT_SMI]				= smi_interception,
3274	[SVM_EXIT_VINTR]			= interrupt_window_interception,
3275	[SVM_EXIT_RDPMC]			= kvm_emulate_rdpmc,
3276	[SVM_EXIT_CPUID]			= kvm_emulate_cpuid,
3277	[SVM_EXIT_IRET]                         = iret_interception,
3278	[SVM_EXIT_INVD]                         = kvm_emulate_invd,
3279	[SVM_EXIT_PAUSE]			= pause_interception,
3280	[SVM_EXIT_HLT]				= kvm_emulate_halt,
3281	[SVM_EXIT_INVLPG]			= invlpg_interception,
3282	[SVM_EXIT_INVLPGA]			= invlpga_interception,
3283	[SVM_EXIT_IOIO]				= io_interception,
3284	[SVM_EXIT_MSR]				= msr_interception,
3285	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
3286	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
3287	[SVM_EXIT_VMRUN]			= vmrun_interception,
3288	[SVM_EXIT_VMMCALL]			= kvm_emulate_hypercall,
3289	[SVM_EXIT_VMLOAD]			= vmload_interception,
3290	[SVM_EXIT_VMSAVE]			= vmsave_interception,
3291	[SVM_EXIT_STGI]				= stgi_interception,
3292	[SVM_EXIT_CLGI]				= clgi_interception,
3293	[SVM_EXIT_SKINIT]			= skinit_interception,
3294	[SVM_EXIT_RDTSCP]			= kvm_handle_invalid_op,
3295	[SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3296	[SVM_EXIT_MONITOR]			= kvm_emulate_monitor,
3297	[SVM_EXIT_MWAIT]			= kvm_emulate_mwait,
3298	[SVM_EXIT_XSETBV]			= kvm_emulate_xsetbv,
3299	[SVM_EXIT_RDPRU]			= kvm_handle_invalid_op,
3300	[SVM_EXIT_EFER_WRITE_TRAP]		= efer_trap,
3301	[SVM_EXIT_CR0_WRITE_TRAP]		= cr_trap,
3302	[SVM_EXIT_CR4_WRITE_TRAP]		= cr_trap,
3303	[SVM_EXIT_CR8_WRITE_TRAP]		= cr_trap,
3304	[SVM_EXIT_INVPCID]                      = invpcid_interception,
3305	[SVM_EXIT_NPF]				= npf_interception,
3306	[SVM_EXIT_RSM]                          = rsm_interception,
3307	[SVM_EXIT_AVIC_INCOMPLETE_IPI]		= avic_incomplete_ipi_interception,
3308	[SVM_EXIT_AVIC_UNACCELERATED_ACCESS]	= avic_unaccelerated_access_interception,
3309	[SVM_EXIT_VMGEXIT]			= sev_handle_vmgexit,
3310};
3311
3312static void dump_vmcb(struct kvm_vcpu *vcpu)
3313{
3314	struct vcpu_svm *svm = to_svm(vcpu);
3315	struct vmcb_control_area *control = &svm->vmcb->control;
3316	struct vmcb_save_area *save = &svm->vmcb->save;
3317	struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3318
3319	if (!dump_invalid_vmcb) {
3320		pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3321		return;
3322	}
3323
3324	pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3325	       svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3326	pr_err("VMCB Control Area:\n");
3327	pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3328	pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3329	pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3330	pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3331	pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3332	pr_err("%-20s%08x %08x\n", "intercepts:",
3333              control->intercepts[INTERCEPT_WORD3],
3334	       control->intercepts[INTERCEPT_WORD4]);
3335	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3336	pr_err("%-20s%d\n", "pause filter threshold:",
3337	       control->pause_filter_thresh);
3338	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3339	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3340	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3341	pr_err("%-20s%d\n", "asid:", control->asid);
3342	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3343	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3344	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3345	pr_err("%-20s%08x\n", "int_state:", control->int_state);
3346	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3347	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3348	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3349	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3350	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3351	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3352	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3353	pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3354	pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3355	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3356	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3357	pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3358	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3359	pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3360	pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3361	pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3362	pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3363	pr_err("VMCB State Save Area:\n");
3364	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3365	       "es:",
3366	       save->es.selector, save->es.attrib,
3367	       save->es.limit, save->es.base);
3368	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3369	       "cs:",
3370	       save->cs.selector, save->cs.attrib,
3371	       save->cs.limit, save->cs.base);
3372	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3373	       "ss:",
3374	       save->ss.selector, save->ss.attrib,
3375	       save->ss.limit, save->ss.base);
3376	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3377	       "ds:",
3378	       save->ds.selector, save->ds.attrib,
3379	       save->ds.limit, save->ds.base);
3380	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3381	       "fs:",
3382	       save01->fs.selector, save01->fs.attrib,
3383	       save01->fs.limit, save01->fs.base);
3384	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3385	       "gs:",
3386	       save01->gs.selector, save01->gs.attrib,
3387	       save01->gs.limit, save01->gs.base);
3388	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3389	       "gdtr:",
3390	       save->gdtr.selector, save->gdtr.attrib,
3391	       save->gdtr.limit, save->gdtr.base);
3392	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3393	       "ldtr:",
3394	       save01->ldtr.selector, save01->ldtr.attrib,
3395	       save01->ldtr.limit, save01->ldtr.base);
3396	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3397	       "idtr:",
3398	       save->idtr.selector, save->idtr.attrib,
3399	       save->idtr.limit, save->idtr.base);
3400	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3401	       "tr:",
3402	       save01->tr.selector, save01->tr.attrib,
3403	       save01->tr.limit, save01->tr.base);
3404	pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3405	       save->vmpl, save->cpl, save->efer);
3406	pr_err("%-15s %016llx %-13s %016llx\n",
3407	       "cr0:", save->cr0, "cr2:", save->cr2);
3408	pr_err("%-15s %016llx %-13s %016llx\n",
3409	       "cr3:", save->cr3, "cr4:", save->cr4);
3410	pr_err("%-15s %016llx %-13s %016llx\n",
3411	       "dr6:", save->dr6, "dr7:", save->dr7);
3412	pr_err("%-15s %016llx %-13s %016llx\n",
3413	       "rip:", save->rip, "rflags:", save->rflags);
3414	pr_err("%-15s %016llx %-13s %016llx\n",
3415	       "rsp:", save->rsp, "rax:", save->rax);
3416	pr_err("%-15s %016llx %-13s %016llx\n",
3417	       "star:", save01->star, "lstar:", save01->lstar);
3418	pr_err("%-15s %016llx %-13s %016llx\n",
3419	       "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3420	pr_err("%-15s %016llx %-13s %016llx\n",
3421	       "kernel_gs_base:", save01->kernel_gs_base,
3422	       "sysenter_cs:", save01->sysenter_cs);
3423	pr_err("%-15s %016llx %-13s %016llx\n",
3424	       "sysenter_esp:", save01->sysenter_esp,
3425	       "sysenter_eip:", save01->sysenter_eip);
3426	pr_err("%-15s %016llx %-13s %016llx\n",
3427	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3428	pr_err("%-15s %016llx %-13s %016llx\n",
3429	       "br_from:", save->br_from, "br_to:", save->br_to);
3430	pr_err("%-15s %016llx %-13s %016llx\n",
3431	       "excp_from:", save->last_excp_from,
3432	       "excp_to:", save->last_excp_to);
3433}
3434
3435static bool svm_check_exit_valid(u64 exit_code)
3436{
3437	return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3438		svm_exit_handlers[exit_code]);
3439}
3440
3441static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3442{
3443	vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3444	dump_vmcb(vcpu);
3445	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3446	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3447	vcpu->run->internal.ndata = 2;
3448	vcpu->run->internal.data[0] = exit_code;
3449	vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3450	return 0;
3451}
3452
3453int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3454{
3455	if (!svm_check_exit_valid(exit_code))
3456		return svm_handle_invalid_exit(vcpu, exit_code);
3457
3458#ifdef CONFIG_RETPOLINE
3459	if (exit_code == SVM_EXIT_MSR)
3460		return msr_interception(vcpu);
3461	else if (exit_code == SVM_EXIT_VINTR)
3462		return interrupt_window_interception(vcpu);
3463	else if (exit_code == SVM_EXIT_INTR)
3464		return intr_interception(vcpu);
3465	else if (exit_code == SVM_EXIT_HLT)
3466		return kvm_emulate_halt(vcpu);
3467	else if (exit_code == SVM_EXIT_NPF)
3468		return npf_interception(vcpu);
3469#endif
3470	return svm_exit_handlers[exit_code](vcpu);
3471}
3472
3473static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3474			      u64 *info1, u64 *info2,
3475			      u32 *intr_info, u32 *error_code)
3476{
3477	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3478
3479	*reason = control->exit_code;
3480	*info1 = control->exit_info_1;
3481	*info2 = control->exit_info_2;
3482	*intr_info = control->exit_int_info;
3483	if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3484	    (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3485		*error_code = control->exit_int_info_err;
3486	else
3487		*error_code = 0;
3488}
3489
3490static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3491{
3492	struct vcpu_svm *svm = to_svm(vcpu);
3493	struct kvm_run *kvm_run = vcpu->run;
3494	u32 exit_code = svm->vmcb->control.exit_code;
3495
 
 
3496	/* SEV-ES guests must use the CR write traps to track CR registers. */
3497	if (!sev_es_guest(vcpu->kvm)) {
3498		if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3499			vcpu->arch.cr0 = svm->vmcb->save.cr0;
3500		if (npt_enabled)
3501			vcpu->arch.cr3 = svm->vmcb->save.cr3;
3502	}
3503
3504	if (is_guest_mode(vcpu)) {
3505		int vmexit;
3506
3507		trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3508
3509		vmexit = nested_svm_exit_special(svm);
3510
3511		if (vmexit == NESTED_EXIT_CONTINUE)
3512			vmexit = nested_svm_exit_handled(svm);
3513
3514		if (vmexit == NESTED_EXIT_DONE)
3515			return 1;
3516	}
3517
3518	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3519		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3520		kvm_run->fail_entry.hardware_entry_failure_reason
3521			= svm->vmcb->control.exit_code;
3522		kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3523		dump_vmcb(vcpu);
3524		return 0;
3525	}
3526
3527	if (exit_fastpath != EXIT_FASTPATH_NONE)
3528		return 1;
3529
3530	return svm_invoke_exit_handler(vcpu, exit_code);
3531}
3532
 
 
 
 
 
 
 
 
3533static void pre_svm_run(struct kvm_vcpu *vcpu)
3534{
3535	struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
3536	struct vcpu_svm *svm = to_svm(vcpu);
3537
3538	/*
3539	 * If the previous vmrun of the vmcb occurred on a different physical
3540	 * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3541	 * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3542	 */
3543	if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3544		svm->current_vmcb->asid_generation = 0;
3545		vmcb_mark_all_dirty(svm->vmcb);
3546		svm->current_vmcb->cpu = vcpu->cpu;
3547        }
3548
3549	if (sev_guest(vcpu->kvm))
3550		return pre_sev_run(svm, vcpu->cpu);
3551
3552	/* FIXME: handle wraparound of asid_generation */
3553	if (svm->current_vmcb->asid_generation != sd->asid_generation)
3554		new_asid(svm, sd);
3555}
3556
3557static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3558{
3559	struct vcpu_svm *svm = to_svm(vcpu);
3560
3561	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3562
3563	if (svm->nmi_l1_to_l2)
3564		return;
3565
3566	/*
3567	 * No need to manually track NMI masking when vNMI is enabled, hardware
3568	 * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
3569	 * case where software directly injects an NMI.
3570	 */
3571	if (!is_vnmi_enabled(svm)) {
3572		svm->nmi_masked = true;
3573		svm_set_iret_intercept(svm);
3574	}
3575	++vcpu->stat.nmi_injections;
3576}
3577
3578static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
3579{
3580	struct vcpu_svm *svm = to_svm(vcpu);
3581
3582	if (!is_vnmi_enabled(svm))
3583		return false;
3584
3585	return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
3586}
3587
3588static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
3589{
3590	struct vcpu_svm *svm = to_svm(vcpu);
3591
3592	if (!is_vnmi_enabled(svm))
3593		return false;
3594
3595	if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
3596		return false;
3597
3598	svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
3599	vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
3600
3601	/*
3602	 * Because the pending NMI is serviced by hardware, KVM can't know when
3603	 * the NMI is "injected", but for all intents and purposes, passing the
3604	 * NMI off to hardware counts as injection.
3605	 */
3606	++vcpu->stat.nmi_injections;
3607
3608	return true;
3609}
3610
3611static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3612{
3613	struct vcpu_svm *svm = to_svm(vcpu);
3614	u32 type;
3615
3616	if (vcpu->arch.interrupt.soft) {
3617		if (svm_update_soft_interrupt_rip(vcpu))
3618			return;
3619
3620		type = SVM_EVTINJ_TYPE_SOFT;
3621	} else {
3622		type = SVM_EVTINJ_TYPE_INTR;
3623	}
3624
3625	trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3626			   vcpu->arch.interrupt.soft, reinjected);
3627	++vcpu->stat.irq_injections;
3628
3629	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3630				       SVM_EVTINJ_VALID | type;
3631}
3632
3633void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3634				     int trig_mode, int vector)
3635{
3636	/*
3637	 * apic->apicv_active must be read after vcpu->mode.
3638	 * Pairs with smp_store_release in vcpu_enter_guest.
3639	 */
3640	bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3641
3642	/* Note, this is called iff the local APIC is in-kernel. */
3643	if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3644		/* Process the interrupt via kvm_check_and_inject_events(). */
3645		kvm_make_request(KVM_REQ_EVENT, vcpu);
3646		kvm_vcpu_kick(vcpu);
3647		return;
3648	}
3649
3650	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3651	if (in_guest_mode) {
3652		/*
3653		 * Signal the doorbell to tell hardware to inject the IRQ.  If
3654		 * the vCPU exits the guest before the doorbell chimes, hardware
3655		 * will automatically process AVIC interrupts at the next VMRUN.
3656		 */
3657		avic_ring_doorbell(vcpu);
3658	} else {
3659		/*
3660		 * Wake the vCPU if it was blocking.  KVM will then detect the
3661		 * pending IRQ when checking if the vCPU has a wake event.
3662		 */
3663		kvm_vcpu_wake_up(vcpu);
3664	}
3665}
3666
3667static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3668				  int trig_mode, int vector)
3669{
3670	kvm_lapic_set_irr(vector, apic);
3671
3672	/*
3673	 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3674	 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3675	 * the read of guest_mode.  This guarantees that either VMRUN will see
3676	 * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3677	 * will signal the doorbell if the CPU has already entered the guest.
3678	 */
3679	smp_mb__after_atomic();
3680	svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3681}
3682
3683static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3684{
3685	struct vcpu_svm *svm = to_svm(vcpu);
3686
3687	/*
3688	 * SEV-ES guests must always keep the CR intercepts cleared. CR
3689	 * tracking is done using the CR write traps.
3690	 */
3691	if (sev_es_guest(vcpu->kvm))
3692		return;
3693
3694	if (nested_svm_virtualize_tpr(vcpu))
3695		return;
3696
3697	svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3698
3699	if (irr == -1)
3700		return;
3701
3702	if (tpr >= irr)
3703		svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3704}
3705
3706static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3707{
3708	struct vcpu_svm *svm = to_svm(vcpu);
3709
3710	if (is_vnmi_enabled(svm))
3711		return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
3712	else
3713		return svm->nmi_masked;
3714}
3715
3716static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3717{
3718	struct vcpu_svm *svm = to_svm(vcpu);
3719
3720	if (is_vnmi_enabled(svm)) {
3721		if (masked)
3722			svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
3723		else
3724			svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
3725
3726	} else {
3727		svm->nmi_masked = masked;
3728		if (masked)
3729			svm_set_iret_intercept(svm);
3730		else
3731			svm_clr_iret_intercept(svm);
3732	}
3733}
3734
3735bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3736{
3737	struct vcpu_svm *svm = to_svm(vcpu);
3738	struct vmcb *vmcb = svm->vmcb;
 
3739
3740	if (!gif_set(svm))
3741		return true;
3742
3743	if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3744		return false;
3745
3746	if (svm_get_nmi_mask(vcpu))
3747		return true;
3748
3749	return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
3750}
3751
3752static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3753{
3754	struct vcpu_svm *svm = to_svm(vcpu);
3755	if (svm->nested.nested_run_pending)
3756		return -EBUSY;
3757
3758	if (svm_nmi_blocked(vcpu))
3759		return 0;
3760
3761	/* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3762	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3763		return -EBUSY;
3764	return 1;
3765}
3766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3767bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3768{
3769	struct vcpu_svm *svm = to_svm(vcpu);
3770	struct vmcb *vmcb = svm->vmcb;
3771
3772	if (!gif_set(svm))
3773		return true;
3774
3775	if (is_guest_mode(vcpu)) {
3776		/* As long as interrupts are being delivered...  */
3777		if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3778		    ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3779		    : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3780			return true;
3781
3782		/* ... vmexits aren't blocked by the interrupt shadow  */
3783		if (nested_exit_on_intr(svm))
3784			return false;
3785	} else {
3786		if (!svm_get_if_flag(vcpu))
3787			return true;
3788	}
3789
3790	return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3791}
3792
3793static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3794{
3795	struct vcpu_svm *svm = to_svm(vcpu);
3796
3797	if (svm->nested.nested_run_pending)
3798		return -EBUSY;
3799
3800	if (svm_interrupt_blocked(vcpu))
3801		return 0;
3802
3803	/*
3804	 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3805	 * e.g. if the IRQ arrived asynchronously after checking nested events.
3806	 */
3807	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3808		return -EBUSY;
3809
3810	return 1;
3811}
3812
3813static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3814{
3815	struct vcpu_svm *svm = to_svm(vcpu);
3816
3817	/*
3818	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3819	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3820	 * get that intercept, this function will be called again though and
3821	 * we'll get the vintr intercept. However, if the vGIF feature is
3822	 * enabled, the STGI interception will not occur. Enable the irq
3823	 * window under the assumption that the hardware will set the GIF.
3824	 */
3825	if (vgif || gif_set(svm)) {
3826		/*
3827		 * IRQ window is not needed when AVIC is enabled,
3828		 * unless we have pending ExtINT since it cannot be injected
3829		 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3830		 * and fallback to injecting IRQ via V_IRQ.
3831		 *
3832		 * If running nested, AVIC is already locally inhibited
3833		 * on this vCPU, therefore there is no need to request
3834		 * the VM wide AVIC inhibition.
3835		 */
3836		if (!is_guest_mode(vcpu))
3837			kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3838
3839		svm_set_vintr(svm);
3840	}
3841}
3842
3843static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3844{
3845	struct vcpu_svm *svm = to_svm(vcpu);
3846
3847	/*
3848	 * KVM should never request an NMI window when vNMI is enabled, as KVM
3849	 * allows at most one to-be-injected NMI and one pending NMI, i.e. if
3850	 * two NMIs arrive simultaneously, KVM will inject one and set
3851	 * V_NMI_PENDING for the other.  WARN, but continue with the standard
3852	 * single-step approach to try and salvage the pending NMI.
3853	 */
3854	WARN_ON_ONCE(is_vnmi_enabled(svm));
3855
3856	if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
3857		return; /* IRET will cause a vm exit */
3858
3859	/*
3860	 * SEV-ES guests are responsible for signaling when a vCPU is ready to
3861	 * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
3862	 * KVM can't intercept and single-step IRET to detect when NMIs are
3863	 * unblocked (architecturally speaking).  See SVM_VMGEXIT_NMI_COMPLETE.
3864	 *
3865	 * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
3866	 * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
3867	 * supported NAEs in the GHCB protocol.
3868	 */
3869	if (sev_es_guest(vcpu->kvm))
3870		return;
3871
3872	if (!gif_set(svm)) {
3873		if (vgif)
3874			svm_set_intercept(svm, INTERCEPT_STGI);
3875		return; /* STGI will cause a vm exit */
3876	}
3877
3878	/*
3879	 * Something prevents NMI from been injected. Single step over possible
3880	 * problem (IRET or exception injection or interrupt shadow)
3881	 */
3882	svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3883	svm->nmi_singlestep = true;
3884	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3885}
3886
3887static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
3888{
3889	struct vcpu_svm *svm = to_svm(vcpu);
3890
3891	/*
3892	 * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
3893	 * A TLB flush for the current ASID flushes both "host" and "guest" TLB
3894	 * entries, and thus is a superset of Hyper-V's fine grained flushing.
3895	 */
3896	kvm_hv_vcpu_purge_flush_tlb(vcpu);
3897
3898	/*
3899	 * Flush only the current ASID even if the TLB flush was invoked via
3900	 * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3901	 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3902	 * unconditionally does a TLB flush on both nested VM-Enter and nested
3903	 * VM-Exit (via kvm_mmu_reset_context()).
3904	 */
3905	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3906		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3907	else
3908		svm->current_vmcb->asid_generation--;
3909}
3910
3911static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3912{
3913	hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
3914
3915	/*
3916	 * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
3917	 * flush the NPT mappings via hypercall as flushing the ASID only
3918	 * affects virtual to physical mappings, it does not invalidate guest
3919	 * physical to host physical mappings.
3920	 */
3921	if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
3922		hyperv_flush_guest_mapping(root_tdp);
3923
3924	svm_flush_tlb_asid(vcpu);
3925}
3926
3927static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
3928{
3929	/*
3930	 * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
3931	 * flushes should be routed to hv_flush_remote_tlbs() without requesting
3932	 * a "regular" remote flush.  Reaching this point means either there's
3933	 * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
3934	 * which might be fatal to the guest.  Yell, but try to recover.
3935	 */
3936	if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
3937		hv_flush_remote_tlbs(vcpu->kvm);
3938
3939	svm_flush_tlb_asid(vcpu);
3940}
3941
3942static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3943{
3944	struct vcpu_svm *svm = to_svm(vcpu);
3945
3946	invlpga(gva, svm->vmcb->control.asid);
3947}
3948
3949static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3950{
3951	struct vcpu_svm *svm = to_svm(vcpu);
3952
3953	if (nested_svm_virtualize_tpr(vcpu))
3954		return;
3955
3956	if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3957		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3958		kvm_set_cr8(vcpu, cr8);
3959	}
3960}
3961
3962static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3963{
3964	struct vcpu_svm *svm = to_svm(vcpu);
3965	u64 cr8;
3966
3967	if (nested_svm_virtualize_tpr(vcpu) ||
3968	    kvm_vcpu_apicv_active(vcpu))
3969		return;
3970
3971	cr8 = kvm_get_cr8(vcpu);
3972	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3973	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3974}
3975
3976static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3977					int type)
3978{
3979	bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3980	bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3981	struct vcpu_svm *svm = to_svm(vcpu);
3982
3983	/*
3984	 * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3985	 * associated with the original soft exception/interrupt.  next_rip is
3986	 * cleared on all exits that can occur while vectoring an event, so KVM
3987	 * needs to manually set next_rip for re-injection.  Unlike the !nrips
3988	 * case below, this needs to be done if and only if KVM is re-injecting
3989	 * the same event, i.e. if the event is a soft exception/interrupt,
3990	 * otherwise next_rip is unused on VMRUN.
3991	 */
3992	if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3993	    kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3994		svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3995	/*
3996	 * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3997	 * injecting the soft exception/interrupt.  That advancement needs to
3998	 * be unwound if vectoring didn't complete.  Note, the new event may
3999	 * not be the injected event, e.g. if KVM injected an INTn, the INTn
4000	 * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
4001	 * be the reported vectored event, but RIP still needs to be unwound.
4002	 */
4003	else if (!nrips && (is_soft || is_exception) &&
4004		 kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
4005		kvm_rip_write(vcpu, svm->soft_int_old_rip);
4006}
4007
4008static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
4009{
4010	struct vcpu_svm *svm = to_svm(vcpu);
4011	u8 vector;
4012	int type;
4013	u32 exitintinfo = svm->vmcb->control.exit_int_info;
4014	bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
4015	bool soft_int_injected = svm->soft_int_injected;
4016
4017	svm->nmi_l1_to_l2 = false;
4018	svm->soft_int_injected = false;
4019
4020	/*
4021	 * If we've made progress since setting awaiting_iret_completion, we've
4022	 * executed an IRET and can allow NMI injection.
4023	 */
4024	if (svm->awaiting_iret_completion &&
4025	    kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
4026		svm->awaiting_iret_completion = false;
4027		svm->nmi_masked = false;
4028		kvm_make_request(KVM_REQ_EVENT, vcpu);
4029	}
4030
4031	vcpu->arch.nmi_injected = false;
4032	kvm_clear_exception_queue(vcpu);
4033	kvm_clear_interrupt_queue(vcpu);
4034
4035	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
4036		return;
4037
4038	kvm_make_request(KVM_REQ_EVENT, vcpu);
4039
4040	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
4041	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
4042
4043	if (soft_int_injected)
4044		svm_complete_soft_interrupt(vcpu, vector, type);
4045
4046	switch (type) {
4047	case SVM_EXITINTINFO_TYPE_NMI:
4048		vcpu->arch.nmi_injected = true;
4049		svm->nmi_l1_to_l2 = nmi_l1_to_l2;
4050		break;
4051	case SVM_EXITINTINFO_TYPE_EXEPT:
4052		/*
4053		 * Never re-inject a #VC exception.
4054		 */
4055		if (vector == X86_TRAP_VC)
4056			break;
4057
4058		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
4059			u32 err = svm->vmcb->control.exit_int_info_err;
4060			kvm_requeue_exception_e(vcpu, vector, err);
4061
4062		} else
4063			kvm_requeue_exception(vcpu, vector);
4064		break;
4065	case SVM_EXITINTINFO_TYPE_INTR:
4066		kvm_queue_interrupt(vcpu, vector, false);
4067		break;
4068	case SVM_EXITINTINFO_TYPE_SOFT:
4069		kvm_queue_interrupt(vcpu, vector, true);
4070		break;
4071	default:
4072		break;
4073	}
4074
4075}
4076
4077static void svm_cancel_injection(struct kvm_vcpu *vcpu)
4078{
4079	struct vcpu_svm *svm = to_svm(vcpu);
4080	struct vmcb_control_area *control = &svm->vmcb->control;
4081
4082	control->exit_int_info = control->event_inj;
4083	control->exit_int_info_err = control->event_inj_err;
4084	control->event_inj = 0;
4085	svm_complete_interrupts(vcpu);
4086}
4087
4088static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
4089{
4090	return 1;
4091}
4092
4093static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
4094{
4095	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
4096	    to_svm(vcpu)->vmcb->control.exit_info_1)
 
 
 
 
 
 
4097		return handle_fastpath_set_msr_irqoff(vcpu);
4098
4099	return EXIT_FASTPATH_NONE;
4100}
4101
4102static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
4103{
4104	struct vcpu_svm *svm = to_svm(vcpu);
4105
4106	guest_state_enter_irqoff();
4107
4108	amd_clear_divider();
4109
4110	if (sev_es_guest(vcpu->kvm))
4111		__svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
4112	else
4113		__svm_vcpu_run(svm, spec_ctrl_intercepted);
4114
4115	guest_state_exit_irqoff();
4116}
4117
4118static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
4119{
4120	struct vcpu_svm *svm = to_svm(vcpu);
4121	bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
4122
4123	trace_kvm_entry(vcpu);
4124
4125	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4126	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4127	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4128
4129	/*
4130	 * Disable singlestep if we're injecting an interrupt/exception.
4131	 * We don't want our modified rflags to be pushed on the stack where
4132	 * we might not be able to easily reset them if we disabled NMI
4133	 * singlestep later.
4134	 */
4135	if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
4136		/*
4137		 * Event injection happens before external interrupts cause a
4138		 * vmexit and interrupts are disabled here, so smp_send_reschedule
4139		 * is enough to force an immediate vmexit.
4140		 */
4141		disable_nmi_singlestep(svm);
4142		smp_send_reschedule(vcpu->cpu);
4143	}
4144
4145	pre_svm_run(vcpu);
4146
4147	sync_lapic_to_cr8(vcpu);
4148
4149	if (unlikely(svm->asid != svm->vmcb->control.asid)) {
4150		svm->vmcb->control.asid = svm->asid;
4151		vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
4152	}
4153	svm->vmcb->save.cr2 = vcpu->arch.cr2;
4154
4155	svm_hv_update_vp_id(svm->vmcb, vcpu);
4156
4157	/*
4158	 * Run with all-zero DR6 unless needed, so that we can get the exact cause
4159	 * of a #DB.
4160	 */
4161	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
4162		svm_set_dr6(svm, vcpu->arch.dr6);
4163	else
4164		svm_set_dr6(svm, DR6_ACTIVE_LOW);
4165
4166	clgi();
4167	kvm_load_guest_xsave_state(vcpu);
4168
4169	kvm_wait_lapic_expire(vcpu);
4170
4171	/*
4172	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
4173	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
4174	 * is no need to worry about the conditional branch over the wrmsr
4175	 * being speculatively taken.
4176	 */
4177	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4178		x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
4179
4180	svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
4181
 
 
 
4182	if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
4183		x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
4184
4185	if (!sev_es_guest(vcpu->kvm)) {
4186		vcpu->arch.cr2 = svm->vmcb->save.cr2;
4187		vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
4188		vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
4189		vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
4190	}
4191	vcpu->arch.regs_dirty = 0;
4192
4193	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4194		kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
4195
4196	kvm_load_host_xsave_state(vcpu);
4197	stgi();
4198
4199	/* Any pending NMI will happen here */
4200
4201	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
4202		kvm_after_interrupt(vcpu);
4203
4204	sync_cr8_to_lapic(vcpu);
4205
4206	svm->next_rip = 0;
4207	if (is_guest_mode(vcpu)) {
4208		nested_sync_control_from_vmcb02(svm);
4209
4210		/* Track VMRUNs that have made past consistency checking */
4211		if (svm->nested.nested_run_pending &&
4212		    svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4213                        ++vcpu->stat.nested_run;
4214
4215		svm->nested.nested_run_pending = 0;
4216	}
4217
4218	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4219	vmcb_mark_all_clean(svm->vmcb);
4220
4221	/* if exit due to PF check for async PF */
4222	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4223		vcpu->arch.apf.host_apf_flags =
4224			kvm_read_and_reset_apf_flags();
4225
4226	vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4227
4228	/*
4229	 * We need to handle MC intercepts here before the vcpu has a chance to
4230	 * change the physical cpu
4231	 */
4232	if (unlikely(svm->vmcb->control.exit_code ==
4233		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
4234		svm_handle_mce(vcpu);
4235
4236	trace_kvm_exit(vcpu, KVM_ISA_SVM);
4237
4238	svm_complete_interrupts(vcpu);
4239
4240	if (is_guest_mode(vcpu))
4241		return EXIT_FASTPATH_NONE;
4242
4243	return svm_exit_handlers_fastpath(vcpu);
4244}
4245
4246static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4247			     int root_level)
4248{
4249	struct vcpu_svm *svm = to_svm(vcpu);
4250	unsigned long cr3;
4251
4252	if (npt_enabled) {
4253		svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4254		vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4255
4256		hv_track_root_tdp(vcpu, root_hpa);
4257
4258		cr3 = vcpu->arch.cr3;
4259	} else if (root_level >= PT64_ROOT_4LEVEL) {
4260		cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4261	} else {
4262		/* PCID in the guest should be impossible with a 32-bit MMU. */
4263		WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4264		cr3 = root_hpa;
4265	}
4266
4267	svm->vmcb->save.cr3 = cr3;
4268	vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4269}
4270
 
 
 
 
 
 
 
 
 
 
 
4271static void
4272svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4273{
4274	/*
4275	 * Patch in the VMMCALL instruction:
4276	 */
4277	hypercall[0] = 0x0f;
4278	hypercall[1] = 0x01;
4279	hypercall[2] = 0xd9;
4280}
4281
 
 
 
 
 
4282/*
4283 * The kvm parameter can be NULL (module initialization, or invocation before
4284 * VM creation). Be sure to check the kvm parameter before using it.
4285 */
4286static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4287{
4288	switch (index) {
4289	case MSR_IA32_MCG_EXT_CTL:
4290	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
4291		return false;
4292	case MSR_IA32_SMBASE:
4293		if (!IS_ENABLED(CONFIG_KVM_SMM))
4294			return false;
4295		/* SEV-ES guests do not support SMM, so report false */
4296		if (kvm && sev_es_guest(kvm))
4297			return false;
4298		break;
4299	default:
4300		break;
4301	}
4302
4303	return true;
4304}
4305
4306static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4307{
4308	struct vcpu_svm *svm = to_svm(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4309
4310	/*
4311	 * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
4312	 * can only disable all variants of by disallowing CR4.OSXSAVE from
4313	 * being set.  As a result, if the host has XSAVE and XSAVES, and the
4314	 * guest has XSAVE enabled, the guest can execute XSAVES without
4315	 * faulting.  Treat XSAVES as enabled in this case regardless of
4316	 * whether it's advertised to the guest so that KVM context switches
4317	 * XSS on VM-Enter/VM-Exit.  Failure to do so would effectively give
4318	 * the guest read/write access to the host's XSS.
4319	 */
4320	if (boot_cpu_has(X86_FEATURE_XSAVE) &&
4321	    boot_cpu_has(X86_FEATURE_XSAVES) &&
4322	    guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
4323		kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
4324
4325	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
4326	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
4327	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
4328
4329	/*
4330	 * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
4331	 * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
4332	 * SVM on Intel is bonkers and extremely unlikely to work).
4333	 */
4334	if (!guest_cpuid_is_intel(vcpu))
4335		kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4336
4337	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
4338	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
4339	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
4340	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
4341
4342	svm_recalc_instruction_intercepts(vcpu, svm);
4343
4344	if (boot_cpu_has(X86_FEATURE_IBPB))
4345		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
4346				     !!guest_has_pred_cmd_msr(vcpu));
4347
4348	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
4349		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
4350				     !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
4351
4352	if (sev_guest(vcpu->kvm))
4353		sev_vcpu_after_set_cpuid(svm);
4354
4355	init_vmcb_after_set_cpuid(vcpu);
4356}
4357
4358static bool svm_has_wbinvd_exit(void)
4359{
4360	return true;
4361}
4362
4363#define PRE_EX(exit)  { .exit_code = (exit), \
4364			.stage = X86_ICPT_PRE_EXCEPT, }
4365#define POST_EX(exit) { .exit_code = (exit), \
4366			.stage = X86_ICPT_POST_EXCEPT, }
4367#define POST_MEM(exit) { .exit_code = (exit), \
4368			.stage = X86_ICPT_POST_MEMACCESS, }
4369
4370static const struct __x86_intercept {
4371	u32 exit_code;
4372	enum x86_intercept_stage stage;
4373} x86_intercept_map[] = {
4374	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
4375	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
4376	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
4377	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
4378	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
4379	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
4380	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
4381	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
4382	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
4383	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
4384	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
4385	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
4386	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
4387	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
4388	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
4389	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
4390	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
4391	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
4392	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
4393	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
4394	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
4395	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
4396	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
4397	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
4398	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
4399	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
4400	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
4401	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
4402	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
4403	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
4404	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
4405	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
4406	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
4407	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
4408	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
4409	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
4410	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
4411	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
4412	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
4413	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
4414	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
4415	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
4416	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
4417	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
4418	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
4419	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
4420	[x86_intercept_xsetbv]		= PRE_EX(SVM_EXIT_XSETBV),
4421};
4422
4423#undef PRE_EX
4424#undef POST_EX
4425#undef POST_MEM
4426
4427static int svm_check_intercept(struct kvm_vcpu *vcpu,
4428			       struct x86_instruction_info *info,
4429			       enum x86_intercept_stage stage,
4430			       struct x86_exception *exception)
4431{
4432	struct vcpu_svm *svm = to_svm(vcpu);
4433	int vmexit, ret = X86EMUL_CONTINUE;
4434	struct __x86_intercept icpt_info;
4435	struct vmcb *vmcb = svm->vmcb;
4436
4437	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4438		goto out;
4439
4440	icpt_info = x86_intercept_map[info->intercept];
4441
4442	if (stage != icpt_info.stage)
4443		goto out;
4444
4445	switch (icpt_info.exit_code) {
4446	case SVM_EXIT_READ_CR0:
4447		if (info->intercept == x86_intercept_cr_read)
4448			icpt_info.exit_code += info->modrm_reg;
4449		break;
4450	case SVM_EXIT_WRITE_CR0: {
4451		unsigned long cr0, val;
4452
4453		if (info->intercept == x86_intercept_cr_write)
4454			icpt_info.exit_code += info->modrm_reg;
4455
4456		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4457		    info->intercept == x86_intercept_clts)
4458			break;
4459
4460		if (!(vmcb12_is_intercept(&svm->nested.ctl,
4461					INTERCEPT_SELECTIVE_CR0)))
4462			break;
4463
4464		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4465		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4466
4467		if (info->intercept == x86_intercept_lmsw) {
4468			cr0 &= 0xfUL;
4469			val &= 0xfUL;
4470			/* lmsw can't clear PE - catch this here */
4471			if (cr0 & X86_CR0_PE)
4472				val |= X86_CR0_PE;
4473		}
4474
4475		if (cr0 ^ val)
4476			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4477
4478		break;
4479	}
4480	case SVM_EXIT_READ_DR0:
4481	case SVM_EXIT_WRITE_DR0:
4482		icpt_info.exit_code += info->modrm_reg;
4483		break;
4484	case SVM_EXIT_MSR:
4485		if (info->intercept == x86_intercept_wrmsr)
4486			vmcb->control.exit_info_1 = 1;
4487		else
4488			vmcb->control.exit_info_1 = 0;
4489		break;
4490	case SVM_EXIT_PAUSE:
4491		/*
4492		 * We get this for NOP only, but pause
4493		 * is rep not, check this here
4494		 */
4495		if (info->rep_prefix != REPE_PREFIX)
4496			goto out;
4497		break;
4498	case SVM_EXIT_IOIO: {
4499		u64 exit_info;
4500		u32 bytes;
4501
4502		if (info->intercept == x86_intercept_in ||
4503		    info->intercept == x86_intercept_ins) {
4504			exit_info = ((info->src_val & 0xffff) << 16) |
4505				SVM_IOIO_TYPE_MASK;
4506			bytes = info->dst_bytes;
4507		} else {
4508			exit_info = (info->dst_val & 0xffff) << 16;
4509			bytes = info->src_bytes;
4510		}
4511
4512		if (info->intercept == x86_intercept_outs ||
4513		    info->intercept == x86_intercept_ins)
4514			exit_info |= SVM_IOIO_STR_MASK;
4515
4516		if (info->rep_prefix)
4517			exit_info |= SVM_IOIO_REP_MASK;
4518
4519		bytes = min(bytes, 4u);
4520
4521		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4522
4523		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4524
4525		vmcb->control.exit_info_1 = exit_info;
4526		vmcb->control.exit_info_2 = info->next_rip;
4527
4528		break;
4529	}
4530	default:
4531		break;
4532	}
4533
4534	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4535	if (static_cpu_has(X86_FEATURE_NRIPS))
4536		vmcb->control.next_rip  = info->next_rip;
4537	vmcb->control.exit_code = icpt_info.exit_code;
4538	vmexit = nested_svm_exit_handled(svm);
4539
4540	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4541					   : X86EMUL_CONTINUE;
4542
4543out:
4544	return ret;
4545}
4546
4547static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4548{
4549	if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4550		vcpu->arch.at_instruction_boundary = true;
4551}
4552
4553static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4554{
4555	if (!kvm_pause_in_guest(vcpu->kvm))
4556		shrink_ple_window(vcpu);
4557}
4558
4559static void svm_setup_mce(struct kvm_vcpu *vcpu)
4560{
4561	/* [63:9] are reserved. */
4562	vcpu->arch.mcg_cap &= 0x1ff;
4563}
4564
4565#ifdef CONFIG_KVM_SMM
4566bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4567{
4568	struct vcpu_svm *svm = to_svm(vcpu);
4569
4570	/* Per APM Vol.2 15.22.2 "Response to SMI" */
4571	if (!gif_set(svm))
4572		return true;
4573
4574	return is_smm(vcpu);
4575}
4576
4577static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4578{
4579	struct vcpu_svm *svm = to_svm(vcpu);
4580	if (svm->nested.nested_run_pending)
4581		return -EBUSY;
4582
4583	if (svm_smi_blocked(vcpu))
4584		return 0;
4585
4586	/* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4587	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4588		return -EBUSY;
4589
4590	return 1;
4591}
4592
4593static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
4594{
4595	struct vcpu_svm *svm = to_svm(vcpu);
4596	struct kvm_host_map map_save;
4597	int ret;
4598
4599	if (!is_guest_mode(vcpu))
4600		return 0;
4601
4602	/*
4603	 * 32-bit SMRAM format doesn't preserve EFER and SVM state.  Userspace is
4604	 * responsible for ensuring nested SVM and SMIs are mutually exclusive.
4605	 */
4606
4607	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4608		return 1;
4609
4610	smram->smram64.svm_guest_flag = 1;
4611	smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
4612
4613	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4614	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4615	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4616
4617	ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4618	if (ret)
4619		return ret;
4620
4621	/*
4622	 * KVM uses VMCB01 to store L1 host state while L2 runs but
4623	 * VMCB01 is going to be used during SMM and thus the state will
4624	 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4625	 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4626	 * format of the area is identical to guest save area offsetted
4627	 * by 0x400 (matches the offset of 'struct vmcb_save_area'
4628	 * within 'struct vmcb'). Note: HSAVE area may also be used by
4629	 * L1 hypervisor to save additional host context (e.g. KVM does
4630	 * that, see svm_prepare_switch_to_guest()) which must be
4631	 * preserved.
4632	 */
4633	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4634		return 1;
4635
4636	BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4637
4638	svm_copy_vmrun_state(map_save.hva + 0x400,
4639			     &svm->vmcb01.ptr->save);
4640
4641	kvm_vcpu_unmap(vcpu, &map_save, true);
4642	return 0;
4643}
4644
4645static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
4646{
4647	struct vcpu_svm *svm = to_svm(vcpu);
4648	struct kvm_host_map map, map_save;
4649	struct vmcb *vmcb12;
4650	int ret;
4651
4652	const struct kvm_smram_state_64 *smram64 = &smram->smram64;
4653
4654	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4655		return 0;
4656
4657	/* Non-zero if SMI arrived while vCPU was in guest mode. */
4658	if (!smram64->svm_guest_flag)
4659		return 0;
4660
4661	if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4662		return 1;
4663
4664	if (!(smram64->efer & EFER_SVME))
4665		return 1;
4666
4667	if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
4668		return 1;
4669
4670	ret = 1;
4671	if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
4672		goto unmap_map;
4673
4674	if (svm_allocate_nested(svm))
4675		goto unmap_save;
4676
4677	/*
4678	 * Restore L1 host state from L1 HSAVE area as VMCB01 was
4679	 * used during SMM (see svm_enter_smm())
4680	 */
4681
4682	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4683
4684	/*
4685	 * Enter the nested guest now
4686	 */
4687
4688	vmcb_mark_all_dirty(svm->vmcb01.ptr);
4689
4690	vmcb12 = map.hva;
4691	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4692	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4693	ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
4694
4695	if (ret)
4696		goto unmap_save;
4697
4698	svm->nested.nested_run_pending = 1;
4699
4700unmap_save:
4701	kvm_vcpu_unmap(vcpu, &map_save, true);
4702unmap_map:
4703	kvm_vcpu_unmap(vcpu, &map, true);
4704	return ret;
4705}
4706
4707static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4708{
4709	struct vcpu_svm *svm = to_svm(vcpu);
4710
4711	if (!gif_set(svm)) {
4712		if (vgif)
4713			svm_set_intercept(svm, INTERCEPT_STGI);
4714		/* STGI will cause a vm exit */
4715	} else {
4716		/* We must be in SMM; RSM will cause a vmexit anyway.  */
4717	}
4718}
4719#endif
4720
4721static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4722					 void *insn, int insn_len)
4723{
4724	bool smep, smap, is_user;
 
4725	u64 error_code;
4726
4727	/* Emulation is always possible when KVM has access to all guest state. */
4728	if (!sev_guest(vcpu->kvm))
4729		return X86EMUL_CONTINUE;
4730
4731	/* #UD and #GP should never be intercepted for SEV guests. */
4732	WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4733				  EMULTYPE_TRAP_UD_FORCED |
4734				  EMULTYPE_VMWARE_GP));
4735
4736	/*
4737	 * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4738	 * to guest register state.
4739	 */
4740	if (sev_es_guest(vcpu->kvm))
4741		return X86EMUL_RETRY_INSTR;
4742
4743	/*
4744	 * Emulation is possible if the instruction is already decoded, e.g.
4745	 * when completing I/O after returning from userspace.
4746	 */
4747	if (emul_type & EMULTYPE_NO_DECODE)
4748		return X86EMUL_CONTINUE;
4749
4750	/*
4751	 * Emulation is possible for SEV guests if and only if a prefilled
4752	 * buffer containing the bytes of the intercepted instruction is
4753	 * available. SEV guest memory is encrypted with a guest specific key
4754	 * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
4755	 * decode garbage.
4756	 *
4757	 * If KVM is NOT trying to simply skip an instruction, inject #UD if
4758	 * KVM reached this point without an instruction buffer.  In practice,
4759	 * this path should never be hit by a well-behaved guest, e.g. KVM
4760	 * doesn't intercept #UD or #GP for SEV guests, but this path is still
4761	 * theoretically reachable, e.g. via unaccelerated fault-like AVIC
4762	 * access, and needs to be handled by KVM to avoid putting the guest
4763	 * into an infinite loop.   Injecting #UD is somewhat arbitrary, but
4764	 * its the least awful option given lack of insight into the guest.
4765	 *
4766	 * If KVM is trying to skip an instruction, simply resume the guest.
4767	 * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
4768	 * will attempt to re-inject the INT3/INTO and skip the instruction.
4769	 * In that scenario, retrying the INT3/INTO and hoping the guest will
4770	 * make forward progress is the only option that has a chance of
4771	 * success (and in practice it will work the vast majority of the time).
4772	 */
4773	if (unlikely(!insn)) {
4774		if (emul_type & EMULTYPE_SKIP)
4775			return X86EMUL_UNHANDLEABLE;
4776
4777		kvm_queue_exception(vcpu, UD_VECTOR);
4778		return X86EMUL_PROPAGATE_FAULT;
4779	}
4780
4781	/*
4782	 * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4783	 * will be empty if the DecodeAssist microcode cannot fetch bytes for
4784	 * the faulting instruction because the code fetch itself faulted, e.g.
4785	 * the guest attempted to fetch from emulated MMIO or a guest page
4786	 * table used to translate CS:RIP resides in emulated MMIO.
4787	 */
4788	if (likely(insn_len))
4789		return X86EMUL_CONTINUE;
4790
4791	/*
4792	 * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4793	 *
4794	 * Errata:
4795	 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4796	 * possible that CPU microcode implementing DecodeAssist will fail to
4797	 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4798	 * be '0'.  This happens because microcode reads CS:RIP using a _data_
4799	 * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4800	 * gives up and does not fill the instruction bytes buffer.
4801	 *
4802	 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4803	 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4804	 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4805	 * GuestIntrBytes field of the VMCB.
4806	 *
4807	 * This does _not_ mean that the erratum has been encountered, as the
4808	 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4809	 * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4810	 * encountered a reserved/not-present #PF.
4811	 *
4812	 * To hit the erratum, the following conditions must be true:
4813	 *    1. CR4.SMAP=1 (obviously).
4814	 *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4815	 *       have been hit as the guest would have encountered a SMEP
4816	 *       violation #PF, not a #NPF.
4817	 *    3. The #NPF is not due to a code fetch, in which case failure to
4818	 *       retrieve the instruction bytes is legitimate (see abvoe).
4819	 *
4820	 * In addition, don't apply the erratum workaround if the #NPF occurred
4821	 * while translating guest page tables (see below).
4822	 */
4823	error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4824	if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4825		goto resume_guest;
4826
4827	smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
4828	smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
 
4829	is_user = svm_get_cpl(vcpu) == 3;
4830	if (smap && (!smep || is_user)) {
4831		pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
4832
4833		/*
4834		 * If the fault occurred in userspace, arbitrarily inject #GP
4835		 * to avoid killing the guest and to hopefully avoid confusing
4836		 * the guest kernel too much, e.g. injecting #PF would not be
4837		 * coherent with respect to the guest's page tables.  Request
4838		 * triple fault if the fault occurred in the kernel as there's
4839		 * no fault that KVM can inject without confusing the guest.
4840		 * In practice, the triple fault is moot as no sane SEV kernel
4841		 * will execute from user memory while also running with SMAP=1.
4842		 */
4843		if (is_user)
4844			kvm_inject_gp(vcpu, 0);
4845		else
4846			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4847		return X86EMUL_PROPAGATE_FAULT;
4848	}
4849
4850resume_guest:
4851	/*
4852	 * If the erratum was not hit, simply resume the guest and let it fault
4853	 * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4854	 * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4855	 * userspace will kill the guest, and letting the emulator read garbage
4856	 * will yield random behavior and potentially corrupt the guest.
4857	 *
4858	 * Simply resuming the guest is technically not a violation of the SEV
4859	 * architecture.  AMD's APM states that all code fetches and page table
4860	 * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4861	 * APM also states that encrypted accesses to MMIO are "ignored", but
4862	 * doesn't explicitly define "ignored", i.e. doing nothing and letting
4863	 * the guest spin is technically "ignoring" the access.
4864	 */
4865	return X86EMUL_RETRY_INSTR;
4866}
4867
4868static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4869{
4870	struct vcpu_svm *svm = to_svm(vcpu);
4871
4872	return !gif_set(svm);
4873}
4874
4875static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4876{
4877	if (!sev_es_guest(vcpu->kvm))
4878		return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4879
4880	sev_vcpu_deliver_sipi_vector(vcpu, vector);
4881}
4882
4883static void svm_vm_destroy(struct kvm *kvm)
4884{
4885	avic_vm_destroy(kvm);
4886	sev_vm_destroy(kvm);
4887}
4888
4889static int svm_vm_init(struct kvm *kvm)
4890{
4891	if (!pause_filter_count || !pause_filter_thresh)
4892		kvm->arch.pause_in_guest = true;
4893
4894	if (enable_apicv) {
4895		int ret = avic_vm_init(kvm);
4896		if (ret)
4897			return ret;
4898	}
4899
4900	return 0;
4901}
4902
4903static struct kvm_x86_ops svm_x86_ops __initdata = {
4904	.name = KBUILD_MODNAME,
4905
4906	.check_processor_compatibility = svm_check_processor_compat,
4907
4908	.hardware_unsetup = svm_hardware_unsetup,
4909	.hardware_enable = svm_hardware_enable,
4910	.hardware_disable = svm_hardware_disable,
4911	.has_emulated_msr = svm_has_emulated_msr,
4912
4913	.vcpu_create = svm_vcpu_create,
4914	.vcpu_free = svm_vcpu_free,
4915	.vcpu_reset = svm_vcpu_reset,
4916
4917	.vm_size = sizeof(struct kvm_svm),
4918	.vm_init = svm_vm_init,
4919	.vm_destroy = svm_vm_destroy,
4920
4921	.prepare_switch_to_guest = svm_prepare_switch_to_guest,
4922	.vcpu_load = svm_vcpu_load,
4923	.vcpu_put = svm_vcpu_put,
4924	.vcpu_blocking = avic_vcpu_blocking,
4925	.vcpu_unblocking = avic_vcpu_unblocking,
4926
4927	.update_exception_bitmap = svm_update_exception_bitmap,
4928	.get_msr_feature = svm_get_msr_feature,
4929	.get_msr = svm_get_msr,
4930	.set_msr = svm_set_msr,
4931	.get_segment_base = svm_get_segment_base,
4932	.get_segment = svm_get_segment,
4933	.set_segment = svm_set_segment,
4934	.get_cpl = svm_get_cpl,
4935	.get_cs_db_l_bits = svm_get_cs_db_l_bits,
4936	.is_valid_cr0 = svm_is_valid_cr0,
4937	.set_cr0 = svm_set_cr0,
4938	.post_set_cr3 = sev_post_set_cr3,
4939	.is_valid_cr4 = svm_is_valid_cr4,
4940	.set_cr4 = svm_set_cr4,
4941	.set_efer = svm_set_efer,
4942	.get_idt = svm_get_idt,
4943	.set_idt = svm_set_idt,
4944	.get_gdt = svm_get_gdt,
4945	.set_gdt = svm_set_gdt,
4946	.set_dr7 = svm_set_dr7,
4947	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4948	.cache_reg = svm_cache_reg,
4949	.get_rflags = svm_get_rflags,
4950	.set_rflags = svm_set_rflags,
4951	.get_if_flag = svm_get_if_flag,
4952
4953	.flush_tlb_all = svm_flush_tlb_all,
4954	.flush_tlb_current = svm_flush_tlb_current,
4955	.flush_tlb_gva = svm_flush_tlb_gva,
4956	.flush_tlb_guest = svm_flush_tlb_asid,
4957
4958	.vcpu_pre_run = svm_vcpu_pre_run,
4959	.vcpu_run = svm_vcpu_run,
4960	.handle_exit = svm_handle_exit,
4961	.skip_emulated_instruction = svm_skip_emulated_instruction,
4962	.update_emulated_instruction = NULL,
4963	.set_interrupt_shadow = svm_set_interrupt_shadow,
4964	.get_interrupt_shadow = svm_get_interrupt_shadow,
4965	.patch_hypercall = svm_patch_hypercall,
4966	.inject_irq = svm_inject_irq,
4967	.inject_nmi = svm_inject_nmi,
4968	.is_vnmi_pending = svm_is_vnmi_pending,
4969	.set_vnmi_pending = svm_set_vnmi_pending,
4970	.inject_exception = svm_inject_exception,
4971	.cancel_injection = svm_cancel_injection,
4972	.interrupt_allowed = svm_interrupt_allowed,
4973	.nmi_allowed = svm_nmi_allowed,
4974	.get_nmi_mask = svm_get_nmi_mask,
4975	.set_nmi_mask = svm_set_nmi_mask,
4976	.enable_nmi_window = svm_enable_nmi_window,
4977	.enable_irq_window = svm_enable_irq_window,
4978	.update_cr8_intercept = svm_update_cr8_intercept,
4979	.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
4980	.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
 
4981	.apicv_post_state_restore = avic_apicv_post_state_restore,
4982	.required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
4983
4984	.get_exit_info = svm_get_exit_info,
4985
4986	.vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4987
4988	.has_wbinvd_exit = svm_has_wbinvd_exit,
4989
4990	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
4991	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4992	.write_tsc_offset = svm_write_tsc_offset,
4993	.write_tsc_multiplier = svm_write_tsc_multiplier,
4994
4995	.load_mmu_pgd = svm_load_mmu_pgd,
4996
4997	.check_intercept = svm_check_intercept,
4998	.handle_exit_irqoff = svm_handle_exit_irqoff,
4999
5000	.request_immediate_exit = __kvm_request_immediate_exit,
5001
5002	.sched_in = svm_sched_in,
5003
5004	.nested_ops = &svm_nested_ops,
5005
5006	.deliver_interrupt = svm_deliver_interrupt,
5007	.pi_update_irte = avic_pi_update_irte,
5008	.setup_mce = svm_setup_mce,
5009
5010#ifdef CONFIG_KVM_SMM
5011	.smi_allowed = svm_smi_allowed,
5012	.enter_smm = svm_enter_smm,
5013	.leave_smm = svm_leave_smm,
5014	.enable_smi_window = svm_enable_smi_window,
5015#endif
5016
5017	.mem_enc_ioctl = sev_mem_enc_ioctl,
5018	.mem_enc_register_region = sev_mem_enc_register_region,
5019	.mem_enc_unregister_region = sev_mem_enc_unregister_region,
5020	.guest_memory_reclaimed = sev_guest_memory_reclaimed,
5021
5022	.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
5023	.vm_move_enc_context_from = sev_vm_move_enc_context_from,
5024
5025	.check_emulate_instruction = svm_check_emulate_instruction,
5026
5027	.apic_init_signal_blocked = svm_apic_init_signal_blocked,
5028
5029	.msr_filter_changed = svm_msr_filter_changed,
5030	.complete_emulated_msr = svm_complete_emulated_msr,
5031
5032	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
5033	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
5034};
5035
5036/*
5037 * The default MMIO mask is a single bit (excluding the present bit),
5038 * which could conflict with the memory encryption bit. Check for
5039 * memory encryption support and override the default MMIO mask if
5040 * memory encryption is enabled.
5041 */
5042static __init void svm_adjust_mmio_mask(void)
5043{
5044	unsigned int enc_bit, mask_bit;
5045	u64 msr, mask;
5046
5047	/* If there is no memory encryption support, use existing mask */
5048	if (cpuid_eax(0x80000000) < 0x8000001f)
5049		return;
5050
5051	/* If memory encryption is not enabled, use existing mask */
5052	rdmsrl(MSR_AMD64_SYSCFG, msr);
5053	if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
5054		return;
5055
5056	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
5057	mask_bit = boot_cpu_data.x86_phys_bits;
5058
5059	/* Increment the mask bit if it is the same as the encryption bit */
5060	if (enc_bit == mask_bit)
5061		mask_bit++;
5062
5063	/*
5064	 * If the mask bit location is below 52, then some bits above the
5065	 * physical addressing limit will always be reserved, so use the
5066	 * rsvd_bits() function to generate the mask. This mask, along with
5067	 * the present bit, will be used to generate a page fault with
5068	 * PFER.RSV = 1.
5069	 *
5070	 * If the mask bit location is 52 (or above), then clear the mask.
5071	 */
5072	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
5073
5074	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
5075}
5076
5077static __init void svm_set_cpu_caps(void)
5078{
5079	kvm_set_cpu_caps();
5080
5081	kvm_caps.supported_perf_cap = 0;
5082	kvm_caps.supported_xss = 0;
5083
5084	/* CPUID 0x80000001 and 0x8000000A (SVM features) */
5085	if (nested) {
5086		kvm_cpu_cap_set(X86_FEATURE_SVM);
5087		kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
5088
5089		/*
5090		 * KVM currently flushes TLBs on *every* nested SVM transition,
5091		 * and so for all intents and purposes KVM supports flushing by
5092		 * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
5093		 */
5094		kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
5095
5096		if (nrips)
5097			kvm_cpu_cap_set(X86_FEATURE_NRIPS);
5098
5099		if (npt_enabled)
5100			kvm_cpu_cap_set(X86_FEATURE_NPT);
5101
5102		if (tsc_scaling)
5103			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
5104
5105		if (vls)
5106			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
5107		if (lbrv)
5108			kvm_cpu_cap_set(X86_FEATURE_LBRV);
5109
5110		if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
5111			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
5112
5113		if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
5114			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
5115
5116		if (vgif)
5117			kvm_cpu_cap_set(X86_FEATURE_VGIF);
5118
5119		if (vnmi)
5120			kvm_cpu_cap_set(X86_FEATURE_VNMI);
5121
5122		/* Nested VM can receive #VMEXIT instead of triggering #GP */
5123		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
5124	}
5125
5126	/* CPUID 0x80000008 */
5127	if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
5128	    boot_cpu_has(X86_FEATURE_AMD_SSBD))
5129		kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
5130
5131	if (enable_pmu) {
5132		/*
5133		 * Enumerate support for PERFCTR_CORE if and only if KVM has
5134		 * access to enough counters to virtualize "core" support,
5135		 * otherwise limit vPMU support to the legacy number of counters.
5136		 */
5137		if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
5138			kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
5139							  kvm_pmu_cap.num_counters_gp);
5140		else
5141			kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
5142
5143		if (kvm_pmu_cap.version != 2 ||
5144		    !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
5145			kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
5146	}
5147
5148	/* CPUID 0x8000001F (SME/SEV features) */
5149	sev_set_cpu_caps();
5150}
5151
5152static __init int svm_hardware_setup(void)
5153{
5154	int cpu;
5155	struct page *iopm_pages;
5156	void *iopm_va;
5157	int r;
5158	unsigned int order = get_order(IOPM_SIZE);
5159
5160	/*
5161	 * NX is required for shadow paging and for NPT if the NX huge pages
5162	 * mitigation is enabled.
5163	 */
5164	if (!boot_cpu_has(X86_FEATURE_NX)) {
5165		pr_err_ratelimited("NX (Execute Disable) not supported\n");
5166		return -EOPNOTSUPP;
5167	}
5168	kvm_enable_efer_bits(EFER_NX);
5169
5170	iopm_pages = alloc_pages(GFP_KERNEL, order);
5171
5172	if (!iopm_pages)
5173		return -ENOMEM;
5174
5175	iopm_va = page_address(iopm_pages);
5176	memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
5177	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
5178
5179	init_msrpm_offsets();
5180
5181	kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
5182				     XFEATURE_MASK_BNDCSR);
5183
5184	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
5185		kvm_enable_efer_bits(EFER_FFXSR);
5186
5187	if (tsc_scaling) {
5188		if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
5189			tsc_scaling = false;
5190		} else {
5191			pr_info("TSC scaling supported\n");
5192			kvm_caps.has_tsc_control = true;
5193		}
5194	}
5195	kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
5196	kvm_caps.tsc_scaling_ratio_frac_bits = 32;
5197
5198	tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
5199
5200	if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
5201		kvm_enable_efer_bits(EFER_AUTOIBRS);
5202
5203	/* Check for pause filtering support */
5204	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
5205		pause_filter_count = 0;
5206		pause_filter_thresh = 0;
5207	} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
5208		pause_filter_thresh = 0;
5209	}
5210
5211	if (nested) {
5212		pr_info("Nested Virtualization enabled\n");
5213		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
5214	}
5215
5216	/*
5217	 * KVM's MMU doesn't support using 2-level paging for itself, and thus
5218	 * NPT isn't supported if the host is using 2-level paging since host
5219	 * CR4 is unchanged on VMRUN.
5220	 */
5221	if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
5222		npt_enabled = false;
5223
5224	if (!boot_cpu_has(X86_FEATURE_NPT))
5225		npt_enabled = false;
5226
5227	/* Force VM NPT level equal to the host's paging level */
5228	kvm_configure_mmu(npt_enabled, get_npt_level(),
5229			  get_npt_level(), PG_LEVEL_1G);
5230	pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
5231
5232	/* Setup shadow_me_value and shadow_me_mask */
5233	kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
5234
5235	svm_adjust_mmio_mask();
5236
5237	nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
5238
5239	/*
5240	 * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
5241	 * may be modified by svm_adjust_mmio_mask()), as well as nrips.
5242	 */
5243	sev_hardware_setup();
5244
5245	svm_hv_hardware_setup();
5246
5247	for_each_possible_cpu(cpu) {
5248		r = svm_cpu_init(cpu);
5249		if (r)
5250			goto err;
5251	}
5252
5253	enable_apicv = avic = avic && avic_hardware_setup();
 
 
 
 
 
5254
5255	if (!enable_apicv) {
5256		svm_x86_ops.vcpu_blocking = NULL;
5257		svm_x86_ops.vcpu_unblocking = NULL;
5258		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5259	} else if (!x2avic_enabled) {
5260		svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
5261	}
5262
5263	if (vls) {
5264		if (!npt_enabled ||
5265		    !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5266		    !IS_ENABLED(CONFIG_X86_64)) {
5267			vls = false;
5268		} else {
5269			pr_info("Virtual VMLOAD VMSAVE supported\n");
5270		}
5271	}
5272
5273	if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5274		svm_gp_erratum_intercept = false;
5275
5276	if (vgif) {
5277		if (!boot_cpu_has(X86_FEATURE_VGIF))
5278			vgif = false;
5279		else
5280			pr_info("Virtual GIF supported\n");
5281	}
5282
5283	vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
5284	if (vnmi)
5285		pr_info("Virtual NMI enabled\n");
5286
5287	if (!vnmi) {
5288		svm_x86_ops.is_vnmi_pending = NULL;
5289		svm_x86_ops.set_vnmi_pending = NULL;
5290	}
5291
5292
5293	if (lbrv) {
5294		if (!boot_cpu_has(X86_FEATURE_LBRV))
5295			lbrv = false;
5296		else
5297			pr_info("LBR virtualization supported\n");
5298	}
5299
5300	if (!enable_pmu)
5301		pr_info("PMU virtualization is disabled\n");
5302
5303	svm_set_cpu_caps();
5304
5305	/*
5306	 * It seems that on AMD processors PTE's accessed bit is
5307	 * being set by the CPU hardware before the NPF vmexit.
5308	 * This is not expected behaviour and our tests fail because
5309	 * of it.
5310	 * A workaround here is to disable support for
5311	 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5312	 * In this case userspace can know if there is support using
5313	 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5314	 * it
5315	 * If future AMD CPU models change the behaviour described above,
5316	 * this variable can be changed accordingly
5317	 */
5318	allow_smaller_maxphyaddr = !npt_enabled;
5319
5320	return 0;
5321
5322err:
5323	svm_hardware_unsetup();
5324	return r;
5325}
5326
5327
5328static struct kvm_x86_init_ops svm_init_ops __initdata = {
 
 
5329	.hardware_setup = svm_hardware_setup,
 
5330
5331	.runtime_ops = &svm_x86_ops,
5332	.pmu_ops = &amd_pmu_ops,
5333};
5334
5335static void __svm_exit(void)
5336{
5337	kvm_x86_vendor_exit();
5338
5339	cpu_emergency_unregister_virt_callback(svm_emergency_disable);
5340}
5341
5342static int __init svm_init(void)
5343{
5344	int r;
5345
5346	__unused_size_checks();
5347
5348	if (!kvm_is_svm_supported())
5349		return -EOPNOTSUPP;
5350
5351	r = kvm_x86_vendor_init(&svm_init_ops);
5352	if (r)
5353		return r;
5354
5355	cpu_emergency_register_virt_callback(svm_emergency_disable);
5356
5357	/*
5358	 * Common KVM initialization _must_ come last, after this, /dev/kvm is
5359	 * exposed to userspace!
5360	 */
5361	r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
5362		     THIS_MODULE);
5363	if (r)
5364		goto err_kvm_init;
5365
5366	return 0;
5367
5368err_kvm_init:
5369	__svm_exit();
5370	return r;
5371}
5372
5373static void __exit svm_exit(void)
5374{
5375	kvm_exit();
5376	__svm_exit();
5377}
5378
5379module_init(svm_init)
5380module_exit(svm_exit)