Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.13.7.
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * AMD SVM support
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   8 *
   9 * Authors:
  10 *   Yaniv Kamay  <yaniv@qumranet.com>
  11 *   Avi Kivity   <avi@qumranet.com>
  12 *
  13 * This work is licensed under the terms of the GNU GPL, version 2.  See
  14 * the COPYING file in the top-level directory.
  15 *
  16 */
  17#include <linux/kvm_host.h>
  18
  19#include "irq.h"
  20#include "mmu.h"
  21#include "kvm_cache_regs.h"
  22#include "x86.h"
  23#include "cpuid.h"
  24#include "pmu.h"
  25
  26#include <linux/module.h>
  27#include <linux/mod_devicetable.h>
  28#include <linux/kernel.h>
  29#include <linux/vmalloc.h>
  30#include <linux/highmem.h>
  31#include <linux/sched.h>
  32#include <linux/trace_events.h>
  33#include <linux/slab.h>
  34
  35#include <asm/perf_event.h>
  36#include <asm/tlbflush.h>
  37#include <asm/desc.h>
  38#include <asm/debugreg.h>
  39#include <asm/kvm_para.h>
  40
  41#include <asm/virtext.h>
  42#include "trace.h"
  43
  44#define __ex(x) __kvm_handle_fault_on_reboot(x)
  45
  46MODULE_AUTHOR("Qumranet");
  47MODULE_LICENSE("GPL");
  48
  49static const struct x86_cpu_id svm_cpu_id[] = {
  50	X86_FEATURE_MATCH(X86_FEATURE_SVM),
  51	{}
  52};
  53MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  54
  55#define IOPM_ALLOC_ORDER 2
  56#define MSRPM_ALLOC_ORDER 1
  57
  58#define SEG_TYPE_LDT 2
  59#define SEG_TYPE_BUSY_TSS16 3
  60
  61#define SVM_FEATURE_NPT            (1 <<  0)
  62#define SVM_FEATURE_LBRV           (1 <<  1)
  63#define SVM_FEATURE_SVML           (1 <<  2)
  64#define SVM_FEATURE_NRIP           (1 <<  3)
  65#define SVM_FEATURE_TSC_RATE       (1 <<  4)
  66#define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
  67#define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
  68#define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
  69#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
  70
  71#define NESTED_EXIT_HOST	0	/* Exit handled on host level */
  72#define NESTED_EXIT_DONE	1	/* Exit caused nested vmexit  */
  73#define NESTED_EXIT_CONTINUE	2	/* Further checks needed      */
  74
  75#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
  76
  77#define TSC_RATIO_RSVD          0xffffff0000000000ULL
  78#define TSC_RATIO_MIN		0x0000000000000001ULL
  79#define TSC_RATIO_MAX		0x000000ffffffffffULL
  80
  81static bool erratum_383_found __read_mostly;
  82
  83static const u32 host_save_user_msrs[] = {
  84#ifdef CONFIG_X86_64
  85	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
  86	MSR_FS_BASE,
  87#endif
  88	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
  89	MSR_TSC_AUX,
  90};
  91
  92#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
  93
  94struct kvm_vcpu;
  95
  96struct nested_state {
  97	struct vmcb *hsave;
  98	u64 hsave_msr;
  99	u64 vm_cr_msr;
 100	u64 vmcb;
 101
 102	/* These are the merged vectors */
 103	u32 *msrpm;
 104
 105	/* gpa pointers to the real vectors */
 106	u64 vmcb_msrpm;
 107	u64 vmcb_iopm;
 108
 109	/* A VMEXIT is required but not yet emulated */
 110	bool exit_required;
 111
 112	/* cache for intercepts of the guest */
 113	u32 intercept_cr;
 114	u32 intercept_dr;
 115	u32 intercept_exceptions;
 116	u64 intercept;
 117
 118	/* Nested Paging related state */
 119	u64 nested_cr3;
 120};
 121
 122#define MSRPM_OFFSETS	16
 123static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 124
 125/*
 126 * Set osvw_len to higher value when updated Revision Guides
 127 * are published and we know what the new status bits are
 128 */
 129static uint64_t osvw_len = 4, osvw_status;
 130
 131struct vcpu_svm {
 132	struct kvm_vcpu vcpu;
 133	struct vmcb *vmcb;
 134	unsigned long vmcb_pa;
 135	struct svm_cpu_data *svm_data;
 136	uint64_t asid_generation;
 137	uint64_t sysenter_esp;
 138	uint64_t sysenter_eip;
 139	uint64_t tsc_aux;
 140
 141	u64 next_rip;
 142
 143	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
 144	struct {
 145		u16 fs;
 146		u16 gs;
 147		u16 ldt;
 148		u64 gs_base;
 149	} host;
 150
 151	u32 *msrpm;
 152
 153	ulong nmi_iret_rip;
 154
 155	struct nested_state nested;
 156
 157	bool nmi_singlestep;
 158
 159	unsigned int3_injected;
 160	unsigned long int3_rip;
 161	u32 apf_reason;
 162
 163	/* cached guest cpuid flags for faster access */
 164	bool nrips_enabled	: 1;
 165};
 166
 167static DEFINE_PER_CPU(u64, current_tsc_ratio);
 168#define TSC_RATIO_DEFAULT	0x0100000000ULL
 169
 170#define MSR_INVALID			0xffffffffU
 171
 172static const struct svm_direct_access_msrs {
 173	u32 index;   /* Index of the MSR */
 174	bool always; /* True if intercept is always on */
 175} direct_access_msrs[] = {
 176	{ .index = MSR_STAR,				.always = true  },
 177	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
 178#ifdef CONFIG_X86_64
 179	{ .index = MSR_GS_BASE,				.always = true  },
 180	{ .index = MSR_FS_BASE,				.always = true  },
 181	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
 182	{ .index = MSR_LSTAR,				.always = true  },
 183	{ .index = MSR_CSTAR,				.always = true  },
 184	{ .index = MSR_SYSCALL_MASK,			.always = true  },
 185#endif
 186	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
 187	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
 188	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
 189	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
 190	{ .index = MSR_INVALID,				.always = false },
 191};
 192
 193/* enable NPT for AMD64 and X86 with PAE */
 194#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 195static bool npt_enabled = true;
 196#else
 197static bool npt_enabled;
 198#endif
 199
 200/* allow nested paging (virtualized MMU) for all guests */
 201static int npt = true;
 202module_param(npt, int, S_IRUGO);
 203
 204/* allow nested virtualization in KVM/SVM */
 205static int nested = true;
 206module_param(nested, int, S_IRUGO);
 207
 208static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 209static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 210static void svm_complete_interrupts(struct vcpu_svm *svm);
 211
 212static int nested_svm_exit_handled(struct vcpu_svm *svm);
 213static int nested_svm_intercept(struct vcpu_svm *svm);
 214static int nested_svm_vmexit(struct vcpu_svm *svm);
 215static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 216				      bool has_error_code, u32 error_code);
 217
 218enum {
 219	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
 220			    pause filter count */
 221	VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
 222	VMCB_ASID,	 /* ASID */
 223	VMCB_INTR,	 /* int_ctl, int_vector */
 224	VMCB_NPT,        /* npt_en, nCR3, gPAT */
 225	VMCB_CR,	 /* CR0, CR3, CR4, EFER */
 226	VMCB_DR,         /* DR6, DR7 */
 227	VMCB_DT,         /* GDT, IDT */
 228	VMCB_SEG,        /* CS, DS, SS, ES, CPL */
 229	VMCB_CR2,        /* CR2 only */
 230	VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
 231	VMCB_DIRTY_MAX,
 232};
 233
 234/* TPR and CR2 are always written before VMRUN */
 235#define VMCB_ALWAYS_DIRTY_MASK	((1U << VMCB_INTR) | (1U << VMCB_CR2))
 236
 237static inline void mark_all_dirty(struct vmcb *vmcb)
 238{
 239	vmcb->control.clean = 0;
 240}
 241
 242static inline void mark_all_clean(struct vmcb *vmcb)
 243{
 244	vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
 245			       & ~VMCB_ALWAYS_DIRTY_MASK;
 246}
 247
 248static inline void mark_dirty(struct vmcb *vmcb, int bit)
 249{
 250	vmcb->control.clean &= ~(1 << bit);
 251}
 252
 253static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 254{
 255	return container_of(vcpu, struct vcpu_svm, vcpu);
 256}
 257
 258static void recalc_intercepts(struct vcpu_svm *svm)
 259{
 260	struct vmcb_control_area *c, *h;
 261	struct nested_state *g;
 262
 263	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 264
 265	if (!is_guest_mode(&svm->vcpu))
 266		return;
 267
 268	c = &svm->vmcb->control;
 269	h = &svm->nested.hsave->control;
 270	g = &svm->nested;
 271
 272	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 273	c->intercept_dr = h->intercept_dr | g->intercept_dr;
 274	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 275	c->intercept = h->intercept | g->intercept;
 276}
 277
 278static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
 279{
 280	if (is_guest_mode(&svm->vcpu))
 281		return svm->nested.hsave;
 282	else
 283		return svm->vmcb;
 284}
 285
 286static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
 287{
 288	struct vmcb *vmcb = get_host_vmcb(svm);
 289
 290	vmcb->control.intercept_cr |= (1U << bit);
 291
 292	recalc_intercepts(svm);
 293}
 294
 295static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
 296{
 297	struct vmcb *vmcb = get_host_vmcb(svm);
 298
 299	vmcb->control.intercept_cr &= ~(1U << bit);
 300
 301	recalc_intercepts(svm);
 302}
 303
 304static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
 305{
 306	struct vmcb *vmcb = get_host_vmcb(svm);
 307
 308	return vmcb->control.intercept_cr & (1U << bit);
 309}
 310
 311static inline void set_dr_intercepts(struct vcpu_svm *svm)
 312{
 313	struct vmcb *vmcb = get_host_vmcb(svm);
 314
 315	vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
 316		| (1 << INTERCEPT_DR1_READ)
 317		| (1 << INTERCEPT_DR2_READ)
 318		| (1 << INTERCEPT_DR3_READ)
 319		| (1 << INTERCEPT_DR4_READ)
 320		| (1 << INTERCEPT_DR5_READ)
 321		| (1 << INTERCEPT_DR6_READ)
 322		| (1 << INTERCEPT_DR7_READ)
 323		| (1 << INTERCEPT_DR0_WRITE)
 324		| (1 << INTERCEPT_DR1_WRITE)
 325		| (1 << INTERCEPT_DR2_WRITE)
 326		| (1 << INTERCEPT_DR3_WRITE)
 327		| (1 << INTERCEPT_DR4_WRITE)
 328		| (1 << INTERCEPT_DR5_WRITE)
 329		| (1 << INTERCEPT_DR6_WRITE)
 330		| (1 << INTERCEPT_DR7_WRITE);
 331
 332	recalc_intercepts(svm);
 333}
 334
 335static inline void clr_dr_intercepts(struct vcpu_svm *svm)
 336{
 337	struct vmcb *vmcb = get_host_vmcb(svm);
 338
 339	vmcb->control.intercept_dr = 0;
 340
 341	recalc_intercepts(svm);
 342}
 343
 344static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
 345{
 346	struct vmcb *vmcb = get_host_vmcb(svm);
 347
 348	vmcb->control.intercept_exceptions |= (1U << bit);
 349
 350	recalc_intercepts(svm);
 351}
 352
 353static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
 354{
 355	struct vmcb *vmcb = get_host_vmcb(svm);
 356
 357	vmcb->control.intercept_exceptions &= ~(1U << bit);
 358
 359	recalc_intercepts(svm);
 360}
 361
 362static inline void set_intercept(struct vcpu_svm *svm, int bit)
 363{
 364	struct vmcb *vmcb = get_host_vmcb(svm);
 365
 366	vmcb->control.intercept |= (1ULL << bit);
 367
 368	recalc_intercepts(svm);
 369}
 370
 371static inline void clr_intercept(struct vcpu_svm *svm, int bit)
 372{
 373	struct vmcb *vmcb = get_host_vmcb(svm);
 374
 375	vmcb->control.intercept &= ~(1ULL << bit);
 376
 377	recalc_intercepts(svm);
 378}
 379
 380static inline void enable_gif(struct vcpu_svm *svm)
 381{
 382	svm->vcpu.arch.hflags |= HF_GIF_MASK;
 383}
 384
 385static inline void disable_gif(struct vcpu_svm *svm)
 386{
 387	svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 388}
 389
 390static inline bool gif_set(struct vcpu_svm *svm)
 391{
 392	return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 393}
 394
 395static unsigned long iopm_base;
 396
 397struct kvm_ldttss_desc {
 398	u16 limit0;
 399	u16 base0;
 400	unsigned base1:8, type:5, dpl:2, p:1;
 401	unsigned limit1:4, zero0:3, g:1, base2:8;
 402	u32 base3;
 403	u32 zero1;
 404} __attribute__((packed));
 405
 406struct svm_cpu_data {
 407	int cpu;
 408
 409	u64 asid_generation;
 410	u32 max_asid;
 411	u32 next_asid;
 412	struct kvm_ldttss_desc *tss_desc;
 413
 414	struct page *save_area;
 415};
 416
 417static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 418
 419struct svm_init_data {
 420	int cpu;
 421	int r;
 422};
 423
 424static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 425
 426#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
 427#define MSRS_RANGE_SIZE 2048
 428#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 429
 430static u32 svm_msrpm_offset(u32 msr)
 431{
 432	u32 offset;
 433	int i;
 434
 435	for (i = 0; i < NUM_MSR_MAPS; i++) {
 436		if (msr < msrpm_ranges[i] ||
 437		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
 438			continue;
 439
 440		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
 441		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
 442
 443		/* Now we have the u8 offset - but need the u32 offset */
 444		return offset / 4;
 445	}
 446
 447	/* MSR not in any range */
 448	return MSR_INVALID;
 449}
 450
 451#define MAX_INST_SIZE 15
 452
 453static inline void clgi(void)
 454{
 455	asm volatile (__ex(SVM_CLGI));
 456}
 457
 458static inline void stgi(void)
 459{
 460	asm volatile (__ex(SVM_STGI));
 461}
 462
 463static inline void invlpga(unsigned long addr, u32 asid)
 464{
 465	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 466}
 467
 468static int get_npt_level(void)
 469{
 470#ifdef CONFIG_X86_64
 471	return PT64_ROOT_LEVEL;
 472#else
 473	return PT32E_ROOT_LEVEL;
 474#endif
 475}
 476
 477static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 478{
 479	vcpu->arch.efer = efer;
 480	if (!npt_enabled && !(efer & EFER_LMA))
 481		efer &= ~EFER_LME;
 482
 483	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 484	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
 485}
 486
 487static int is_external_interrupt(u32 info)
 488{
 489	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
 490	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 491}
 492
 493static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 494{
 495	struct vcpu_svm *svm = to_svm(vcpu);
 496	u32 ret = 0;
 497
 498	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 499		ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 500	return ret;
 501}
 502
 503static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 504{
 505	struct vcpu_svm *svm = to_svm(vcpu);
 506
 507	if (mask == 0)
 508		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 509	else
 510		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
 511
 512}
 513
 514static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 515{
 516	struct vcpu_svm *svm = to_svm(vcpu);
 517
 518	if (svm->vmcb->control.next_rip != 0) {
 519		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 520		svm->next_rip = svm->vmcb->control.next_rip;
 521	}
 522
 523	if (!svm->next_rip) {
 524		if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
 525				EMULATE_DONE)
 526			printk(KERN_DEBUG "%s: NOP\n", __func__);
 527		return;
 528	}
 529	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
 530		printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
 531		       __func__, kvm_rip_read(vcpu), svm->next_rip);
 532
 533	kvm_rip_write(vcpu, svm->next_rip);
 534	svm_set_interrupt_shadow(vcpu, 0);
 535}
 536
 537static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 538				bool has_error_code, u32 error_code,
 539				bool reinject)
 540{
 541	struct vcpu_svm *svm = to_svm(vcpu);
 542
 543	/*
 544	 * If we are within a nested VM we'd better #VMEXIT and let the guest
 545	 * handle the exception
 546	 */
 547	if (!reinject &&
 548	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
 549		return;
 550
 551	if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
 552		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
 553
 554		/*
 555		 * For guest debugging where we have to reinject #BP if some
 556		 * INT3 is guest-owned:
 557		 * Emulate nRIP by moving RIP forward. Will fail if injection
 558		 * raises a fault that is not intercepted. Still better than
 559		 * failing in all cases.
 560		 */
 561		skip_emulated_instruction(&svm->vcpu);
 562		rip = kvm_rip_read(&svm->vcpu);
 563		svm->int3_rip = rip + svm->vmcb->save.cs.base;
 564		svm->int3_injected = rip - old_rip;
 565	}
 566
 567	svm->vmcb->control.event_inj = nr
 568		| SVM_EVTINJ_VALID
 569		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
 570		| SVM_EVTINJ_TYPE_EXEPT;
 571	svm->vmcb->control.event_inj_err = error_code;
 572}
 573
 574static void svm_init_erratum_383(void)
 575{
 576	u32 low, high;
 577	int err;
 578	u64 val;
 579
 580	if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
 581		return;
 582
 583	/* Use _safe variants to not break nested virtualization */
 584	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
 585	if (err)
 586		return;
 587
 588	val |= (1ULL << 47);
 589
 590	low  = lower_32_bits(val);
 591	high = upper_32_bits(val);
 592
 593	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
 594
 595	erratum_383_found = true;
 596}
 597
 598static void svm_init_osvw(struct kvm_vcpu *vcpu)
 599{
 600	/*
 601	 * Guests should see errata 400 and 415 as fixed (assuming that
 602	 * HLT and IO instructions are intercepted).
 603	 */
 604	vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
 605	vcpu->arch.osvw.status = osvw_status & ~(6ULL);
 606
 607	/*
 608	 * By increasing VCPU's osvw.length to 3 we are telling the guest that
 609	 * all osvw.status bits inside that length, including bit 0 (which is
 610	 * reserved for erratum 298), are valid. However, if host processor's
 611	 * osvw_len is 0 then osvw_status[0] carries no information. We need to
 612	 * be conservative here and therefore we tell the guest that erratum 298
 613	 * is present (because we really don't know).
 614	 */
 615	if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
 616		vcpu->arch.osvw.status |= 1;
 617}
 618
 619static int has_svm(void)
 620{
 621	const char *msg;
 622
 623	if (!cpu_has_svm(&msg)) {
 624		printk(KERN_INFO "has_svm: %s\n", msg);
 625		return 0;
 626	}
 627
 628	return 1;
 629}
 630
 631static void svm_hardware_disable(void)
 632{
 633	/* Make sure we clean up behind us */
 634	if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
 635		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 636
 637	cpu_svm_disable();
 638
 639	amd_pmu_disable_virt();
 640}
 641
 642static int svm_hardware_enable(void)
 643{
 644
 645	struct svm_cpu_data *sd;
 646	uint64_t efer;
 647	struct desc_ptr gdt_descr;
 648	struct desc_struct *gdt;
 649	int me = raw_smp_processor_id();
 650
 651	rdmsrl(MSR_EFER, efer);
 652	if (efer & EFER_SVME)
 653		return -EBUSY;
 654
 655	if (!has_svm()) {
 656		pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
 657		return -EINVAL;
 658	}
 659	sd = per_cpu(svm_data, me);
 660	if (!sd) {
 661		pr_err("%s: svm_data is NULL on %d\n", __func__, me);
 662		return -EINVAL;
 663	}
 664
 665	sd->asid_generation = 1;
 666	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 667	sd->next_asid = sd->max_asid + 1;
 668
 669	native_store_gdt(&gdt_descr);
 670	gdt = (struct desc_struct *)gdt_descr.address;
 671	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 672
 673	wrmsrl(MSR_EFER, efer | EFER_SVME);
 674
 675	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 676
 677	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 678		wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
 679		__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
 680	}
 681
 682
 683	/*
 684	 * Get OSVW bits.
 685	 *
 686	 * Note that it is possible to have a system with mixed processor
 687	 * revisions and therefore different OSVW bits. If bits are not the same
 688	 * on different processors then choose the worst case (i.e. if erratum
 689	 * is present on one processor and not on another then assume that the
 690	 * erratum is present everywhere).
 691	 */
 692	if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
 693		uint64_t len, status = 0;
 694		int err;
 695
 696		len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
 697		if (!err)
 698			status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
 699						      &err);
 700
 701		if (err)
 702			osvw_status = osvw_len = 0;
 703		else {
 704			if (len < osvw_len)
 705				osvw_len = len;
 706			osvw_status |= status;
 707			osvw_status &= (1ULL << osvw_len) - 1;
 708		}
 709	} else
 710		osvw_status = osvw_len = 0;
 711
 712	svm_init_erratum_383();
 713
 714	amd_pmu_enable_virt();
 715
 716	return 0;
 717}
 718
 719static void svm_cpu_uninit(int cpu)
 720{
 721	struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
 722
 723	if (!sd)
 724		return;
 725
 726	per_cpu(svm_data, raw_smp_processor_id()) = NULL;
 727	__free_page(sd->save_area);
 728	kfree(sd);
 729}
 730
 731static int svm_cpu_init(int cpu)
 732{
 733	struct svm_cpu_data *sd;
 734	int r;
 735
 736	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 737	if (!sd)
 738		return -ENOMEM;
 739	sd->cpu = cpu;
 740	sd->save_area = alloc_page(GFP_KERNEL);
 741	r = -ENOMEM;
 742	if (!sd->save_area)
 743		goto err_1;
 744
 745	per_cpu(svm_data, cpu) = sd;
 746
 747	return 0;
 748
 749err_1:
 750	kfree(sd);
 751	return r;
 752
 753}
 754
 755static bool valid_msr_intercept(u32 index)
 756{
 757	int i;
 758
 759	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
 760		if (direct_access_msrs[i].index == index)
 761			return true;
 762
 763	return false;
 764}
 765
 766static void set_msr_interception(u32 *msrpm, unsigned msr,
 767				 int read, int write)
 768{
 769	u8 bit_read, bit_write;
 770	unsigned long tmp;
 771	u32 offset;
 772
 773	/*
 774	 * If this warning triggers extend the direct_access_msrs list at the
 775	 * beginning of the file
 776	 */
 777	WARN_ON(!valid_msr_intercept(msr));
 778
 779	offset    = svm_msrpm_offset(msr);
 780	bit_read  = 2 * (msr & 0x0f);
 781	bit_write = 2 * (msr & 0x0f) + 1;
 782	tmp       = msrpm[offset];
 783
 784	BUG_ON(offset == MSR_INVALID);
 785
 786	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
 787	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 788
 789	msrpm[offset] = tmp;
 790}
 791
 792static void svm_vcpu_init_msrpm(u32 *msrpm)
 793{
 794	int i;
 795
 796	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 797
 798	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 799		if (!direct_access_msrs[i].always)
 800			continue;
 801
 802		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
 803	}
 804}
 805
 806static void add_msr_offset(u32 offset)
 807{
 808	int i;
 809
 810	for (i = 0; i < MSRPM_OFFSETS; ++i) {
 811
 812		/* Offset already in list? */
 813		if (msrpm_offsets[i] == offset)
 814			return;
 815
 816		/* Slot used by another offset? */
 817		if (msrpm_offsets[i] != MSR_INVALID)
 818			continue;
 819
 820		/* Add offset to list */
 821		msrpm_offsets[i] = offset;
 822
 823		return;
 824	}
 825
 826	/*
 827	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
 828	 * increase MSRPM_OFFSETS in this case.
 829	 */
 830	BUG();
 831}
 832
 833static void init_msrpm_offsets(void)
 834{
 835	int i;
 836
 837	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
 838
 839	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
 840		u32 offset;
 841
 842		offset = svm_msrpm_offset(direct_access_msrs[i].index);
 843		BUG_ON(offset == MSR_INVALID);
 844
 845		add_msr_offset(offset);
 846	}
 847}
 848
 849static void svm_enable_lbrv(struct vcpu_svm *svm)
 850{
 851	u32 *msrpm = svm->msrpm;
 852
 853	svm->vmcb->control.lbr_ctl = 1;
 854	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
 855	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 856	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 857	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 858}
 859
 860static void svm_disable_lbrv(struct vcpu_svm *svm)
 861{
 862	u32 *msrpm = svm->msrpm;
 863
 864	svm->vmcb->control.lbr_ctl = 0;
 865	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
 866	set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 867	set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 868	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 869}
 870
 871static __init int svm_hardware_setup(void)
 872{
 873	int cpu;
 874	struct page *iopm_pages;
 875	void *iopm_va;
 876	int r;
 877
 878	iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
 879
 880	if (!iopm_pages)
 881		return -ENOMEM;
 882
 883	iopm_va = page_address(iopm_pages);
 884	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 885	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 886
 887	init_msrpm_offsets();
 888
 889	if (boot_cpu_has(X86_FEATURE_NX))
 890		kvm_enable_efer_bits(EFER_NX);
 891
 892	if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
 893		kvm_enable_efer_bits(EFER_FFXSR);
 894
 895	if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
 896		kvm_has_tsc_control = true;
 897		kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
 898		kvm_tsc_scaling_ratio_frac_bits = 32;
 899	}
 900
 901	if (nested) {
 902		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 903		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 904	}
 905
 906	for_each_possible_cpu(cpu) {
 907		r = svm_cpu_init(cpu);
 908		if (r)
 909			goto err;
 910	}
 911
 912	if (!boot_cpu_has(X86_FEATURE_NPT))
 913		npt_enabled = false;
 914
 915	if (npt_enabled && !npt) {
 916		printk(KERN_INFO "kvm: Nested Paging disabled\n");
 917		npt_enabled = false;
 918	}
 919
 920	if (npt_enabled) {
 921		printk(KERN_INFO "kvm: Nested Paging enabled\n");
 922		kvm_enable_tdp();
 923	} else
 924		kvm_disable_tdp();
 925
 926	return 0;
 927
 928err:
 929	__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
 930	iopm_base = 0;
 931	return r;
 932}
 933
 934static __exit void svm_hardware_unsetup(void)
 935{
 936	int cpu;
 937
 938	for_each_possible_cpu(cpu)
 939		svm_cpu_uninit(cpu);
 940
 941	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
 942	iopm_base = 0;
 943}
 944
 945static void init_seg(struct vmcb_seg *seg)
 946{
 947	seg->selector = 0;
 948	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
 949		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
 950	seg->limit = 0xffff;
 951	seg->base = 0;
 952}
 953
 954static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 955{
 956	seg->selector = 0;
 957	seg->attrib = SVM_SELECTOR_P_MASK | type;
 958	seg->limit = 0xffff;
 959	seg->base = 0;
 960}
 961
 962static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
 963{
 964	struct vcpu_svm *svm = to_svm(vcpu);
 965
 966	return svm->vmcb->control.tsc_offset;
 967}
 968
 969static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 970{
 971	struct vcpu_svm *svm = to_svm(vcpu);
 972	u64 g_tsc_offset = 0;
 973
 974	if (is_guest_mode(vcpu)) {
 975		g_tsc_offset = svm->vmcb->control.tsc_offset -
 976			       svm->nested.hsave->control.tsc_offset;
 977		svm->nested.hsave->control.tsc_offset = offset;
 978	} else
 979		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
 980					   svm->vmcb->control.tsc_offset,
 981					   offset);
 982
 983	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
 984
 985	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 986}
 987
 988static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
 989{
 990	struct vcpu_svm *svm = to_svm(vcpu);
 991
 992	svm->vmcb->control.tsc_offset += adjustment;
 993	if (is_guest_mode(vcpu))
 994		svm->nested.hsave->control.tsc_offset += adjustment;
 995	else
 996		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
 997				     svm->vmcb->control.tsc_offset - adjustment,
 998				     svm->vmcb->control.tsc_offset);
 999
1000	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1001}
1002
1003static void init_vmcb(struct vcpu_svm *svm)
1004{
1005	struct vmcb_control_area *control = &svm->vmcb->control;
1006	struct vmcb_save_area *save = &svm->vmcb->save;
1007
1008	svm->vcpu.fpu_active = 1;
1009	svm->vcpu.arch.hflags = 0;
1010
1011	set_cr_intercept(svm, INTERCEPT_CR0_READ);
1012	set_cr_intercept(svm, INTERCEPT_CR3_READ);
1013	set_cr_intercept(svm, INTERCEPT_CR4_READ);
1014	set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1015	set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1016	set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1017	set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1018
1019	set_dr_intercepts(svm);
1020
1021	set_exception_intercept(svm, PF_VECTOR);
1022	set_exception_intercept(svm, UD_VECTOR);
1023	set_exception_intercept(svm, MC_VECTOR);
1024	set_exception_intercept(svm, AC_VECTOR);
1025	set_exception_intercept(svm, DB_VECTOR);
1026
1027	set_intercept(svm, INTERCEPT_INTR);
1028	set_intercept(svm, INTERCEPT_NMI);
1029	set_intercept(svm, INTERCEPT_SMI);
1030	set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1031	set_intercept(svm, INTERCEPT_RDPMC);
1032	set_intercept(svm, INTERCEPT_CPUID);
1033	set_intercept(svm, INTERCEPT_INVD);
1034	set_intercept(svm, INTERCEPT_HLT);
1035	set_intercept(svm, INTERCEPT_INVLPG);
1036	set_intercept(svm, INTERCEPT_INVLPGA);
1037	set_intercept(svm, INTERCEPT_IOIO_PROT);
1038	set_intercept(svm, INTERCEPT_MSR_PROT);
1039	set_intercept(svm, INTERCEPT_TASK_SWITCH);
1040	set_intercept(svm, INTERCEPT_SHUTDOWN);
1041	set_intercept(svm, INTERCEPT_VMRUN);
1042	set_intercept(svm, INTERCEPT_VMMCALL);
1043	set_intercept(svm, INTERCEPT_VMLOAD);
1044	set_intercept(svm, INTERCEPT_VMSAVE);
1045	set_intercept(svm, INTERCEPT_STGI);
1046	set_intercept(svm, INTERCEPT_CLGI);
1047	set_intercept(svm, INTERCEPT_SKINIT);
1048	set_intercept(svm, INTERCEPT_WBINVD);
1049	set_intercept(svm, INTERCEPT_MONITOR);
1050	set_intercept(svm, INTERCEPT_MWAIT);
1051	set_intercept(svm, INTERCEPT_XSETBV);
1052
1053	control->iopm_base_pa = iopm_base;
1054	control->msrpm_base_pa = __pa(svm->msrpm);
1055	control->int_ctl = V_INTR_MASKING_MASK;
1056
1057	init_seg(&save->es);
1058	init_seg(&save->ss);
1059	init_seg(&save->ds);
1060	init_seg(&save->fs);
1061	init_seg(&save->gs);
1062
1063	save->cs.selector = 0xf000;
1064	save->cs.base = 0xffff0000;
1065	/* Executable/Readable Code Segment */
1066	save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1067		SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1068	save->cs.limit = 0xffff;
1069
1070	save->gdtr.limit = 0xffff;
1071	save->idtr.limit = 0xffff;
1072
1073	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1074	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1075
1076	svm_set_efer(&svm->vcpu, 0);
1077	save->dr6 = 0xffff0ff0;
1078	kvm_set_rflags(&svm->vcpu, 2);
1079	save->rip = 0x0000fff0;
1080	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1081
1082	/*
1083	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1084	 * It also updates the guest-visible cr0 value.
1085	 */
1086	svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1087	kvm_mmu_reset_context(&svm->vcpu);
1088
1089	save->cr4 = X86_CR4_PAE;
1090	/* rdx = ?? */
1091
1092	if (npt_enabled) {
1093		/* Setup VMCB for Nested Paging */
1094		control->nested_ctl = 1;
1095		clr_intercept(svm, INTERCEPT_INVLPG);
1096		clr_exception_intercept(svm, PF_VECTOR);
1097		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1098		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1099		save->g_pat = svm->vcpu.arch.pat;
1100		save->cr3 = 0;
1101		save->cr4 = 0;
1102	}
1103	svm->asid_generation = 0;
1104
1105	svm->nested.vmcb = 0;
1106	svm->vcpu.arch.hflags = 0;
1107
1108	if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1109		control->pause_filter_count = 3000;
1110		set_intercept(svm, INTERCEPT_PAUSE);
1111	}
1112
1113	mark_all_dirty(svm->vmcb);
1114
1115	enable_gif(svm);
1116}
1117
1118static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1119{
1120	struct vcpu_svm *svm = to_svm(vcpu);
1121	u32 dummy;
1122	u32 eax = 1;
1123
1124	if (!init_event) {
1125		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1126					   MSR_IA32_APICBASE_ENABLE;
1127		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
1128			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
1129	}
1130	init_vmcb(svm);
1131
1132	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
1133	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
1134}
1135
1136static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1137{
1138	struct vcpu_svm *svm;
1139	struct page *page;
1140	struct page *msrpm_pages;
1141	struct page *hsave_page;
1142	struct page *nested_msrpm_pages;
1143	int err;
1144
1145	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1146	if (!svm) {
1147		err = -ENOMEM;
1148		goto out;
1149	}
1150
1151	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1152	if (err)
1153		goto free_svm;
1154
1155	err = -ENOMEM;
1156	page = alloc_page(GFP_KERNEL);
1157	if (!page)
1158		goto uninit;
1159
1160	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1161	if (!msrpm_pages)
1162		goto free_page1;
1163
1164	nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
1165	if (!nested_msrpm_pages)
1166		goto free_page2;
1167
1168	hsave_page = alloc_page(GFP_KERNEL);
1169	if (!hsave_page)
1170		goto free_page3;
1171
1172	svm->nested.hsave = page_address(hsave_page);
1173
1174	svm->msrpm = page_address(msrpm_pages);
1175	svm_vcpu_init_msrpm(svm->msrpm);
1176
1177	svm->nested.msrpm = page_address(nested_msrpm_pages);
1178	svm_vcpu_init_msrpm(svm->nested.msrpm);
1179
1180	svm->vmcb = page_address(page);
1181	clear_page(svm->vmcb);
1182	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
1183	svm->asid_generation = 0;
1184	init_vmcb(svm);
1185
1186	svm_init_osvw(&svm->vcpu);
1187
1188	return &svm->vcpu;
1189
1190free_page3:
1191	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
1192free_page2:
1193	__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
1194free_page1:
1195	__free_page(page);
1196uninit:
1197	kvm_vcpu_uninit(&svm->vcpu);
1198free_svm:
1199	kmem_cache_free(kvm_vcpu_cache, svm);
1200out:
1201	return ERR_PTR(err);
1202}
1203
1204static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1205{
1206	struct vcpu_svm *svm = to_svm(vcpu);
1207
1208	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
1209	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
1210	__free_page(virt_to_page(svm->nested.hsave));
1211	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1212	kvm_vcpu_uninit(vcpu);
1213	kmem_cache_free(kvm_vcpu_cache, svm);
1214}
1215
1216static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1217{
1218	struct vcpu_svm *svm = to_svm(vcpu);
1219	int i;
1220
1221	if (unlikely(cpu != vcpu->cpu)) {
1222		svm->asid_generation = 0;
1223		mark_all_dirty(svm->vmcb);
1224	}
1225
1226#ifdef CONFIG_X86_64
1227	rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1228#endif
1229	savesegment(fs, svm->host.fs);
1230	savesegment(gs, svm->host.gs);
1231	svm->host.ldt = kvm_read_ldt();
1232
1233	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1234		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1235
1236	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1237		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
1238		if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
1239			__this_cpu_write(current_tsc_ratio, tsc_ratio);
1240			wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
1241		}
1242	}
1243	/* This assumes that the kernel never uses MSR_TSC_AUX */
1244	if (static_cpu_has(X86_FEATURE_RDTSCP))
1245		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1246}
1247
1248static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1249{
1250	struct vcpu_svm *svm = to_svm(vcpu);
1251	int i;
1252
1253	++vcpu->stat.host_state_reload;
1254	kvm_load_ldt(svm->host.ldt);
1255#ifdef CONFIG_X86_64
1256	loadsegment(fs, svm->host.fs);
1257	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1258	load_gs_index(svm->host.gs);
1259#else
1260#ifdef CONFIG_X86_32_LAZY_GS
1261	loadsegment(gs, svm->host.gs);
1262#endif
1263#endif
1264	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1265		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1266}
1267
1268static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1269{
1270	return to_svm(vcpu)->vmcb->save.rflags;
1271}
1272
1273static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1274{
1275       /*
1276        * Any change of EFLAGS.VM is accompained by a reload of SS
1277        * (caused by either a task switch or an inter-privilege IRET),
1278        * so we do not need to update the CPL here.
1279        */
1280	to_svm(vcpu)->vmcb->save.rflags = rflags;
1281}
1282
1283static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
1284{
1285	return 0;
1286}
1287
1288static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1289{
1290	switch (reg) {
1291	case VCPU_EXREG_PDPTR:
1292		BUG_ON(!npt_enabled);
1293		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
1294		break;
1295	default:
1296		BUG();
1297	}
1298}
1299
1300static void svm_set_vintr(struct vcpu_svm *svm)
1301{
1302	set_intercept(svm, INTERCEPT_VINTR);
1303}
1304
1305static void svm_clear_vintr(struct vcpu_svm *svm)
1306{
1307	clr_intercept(svm, INTERCEPT_VINTR);
1308}
1309
1310static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1311{
1312	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1313
1314	switch (seg) {
1315	case VCPU_SREG_CS: return &save->cs;
1316	case VCPU_SREG_DS: return &save->ds;
1317	case VCPU_SREG_ES: return &save->es;
1318	case VCPU_SREG_FS: return &save->fs;
1319	case VCPU_SREG_GS: return &save->gs;
1320	case VCPU_SREG_SS: return &save->ss;
1321	case VCPU_SREG_TR: return &save->tr;
1322	case VCPU_SREG_LDTR: return &save->ldtr;
1323	}
1324	BUG();
1325	return NULL;
1326}
1327
1328static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1329{
1330	struct vmcb_seg *s = svm_seg(vcpu, seg);
1331
1332	return s->base;
1333}
1334
1335static void svm_get_segment(struct kvm_vcpu *vcpu,
1336			    struct kvm_segment *var, int seg)
1337{
1338	struct vmcb_seg *s = svm_seg(vcpu, seg);
1339
1340	var->base = s->base;
1341	var->limit = s->limit;
1342	var->selector = s->selector;
1343	var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1344	var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1345	var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1346	var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1347	var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1348	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1349	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1350
1351	/*
1352	 * AMD CPUs circa 2014 track the G bit for all segments except CS.
1353	 * However, the SVM spec states that the G bit is not observed by the
1354	 * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1355	 * So let's synthesize a legal G bit for all segments, this helps
1356	 * running KVM nested. It also helps cross-vendor migration, because
1357	 * Intel's vmentry has a check on the 'G' bit.
1358	 */
1359	var->g = s->limit > 0xfffff;
1360
1361	/*
1362	 * AMD's VMCB does not have an explicit unusable field, so emulate it
1363	 * for cross vendor migration purposes by "not present"
1364	 */
1365	var->unusable = !var->present || (var->type == 0);
1366
1367	switch (seg) {
1368	case VCPU_SREG_TR:
1369		/*
1370		 * Work around a bug where the busy flag in the tr selector
1371		 * isn't exposed
1372		 */
1373		var->type |= 0x2;
1374		break;
1375	case VCPU_SREG_DS:
1376	case VCPU_SREG_ES:
1377	case VCPU_SREG_FS:
1378	case VCPU_SREG_GS:
1379		/*
1380		 * The accessed bit must always be set in the segment
1381		 * descriptor cache, although it can be cleared in the
1382		 * descriptor, the cached bit always remains at 1. Since
1383		 * Intel has a check on this, set it here to support
1384		 * cross-vendor migration.
1385		 */
1386		if (!var->unusable)
1387			var->type |= 0x1;
1388		break;
1389	case VCPU_SREG_SS:
1390		/*
1391		 * On AMD CPUs sometimes the DB bit in the segment
1392		 * descriptor is left as 1, although the whole segment has
1393		 * been made unusable. Clear it here to pass an Intel VMX
1394		 * entry check when cross vendor migrating.
1395		 */
1396		if (var->unusable)
1397			var->db = 0;
1398		var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1399		break;
1400	}
1401}
1402
1403static int svm_get_cpl(struct kvm_vcpu *vcpu)
1404{
1405	struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1406
1407	return save->cpl;
1408}
1409
1410static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1411{
1412	struct vcpu_svm *svm = to_svm(vcpu);
1413
1414	dt->size = svm->vmcb->save.idtr.limit;
1415	dt->address = svm->vmcb->save.idtr.base;
1416}
1417
1418static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1419{
1420	struct vcpu_svm *svm = to_svm(vcpu);
1421
1422	svm->vmcb->save.idtr.limit = dt->size;
1423	svm->vmcb->save.idtr.base = dt->address ;
1424	mark_dirty(svm->vmcb, VMCB_DT);
1425}
1426
1427static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1428{
1429	struct vcpu_svm *svm = to_svm(vcpu);
1430
1431	dt->size = svm->vmcb->save.gdtr.limit;
1432	dt->address = svm->vmcb->save.gdtr.base;
1433}
1434
1435static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1436{
1437	struct vcpu_svm *svm = to_svm(vcpu);
1438
1439	svm->vmcb->save.gdtr.limit = dt->size;
1440	svm->vmcb->save.gdtr.base = dt->address ;
1441	mark_dirty(svm->vmcb, VMCB_DT);
1442}
1443
1444static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1445{
1446}
1447
1448static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1449{
1450}
1451
1452static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1453{
1454}
1455
1456static void update_cr0_intercept(struct vcpu_svm *svm)
1457{
1458	ulong gcr0 = svm->vcpu.arch.cr0;
1459	u64 *hcr0 = &svm->vmcb->save.cr0;
1460
1461	if (!svm->vcpu.fpu_active)
1462		*hcr0 |= SVM_CR0_SELECTIVE_MASK;
1463	else
1464		*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1465			| (gcr0 & SVM_CR0_SELECTIVE_MASK);
1466
1467	mark_dirty(svm->vmcb, VMCB_CR);
1468
1469	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1470		clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1471		clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1472	} else {
1473		set_cr_intercept(svm, INTERCEPT_CR0_READ);
1474		set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1475	}
1476}
1477
1478static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1479{
1480	struct vcpu_svm *svm = to_svm(vcpu);
1481
1482#ifdef CONFIG_X86_64
1483	if (vcpu->arch.efer & EFER_LME) {
1484		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1485			vcpu->arch.efer |= EFER_LMA;
1486			svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1487		}
1488
1489		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1490			vcpu->arch.efer &= ~EFER_LMA;
1491			svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1492		}
1493	}
1494#endif
1495	vcpu->arch.cr0 = cr0;
1496
1497	if (!npt_enabled)
1498		cr0 |= X86_CR0_PG | X86_CR0_WP;
1499
1500	if (!vcpu->fpu_active)
1501		cr0 |= X86_CR0_TS;
1502	/*
1503	 * re-enable caching here because the QEMU bios
1504	 * does not do it - this results in some delay at
1505	 * reboot
1506	 */
1507	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1508		cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1509	svm->vmcb->save.cr0 = cr0;
1510	mark_dirty(svm->vmcb, VMCB_CR);
1511	update_cr0_intercept(svm);
1512}
1513
1514static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1515{
1516	unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1517	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1518
1519	if (cr4 & X86_CR4_VMXE)
1520		return 1;
1521
1522	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1523		svm_flush_tlb(vcpu);
1524
1525	vcpu->arch.cr4 = cr4;
1526	if (!npt_enabled)
1527		cr4 |= X86_CR4_PAE;
1528	cr4 |= host_cr4_mce;
1529	to_svm(vcpu)->vmcb->save.cr4 = cr4;
1530	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1531	return 0;
1532}
1533
1534static void svm_set_segment(struct kvm_vcpu *vcpu,
1535			    struct kvm_segment *var, int seg)
1536{
1537	struct vcpu_svm *svm = to_svm(vcpu);
1538	struct vmcb_seg *s = svm_seg(vcpu, seg);
1539
1540	s->base = var->base;
1541	s->limit = var->limit;
1542	s->selector = var->selector;
1543	if (var->unusable)
1544		s->attrib = 0;
1545	else {
1546		s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1547		s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1548		s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1549		s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
1550		s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1551		s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1552		s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1553		s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1554	}
1555
1556	/*
1557	 * This is always accurate, except if SYSRET returned to a segment
1558	 * with SS.DPL != 3.  Intel does not have this quirk, and always
1559	 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1560	 * would entail passing the CPL to userspace and back.
1561	 */
1562	if (seg == VCPU_SREG_SS)
1563		svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1564
1565	mark_dirty(svm->vmcb, VMCB_SEG);
1566}
1567
1568static void update_bp_intercept(struct kvm_vcpu *vcpu)
1569{
1570	struct vcpu_svm *svm = to_svm(vcpu);
1571
1572	clr_exception_intercept(svm, BP_VECTOR);
1573
1574	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1575		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1576			set_exception_intercept(svm, BP_VECTOR);
1577	} else
1578		vcpu->guest_debug = 0;
1579}
1580
1581static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1582{
1583	if (sd->next_asid > sd->max_asid) {
1584		++sd->asid_generation;
1585		sd->next_asid = 1;
1586		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1587	}
1588
1589	svm->asid_generation = sd->asid_generation;
1590	svm->vmcb->control.asid = sd->next_asid++;
1591
1592	mark_dirty(svm->vmcb, VMCB_ASID);
1593}
1594
1595static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
1596{
1597	return to_svm(vcpu)->vmcb->save.dr6;
1598}
1599
1600static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
1601{
1602	struct vcpu_svm *svm = to_svm(vcpu);
1603
1604	svm->vmcb->save.dr6 = value;
1605	mark_dirty(svm->vmcb, VMCB_DR);
1606}
1607
1608static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1609{
1610	struct vcpu_svm *svm = to_svm(vcpu);
1611
1612	get_debugreg(vcpu->arch.db[0], 0);
1613	get_debugreg(vcpu->arch.db[1], 1);
1614	get_debugreg(vcpu->arch.db[2], 2);
1615	get_debugreg(vcpu->arch.db[3], 3);
1616	vcpu->arch.dr6 = svm_get_dr6(vcpu);
1617	vcpu->arch.dr7 = svm->vmcb->save.dr7;
1618
1619	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1620	set_dr_intercepts(svm);
1621}
1622
1623static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1624{
1625	struct vcpu_svm *svm = to_svm(vcpu);
1626
1627	svm->vmcb->save.dr7 = value;
1628	mark_dirty(svm->vmcb, VMCB_DR);
1629}
1630
1631static int pf_interception(struct vcpu_svm *svm)
1632{
1633	u64 fault_address = svm->vmcb->control.exit_info_2;
1634	u32 error_code;
1635	int r = 1;
1636
1637	switch (svm->apf_reason) {
1638	default:
1639		error_code = svm->vmcb->control.exit_info_1;
1640
1641		trace_kvm_page_fault(fault_address, error_code);
1642		if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1643			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1644		r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1645			svm->vmcb->control.insn_bytes,
1646			svm->vmcb->control.insn_len);
1647		break;
1648	case KVM_PV_REASON_PAGE_NOT_PRESENT:
1649		svm->apf_reason = 0;
1650		local_irq_disable();
1651		kvm_async_pf_task_wait(fault_address);
1652		local_irq_enable();
1653		break;
1654	case KVM_PV_REASON_PAGE_READY:
1655		svm->apf_reason = 0;
1656		local_irq_disable();
1657		kvm_async_pf_task_wake(fault_address);
1658		local_irq_enable();
1659		break;
1660	}
1661	return r;
1662}
1663
1664static int db_interception(struct vcpu_svm *svm)
1665{
1666	struct kvm_run *kvm_run = svm->vcpu.run;
1667
1668	if (!(svm->vcpu.guest_debug &
1669	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1670		!svm->nmi_singlestep) {
1671		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1672		return 1;
1673	}
1674
1675	if (svm->nmi_singlestep) {
1676		svm->nmi_singlestep = false;
1677		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1678			svm->vmcb->save.rflags &=
1679				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1680	}
1681
1682	if (svm->vcpu.guest_debug &
1683	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1684		kvm_run->exit_reason = KVM_EXIT_DEBUG;
1685		kvm_run->debug.arch.pc =
1686			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1687		kvm_run->debug.arch.exception = DB_VECTOR;
1688		return 0;
1689	}
1690
1691	return 1;
1692}
1693
1694static int bp_interception(struct vcpu_svm *svm)
1695{
1696	struct kvm_run *kvm_run = svm->vcpu.run;
1697
1698	kvm_run->exit_reason = KVM_EXIT_DEBUG;
1699	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1700	kvm_run->debug.arch.exception = BP_VECTOR;
1701	return 0;
1702}
1703
1704static int ud_interception(struct vcpu_svm *svm)
1705{
1706	int er;
1707
1708	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1709	if (er != EMULATE_DONE)
1710		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1711	return 1;
1712}
1713
1714static int ac_interception(struct vcpu_svm *svm)
1715{
1716	kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
1717	return 1;
1718}
1719
1720static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1721{
1722	struct vcpu_svm *svm = to_svm(vcpu);
1723
1724	clr_exception_intercept(svm, NM_VECTOR);
1725
1726	svm->vcpu.fpu_active = 1;
1727	update_cr0_intercept(svm);
1728}
1729
1730static int nm_interception(struct vcpu_svm *svm)
1731{
1732	svm_fpu_activate(&svm->vcpu);
1733	return 1;
1734}
1735
1736static bool is_erratum_383(void)
1737{
1738	int err, i;
1739	u64 value;
1740
1741	if (!erratum_383_found)
1742		return false;
1743
1744	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1745	if (err)
1746		return false;
1747
1748	/* Bit 62 may or may not be set for this mce */
1749	value &= ~(1ULL << 62);
1750
1751	if (value != 0xb600000000010015ULL)
1752		return false;
1753
1754	/* Clear MCi_STATUS registers */
1755	for (i = 0; i < 6; ++i)
1756		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
1757
1758	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
1759	if (!err) {
1760		u32 low, high;
1761
1762		value &= ~(1ULL << 2);
1763		low    = lower_32_bits(value);
1764		high   = upper_32_bits(value);
1765
1766		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
1767	}
1768
1769	/* Flush tlb to evict multi-match entries */
1770	__flush_tlb_all();
1771
1772	return true;
1773}
1774
1775static void svm_handle_mce(struct vcpu_svm *svm)
1776{
1777	if (is_erratum_383()) {
1778		/*
1779		 * Erratum 383 triggered. Guest state is corrupt so kill the
1780		 * guest.
1781		 */
1782		pr_err("KVM: Guest triggered AMD Erratum 383\n");
1783
1784		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
1785
1786		return;
1787	}
1788
1789	/*
1790	 * On an #MC intercept the MCE handler is not called automatically in
1791	 * the host. So do it by hand here.
1792	 */
1793	asm volatile (
1794		"int $0x12\n");
1795	/* not sure if we ever come back to this point */
1796
1797	return;
1798}
1799
1800static int mc_interception(struct vcpu_svm *svm)
1801{
1802	return 1;
1803}
1804
1805static int shutdown_interception(struct vcpu_svm *svm)
1806{
1807	struct kvm_run *kvm_run = svm->vcpu.run;
1808
1809	/*
1810	 * VMCB is undefined after a SHUTDOWN intercept
1811	 * so reinitialize it.
1812	 */
1813	clear_page(svm->vmcb);
1814	init_vmcb(svm);
1815
1816	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1817	return 0;
1818}
1819
1820static int io_interception(struct vcpu_svm *svm)
1821{
1822	struct kvm_vcpu *vcpu = &svm->vcpu;
1823	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1824	int size, in, string;
1825	unsigned port;
1826
1827	++svm->vcpu.stat.io_exits;
1828	string = (io_info & SVM_IOIO_STR_MASK) != 0;
1829	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1830	if (string || in)
1831		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1832
1833	port = io_info >> 16;
1834	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1835	svm->next_rip = svm->vmcb->control.exit_info_2;
1836	skip_emulated_instruction(&svm->vcpu);
1837
1838	return kvm_fast_pio_out(vcpu, size, port);
1839}
1840
1841static int nmi_interception(struct vcpu_svm *svm)
1842{
1843	return 1;
1844}
1845
1846static int intr_interception(struct vcpu_svm *svm)
1847{
1848	++svm->vcpu.stat.irq_exits;
1849	return 1;
1850}
1851
1852static int nop_on_interception(struct vcpu_svm *svm)
1853{
1854	return 1;
1855}
1856
1857static int halt_interception(struct vcpu_svm *svm)
1858{
1859	svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1860	return kvm_emulate_halt(&svm->vcpu);
1861}
1862
1863static int vmmcall_interception(struct vcpu_svm *svm)
1864{
1865	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1866	return kvm_emulate_hypercall(&svm->vcpu);
1867}
1868
1869static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1870{
1871	struct vcpu_svm *svm = to_svm(vcpu);
1872
1873	return svm->nested.nested_cr3;
1874}
1875
1876static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
1877{
1878	struct vcpu_svm *svm = to_svm(vcpu);
1879	u64 cr3 = svm->nested.nested_cr3;
1880	u64 pdpte;
1881	int ret;
1882
1883	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
1884				       offset_in_page(cr3) + index * 8, 8);
1885	if (ret)
1886		return 0;
1887	return pdpte;
1888}
1889
1890static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1891				   unsigned long root)
1892{
1893	struct vcpu_svm *svm = to_svm(vcpu);
1894
1895	svm->vmcb->control.nested_cr3 = root;
1896	mark_dirty(svm->vmcb, VMCB_NPT);
1897	svm_flush_tlb(vcpu);
1898}
1899
1900static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1901				       struct x86_exception *fault)
1902{
1903	struct vcpu_svm *svm = to_svm(vcpu);
1904
1905	if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
1906		/*
1907		 * TODO: track the cause of the nested page fault, and
1908		 * correctly fill in the high bits of exit_info_1.
1909		 */
1910		svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1911		svm->vmcb->control.exit_code_hi = 0;
1912		svm->vmcb->control.exit_info_1 = (1ULL << 32);
1913		svm->vmcb->control.exit_info_2 = fault->address;
1914	}
1915
1916	svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
1917	svm->vmcb->control.exit_info_1 |= fault->error_code;
1918
1919	/*
1920	 * The present bit is always zero for page structure faults on real
1921	 * hardware.
1922	 */
1923	if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
1924		svm->vmcb->control.exit_info_1 &= ~1;
1925
1926	nested_svm_vmexit(svm);
1927}
1928
1929static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1930{
1931	WARN_ON(mmu_is_nested(vcpu));
1932	kvm_init_shadow_mmu(vcpu);
1933	vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
1934	vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
1935	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
1936	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1937	vcpu->arch.mmu.shadow_root_level = get_npt_level();
1938	reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
1939	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
1940}
1941
1942static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1943{
1944	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1945}
1946
1947static int nested_svm_check_permissions(struct vcpu_svm *svm)
1948{
1949	if (!(svm->vcpu.arch.efer & EFER_SVME)
1950	    || !is_paging(&svm->vcpu)) {
1951		kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1952		return 1;
1953	}
1954
1955	if (svm->vmcb->save.cpl) {
1956		kvm_inject_gp(&svm->vcpu, 0);
1957		return 1;
1958	}
1959
1960       return 0;
1961}
1962
1963static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1964				      bool has_error_code, u32 error_code)
1965{
1966	int vmexit;
1967
1968	if (!is_guest_mode(&svm->vcpu))
1969		return 0;
1970
1971	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1972	svm->vmcb->control.exit_code_hi = 0;
1973	svm->vmcb->control.exit_info_1 = error_code;
1974	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1975
1976	vmexit = nested_svm_intercept(svm);
1977	if (vmexit == NESTED_EXIT_DONE)
1978		svm->nested.exit_required = true;
1979
1980	return vmexit;
1981}
1982
1983/* This function returns true if it is save to enable the irq window */
1984static inline bool nested_svm_intr(struct vcpu_svm *svm)
1985{
1986	if (!is_guest_mode(&svm->vcpu))
1987		return true;
1988
1989	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
1990		return true;
1991
1992	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1993		return false;
1994
1995	/*
1996	 * if vmexit was already requested (by intercepted exception
1997	 * for instance) do not overwrite it with "external interrupt"
1998	 * vmexit.
1999	 */
2000	if (svm->nested.exit_required)
2001		return false;
2002
2003	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
2004	svm->vmcb->control.exit_info_1 = 0;
2005	svm->vmcb->control.exit_info_2 = 0;
2006
2007	if (svm->nested.intercept & 1ULL) {
2008		/*
2009		 * The #vmexit can't be emulated here directly because this
2010		 * code path runs with irqs and preemption disabled. A
2011		 * #vmexit emulation might sleep. Only signal request for
2012		 * the #vmexit here.
2013		 */
2014		svm->nested.exit_required = true;
2015		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
2016		return false;
2017	}
2018
2019	return true;
2020}
2021
2022/* This function returns true if it is save to enable the nmi window */
2023static inline bool nested_svm_nmi(struct vcpu_svm *svm)
2024{
2025	if (!is_guest_mode(&svm->vcpu))
2026		return true;
2027
2028	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
2029		return true;
2030
2031	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
2032	svm->nested.exit_required = true;
2033
2034	return false;
2035}
2036
2037static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
2038{
2039	struct page *page;
2040
2041	might_sleep();
2042
2043	page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
2044	if (is_error_page(page))
2045		goto error;
2046
2047	*_page = page;
2048
2049	return kmap(page);
2050
2051error:
2052	kvm_inject_gp(&svm->vcpu, 0);
2053
2054	return NULL;
2055}
2056
2057static void nested_svm_unmap(struct page *page)
2058{
2059	kunmap(page);
2060	kvm_release_page_dirty(page);
2061}
2062
2063static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
2064{
2065	unsigned port, size, iopm_len;
2066	u16 val, mask;
2067	u8 start_bit;
2068	u64 gpa;
2069
2070	if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
2071		return NESTED_EXIT_HOST;
2072
2073	port = svm->vmcb->control.exit_info_1 >> 16;
2074	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
2075		SVM_IOIO_SIZE_SHIFT;
2076	gpa  = svm->nested.vmcb_iopm + (port / 8);
2077	start_bit = port % 8;
2078	iopm_len = (start_bit + size > 8) ? 2 : 1;
2079	mask = (0xf >> (4 - size)) << start_bit;
2080	val = 0;
2081
2082	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
2083		return NESTED_EXIT_DONE;
2084
2085	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2086}
2087
2088static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
2089{
2090	u32 offset, msr, value;
2091	int write, mask;
2092
2093	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2094		return NESTED_EXIT_HOST;
2095
2096	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2097	offset = svm_msrpm_offset(msr);
2098	write  = svm->vmcb->control.exit_info_1 & 1;
2099	mask   = 1 << ((2 * (msr & 0xf)) + write);
2100
2101	if (offset == MSR_INVALID)
2102		return NESTED_EXIT_DONE;
2103
2104	/* Offset is in 32 bit units but need in 8 bit units */
2105	offset *= 4;
2106
2107	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
2108		return NESTED_EXIT_DONE;
2109
2110	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
2111}
2112
2113static int nested_svm_exit_special(struct vcpu_svm *svm)
2114{
2115	u32 exit_code = svm->vmcb->control.exit_code;
2116
2117	switch (exit_code) {
2118	case SVM_EXIT_INTR:
2119	case SVM_EXIT_NMI:
2120	case SVM_EXIT_EXCP_BASE + MC_VECTOR:
2121		return NESTED_EXIT_HOST;
2122	case SVM_EXIT_NPF:
2123		/* For now we are always handling NPFs when using them */
2124		if (npt_enabled)
2125			return NESTED_EXIT_HOST;
2126		break;
2127	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
2128		/* When we're shadowing, trap PFs, but not async PF */
2129		if (!npt_enabled && svm->apf_reason == 0)
2130			return NESTED_EXIT_HOST;
2131		break;
2132	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
2133		nm_interception(svm);
2134		break;
2135	default:
2136		break;
2137	}
2138
2139	return NESTED_EXIT_CONTINUE;
2140}
2141
2142/*
2143 * If this function returns true, this #vmexit was already handled
2144 */
2145static int nested_svm_intercept(struct vcpu_svm *svm)
2146{
2147	u32 exit_code = svm->vmcb->control.exit_code;
2148	int vmexit = NESTED_EXIT_HOST;
2149
2150	switch (exit_code) {
2151	case SVM_EXIT_MSR:
2152		vmexit = nested_svm_exit_handled_msr(svm);
2153		break;
2154	case SVM_EXIT_IOIO:
2155		vmexit = nested_svm_intercept_ioio(svm);
2156		break;
2157	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
2158		u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
2159		if (svm->nested.intercept_cr & bit)
2160			vmexit = NESTED_EXIT_DONE;
2161		break;
2162	}
2163	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
2164		u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
2165		if (svm->nested.intercept_dr & bit)
2166			vmexit = NESTED_EXIT_DONE;
2167		break;
2168	}
2169	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
2170		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
2171		if (svm->nested.intercept_exceptions & excp_bits)
2172			vmexit = NESTED_EXIT_DONE;
2173		/* async page fault always cause vmexit */
2174		else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2175			 svm->apf_reason != 0)
2176			vmexit = NESTED_EXIT_DONE;
2177		break;
2178	}
2179	case SVM_EXIT_ERR: {
2180		vmexit = NESTED_EXIT_DONE;
2181		break;
2182	}
2183	default: {
2184		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
2185		if (svm->nested.intercept & exit_bits)
2186			vmexit = NESTED_EXIT_DONE;
2187	}
2188	}
2189
2190	return vmexit;
2191}
2192
2193static int nested_svm_exit_handled(struct vcpu_svm *svm)
2194{
2195	int vmexit;
2196
2197	vmexit = nested_svm_intercept(svm);
2198
2199	if (vmexit == NESTED_EXIT_DONE)
2200		nested_svm_vmexit(svm);
2201
2202	return vmexit;
2203}
2204
2205static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
2206{
2207	struct vmcb_control_area *dst  = &dst_vmcb->control;
2208	struct vmcb_control_area *from = &from_vmcb->control;
2209
2210	dst->intercept_cr         = from->intercept_cr;
2211	dst->intercept_dr         = from->intercept_dr;
2212	dst->intercept_exceptions = from->intercept_exceptions;
2213	dst->intercept            = from->intercept;
2214	dst->iopm_base_pa         = from->iopm_base_pa;
2215	dst->msrpm_base_pa        = from->msrpm_base_pa;
2216	dst->tsc_offset           = from->tsc_offset;
2217	dst->asid                 = from->asid;
2218	dst->tlb_ctl              = from->tlb_ctl;
2219	dst->int_ctl              = from->int_ctl;
2220	dst->int_vector           = from->int_vector;
2221	dst->int_state            = from->int_state;
2222	dst->exit_code            = from->exit_code;
2223	dst->exit_code_hi         = from->exit_code_hi;
2224	dst->exit_info_1          = from->exit_info_1;
2225	dst->exit_info_2          = from->exit_info_2;
2226	dst->exit_int_info        = from->exit_int_info;
2227	dst->exit_int_info_err    = from->exit_int_info_err;
2228	dst->nested_ctl           = from->nested_ctl;
2229	dst->event_inj            = from->event_inj;
2230	dst->event_inj_err        = from->event_inj_err;
2231	dst->nested_cr3           = from->nested_cr3;
2232	dst->lbr_ctl              = from->lbr_ctl;
2233}
2234
2235static int nested_svm_vmexit(struct vcpu_svm *svm)
2236{
2237	struct vmcb *nested_vmcb;
2238	struct vmcb *hsave = svm->nested.hsave;
2239	struct vmcb *vmcb = svm->vmcb;
2240	struct page *page;
2241
2242	trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
2243				       vmcb->control.exit_info_1,
2244				       vmcb->control.exit_info_2,
2245				       vmcb->control.exit_int_info,
2246				       vmcb->control.exit_int_info_err,
2247				       KVM_ISA_SVM);
2248
2249	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
2250	if (!nested_vmcb)
2251		return 1;
2252
2253	/* Exit Guest-Mode */
2254	leave_guest_mode(&svm->vcpu);
2255	svm->nested.vmcb = 0;
2256
2257	/* Give the current vmcb to the guest */
2258	disable_gif(svm);
2259
2260	nested_vmcb->save.es     = vmcb->save.es;
2261	nested_vmcb->save.cs     = vmcb->save.cs;
2262	nested_vmcb->save.ss     = vmcb->save.ss;
2263	nested_vmcb->save.ds     = vmcb->save.ds;
2264	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
2265	nested_vmcb->save.idtr   = vmcb->save.idtr;
2266	nested_vmcb->save.efer   = svm->vcpu.arch.efer;
2267	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
2268	nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
2269	nested_vmcb->save.cr2    = vmcb->save.cr2;
2270	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
2271	nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2272	nested_vmcb->save.rip    = vmcb->save.rip;
2273	nested_vmcb->save.rsp    = vmcb->save.rsp;
2274	nested_vmcb->save.rax    = vmcb->save.rax;
2275	nested_vmcb->save.dr7    = vmcb->save.dr7;
2276	nested_vmcb->save.dr6    = vmcb->save.dr6;
2277	nested_vmcb->save.cpl    = vmcb->save.cpl;
2278
2279	nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
2280	nested_vmcb->control.int_vector        = vmcb->control.int_vector;
2281	nested_vmcb->control.int_state         = vmcb->control.int_state;
2282	nested_vmcb->control.exit_code         = vmcb->control.exit_code;
2283	nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
2284	nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
2285	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
2286	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
2287	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2288
2289	if (svm->nrips_enabled)
2290		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
2291
2292	/*
2293	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
2294	 * to make sure that we do not lose injected events. So check event_inj
2295	 * here and copy it to exit_int_info if it is valid.
2296	 * Exit_int_info and event_inj can't be both valid because the case
2297	 * below only happens on a VMRUN instruction intercept which has
2298	 * no valid exit_int_info set.
2299	 */
2300	if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
2301		struct vmcb_control_area *nc = &nested_vmcb->control;
2302
2303		nc->exit_int_info     = vmcb->control.event_inj;
2304		nc->exit_int_info_err = vmcb->control.event_inj_err;
2305	}
2306
2307	nested_vmcb->control.tlb_ctl           = 0;
2308	nested_vmcb->control.event_inj         = 0;
2309	nested_vmcb->control.event_inj_err     = 0;
2310
2311	/* We always set V_INTR_MASKING and remember the old value in hflags */
2312	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
2313		nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
2314
2315	/* Restore the original control entries */
2316	copy_vmcb_control_area(vmcb, hsave);
2317
2318	kvm_clear_exception_queue(&svm->vcpu);
2319	kvm_clear_interrupt_queue(&svm->vcpu);
2320
2321	svm->nested.nested_cr3 = 0;
2322
2323	/* Restore selected save entries */
2324	svm->vmcb->save.es = hsave->save.es;
2325	svm->vmcb->save.cs = hsave->save.cs;
2326	svm->vmcb->save.ss = hsave->save.ss;
2327	svm->vmcb->save.ds = hsave->save.ds;
2328	svm->vmcb->save.gdtr = hsave->save.gdtr;
2329	svm->vmcb->save.idtr = hsave->save.idtr;
2330	kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2331	svm_set_efer(&svm->vcpu, hsave->save.efer);
2332	svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2333	svm_set_cr4(&svm->vcpu, hsave->save.cr4);
2334	if (npt_enabled) {
2335		svm->vmcb->save.cr3 = hsave->save.cr3;
2336		svm->vcpu.arch.cr3 = hsave->save.cr3;
2337	} else {
2338		(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
2339	}
2340	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
2341	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
2342	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
2343	svm->vmcb->save.dr7 = 0;
2344	svm->vmcb->save.cpl = 0;
2345	svm->vmcb->control.exit_int_info = 0;
2346
2347	mark_all_dirty(svm->vmcb);
2348
2349	nested_svm_unmap(page);
2350
2351	nested_svm_uninit_mmu_context(&svm->vcpu);
2352	kvm_mmu_reset_context(&svm->vcpu);
2353	kvm_mmu_load(&svm->vcpu);
2354
2355	return 0;
2356}
2357
2358static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2359{
2360	/*
2361	 * This function merges the msr permission bitmaps of kvm and the
2362	 * nested vmcb. It is optimized in that it only merges the parts where
2363	 * the kvm msr permission bitmap may contain zero bits
2364	 */
2365	int i;
2366
2367	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
2368		return true;
2369
2370	for (i = 0; i < MSRPM_OFFSETS; i++) {
2371		u32 value, p;
2372		u64 offset;
2373
2374		if (msrpm_offsets[i] == 0xffffffff)
2375			break;
2376
2377		p      = msrpm_offsets[i];
2378		offset = svm->nested.vmcb_msrpm + (p * 4);
2379
2380		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
2381			return false;
2382
2383		svm->nested.msrpm[p] = svm->msrpm[p] | value;
2384	}
2385
2386	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
2387
2388	return true;
2389}
2390
2391static bool nested_vmcb_checks(struct vmcb *vmcb)
2392{
2393	if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2394		return false;
2395
2396	if (vmcb->control.asid == 0)
2397		return false;
2398
2399	if (vmcb->control.nested_ctl && !npt_enabled)
2400		return false;
2401
2402	return true;
2403}
2404
2405static bool nested_svm_vmrun(struct vcpu_svm *svm)
2406{
2407	struct vmcb *nested_vmcb;
2408	struct vmcb *hsave = svm->nested.hsave;
2409	struct vmcb *vmcb = svm->vmcb;
2410	struct page *page;
2411	u64 vmcb_gpa;
2412
2413	vmcb_gpa = svm->vmcb->save.rax;
2414
2415	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2416	if (!nested_vmcb)
2417		return false;
2418
2419	if (!nested_vmcb_checks(nested_vmcb)) {
2420		nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
2421		nested_vmcb->control.exit_code_hi = 0;
2422		nested_vmcb->control.exit_info_1  = 0;
2423		nested_vmcb->control.exit_info_2  = 0;
2424
2425		nested_svm_unmap(page);
2426
2427		return false;
2428	}
2429
2430	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2431			       nested_vmcb->save.rip,
2432			       nested_vmcb->control.int_ctl,
2433			       nested_vmcb->control.event_inj,
2434			       nested_vmcb->control.nested_ctl);
2435
2436	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2437				    nested_vmcb->control.intercept_cr >> 16,
2438				    nested_vmcb->control.intercept_exceptions,
2439				    nested_vmcb->control.intercept);
2440
2441	/* Clear internal status */
2442	kvm_clear_exception_queue(&svm->vcpu);
2443	kvm_clear_interrupt_queue(&svm->vcpu);
2444
2445	/*
2446	 * Save the old vmcb, so we don't need to pick what we save, but can
2447	 * restore everything when a VMEXIT occurs
2448	 */
2449	hsave->save.es     = vmcb->save.es;
2450	hsave->save.cs     = vmcb->save.cs;
2451	hsave->save.ss     = vmcb->save.ss;
2452	hsave->save.ds     = vmcb->save.ds;
2453	hsave->save.gdtr   = vmcb->save.gdtr;
2454	hsave->save.idtr   = vmcb->save.idtr;
2455	hsave->save.efer   = svm->vcpu.arch.efer;
2456	hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
2457	hsave->save.cr4    = svm->vcpu.arch.cr4;
2458	hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2459	hsave->save.rip    = kvm_rip_read(&svm->vcpu);
2460	hsave->save.rsp    = vmcb->save.rsp;
2461	hsave->save.rax    = vmcb->save.rax;
2462	if (npt_enabled)
2463		hsave->save.cr3    = vmcb->save.cr3;
2464	else
2465		hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
2466
2467	copy_vmcb_control_area(hsave, vmcb);
2468
2469	if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2470		svm->vcpu.arch.hflags |= HF_HIF_MASK;
2471	else
2472		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2473
2474	if (nested_vmcb->control.nested_ctl) {
2475		kvm_mmu_unload(&svm->vcpu);
2476		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2477		nested_svm_init_mmu_context(&svm->vcpu);
2478	}
2479
2480	/* Load the nested guest state */
2481	svm->vmcb->save.es = nested_vmcb->save.es;
2482	svm->vmcb->save.cs = nested_vmcb->save.cs;
2483	svm->vmcb->save.ss = nested_vmcb->save.ss;
2484	svm->vmcb->save.ds = nested_vmcb->save.ds;
2485	svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2486	svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2487	kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2488	svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2489	svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2490	svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
2491	if (npt_enabled) {
2492		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
2493		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
2494	} else
2495		(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
2496
2497	/* Guest paging mode is active - reset mmu */
2498	kvm_mmu_reset_context(&svm->vcpu);
2499
2500	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
2501	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
2502	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
2503	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
2504
2505	/* In case we don't even reach vcpu_run, the fields are not updated */
2506	svm->vmcb->save.rax = nested_vmcb->save.rax;
2507	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
2508	svm->vmcb->save.rip = nested_vmcb->save.rip;
2509	svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
2510	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
2511	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
2512
2513	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
2514	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
2515
2516	/* cache intercepts */
2517	svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
2518	svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
2519	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2520	svm->nested.intercept            = nested_vmcb->control.intercept;
2521
2522	svm_flush_tlb(&svm->vcpu);
2523	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2524	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2525		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
2526	else
2527		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
2528
2529	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2530		/* We only want the cr8 intercept bits of the guest */
2531		clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2532		clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2533	}
2534
2535	/* We don't want to see VMMCALLs from a nested guest */
2536	clr_intercept(svm, INTERCEPT_VMMCALL);
2537
2538	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2539	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
2540	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
2541	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
2542	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
2543	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
2544
2545	nested_svm_unmap(page);
2546
2547	/* Enter Guest-Mode */
2548	enter_guest_mode(&svm->vcpu);
2549
2550	/*
2551	 * Merge guest and host intercepts - must be called  with vcpu in
2552	 * guest-mode to take affect here
2553	 */
2554	recalc_intercepts(svm);
2555
2556	svm->nested.vmcb = vmcb_gpa;
2557
2558	enable_gif(svm);
2559
2560	mark_all_dirty(svm->vmcb);
2561
2562	return true;
2563}
2564
2565static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
2566{
2567	to_vmcb->save.fs = from_vmcb->save.fs;
2568	to_vmcb->save.gs = from_vmcb->save.gs;
2569	to_vmcb->save.tr = from_vmcb->save.tr;
2570	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
2571	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
2572	to_vmcb->save.star = from_vmcb->save.star;
2573	to_vmcb->save.lstar = from_vmcb->save.lstar;
2574	to_vmcb->save.cstar = from_vmcb->save.cstar;
2575	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
2576	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
2577	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
2578	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
2579}
2580
2581static int vmload_interception(struct vcpu_svm *svm)
2582{
2583	struct vmcb *nested_vmcb;
2584	struct page *page;
2585
2586	if (nested_svm_check_permissions(svm))
2587		return 1;
2588
2589	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2590	if (!nested_vmcb)
2591		return 1;
2592
2593	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2594	skip_emulated_instruction(&svm->vcpu);
2595
2596	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2597	nested_svm_unmap(page);
2598
2599	return 1;
2600}
2601
2602static int vmsave_interception(struct vcpu_svm *svm)
2603{
2604	struct vmcb *nested_vmcb;
2605	struct page *page;
2606
2607	if (nested_svm_check_permissions(svm))
2608		return 1;
2609
2610	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2611	if (!nested_vmcb)
2612		return 1;
2613
2614	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2615	skip_emulated_instruction(&svm->vcpu);
2616
2617	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2618	nested_svm_unmap(page);
2619
2620	return 1;
2621}
2622
2623static int vmrun_interception(struct vcpu_svm *svm)
2624{
2625	if (nested_svm_check_permissions(svm))
2626		return 1;
2627
2628	/* Save rip after vmrun instruction */
2629	kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2630
2631	if (!nested_svm_vmrun(svm))
2632		return 1;
2633
2634	if (!nested_svm_vmrun_msrpm(svm))
2635		goto failed;
2636
2637	return 1;
2638
2639failed:
2640
2641	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
2642	svm->vmcb->control.exit_code_hi = 0;
2643	svm->vmcb->control.exit_info_1  = 0;
2644	svm->vmcb->control.exit_info_2  = 0;
2645
2646	nested_svm_vmexit(svm);
2647
2648	return 1;
2649}
2650
2651static int stgi_interception(struct vcpu_svm *svm)
2652{
2653	if (nested_svm_check_permissions(svm))
2654		return 1;
2655
2656	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2657	skip_emulated_instruction(&svm->vcpu);
2658	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2659
2660	enable_gif(svm);
2661
2662	return 1;
2663}
2664
2665static int clgi_interception(struct vcpu_svm *svm)
2666{
2667	if (nested_svm_check_permissions(svm))
2668		return 1;
2669
2670	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2671	skip_emulated_instruction(&svm->vcpu);
2672
2673	disable_gif(svm);
2674
2675	/* After a CLGI no interrupts should come */
2676	svm_clear_vintr(svm);
2677	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2678
2679	mark_dirty(svm->vmcb, VMCB_INTR);
2680
2681	return 1;
2682}
2683
2684static int invlpga_interception(struct vcpu_svm *svm)
2685{
2686	struct kvm_vcpu *vcpu = &svm->vcpu;
2687
2688	trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
2689			  kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2690
2691	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2692	kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2693
2694	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2695	skip_emulated_instruction(&svm->vcpu);
2696	return 1;
2697}
2698
2699static int skinit_interception(struct vcpu_svm *svm)
2700{
2701	trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
2702
2703	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2704	return 1;
2705}
2706
2707static int wbinvd_interception(struct vcpu_svm *svm)
2708{
2709	kvm_emulate_wbinvd(&svm->vcpu);
2710	return 1;
2711}
2712
2713static int xsetbv_interception(struct vcpu_svm *svm)
2714{
2715	u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2716	u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2717
2718	if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2719		svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2720		skip_emulated_instruction(&svm->vcpu);
2721	}
2722
2723	return 1;
2724}
2725
2726static int task_switch_interception(struct vcpu_svm *svm)
2727{
2728	u16 tss_selector;
2729	int reason;
2730	int int_type = svm->vmcb->control.exit_int_info &
2731		SVM_EXITINTINFO_TYPE_MASK;
2732	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2733	uint32_t type =
2734		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2735	uint32_t idt_v =
2736		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2737	bool has_error_code = false;
2738	u32 error_code = 0;
2739
2740	tss_selector = (u16)svm->vmcb->control.exit_info_1;
2741
2742	if (svm->vmcb->control.exit_info_2 &
2743	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2744		reason = TASK_SWITCH_IRET;
2745	else if (svm->vmcb->control.exit_info_2 &
2746		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2747		reason = TASK_SWITCH_JMP;
2748	else if (idt_v)
2749		reason = TASK_SWITCH_GATE;
2750	else
2751		reason = TASK_SWITCH_CALL;
2752
2753	if (reason == TASK_SWITCH_GATE) {
2754		switch (type) {
2755		case SVM_EXITINTINFO_TYPE_NMI:
2756			svm->vcpu.arch.nmi_injected = false;
2757			break;
2758		case SVM_EXITINTINFO_TYPE_EXEPT:
2759			if (svm->vmcb->control.exit_info_2 &
2760			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2761				has_error_code = true;
2762				error_code =
2763					(u32)svm->vmcb->control.exit_info_2;
2764			}
2765			kvm_clear_exception_queue(&svm->vcpu);
2766			break;
2767		case SVM_EXITINTINFO_TYPE_INTR:
2768			kvm_clear_interrupt_queue(&svm->vcpu);
2769			break;
2770		default:
2771			break;
2772		}
2773	}
2774
2775	if (reason != TASK_SWITCH_GATE ||
2776	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2777	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2778	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
2779		skip_emulated_instruction(&svm->vcpu);
2780
2781	if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2782		int_vec = -1;
2783
2784	if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
2785				has_error_code, error_code) == EMULATE_FAIL) {
2786		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2787		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2788		svm->vcpu.run->internal.ndata = 0;
2789		return 0;
2790	}
2791	return 1;
2792}
2793
2794static int cpuid_interception(struct vcpu_svm *svm)
2795{
2796	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2797	kvm_emulate_cpuid(&svm->vcpu);
2798	return 1;
2799}
2800
2801static int iret_interception(struct vcpu_svm *svm)
2802{
2803	++svm->vcpu.stat.nmi_window_exits;
2804	clr_intercept(svm, INTERCEPT_IRET);
2805	svm->vcpu.arch.hflags |= HF_IRET_MASK;
2806	svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2807	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2808	return 1;
2809}
2810
2811static int invlpg_interception(struct vcpu_svm *svm)
2812{
2813	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2814		return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2815
2816	kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2817	skip_emulated_instruction(&svm->vcpu);
2818	return 1;
2819}
2820
2821static int emulate_on_interception(struct vcpu_svm *svm)
2822{
2823	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2824}
2825
2826static int rdpmc_interception(struct vcpu_svm *svm)
2827{
2828	int err;
2829
2830	if (!static_cpu_has(X86_FEATURE_NRIPS))
2831		return emulate_on_interception(svm);
2832
2833	err = kvm_rdpmc(&svm->vcpu);
2834	kvm_complete_insn_gp(&svm->vcpu, err);
2835
2836	return 1;
2837}
2838
2839static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
2840					    unsigned long val)
2841{
2842	unsigned long cr0 = svm->vcpu.arch.cr0;
2843	bool ret = false;
2844	u64 intercept;
2845
2846	intercept = svm->nested.intercept;
2847
2848	if (!is_guest_mode(&svm->vcpu) ||
2849	    (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2850		return false;
2851
2852	cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2853	val &= ~SVM_CR0_SELECTIVE_MASK;
2854
2855	if (cr0 ^ val) {
2856		svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2857		ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2858	}
2859
2860	return ret;
2861}
2862
2863#define CR_VALID (1ULL << 63)
2864
2865static int cr_interception(struct vcpu_svm *svm)
2866{
2867	int reg, cr;
2868	unsigned long val;
2869	int err;
2870
2871	if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2872		return emulate_on_interception(svm);
2873
2874	if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2875		return emulate_on_interception(svm);
2876
2877	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2878	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2879		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2880	else
2881		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2882
2883	err = 0;
2884	if (cr >= 16) { /* mov to cr */
2885		cr -= 16;
2886		val = kvm_register_read(&svm->vcpu, reg);
2887		switch (cr) {
2888		case 0:
2889			if (!check_selective_cr0_intercepted(svm, val))
2890				err = kvm_set_cr0(&svm->vcpu, val);
2891			else
2892				return 1;
2893
2894			break;
2895		case 3:
2896			err = kvm_set_cr3(&svm->vcpu, val);
2897			break;
2898		case 4:
2899			err = kvm_set_cr4(&svm->vcpu, val);
2900			break;
2901		case 8:
2902			err = kvm_set_cr8(&svm->vcpu, val);
2903			break;
2904		default:
2905			WARN(1, "unhandled write to CR%d", cr);
2906			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2907			return 1;
2908		}
2909	} else { /* mov from cr */
2910		switch (cr) {
2911		case 0:
2912			val = kvm_read_cr0(&svm->vcpu);
2913			break;
2914		case 2:
2915			val = svm->vcpu.arch.cr2;
2916			break;
2917		case 3:
2918			val = kvm_read_cr3(&svm->vcpu);
2919			break;
2920		case 4:
2921			val = kvm_read_cr4(&svm->vcpu);
2922			break;
2923		case 8:
2924			val = kvm_get_cr8(&svm->vcpu);
2925			break;
2926		default:
2927			WARN(1, "unhandled read from CR%d", cr);
2928			kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2929			return 1;
2930		}
2931		kvm_register_write(&svm->vcpu, reg, val);
2932	}
2933	kvm_complete_insn_gp(&svm->vcpu, err);
2934
2935	return 1;
2936}
2937
2938static int dr_interception(struct vcpu_svm *svm)
2939{
2940	int reg, dr;
2941	unsigned long val;
2942
2943	if (svm->vcpu.guest_debug == 0) {
2944		/*
2945		 * No more DR vmexits; force a reload of the debug registers
2946		 * and reenter on this instruction.  The next vmexit will
2947		 * retrieve the full state of the debug registers.
2948		 */
2949		clr_dr_intercepts(svm);
2950		svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2951		return 1;
2952	}
2953
2954	if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2955		return emulate_on_interception(svm);
2956
2957	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2958	dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2959
2960	if (dr >= 16) { /* mov to DRn */
2961		if (!kvm_require_dr(&svm->vcpu, dr - 16))
2962			return 1;
2963		val = kvm_register_read(&svm->vcpu, reg);
2964		kvm_set_dr(&svm->vcpu, dr - 16, val);
2965	} else {
2966		if (!kvm_require_dr(&svm->vcpu, dr))
2967			return 1;
2968		kvm_get_dr(&svm->vcpu, dr, &val);
2969		kvm_register_write(&svm->vcpu, reg, val);
2970	}
2971
2972	skip_emulated_instruction(&svm->vcpu);
2973
2974	return 1;
2975}
2976
2977static int cr8_write_interception(struct vcpu_svm *svm)
2978{
2979	struct kvm_run *kvm_run = svm->vcpu.run;
2980	int r;
2981
2982	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2983	/* instruction emulation calls kvm_set_cr8() */
2984	r = cr_interception(svm);
2985	if (lapic_in_kernel(&svm->vcpu))
2986		return r;
2987	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2988		return r;
2989	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2990	return 0;
2991}
2992
2993static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2994{
2995	struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
2996	return vmcb->control.tsc_offset + host_tsc;
2997}
2998
2999static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3000{
3001	struct vcpu_svm *svm = to_svm(vcpu);
3002
3003	switch (msr_info->index) {
3004	case MSR_IA32_TSC: {
3005		msr_info->data = svm->vmcb->control.tsc_offset +
3006			kvm_scale_tsc(vcpu, rdtsc());
3007
3008		break;
3009	}
3010	case MSR_STAR:
3011		msr_info->data = svm->vmcb->save.star;
3012		break;
3013#ifdef CONFIG_X86_64
3014	case MSR_LSTAR:
3015		msr_info->data = svm->vmcb->save.lstar;
3016		break;
3017	case MSR_CSTAR:
3018		msr_info->data = svm->vmcb->save.cstar;
3019		break;
3020	case MSR_KERNEL_GS_BASE:
3021		msr_info->data = svm->vmcb->save.kernel_gs_base;
3022		break;
3023	case MSR_SYSCALL_MASK:
3024		msr_info->data = svm->vmcb->save.sfmask;
3025		break;
3026#endif
3027	case MSR_IA32_SYSENTER_CS:
3028		msr_info->data = svm->vmcb->save.sysenter_cs;
3029		break;
3030	case MSR_IA32_SYSENTER_EIP:
3031		msr_info->data = svm->sysenter_eip;
3032		break;
3033	case MSR_IA32_SYSENTER_ESP:
3034		msr_info->data = svm->sysenter_esp;
3035		break;
3036	case MSR_TSC_AUX:
3037		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3038			return 1;
3039		msr_info->data = svm->tsc_aux;
3040		break;
3041	/*
3042	 * Nobody will change the following 5 values in the VMCB so we can
3043	 * safely return them on rdmsr. They will always be 0 until LBRV is
3044	 * implemented.
3045	 */
3046	case MSR_IA32_DEBUGCTLMSR:
3047		msr_info->data = svm->vmcb->save.dbgctl;
3048		break;
3049	case MSR_IA32_LASTBRANCHFROMIP:
3050		msr_info->data = svm->vmcb->save.br_from;
3051		break;
3052	case MSR_IA32_LASTBRANCHTOIP:
3053		msr_info->data = svm->vmcb->save.br_to;
3054		break;
3055	case MSR_IA32_LASTINTFROMIP:
3056		msr_info->data = svm->vmcb->save.last_excp_from;
3057		break;
3058	case MSR_IA32_LASTINTTOIP:
3059		msr_info->data = svm->vmcb->save.last_excp_to;
3060		break;
3061	case MSR_VM_HSAVE_PA:
3062		msr_info->data = svm->nested.hsave_msr;
3063		break;
3064	case MSR_VM_CR:
3065		msr_info->data = svm->nested.vm_cr_msr;
3066		break;
3067	case MSR_IA32_UCODE_REV:
3068		msr_info->data = 0x01000065;
3069		break;
3070	case MSR_F15H_IC_CFG: {
3071
3072		int family, model;
3073
3074		family = guest_cpuid_family(vcpu);
3075		model  = guest_cpuid_model(vcpu);
3076
3077		if (family < 0 || model < 0)
3078			return kvm_get_msr_common(vcpu, msr_info);
3079
3080		msr_info->data = 0;
3081
3082		if (family == 0x15 &&
3083		    (model >= 0x2 && model < 0x20))
3084			msr_info->data = 0x1E;
3085		}
3086		break;
3087	default:
3088		return kvm_get_msr_common(vcpu, msr_info);
3089	}
3090	return 0;
3091}
3092
3093static int rdmsr_interception(struct vcpu_svm *svm)
3094{
3095	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3096	struct msr_data msr_info;
3097
3098	msr_info.index = ecx;
3099	msr_info.host_initiated = false;
3100	if (svm_get_msr(&svm->vcpu, &msr_info)) {
3101		trace_kvm_msr_read_ex(ecx);
3102		kvm_inject_gp(&svm->vcpu, 0);
3103	} else {
3104		trace_kvm_msr_read(ecx, msr_info.data);
3105
3106		kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
3107				   msr_info.data & 0xffffffff);
3108		kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
3109				   msr_info.data >> 32);
3110		svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3111		skip_emulated_instruction(&svm->vcpu);
3112	}
3113	return 1;
3114}
3115
3116static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
3117{
3118	struct vcpu_svm *svm = to_svm(vcpu);
3119	int svm_dis, chg_mask;
3120
3121	if (data & ~SVM_VM_CR_VALID_MASK)
3122		return 1;
3123
3124	chg_mask = SVM_VM_CR_VALID_MASK;
3125
3126	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
3127		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
3128
3129	svm->nested.vm_cr_msr &= ~chg_mask;
3130	svm->nested.vm_cr_msr |= (data & chg_mask);
3131
3132	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
3133
3134	/* check for svm_disable while efer.svme is set */
3135	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
3136		return 1;
3137
3138	return 0;
3139}
3140
3141static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
3142{
3143	struct vcpu_svm *svm = to_svm(vcpu);
3144
3145	u32 ecx = msr->index;
3146	u64 data = msr->data;
3147	switch (ecx) {
3148	case MSR_IA32_TSC:
3149		kvm_write_tsc(vcpu, msr);
3150		break;
3151	case MSR_STAR:
3152		svm->vmcb->save.star = data;
3153		break;
3154#ifdef CONFIG_X86_64
3155	case MSR_LSTAR:
3156		svm->vmcb->save.lstar = data;
3157		break;
3158	case MSR_CSTAR:
3159		svm->vmcb->save.cstar = data;
3160		break;
3161	case MSR_KERNEL_GS_BASE:
3162		svm->vmcb->save.kernel_gs_base = data;
3163		break;
3164	case MSR_SYSCALL_MASK:
3165		svm->vmcb->save.sfmask = data;
3166		break;
3167#endif
3168	case MSR_IA32_SYSENTER_CS:
3169		svm->vmcb->save.sysenter_cs = data;
3170		break;
3171	case MSR_IA32_SYSENTER_EIP:
3172		svm->sysenter_eip = data;
3173		svm->vmcb->save.sysenter_eip = data;
3174		break;
3175	case MSR_IA32_SYSENTER_ESP:
3176		svm->sysenter_esp = data;
3177		svm->vmcb->save.sysenter_esp = data;
3178		break;
3179	case MSR_TSC_AUX:
3180		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
3181			return 1;
3182
3183		/*
3184		 * This is rare, so we update the MSR here instead of using
3185		 * direct_access_msrs.  Doing that would require a rdmsr in
3186		 * svm_vcpu_put.
3187		 */
3188		svm->tsc_aux = data;
3189		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
3190		break;
3191	case MSR_IA32_DEBUGCTLMSR:
3192		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
3193			vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
3194				    __func__, data);
3195			break;
3196		}
3197		if (data & DEBUGCTL_RESERVED_BITS)
3198			return 1;
3199
3200		svm->vmcb->save.dbgctl = data;
3201		mark_dirty(svm->vmcb, VMCB_LBR);
3202		if (data & (1ULL<<0))
3203			svm_enable_lbrv(svm);
3204		else
3205			svm_disable_lbrv(svm);
3206		break;
3207	case MSR_VM_HSAVE_PA:
3208		svm->nested.hsave_msr = data;
3209		break;
3210	case MSR_VM_CR:
3211		return svm_set_vm_cr(vcpu, data);
3212	case MSR_VM_IGNNE:
3213		vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
3214		break;
3215	default:
3216		return kvm_set_msr_common(vcpu, msr);
3217	}
3218	return 0;
3219}
3220
3221static int wrmsr_interception(struct vcpu_svm *svm)
3222{
3223	struct msr_data msr;
3224	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3225	u64 data = kvm_read_edx_eax(&svm->vcpu);
3226
3227	msr.data = data;
3228	msr.index = ecx;
3229	msr.host_initiated = false;
3230
3231	svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3232	if (kvm_set_msr(&svm->vcpu, &msr)) {
3233		trace_kvm_msr_write_ex(ecx, data);
3234		kvm_inject_gp(&svm->vcpu, 0);
3235	} else {
3236		trace_kvm_msr_write(ecx, data);
3237		skip_emulated_instruction(&svm->vcpu);
3238	}
3239	return 1;
3240}
3241
3242static int msr_interception(struct vcpu_svm *svm)
3243{
3244	if (svm->vmcb->control.exit_info_1)
3245		return wrmsr_interception(svm);
3246	else
3247		return rdmsr_interception(svm);
3248}
3249
3250static int interrupt_window_interception(struct vcpu_svm *svm)
3251{
3252	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3253	svm_clear_vintr(svm);
3254	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3255	mark_dirty(svm->vmcb, VMCB_INTR);
3256	++svm->vcpu.stat.irq_window_exits;
3257	return 1;
3258}
3259
3260static int pause_interception(struct vcpu_svm *svm)
3261{
3262	kvm_vcpu_on_spin(&(svm->vcpu));
3263	return 1;
3264}
3265
3266static int nop_interception(struct vcpu_svm *svm)
3267{
3268	skip_emulated_instruction(&(svm->vcpu));
3269	return 1;
3270}
3271
3272static int monitor_interception(struct vcpu_svm *svm)
3273{
3274	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
3275	return nop_interception(svm);
3276}
3277
3278static int mwait_interception(struct vcpu_svm *svm)
3279{
3280	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
3281	return nop_interception(svm);
3282}
3283
3284static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
3285	[SVM_EXIT_READ_CR0]			= cr_interception,
3286	[SVM_EXIT_READ_CR3]			= cr_interception,
3287	[SVM_EXIT_READ_CR4]			= cr_interception,
3288	[SVM_EXIT_READ_CR8]			= cr_interception,
3289	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
3290	[SVM_EXIT_WRITE_CR0]			= cr_interception,
3291	[SVM_EXIT_WRITE_CR3]			= cr_interception,
3292	[SVM_EXIT_WRITE_CR4]			= cr_interception,
3293	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
3294	[SVM_EXIT_READ_DR0]			= dr_interception,
3295	[SVM_EXIT_READ_DR1]			= dr_interception,
3296	[SVM_EXIT_READ_DR2]			= dr_interception,
3297	[SVM_EXIT_READ_DR3]			= dr_interception,
3298	[SVM_EXIT_READ_DR4]			= dr_interception,
3299	[SVM_EXIT_READ_DR5]			= dr_interception,
3300	[SVM_EXIT_READ_DR6]			= dr_interception,
3301	[SVM_EXIT_READ_DR7]			= dr_interception,
3302	[SVM_EXIT_WRITE_DR0]			= dr_interception,
3303	[SVM_EXIT_WRITE_DR1]			= dr_interception,
3304	[SVM_EXIT_WRITE_DR2]			= dr_interception,
3305	[SVM_EXIT_WRITE_DR3]			= dr_interception,
3306	[SVM_EXIT_WRITE_DR4]			= dr_interception,
3307	[SVM_EXIT_WRITE_DR5]			= dr_interception,
3308	[SVM_EXIT_WRITE_DR6]			= dr_interception,
3309	[SVM_EXIT_WRITE_DR7]			= dr_interception,
3310	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
3311	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
3312	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
3313	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
3314	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
3315	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
3316	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
3317	[SVM_EXIT_INTR]				= intr_interception,
3318	[SVM_EXIT_NMI]				= nmi_interception,
3319	[SVM_EXIT_SMI]				= nop_on_interception,
3320	[SVM_EXIT_INIT]				= nop_on_interception,
3321	[SVM_EXIT_VINTR]			= interrupt_window_interception,
3322	[SVM_EXIT_RDPMC]			= rdpmc_interception,
3323	[SVM_EXIT_CPUID]			= cpuid_interception,
3324	[SVM_EXIT_IRET]                         = iret_interception,
3325	[SVM_EXIT_INVD]                         = emulate_on_interception,
3326	[SVM_EXIT_PAUSE]			= pause_interception,
3327	[SVM_EXIT_HLT]				= halt_interception,
3328	[SVM_EXIT_INVLPG]			= invlpg_interception,
3329	[SVM_EXIT_INVLPGA]			= invlpga_interception,
3330	[SVM_EXIT_IOIO]				= io_interception,
3331	[SVM_EXIT_MSR]				= msr_interception,
3332	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
3333	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
3334	[SVM_EXIT_VMRUN]			= vmrun_interception,
3335	[SVM_EXIT_VMMCALL]			= vmmcall_interception,
3336	[SVM_EXIT_VMLOAD]			= vmload_interception,
3337	[SVM_EXIT_VMSAVE]			= vmsave_interception,
3338	[SVM_EXIT_STGI]				= stgi_interception,
3339	[SVM_EXIT_CLGI]				= clgi_interception,
3340	[SVM_EXIT_SKINIT]			= skinit_interception,
3341	[SVM_EXIT_WBINVD]                       = wbinvd_interception,
3342	[SVM_EXIT_MONITOR]			= monitor_interception,
3343	[SVM_EXIT_MWAIT]			= mwait_interception,
3344	[SVM_EXIT_XSETBV]			= xsetbv_interception,
3345	[SVM_EXIT_NPF]				= pf_interception,
3346	[SVM_EXIT_RSM]                          = emulate_on_interception,
3347};
3348
3349static void dump_vmcb(struct kvm_vcpu *vcpu)
3350{
3351	struct vcpu_svm *svm = to_svm(vcpu);
3352	struct vmcb_control_area *control = &svm->vmcb->control;
3353	struct vmcb_save_area *save = &svm->vmcb->save;
3354
3355	pr_err("VMCB Control Area:\n");
3356	pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3357	pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3358	pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3359	pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3360	pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3361	pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3362	pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3363	pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3364	pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3365	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3366	pr_err("%-20s%d\n", "asid:", control->asid);
3367	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3368	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3369	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3370	pr_err("%-20s%08x\n", "int_state:", control->int_state);
3371	pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3372	pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3373	pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3374	pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3375	pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3376	pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3377	pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3378	pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3379	pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3380	pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3381	pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3382	pr_err("VMCB State Save Area:\n");
3383	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3384	       "es:",
3385	       save->es.selector, save->es.attrib,
3386	       save->es.limit, save->es.base);
3387	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3388	       "cs:",
3389	       save->cs.selector, save->cs.attrib,
3390	       save->cs.limit, save->cs.base);
3391	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3392	       "ss:",
3393	       save->ss.selector, save->ss.attrib,
3394	       save->ss.limit, save->ss.base);
3395	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3396	       "ds:",
3397	       save->ds.selector, save->ds.attrib,
3398	       save->ds.limit, save->ds.base);
3399	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3400	       "fs:",
3401	       save->fs.selector, save->fs.attrib,
3402	       save->fs.limit, save->fs.base);
3403	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3404	       "gs:",
3405	       save->gs.selector, save->gs.attrib,
3406	       save->gs.limit, save->gs.base);
3407	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3408	       "gdtr:",
3409	       save->gdtr.selector, save->gdtr.attrib,
3410	       save->gdtr.limit, save->gdtr.base);
3411	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3412	       "ldtr:",
3413	       save->ldtr.selector, save->ldtr.attrib,
3414	       save->ldtr.limit, save->ldtr.base);
3415	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3416	       "idtr:",
3417	       save->idtr.selector, save->idtr.attrib,
3418	       save->idtr.limit, save->idtr.base);
3419	pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3420	       "tr:",
3421	       save->tr.selector, save->tr.attrib,
3422	       save->tr.limit, save->tr.base);
3423	pr_err("cpl:            %d                efer:         %016llx\n",
3424		save->cpl, save->efer);
3425	pr_err("%-15s %016llx %-13s %016llx\n",
3426	       "cr0:", save->cr0, "cr2:", save->cr2);
3427	pr_err("%-15s %016llx %-13s %016llx\n",
3428	       "cr3:", save->cr3, "cr4:", save->cr4);
3429	pr_err("%-15s %016llx %-13s %016llx\n",
3430	       "dr6:", save->dr6, "dr7:", save->dr7);
3431	pr_err("%-15s %016llx %-13s %016llx\n",
3432	       "rip:", save->rip, "rflags:", save->rflags);
3433	pr_err("%-15s %016llx %-13s %016llx\n",
3434	       "rsp:", save->rsp, "rax:", save->rax);
3435	pr_err("%-15s %016llx %-13s %016llx\n",
3436	       "star:", save->star, "lstar:", save->lstar);
3437	pr_err("%-15s %016llx %-13s %016llx\n",
3438	       "cstar:", save->cstar, "sfmask:", save->sfmask);
3439	pr_err("%-15s %016llx %-13s %016llx\n",
3440	       "kernel_gs_base:", save->kernel_gs_base,
3441	       "sysenter_cs:", save->sysenter_cs);
3442	pr_err("%-15s %016llx %-13s %016llx\n",
3443	       "sysenter_esp:", save->sysenter_esp,
3444	       "sysenter_eip:", save->sysenter_eip);
3445	pr_err("%-15s %016llx %-13s %016llx\n",
3446	       "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3447	pr_err("%-15s %016llx %-13s %016llx\n",
3448	       "br_from:", save->br_from, "br_to:", save->br_to);
3449	pr_err("%-15s %016llx %-13s %016llx\n",
3450	       "excp_from:", save->last_excp_from,
3451	       "excp_to:", save->last_excp_to);
3452}
3453
3454static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3455{
3456	struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3457
3458	*info1 = control->exit_info_1;
3459	*info2 = control->exit_info_2;
3460}
3461
3462static int handle_exit(struct kvm_vcpu *vcpu)
3463{
3464	struct vcpu_svm *svm = to_svm(vcpu);
3465	struct kvm_run *kvm_run = vcpu->run;
3466	u32 exit_code = svm->vmcb->control.exit_code;
3467
3468	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
3469
3470	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
3471		vcpu->arch.cr0 = svm->vmcb->save.cr0;
3472	if (npt_enabled)
3473		vcpu->arch.cr3 = svm->vmcb->save.cr3;
3474
3475	if (unlikely(svm->nested.exit_required)) {
3476		nested_svm_vmexit(svm);
3477		svm->nested.exit_required = false;
3478
3479		return 1;
3480	}
3481
3482	if (is_guest_mode(vcpu)) {
3483		int vmexit;
3484
3485		trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
3486					svm->vmcb->control.exit_info_1,
3487					svm->vmcb->control.exit_info_2,
3488					svm->vmcb->control.exit_int_info,
3489					svm->vmcb->control.exit_int_info_err,
3490					KVM_ISA_SVM);
3491
3492		vmexit = nested_svm_exit_special(svm);
3493
3494		if (vmexit == NESTED_EXIT_CONTINUE)
3495			vmexit = nested_svm_exit_handled(svm);
3496
3497		if (vmexit == NESTED_EXIT_DONE)
3498			return 1;
3499	}
3500
3501	svm_complete_interrupts(svm);
3502
3503	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3504		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3505		kvm_run->fail_entry.hardware_entry_failure_reason
3506			= svm->vmcb->control.exit_code;
3507		pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
3508		dump_vmcb(vcpu);
3509		return 0;
3510	}
3511
3512	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3513	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3514	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3515	    exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3516		printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3517		       "exit_code 0x%x\n",
3518		       __func__, svm->vmcb->control.exit_int_info,
3519		       exit_code);
3520
3521	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
3522	    || !svm_exit_handlers[exit_code]) {
3523		WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
3524		kvm_queue_exception(vcpu, UD_VECTOR);
3525		return 1;
3526	}
3527
3528	return svm_exit_handlers[exit_code](svm);
3529}
3530
3531static void reload_tss(struct kvm_vcpu *vcpu)
3532{
3533	int cpu = raw_smp_processor_id();
3534
3535	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3536	sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3537	load_TR_desc();
3538}
3539
3540static void pre_svm_run(struct vcpu_svm *svm)
3541{
3542	int cpu = raw_smp_processor_id();
3543
3544	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
3545
3546	/* FIXME: handle wraparound of asid_generation */
3547	if (svm->asid_generation != sd->asid_generation)
3548		new_asid(svm, sd);
3549}
3550
3551static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3552{
3553	struct vcpu_svm *svm = to_svm(vcpu);
3554
3555	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3556	vcpu->arch.hflags |= HF_NMI_MASK;
3557	set_intercept(svm, INTERCEPT_IRET);
3558	++vcpu->stat.nmi_injections;
3559}
3560
3561static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
3562{
3563	struct vmcb_control_area *control;
3564
3565	control = &svm->vmcb->control;
3566	control->int_vector = irq;
3567	control->int_ctl &= ~V_INTR_PRIO_MASK;
3568	control->int_ctl |= V_IRQ_MASK |
3569		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3570	mark_dirty(svm->vmcb, VMCB_INTR);
3571}
3572
3573static void svm_set_irq(struct kvm_vcpu *vcpu)
3574{
3575	struct vcpu_svm *svm = to_svm(vcpu);
3576
3577	BUG_ON(!(gif_set(svm)));
3578
3579	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
3580	++vcpu->stat.irq_injections;
3581
3582	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3583		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
3584}
3585
3586static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3587{
3588	struct vcpu_svm *svm = to_svm(vcpu);
3589
3590	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3591		return;
3592
3593	clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3594
3595	if (irr == -1)
3596		return;
3597
3598	if (tpr >= irr)
3599		set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3600}
3601
3602static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3603{
3604	return;
3605}
3606
3607static bool svm_get_enable_apicv(void)
3608{
3609	return false;
3610}
3611
3612static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
3613{
3614}
3615
3616static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
3617{
3618	return;
3619}
3620
3621static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
3622{
3623	return;
3624}
3625
3626static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
3627{
3628	struct vcpu_svm *svm = to_svm(vcpu);
3629	struct vmcb *vmcb = svm->vmcb;
3630	int ret;
3631	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
3632	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
3633	ret = ret && gif_set(svm) && nested_svm_nmi(svm);
3634
3635	return ret;
3636}
3637
3638static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3639{
3640	struct vcpu_svm *svm = to_svm(vcpu);
3641
3642	return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
3643}
3644
3645static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3646{
3647	struct vcpu_svm *svm = to_svm(vcpu);
3648
3649	if (masked) {
3650		svm->vcpu.arch.hflags |= HF_NMI_MASK;
3651		set_intercept(svm, INTERCEPT_IRET);
3652	} else {
3653		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
3654		clr_intercept(svm, INTERCEPT_IRET);
3655	}
3656}
3657
3658static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3659{
3660	struct vcpu_svm *svm = to_svm(vcpu);
3661	struct vmcb *vmcb = svm->vmcb;
3662	int ret;
3663
3664	if (!gif_set(svm) ||
3665	     (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3666		return 0;
3667
3668	ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3669
3670	if (is_guest_mode(vcpu))
3671		return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3672
3673	return ret;
3674}
3675
3676static void enable_irq_window(struct kvm_vcpu *vcpu)
3677{
3678	struct vcpu_svm *svm = to_svm(vcpu);
3679
3680	/*
3681	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3682	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3683	 * get that intercept, this function will be called again though and
3684	 * we'll get the vintr intercept.
3685	 */
3686	if (gif_set(svm) && nested_svm_intr(svm)) {
3687		svm_set_vintr(svm);
3688		svm_inject_irq(svm, 0x0);
3689	}
3690}
3691
3692static void enable_nmi_window(struct kvm_vcpu *vcpu)
3693{
3694	struct vcpu_svm *svm = to_svm(vcpu);
3695
3696	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
3697	    == HF_NMI_MASK)
3698		return; /* IRET will cause a vm exit */
3699
3700	/*
3701	 * Something prevents NMI from been injected. Single step over possible
3702	 * problem (IRET or exception injection or interrupt shadow)
3703	 */
3704	svm->nmi_singlestep = true;
3705	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3706}
3707
3708static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3709{
3710	return 0;
3711}
3712
3713static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3714{
3715	struct vcpu_svm *svm = to_svm(vcpu);
3716
3717	if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3718		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3719	else
3720		svm->asid_generation--;
3721}
3722
3723static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
3724{
3725}
3726
3727static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3728{
3729	struct vcpu_svm *svm = to_svm(vcpu);
3730
3731	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3732		return;
3733
3734	if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3735		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3736		kvm_set_cr8(vcpu, cr8);
3737	}
3738}
3739
3740static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3741{
3742	struct vcpu_svm *svm = to_svm(vcpu);
3743	u64 cr8;
3744
3745	if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3746		return;
3747
3748	cr8 = kvm_get_cr8(vcpu);
3749	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3750	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3751}
3752
3753static void svm_complete_interrupts(struct vcpu_svm *svm)
3754{
3755	u8 vector;
3756	int type;
3757	u32 exitintinfo = svm->vmcb->control.exit_int_info;
3758	unsigned int3_injected = svm->int3_injected;
3759
3760	svm->int3_injected = 0;
3761
3762	/*
3763	 * If we've made progress since setting HF_IRET_MASK, we've
3764	 * executed an IRET and can allow NMI injection.
3765	 */
3766	if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3767	    && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3768		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3769		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3770	}
3771
3772	svm->vcpu.arch.nmi_injected = false;
3773	kvm_clear_exception_queue(&svm->vcpu);
3774	kvm_clear_interrupt_queue(&svm->vcpu);
3775
3776	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3777		return;
3778
3779	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3780
3781	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3782	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3783
3784	switch (type) {
3785	case SVM_EXITINTINFO_TYPE_NMI:
3786		svm->vcpu.arch.nmi_injected = true;
3787		break;
3788	case SVM_EXITINTINFO_TYPE_EXEPT:
3789		/*
3790		 * In case of software exceptions, do not reinject the vector,
3791		 * but re-execute the instruction instead. Rewind RIP first
3792		 * if we emulated INT3 before.
3793		 */
3794		if (kvm_exception_is_soft(vector)) {
3795			if (vector == BP_VECTOR && int3_injected &&
3796			    kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
3797				kvm_rip_write(&svm->vcpu,
3798					      kvm_rip_read(&svm->vcpu) -
3799					      int3_injected);
3800			break;
3801		}
3802		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3803			u32 err = svm->vmcb->control.exit_int_info_err;
3804			kvm_requeue_exception_e(&svm->vcpu, vector, err);
3805
3806		} else
3807			kvm_requeue_exception(&svm->vcpu, vector);
3808		break;
3809	case SVM_EXITINTINFO_TYPE_INTR:
3810		kvm_queue_interrupt(&svm->vcpu, vector, false);
3811		break;
3812	default:
3813		break;
3814	}
3815}
3816
3817static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3818{
3819	struct vcpu_svm *svm = to_svm(vcpu);
3820	struct vmcb_control_area *control = &svm->vmcb->control;
3821
3822	control->exit_int_info = control->event_inj;
3823	control->exit_int_info_err = control->event_inj_err;
3824	control->event_inj = 0;
3825	svm_complete_interrupts(svm);
3826}
3827
3828static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3829{
3830	struct vcpu_svm *svm = to_svm(vcpu);
3831
3832	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3833	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3834	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3835
3836	/*
3837	 * A vmexit emulation is required before the vcpu can be executed
3838	 * again.
3839	 */
3840	if (unlikely(svm->nested.exit_required))
3841		return;
3842
3843	pre_svm_run(svm);
3844
3845	sync_lapic_to_cr8(vcpu);
3846
3847	svm->vmcb->save.cr2 = vcpu->arch.cr2;
3848
3849	clgi();
3850
3851	local_irq_enable();
3852
3853	asm volatile (
3854		"push %%" _ASM_BP "; \n\t"
3855		"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
3856		"mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
3857		"mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
3858		"mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
3859		"mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
3860		"mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
3861#ifdef CONFIG_X86_64
3862		"mov %c[r8](%[svm]),  %%r8  \n\t"
3863		"mov %c[r9](%[svm]),  %%r9  \n\t"
3864		"mov %c[r10](%[svm]), %%r10 \n\t"
3865		"mov %c[r11](%[svm]), %%r11 \n\t"
3866		"mov %c[r12](%[svm]), %%r12 \n\t"
3867		"mov %c[r13](%[svm]), %%r13 \n\t"
3868		"mov %c[r14](%[svm]), %%r14 \n\t"
3869		"mov %c[r15](%[svm]), %%r15 \n\t"
3870#endif
3871
3872		/* Enter guest mode */
3873		"push %%" _ASM_AX " \n\t"
3874		"mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
3875		__ex(SVM_VMLOAD) "\n\t"
3876		__ex(SVM_VMRUN) "\n\t"
3877		__ex(SVM_VMSAVE) "\n\t"
3878		"pop %%" _ASM_AX " \n\t"
3879
3880		/* Save guest registers, load host registers */
3881		"mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
3882		"mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
3883		"mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
3884		"mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
3885		"mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
3886		"mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
3887#ifdef CONFIG_X86_64
3888		"mov %%r8,  %c[r8](%[svm]) \n\t"
3889		"mov %%r9,  %c[r9](%[svm]) \n\t"
3890		"mov %%r10, %c[r10](%[svm]) \n\t"
3891		"mov %%r11, %c[r11](%[svm]) \n\t"
3892		"mov %%r12, %c[r12](%[svm]) \n\t"
3893		"mov %%r13, %c[r13](%[svm]) \n\t"
3894		"mov %%r14, %c[r14](%[svm]) \n\t"
3895		"mov %%r15, %c[r15](%[svm]) \n\t"
3896#endif
3897		"pop %%" _ASM_BP
3898		:
3899		: [svm]"a"(svm),
3900		  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
3901		  [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
3902		  [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
3903		  [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
3904		  [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
3905		  [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
3906		  [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
3907#ifdef CONFIG_X86_64
3908		  , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
3909		  [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
3910		  [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
3911		  [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
3912		  [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
3913		  [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
3914		  [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
3915		  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
3916#endif
3917		: "cc", "memory"
3918#ifdef CONFIG_X86_64
3919		, "rbx", "rcx", "rdx", "rsi", "rdi"
3920		, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
3921#else
3922		, "ebx", "ecx", "edx", "esi", "edi"
3923#endif
3924		);
3925
3926#ifdef CONFIG_X86_64
3927	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3928#else
3929	loadsegment(fs, svm->host.fs);
3930#ifndef CONFIG_X86_32_LAZY_GS
3931	loadsegment(gs, svm->host.gs);
3932#endif
3933#endif
3934
3935	reload_tss(vcpu);
3936
3937	local_irq_disable();
3938
3939	vcpu->arch.cr2 = svm->vmcb->save.cr2;
3940	vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3941	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3942	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3943
3944	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3945		kvm_before_handle_nmi(&svm->vcpu);
3946
3947	stgi();
3948
3949	/* Any pending NMI will happen here */
3950
3951	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3952		kvm_after_handle_nmi(&svm->vcpu);
3953
3954	sync_cr8_to_lapic(vcpu);
3955
3956	svm->next_rip = 0;
3957
3958	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3959
3960	/* if exit due to PF check for async PF */
3961	if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3962		svm->apf_reason = kvm_read_and_reset_pf_reason();
3963
3964	if (npt_enabled) {
3965		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3966		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
3967	}
3968
3969	/*
3970	 * We need to handle MC intercepts here before the vcpu has a chance to
3971	 * change the physical cpu
3972	 */
3973	if (unlikely(svm->vmcb->control.exit_code ==
3974		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
3975		svm_handle_mce(svm);
3976
3977	mark_all_clean(svm->vmcb);
3978}
3979
3980static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3981{
3982	struct vcpu_svm *svm = to_svm(vcpu);
3983
3984	svm->vmcb->save.cr3 = root;
3985	mark_dirty(svm->vmcb, VMCB_CR);
3986	svm_flush_tlb(vcpu);
3987}
3988
3989static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3990{
3991	struct vcpu_svm *svm = to_svm(vcpu);
3992
3993	svm->vmcb->control.nested_cr3 = root;
3994	mark_dirty(svm->vmcb, VMCB_NPT);
3995
3996	/* Also sync guest cr3 here in case we live migrate */
3997	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3998	mark_dirty(svm->vmcb, VMCB_CR);
3999
4000	svm_flush_tlb(vcpu);
4001}
4002
4003static int is_disabled(void)
4004{
4005	u64 vm_cr;
4006
4007	rdmsrl(MSR_VM_CR, vm_cr);
4008	if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4009		return 1;
4010
4011	return 0;
4012}
4013
4014static void
4015svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4016{
4017	/*
4018	 * Patch in the VMMCALL instruction:
4019	 */
4020	hypercall[0] = 0x0f;
4021	hypercall[1] = 0x01;
4022	hypercall[2] = 0xd9;
4023}
4024
4025static void svm_check_processor_compat(void *rtn)
4026{
4027	*(int *)rtn = 0;
4028}
4029
4030static bool svm_cpu_has_accelerated_tpr(void)
4031{
4032	return false;
4033}
4034
4035static bool svm_has_high_real_mode_segbase(void)
4036{
4037	return true;
4038}
4039
4040static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4041{
4042	return 0;
4043}
4044
4045static void svm_cpuid_update(struct kvm_vcpu *vcpu)
4046{
4047	struct vcpu_svm *svm = to_svm(vcpu);
4048
4049	/* Update nrips enabled cache */
4050	svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
4051}
4052
4053static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4054{
4055	switch (func) {
4056	case 0x80000001:
4057		if (nested)
4058			entry->ecx |= (1 << 2); /* Set SVM bit */
4059		break;
4060	case 0x8000000A:
4061		entry->eax = 1; /* SVM revision 1 */
4062		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
4063				   ASID emulation to nested SVM */
4064		entry->ecx = 0; /* Reserved */
4065		entry->edx = 0; /* Per default do not support any
4066				   additional features */
4067
4068		/* Support next_rip if host supports it */
4069		if (boot_cpu_has(X86_FEATURE_NRIPS))
4070			entry->edx |= SVM_FEATURE_NRIP;
4071
4072		/* Support NPT for the guest if enabled */
4073		if (npt_enabled)
4074			entry->edx |= SVM_FEATURE_NPT;
4075
4076		break;
4077	}
4078}
4079
4080static int svm_get_lpage_level(void)
4081{
4082	return PT_PDPE_LEVEL;
4083}
4084
4085static bool svm_rdtscp_supported(void)
4086{
4087	return boot_cpu_has(X86_FEATURE_RDTSCP);
4088}
4089
4090static bool svm_invpcid_supported(void)
4091{
4092	return false;
4093}
4094
4095static bool svm_mpx_supported(void)
4096{
4097	return false;
4098}
4099
4100static bool svm_xsaves_supported(void)
4101{
4102	return false;
4103}
4104
4105static bool svm_has_wbinvd_exit(void)
4106{
4107	return true;
4108}
4109
4110static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
4111{
4112	struct vcpu_svm *svm = to_svm(vcpu);
4113
4114	set_exception_intercept(svm, NM_VECTOR);
4115	update_cr0_intercept(svm);
4116}
4117
4118#define PRE_EX(exit)  { .exit_code = (exit), \
4119			.stage = X86_ICPT_PRE_EXCEPT, }
4120#define POST_EX(exit) { .exit_code = (exit), \
4121			.stage = X86_ICPT_POST_EXCEPT, }
4122#define POST_MEM(exit) { .exit_code = (exit), \
4123			.stage = X86_ICPT_POST_MEMACCESS, }
4124
4125static const struct __x86_intercept {
4126	u32 exit_code;
4127	enum x86_intercept_stage stage;
4128} x86_intercept_map[] = {
4129	[x86_intercept_cr_read]		= POST_EX(SVM_EXIT_READ_CR0),
4130	[x86_intercept_cr_write]	= POST_EX(SVM_EXIT_WRITE_CR0),
4131	[x86_intercept_clts]		= POST_EX(SVM_EXIT_WRITE_CR0),
4132	[x86_intercept_lmsw]		= POST_EX(SVM_EXIT_WRITE_CR0),
4133	[x86_intercept_smsw]		= POST_EX(SVM_EXIT_READ_CR0),
4134	[x86_intercept_dr_read]		= POST_EX(SVM_EXIT_READ_DR0),
4135	[x86_intercept_dr_write]	= POST_EX(SVM_EXIT_WRITE_DR0),
4136	[x86_intercept_sldt]		= POST_EX(SVM_EXIT_LDTR_READ),
4137	[x86_intercept_str]		= POST_EX(SVM_EXIT_TR_READ),
4138	[x86_intercept_lldt]		= POST_EX(SVM_EXIT_LDTR_WRITE),
4139	[x86_intercept_ltr]		= POST_EX(SVM_EXIT_TR_WRITE),
4140	[x86_intercept_sgdt]		= POST_EX(SVM_EXIT_GDTR_READ),
4141	[x86_intercept_sidt]		= POST_EX(SVM_EXIT_IDTR_READ),
4142	[x86_intercept_lgdt]		= POST_EX(SVM_EXIT_GDTR_WRITE),
4143	[x86_intercept_lidt]		= POST_EX(SVM_EXIT_IDTR_WRITE),
4144	[x86_intercept_vmrun]		= POST_EX(SVM_EXIT_VMRUN),
4145	[x86_intercept_vmmcall]		= POST_EX(SVM_EXIT_VMMCALL),
4146	[x86_intercept_vmload]		= POST_EX(SVM_EXIT_VMLOAD),
4147	[x86_intercept_vmsave]		= POST_EX(SVM_EXIT_VMSAVE),
4148	[x86_intercept_stgi]		= POST_EX(SVM_EXIT_STGI),
4149	[x86_intercept_clgi]		= POST_EX(SVM_EXIT_CLGI),
4150	[x86_intercept_skinit]		= POST_EX(SVM_EXIT_SKINIT),
4151	[x86_intercept_invlpga]		= POST_EX(SVM_EXIT_INVLPGA),
4152	[x86_intercept_rdtscp]		= POST_EX(SVM_EXIT_RDTSCP),
4153	[x86_intercept_monitor]		= POST_MEM(SVM_EXIT_MONITOR),
4154	[x86_intercept_mwait]		= POST_EX(SVM_EXIT_MWAIT),
4155	[x86_intercept_invlpg]		= POST_EX(SVM_EXIT_INVLPG),
4156	[x86_intercept_invd]		= POST_EX(SVM_EXIT_INVD),
4157	[x86_intercept_wbinvd]		= POST_EX(SVM_EXIT_WBINVD),
4158	[x86_intercept_wrmsr]		= POST_EX(SVM_EXIT_MSR),
4159	[x86_intercept_rdtsc]		= POST_EX(SVM_EXIT_RDTSC),
4160	[x86_intercept_rdmsr]		= POST_EX(SVM_EXIT_MSR),
4161	[x86_intercept_rdpmc]		= POST_EX(SVM_EXIT_RDPMC),
4162	[x86_intercept_cpuid]		= PRE_EX(SVM_EXIT_CPUID),
4163	[x86_intercept_rsm]		= PRE_EX(SVM_EXIT_RSM),
4164	[x86_intercept_pause]		= PRE_EX(SVM_EXIT_PAUSE),
4165	[x86_intercept_pushf]		= PRE_EX(SVM_EXIT_PUSHF),
4166	[x86_intercept_popf]		= PRE_EX(SVM_EXIT_POPF),
4167	[x86_intercept_intn]		= PRE_EX(SVM_EXIT_SWINT),
4168	[x86_intercept_iret]		= PRE_EX(SVM_EXIT_IRET),
4169	[x86_intercept_icebp]		= PRE_EX(SVM_EXIT_ICEBP),
4170	[x86_intercept_hlt]		= POST_EX(SVM_EXIT_HLT),
4171	[x86_intercept_in]		= POST_EX(SVM_EXIT_IOIO),
4172	[x86_intercept_ins]		= POST_EX(SVM_EXIT_IOIO),
4173	[x86_intercept_out]		= POST_EX(SVM_EXIT_IOIO),
4174	[x86_intercept_outs]		= POST_EX(SVM_EXIT_IOIO),
4175};
4176
4177#undef PRE_EX
4178#undef POST_EX
4179#undef POST_MEM
4180
4181static int svm_check_intercept(struct kvm_vcpu *vcpu,
4182			       struct x86_instruction_info *info,
4183			       enum x86_intercept_stage stage)
4184{
4185	struct vcpu_svm *svm = to_svm(vcpu);
4186	int vmexit, ret = X86EMUL_CONTINUE;
4187	struct __x86_intercept icpt_info;
4188	struct vmcb *vmcb = svm->vmcb;
4189
4190	if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4191		goto out;
4192
4193	icpt_info = x86_intercept_map[info->intercept];
4194
4195	if (stage != icpt_info.stage)
4196		goto out;
4197
4198	switch (icpt_info.exit_code) {
4199	case SVM_EXIT_READ_CR0:
4200		if (info->intercept == x86_intercept_cr_read)
4201			icpt_info.exit_code += info->modrm_reg;
4202		break;
4203	case SVM_EXIT_WRITE_CR0: {
4204		unsigned long cr0, val;
4205		u64 intercept;
4206
4207		if (info->intercept == x86_intercept_cr_write)
4208			icpt_info.exit_code += info->modrm_reg;
4209
4210		if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4211		    info->intercept == x86_intercept_clts)
4212			break;
4213
4214		intercept = svm->nested.intercept;
4215
4216		if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4217			break;
4218
4219		cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4220		val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4221
4222		if (info->intercept == x86_intercept_lmsw) {
4223			cr0 &= 0xfUL;
4224			val &= 0xfUL;
4225			/* lmsw can't clear PE - catch this here */
4226			if (cr0 & X86_CR0_PE)
4227				val |= X86_CR0_PE;
4228		}
4229
4230		if (cr0 ^ val)
4231			icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4232
4233		break;
4234	}
4235	case SVM_EXIT_READ_DR0:
4236	case SVM_EXIT_WRITE_DR0:
4237		icpt_info.exit_code += info->modrm_reg;
4238		break;
4239	case SVM_EXIT_MSR:
4240		if (info->intercept == x86_intercept_wrmsr)
4241			vmcb->control.exit_info_1 = 1;
4242		else
4243			vmcb->control.exit_info_1 = 0;
4244		break;
4245	case SVM_EXIT_PAUSE:
4246		/*
4247		 * We get this for NOP only, but pause
4248		 * is rep not, check this here
4249		 */
4250		if (info->rep_prefix != REPE_PREFIX)
4251			goto out;
4252	case SVM_EXIT_IOIO: {
4253		u64 exit_info;
4254		u32 bytes;
4255
4256		if (info->intercept == x86_intercept_in ||
4257		    info->intercept == x86_intercept_ins) {
4258			exit_info = ((info->src_val & 0xffff) << 16) |
4259				SVM_IOIO_TYPE_MASK;
4260			bytes = info->dst_bytes;
4261		} else {
4262			exit_info = (info->dst_val & 0xffff) << 16;
4263			bytes = info->src_bytes;
4264		}
4265
4266		if (info->intercept == x86_intercept_outs ||
4267		    info->intercept == x86_intercept_ins)
4268			exit_info |= SVM_IOIO_STR_MASK;
4269
4270		if (info->rep_prefix)
4271			exit_info |= SVM_IOIO_REP_MASK;
4272
4273		bytes = min(bytes, 4u);
4274
4275		exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4276
4277		exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4278
4279		vmcb->control.exit_info_1 = exit_info;
4280		vmcb->control.exit_info_2 = info->next_rip;
4281
4282		break;
4283	}
4284	default:
4285		break;
4286	}
4287
4288	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4289	if (static_cpu_has(X86_FEATURE_NRIPS))
4290		vmcb->control.next_rip  = info->next_rip;
4291	vmcb->control.exit_code = icpt_info.exit_code;
4292	vmexit = nested_svm_exit_handled(svm);
4293
4294	ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4295					   : X86EMUL_CONTINUE;
4296
4297out:
4298	return ret;
4299}
4300
4301static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
4302{
4303	local_irq_enable();
4304}
4305
4306static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4307{
4308}
4309
4310static struct kvm_x86_ops svm_x86_ops = {
4311	.cpu_has_kvm_support = has_svm,
4312	.disabled_by_bios = is_disabled,
4313	.hardware_setup = svm_hardware_setup,
4314	.hardware_unsetup = svm_hardware_unsetup,
4315	.check_processor_compatibility = svm_check_processor_compat,
4316	.hardware_enable = svm_hardware_enable,
4317	.hardware_disable = svm_hardware_disable,
4318	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
4319	.cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
4320
4321	.vcpu_create = svm_create_vcpu,
4322	.vcpu_free = svm_free_vcpu,
4323	.vcpu_reset = svm_vcpu_reset,
4324
4325	.prepare_guest_switch = svm_prepare_guest_switch,
4326	.vcpu_load = svm_vcpu_load,
4327	.vcpu_put = svm_vcpu_put,
4328
4329	.update_bp_intercept = update_bp_intercept,
4330	.get_msr = svm_get_msr,
4331	.set_msr = svm_set_msr,
4332	.get_segment_base = svm_get_segment_base,
4333	.get_segment = svm_get_segment,
4334	.set_segment = svm_set_segment,
4335	.get_cpl = svm_get_cpl,
4336	.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
4337	.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4338	.decache_cr3 = svm_decache_cr3,
4339	.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
4340	.set_cr0 = svm_set_cr0,
4341	.set_cr3 = svm_set_cr3,
4342	.set_cr4 = svm_set_cr4,
4343	.set_efer = svm_set_efer,
4344	.get_idt = svm_get_idt,
4345	.set_idt = svm_set_idt,
4346	.get_gdt = svm_get_gdt,
4347	.set_gdt = svm_set_gdt,
4348	.get_dr6 = svm_get_dr6,
4349	.set_dr6 = svm_set_dr6,
4350	.set_dr7 = svm_set_dr7,
4351	.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4352	.cache_reg = svm_cache_reg,
4353	.get_rflags = svm_get_rflags,
4354	.set_rflags = svm_set_rflags,
4355
4356	.get_pkru = svm_get_pkru,
4357
4358	.fpu_activate = svm_fpu_activate,
4359	.fpu_deactivate = svm_fpu_deactivate,
4360
4361	.tlb_flush = svm_flush_tlb,
4362
4363	.run = svm_vcpu_run,
4364	.handle_exit = handle_exit,
4365	.skip_emulated_instruction = skip_emulated_instruction,
4366	.set_interrupt_shadow = svm_set_interrupt_shadow,
4367	.get_interrupt_shadow = svm_get_interrupt_shadow,
4368	.patch_hypercall = svm_patch_hypercall,
4369	.set_irq = svm_set_irq,
4370	.set_nmi = svm_inject_nmi,
4371	.queue_exception = svm_queue_exception,
4372	.cancel_injection = svm_cancel_injection,
4373	.interrupt_allowed = svm_interrupt_allowed,
4374	.nmi_allowed = svm_nmi_allowed,
4375	.get_nmi_mask = svm_get_nmi_mask,
4376	.set_nmi_mask = svm_set_nmi_mask,
4377	.enable_nmi_window = enable_nmi_window,
4378	.enable_irq_window = enable_irq_window,
4379	.update_cr8_intercept = update_cr8_intercept,
4380	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
4381	.get_enable_apicv = svm_get_enable_apicv,
4382	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
4383	.load_eoi_exitmap = svm_load_eoi_exitmap,
4384	.sync_pir_to_irr = svm_sync_pir_to_irr,
4385
4386	.set_tss_addr = svm_set_tss_addr,
4387	.get_tdp_level = get_npt_level,
4388	.get_mt_mask = svm_get_mt_mask,
4389
4390	.get_exit_info = svm_get_exit_info,
4391
4392	.get_lpage_level = svm_get_lpage_level,
4393
4394	.cpuid_update = svm_cpuid_update,
4395
4396	.rdtscp_supported = svm_rdtscp_supported,
4397	.invpcid_supported = svm_invpcid_supported,
4398	.mpx_supported = svm_mpx_supported,
4399	.xsaves_supported = svm_xsaves_supported,
4400
4401	.set_supported_cpuid = svm_set_supported_cpuid,
4402
4403	.has_wbinvd_exit = svm_has_wbinvd_exit,
4404
4405	.read_tsc_offset = svm_read_tsc_offset,
4406	.write_tsc_offset = svm_write_tsc_offset,
4407	.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
4408	.read_l1_tsc = svm_read_l1_tsc,
4409
4410	.set_tdp_cr3 = set_tdp_cr3,
4411
4412	.check_intercept = svm_check_intercept,
4413	.handle_external_intr = svm_handle_external_intr,
4414
4415	.sched_in = svm_sched_in,
4416
4417	.pmu_ops = &amd_pmu_ops,
4418};
4419
4420static int __init svm_init(void)
4421{
4422	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
4423			__alignof__(struct vcpu_svm), THIS_MODULE);
4424}
4425
4426static void __exit svm_exit(void)
4427{
4428	kvm_exit();
4429}
4430
4431module_init(svm_init)
4432module_exit(svm_exit)