x86.c - arch/x86/kvm/x86.c - Linux diff v3.1 - Bootlin Elixir Cross Referencer

 
   1/*
   2 * Kernel-based Virtual Machine driver for Linux
   3 *
   4 * derived from drivers/kvm/kvm_main.c
   5 *
   6 * Copyright (C) 2006 Qumranet, Inc.
   7 * Copyright (C) 2008 Qumranet, Inc.
   8 * Copyright IBM Corporation, 2008
   9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  10 *
  11 * Authors:
  12 *   Avi Kivity   <avi@qumranet.com>
  13 *   Yaniv Kamay  <yaniv@qumranet.com>
  14 *   Amit Shah    <amit.shah@qumranet.com>
  15 *   Ben-Ami Yassour <benami@il.ibm.com>
  16 *
  17 * This work is licensed under the terms of the GNU GPL, version 2.  See
  18 * the COPYING file in the top-level directory.
  19 *
  20 */
  21
  22#include <linux/kvm_host.h>
  23#include "irq.h"
  24#include "mmu.h"
  25#include "i8254.h"
  26#include "tss.h"
  27#include "kvm_cache_regs.h"
  28#include "x86.h"
 
 
 
  29
  30#include <linux/clocksource.h>
  31#include <linux/interrupt.h>
  32#include <linux/kvm.h>
  33#include <linux/fs.h>
  34#include <linux/vmalloc.h>
  35#include <linux/module.h>
 
  36#include <linux/mman.h>
  37#include <linux/highmem.h>
  38#include <linux/iommu.h>
  39#include <linux/intel-iommu.h>
  40#include <linux/cpufreq.h>
  41#include <linux/user-return-notifier.h>
  42#include <linux/srcu.h>
  43#include <linux/slab.h>
  44#include <linux/perf_event.h>
  45#include <linux/uaccess.h>
  46#include <linux/hash.h>
  47#include <trace/events/kvm.h>
 
 
 
 
 
 
 
  48
  49#define CREATE_TRACE_POINTS
  50#include "trace.h"
  51
  52#include <asm/debugreg.h>
  53#include <asm/msr.h>
  54#include <asm/desc.h>
  55#include <asm/mtrr.h>
  56#include <asm/mce.h>
  57#include <asm/i387.h>
  58#include <asm/xcr.h>
  59#include <asm/pvclock.h>
  60#include <asm/div64.h>
 
 
 
 
 
 
 
 
  61
  62#define MAX_IO_MSRS 256
  63#define KVM_MAX_MCE_BANKS 32
  64#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
 
  65
  66#define emul_to_vcpu(ctxt) \
  67	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
  68
  69/* EFER defaults:
  70 * - enable syscall per default because its emulated by KVM
  71 * - enable LME and LMA per default on 64 bit KVM
  72 */
  73#ifdef CONFIG_X86_64
  74static
  75u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
  76#else
  77static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
  78#endif
  79
  80#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
  81#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 
 
  82
  83static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  84static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  85				    struct kvm_cpuid_entry2 __user *entries);
 
 
 
  86
  87struct kvm_x86_ops *kvm_x86_ops;
  88EXPORT_SYMBOL_GPL(kvm_x86_ops);
  89
  90int ignore_msrs = 0;
  91module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
 
 
 
 
 
  92
  93bool kvm_has_tsc_control;
 
 
 
  94EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
  95u32  kvm_max_guest_tsc_khz;
  96EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  97
  98#define KVM_NR_SHARED_MSRS 16
  99
 100struct kvm_shared_msrs_global {
 101	int nr;
 102	u32 msrs[KVM_NR_SHARED_MSRS];
 103};
 104
 105struct kvm_shared_msrs {
 106	struct user_return_notifier urn;
 107	bool registered;
 108	struct kvm_shared_msr_values {
 109		u64 host;
 110		u64 curr;
 111	} values[KVM_NR_SHARED_MSRS];
 112};
 113
 114static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
 115static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
 116
 117struct kvm_stats_debugfs_item debugfs_entries[] = {
 118	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 119	{ "pf_guest", VCPU_STAT(pf_guest) },
 120	{ "tlb_flush", VCPU_STAT(tlb_flush) },
 121	{ "invlpg", VCPU_STAT(invlpg) },
 122	{ "exits", VCPU_STAT(exits) },
 123	{ "io_exits", VCPU_STAT(io_exits) },
 124	{ "mmio_exits", VCPU_STAT(mmio_exits) },
 125	{ "signal_exits", VCPU_STAT(signal_exits) },
 126	{ "irq_window", VCPU_STAT(irq_window_exits) },
 127	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
 128	{ "halt_exits", VCPU_STAT(halt_exits) },
 
 
 
 129	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 130	{ "hypercalls", VCPU_STAT(hypercalls) },
 131	{ "request_irq", VCPU_STAT(request_irq_exits) },
 132	{ "irq_exits", VCPU_STAT(irq_exits) },
 133	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 134	{ "efer_reload", VCPU_STAT(efer_reload) },
 135	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 136	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 137	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 138	{ "irq_injections", VCPU_STAT(irq_injections) },
 139	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 
 
 140	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 141	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 142	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
 143	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
 144	{ "mmu_flooded", VM_STAT(mmu_flooded) },
 145	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 146	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 147	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 148	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 149	{ "largepages", VM_STAT(lpages) },
 
 
 
 150	{ NULL }
 151};
 152
 153u64 __read_mostly host_xcr0;
 154
 155int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
 
 
 156
 157static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 158{
 159	int i;
 160	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
 161		vcpu->arch.apf.gfns[i] = ~0;
 162}
 163
 164static void kvm_on_user_return(struct user_return_notifier *urn)
 165{
 166	unsigned slot;
 167	struct kvm_shared_msrs *locals
 168		= container_of(urn, struct kvm_shared_msrs, urn);
 169	struct kvm_shared_msr_values *values;
 
 170
 
 
 
 
 
 
 
 
 
 
 171	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 172		values = &locals->values[slot];
 173		if (values->host != values->curr) {
 174			wrmsrl(shared_msrs_global.msrs[slot], values->host);
 175			values->curr = values->host;
 176		}
 177	}
 178	locals->registered = false;
 179	user_return_notifier_unregister(urn);
 180}
 181
 182static void shared_msr_update(unsigned slot, u32 msr)
 183{
 184	struct kvm_shared_msrs *smsr;
 185	u64 value;
 
 
 186
 187	smsr = &__get_cpu_var(shared_msrs);
 188	/* only read, and nobody should modify it at this time,
 189	 * so don't need lock */
 190	if (slot >= shared_msrs_global.nr) {
 191		printk(KERN_ERR "kvm: invalid MSR slot!");
 192		return;
 193	}
 194	rdmsrl_safe(msr, &value);
 195	smsr->values[slot].host = value;
 196	smsr->values[slot].curr = value;
 197}
 198
 199void kvm_define_shared_msr(unsigned slot, u32 msr)
 200{
 
 
 201	if (slot >= shared_msrs_global.nr)
 202		shared_msrs_global.nr = slot + 1;
 203	shared_msrs_global.msrs[slot] = msr;
 204	/* we need ensured the shared_msr_global have been updated */
 205	smp_wmb();
 206}
 207EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 208
 209static void kvm_shared_msr_cpu_online(void)
 210{
 211	unsigned i;
 212
 213	for (i = 0; i < shared_msrs_global.nr; ++i)
 214		shared_msr_update(i, shared_msrs_global.msrs[i]);
 215}
 216
 217void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
 218{
 219	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 
 
 220
 221	if (((value ^ smsr->values[slot].curr) & mask) == 0)
 222		return;
 223	smsr->values[slot].curr = value;
 224	wrmsrl(shared_msrs_global.msrs[slot], value);
 
 
 
 225	if (!smsr->registered) {
 226		smsr->urn.on_user_return = kvm_on_user_return;
 227		user_return_notifier_register(&smsr->urn);
 228		smsr->registered = true;
 229	}
 
 230}
 231EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
 232
 233static void drop_user_return_notifiers(void *ignore)
 234{
 235	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
 
 236
 237	if (smsr->registered)
 238		kvm_on_user_return(&smsr->urn);
 239}
 240
 241u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 242{
 243	if (irqchip_in_kernel(vcpu->kvm))
 244		return vcpu->arch.apic_base;
 245	else
 246		return vcpu->arch.apic_base;
 247}
 248EXPORT_SYMBOL_GPL(kvm_get_apic_base);
 249
 250void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 251{
 252	/* TODO: reserve bits check */
 253	if (irqchip_in_kernel(vcpu->kvm))
 254		kvm_lapic_set_base(vcpu, data);
 255	else
 256		vcpu->arch.apic_base = data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 257}
 258EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 259
 
 
 
 
 
 
 
 260#define EXCPT_BENIGN		0
 261#define EXCPT_CONTRIBUTORY	1
 262#define EXCPT_PF		2
 263
 264static int exception_class(int vector)
 265{
 266	switch (vector) {
 267	case PF_VECTOR:
 268		return EXCPT_PF;
 269	case DE_VECTOR:
 270	case TS_VECTOR:
 271	case NP_VECTOR:
 272	case SS_VECTOR:
 273	case GP_VECTOR:
 274		return EXCPT_CONTRIBUTORY;
 275	default:
 276		break;
 277	}
 278	return EXCPT_BENIGN;
 279}
 280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 281static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 282		unsigned nr, bool has_error, u32 error_code,
 283		bool reinject)
 284{
 285	u32 prev_nr;
 286	int class1, class2;
 287
 288	kvm_make_request(KVM_REQ_EVENT, vcpu);
 289
 290	if (!vcpu->arch.exception.pending) {
 291	queue:
 292		vcpu->arch.exception.pending = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 293		vcpu->arch.exception.has_error_code = has_error;
 294		vcpu->arch.exception.nr = nr;
 295		vcpu->arch.exception.error_code = error_code;
 296		vcpu->arch.exception.reinject = reinject;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 297		return;
 298	}
 299
 300	/* to check exception */
 301	prev_nr = vcpu->arch.exception.nr;
 302	if (prev_nr == DF_VECTOR) {
 303		/* triple fault -> shutdown */
 304		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 305		return;
 306	}
 307	class1 = exception_class(prev_nr);
 308	class2 = exception_class(nr);
 309	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
 310		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
 311		/* generate double fault per SDM Table 5-5 */
 
 
 
 
 312		vcpu->arch.exception.pending = true;
 
 313		vcpu->arch.exception.has_error_code = true;
 314		vcpu->arch.exception.nr = DF_VECTOR;
 315		vcpu->arch.exception.error_code = 0;
 
 
 316	} else
 317		/* replace previous exception with a new one in a hope
 318		   that instruction re-execution will regenerate lost
 319		   exception */
 320		goto queue;
 321}
 322
 323void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 324{
 325	kvm_multiple_exception(vcpu, nr, false, 0, false);
 326}
 327EXPORT_SYMBOL_GPL(kvm_queue_exception);
 328
 329void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 330{
 331	kvm_multiple_exception(vcpu, nr, false, 0, true);
 332}
 333EXPORT_SYMBOL_GPL(kvm_requeue_exception);
 334
 335void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 336{
 337	if (err)
 338		kvm_inject_gp(vcpu, 0);
 339	else
 340		kvm_x86_ops->skip_emulated_instruction(vcpu);
 
 
 341}
 342EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 343
 344void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 345{
 346	++vcpu->stat.pf_guest;
 347	vcpu->arch.cr2 = fault->address;
 348	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 
 
 
 
 
 
 
 349}
 350EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 351
 352void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 353{
 354	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
 355		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
 356	else
 357		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 
 
 358}
 359
 360void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 361{
 362	kvm_make_request(KVM_REQ_EVENT, vcpu);
 363	vcpu->arch.nmi_pending = 1;
 364}
 365EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 366
 367void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 368{
 369	kvm_multiple_exception(vcpu, nr, true, error_code, false);
 370}
 371EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 372
 373void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 374{
 375	kvm_multiple_exception(vcpu, nr, true, error_code, true);
 376}
 377EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
 378
 379/*
 380 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 381 * a #GP and return false.
 382 */
 383bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
 384{
 385	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
 386		return true;
 387	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 388	return false;
 389}
 390EXPORT_SYMBOL_GPL(kvm_require_cpl);
 391
 
 
 
 
 
 
 
 
 
 
 392/*
 393 * This function will be used to read from the physical memory of the currently
 394 * running guest. The difference to kvm_read_guest_page is that this function
 395 * can read from guest physical or from the guest's guest physical memory.
 396 */
 397int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 398			    gfn_t ngfn, void *data, int offset, int len,
 399			    u32 access)
 400{
 
 401	gfn_t real_gfn;
 402	gpa_t ngpa;
 403
 404	ngpa     = gfn_to_gpa(ngfn);
 405	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
 406	if (real_gfn == UNMAPPED_GVA)
 407		return -EFAULT;
 408
 409	real_gfn = gpa_to_gfn(real_gfn);
 410
 411	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
 412}
 413EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 414
 415int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 416			       void *data, int offset, int len, u32 access)
 417{
 418	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
 419				       data, offset, len, access);
 420}
 421
 
 
 
 
 
 
 422/*
 423 * Load the pae pdptrs.  Return true is they are all valid.
 424 */
 425int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 426{
 427	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 428	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
 429	int i;
 430	int ret;
 431	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
 432
 433	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
 434				      offset * sizeof(u64), sizeof(pdpte),
 435				      PFERR_USER_MASK|PFERR_WRITE_MASK);
 436	if (ret < 0) {
 437		ret = 0;
 438		goto out;
 439	}
 440	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 441		if (is_present_gpte(pdpte[i]) &&
 442		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 443			ret = 0;
 444			goto out;
 445		}
 446	}
 447	ret = 1;
 448
 449	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
 450	__set_bit(VCPU_EXREG_PDPTR,
 451		  (unsigned long *)&vcpu->arch.regs_avail);
 452	__set_bit(VCPU_EXREG_PDPTR,
 453		  (unsigned long *)&vcpu->arch.regs_dirty);
 454out:
 455
 456	return ret;
 457}
 458EXPORT_SYMBOL_GPL(load_pdptrs);
 459
 460static bool pdptrs_changed(struct kvm_vcpu *vcpu)
 461{
 462	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
 463	bool changed = true;
 464	int offset;
 465	gfn_t gfn;
 466	int r;
 467
 468	if (is_long_mode(vcpu) || !is_pae(vcpu))
 469		return false;
 470
 471	if (!test_bit(VCPU_EXREG_PDPTR,
 472		      (unsigned long *)&vcpu->arch.regs_avail))
 473		return true;
 474
 475	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
 476	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
 477	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
 478				       PFERR_USER_MASK | PFERR_WRITE_MASK);
 479	if (r < 0)
 480		goto out;
 481	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
 482out:
 483
 484	return changed;
 485}
 
 486
 487int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 488{
 489	unsigned long old_cr0 = kvm_read_cr0(vcpu);
 490	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
 491				    X86_CR0_CD | X86_CR0_NW;
 492
 493	cr0 |= X86_CR0_ET;
 494
 495#ifdef CONFIG_X86_64
 496	if (cr0 & 0xffffffff00000000UL)
 497		return 1;
 498#endif
 499
 500	cr0 &= ~CR0_RESERVED_BITS;
 501
 502	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
 503		return 1;
 504
 505	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
 506		return 1;
 507
 508	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 509#ifdef CONFIG_X86_64
 510		if ((vcpu->arch.efer & EFER_LME)) {
 511			int cs_db, cs_l;
 512
 513			if (!is_pae(vcpu))
 514				return 1;
 515			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 516			if (cs_l)
 517				return 1;
 518		} else
 519#endif
 520		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 521						 kvm_read_cr3(vcpu)))
 522			return 1;
 523	}
 524
 
 
 
 525	kvm_x86_ops->set_cr0(vcpu, cr0);
 526
 527	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
 528		kvm_clear_async_pf_completion_queue(vcpu);
 529		kvm_async_pf_hash_reset(vcpu);
 530	}
 531
 532	if ((cr0 ^ old_cr0) & update_bits)
 533		kvm_mmu_reset_context(vcpu);
 
 
 
 
 
 
 534	return 0;
 535}
 536EXPORT_SYMBOL_GPL(kvm_set_cr0);
 537
 538void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 539{
 540	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 541}
 542EXPORT_SYMBOL_GPL(kvm_lmsw);
 543
 544int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 545{
 546	u64 xcr0;
 
 
 547
 548	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 549	if (index != XCR_XFEATURE_ENABLED_MASK)
 550		return 1;
 551	xcr0 = xcr;
 552	if (kvm_x86_ops->get_cpl(vcpu) != 0)
 553		return 1;
 554	if (!(xcr0 & XSTATE_FP))
 555		return 1;
 556	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
 
 
 
 
 
 
 
 557		return 1;
 558	if (xcr0 & ~host_xcr0)
 
 
 559		return 1;
 
 
 
 
 
 
 
 560	vcpu->arch.xcr0 = xcr0;
 561	vcpu->guest_xcr0_loaded = 0;
 
 
 562	return 0;
 563}
 564
 565int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 566{
 567	if (__kvm_set_xcr(vcpu, index, xcr)) {
 
 568		kvm_inject_gp(vcpu, 0);
 569		return 1;
 570	}
 571	return 0;
 572}
 573EXPORT_SYMBOL_GPL(kvm_set_xcr);
 574
 575static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 576{
 577	struct kvm_cpuid_entry2 *best;
 
 578
 579	best = kvm_find_cpuid_entry(vcpu, 1, 0);
 580	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 581}
 582
 583static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
 584{
 585	struct kvm_cpuid_entry2 *best;
 586
 587	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 588	return best && (best->ebx & bit(X86_FEATURE_SMEP));
 589}
 590
 591static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 592{
 593	struct kvm_cpuid_entry2 *best;
 594
 595	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 596	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 597}
 598
 599static void update_cpuid(struct kvm_vcpu *vcpu)
 600{
 601	struct kvm_cpuid_entry2 *best;
 602
 603	best = kvm_find_cpuid_entry(vcpu, 1, 0);
 604	if (!best)
 605		return;
 606
 607	/* Update OSXSAVE bit */
 608	if (cpu_has_xsave && best->function == 0x1) {
 609		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
 610		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
 611			best->ecx |= bit(X86_FEATURE_OSXSAVE);
 612	}
 613}
 614
 615int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 616{
 617	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 618	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
 619				   X86_CR4_PAE | X86_CR4_SMEP;
 620	if (cr4 & CR4_RESERVED_BITS)
 621		return 1;
 622
 623	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
 624		return 1;
 625
 626	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
 627		return 1;
 628
 629	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
 630		return 1;
 631
 632	if (is_long_mode(vcpu)) {
 633		if (!(cr4 & X86_CR4_PAE))
 634			return 1;
 635	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 636		   && ((cr4 ^ old_cr4) & pdptr_bits)
 637		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
 638				   kvm_read_cr3(vcpu)))
 639		return 1;
 640
 
 
 
 
 
 
 
 
 
 641	if (kvm_x86_ops->set_cr4(vcpu, cr4))
 642		return 1;
 643
 644	if ((cr4 ^ old_cr4) & pdptr_bits)
 
 645		kvm_mmu_reset_context(vcpu);
 646
 647	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
 648		update_cpuid(vcpu);
 649
 650	return 0;
 651}
 652EXPORT_SYMBOL_GPL(kvm_set_cr4);
 653
 654int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 655{
 656	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
 657		kvm_mmu_sync_roots(vcpu);
 658		kvm_mmu_flush_tlb(vcpu);
 659		return 0;
 
 
 
 660	}
 
 661
 662	if (is_long_mode(vcpu)) {
 663		if (cr3 & CR3_L_MODE_RESERVED_BITS)
 664			return 1;
 665	} else {
 666		if (is_pae(vcpu)) {
 667			if (cr3 & CR3_PAE_RESERVED_BITS)
 668				return 1;
 669			if (is_paging(vcpu) &&
 670			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
 671				return 1;
 672		}
 673		/*
 674		 * We don't check reserved bits in nonpae mode, because
 675		 * this isn't enforced, and VMware depends on this.
 676		 */
 677	}
 678
 679	/*
 680	 * Does the new cr3 value map to physical memory? (Note, we
 681	 * catch an invalid cr3 even in real-mode, because it would
 682	 * cause trouble later on when we turn on paging anyway.)
 683	 *
 684	 * A real CPU would silently accept an invalid cr3 and would
 685	 * attempt to use it - with largely undefined (and often hard
 686	 * to debug) behavior on the guest side.
 687	 */
 688	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 689		return 1;
 
 
 
 
 
 690	vcpu->arch.cr3 = cr3;
 691	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 692	vcpu->arch.mmu.new_cr3(vcpu);
 693	return 0;
 694}
 695EXPORT_SYMBOL_GPL(kvm_set_cr3);
 696
 697int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 698{
 699	if (cr8 & CR8_RESERVED_BITS)
 700		return 1;
 701	if (irqchip_in_kernel(vcpu->kvm))
 702		kvm_lapic_set_tpr(vcpu, cr8);
 703	else
 704		vcpu->arch.cr8 = cr8;
 705	return 0;
 706}
 707EXPORT_SYMBOL_GPL(kvm_set_cr8);
 708
 709unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 710{
 711	if (irqchip_in_kernel(vcpu->kvm))
 712		return kvm_lapic_get_cr8(vcpu);
 713	else
 714		return vcpu->arch.cr8;
 715}
 716EXPORT_SYMBOL_GPL(kvm_get_cr8);
 717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 718static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 719{
 720	switch (dr) {
 721	case 0 ... 3:
 722		vcpu->arch.db[dr] = val;
 723		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 724			vcpu->arch.eff_db[dr] = val;
 725		break;
 726	case 4:
 727		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 728			return 1; /* #UD */
 729		/* fall through */
 730	case 6:
 731		if (val & 0xffffffff00000000ULL)
 732			return -1; /* #GP */
 733		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 
 734		break;
 735	case 5:
 736		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 737			return 1; /* #UD */
 738		/* fall through */
 739	default: /* 7 */
 740		if (val & 0xffffffff00000000ULL)
 741			return -1; /* #GP */
 742		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 743		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 744			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
 745			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
 746		}
 747		break;
 748	}
 749
 750	return 0;
 751}
 752
 753int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 754{
 755	int res;
 756
 757	res = __kvm_set_dr(vcpu, dr, val);
 758	if (res > 0)
 759		kvm_queue_exception(vcpu, UD_VECTOR);
 760	else if (res < 0)
 761		kvm_inject_gp(vcpu, 0);
 762
 763	return res;
 
 764}
 765EXPORT_SYMBOL_GPL(kvm_set_dr);
 766
 767static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 768{
 769	switch (dr) {
 770	case 0 ... 3:
 771		*val = vcpu->arch.db[dr];
 772		break;
 773	case 4:
 774		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 775			return 1;
 776		/* fall through */
 777	case 6:
 778		*val = vcpu->arch.dr6;
 
 
 
 779		break;
 780	case 5:
 781		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 782			return 1;
 783		/* fall through */
 784	default: /* 7 */
 785		*val = vcpu->arch.dr7;
 786		break;
 787	}
 788
 789	return 0;
 790}
 
 791
 792int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 793{
 794	if (_kvm_get_dr(vcpu, dr, val)) {
 795		kvm_queue_exception(vcpu, UD_VECTOR);
 796		return 1;
 797	}
 798	return 0;
 
 
 
 
 
 799}
 800EXPORT_SYMBOL_GPL(kvm_get_dr);
 801
 802/*
 803 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 804 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 805 *
 806 * This list is modified at module load time to reflect the
 
 
 807 * capabilities of the host cpu. This capabilities test skips MSRs that are
 808 * kvm-specific. Those are put in the beginning of the list.
 
 809 */
 810
 811#define KVM_SAVE_MSRS_BEGIN	9
 812static u32 msrs_to_save[] = {
 813	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 814	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 815	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 816	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 817	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 818	MSR_STAR,
 819#ifdef CONFIG_X86_64
 820	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 821#endif
 822	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 823};
 824
 
 825static unsigned num_msrs_to_save;
 826
 827static u32 emulated_msrs[] = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 828	MSR_IA32_MISC_ENABLE,
 829	MSR_IA32_MCG_STATUS,
 830	MSR_IA32_MCG_CTL,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 831};
 832
 833static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 
 
 834{
 835	u64 old_efer = vcpu->arch.efer;
 836
 837	if (efer & efer_reserved_bits)
 838		return 1;
 839
 840	if (is_paging(vcpu)
 841	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
 842		return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 843
 844	if (efer & EFER_FFXSR) {
 845		struct kvm_cpuid_entry2 *feat;
 846
 847		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 848		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
 
 
 
 
 
 
 
 
 
 849			return 1;
 850	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 851
 852	if (efer & EFER_SVME) {
 853		struct kvm_cpuid_entry2 *feat;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 854
 855		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 856		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 857			return 1;
 858	}
 859
 860	efer &= ~EFER_LMA;
 861	efer |= vcpu->arch.efer & EFER_LMA;
 862
 863	kvm_x86_ops->set_efer(vcpu, efer);
 864
 865	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 866
 867	/* Update reserved bits */
 868	if ((efer ^ old_efer) & EFER_NX)
 869		kvm_mmu_reset_context(vcpu);
 870
 871	return 0;
 872}
 873
 874void kvm_enable_efer_bits(u64 mask)
 875{
 876       efer_reserved_bits &= ~mask;
 877}
 878EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 880
 881/*
 882 * Writes msr value into into the appropriate "register".
 
 883 * Returns 0 on success, non-0 otherwise.
 884 * Assumes vcpu_load() was already called.
 885 */
 886int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 887{
 888	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
 889}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 890
 891/*
 892 * Adapt set_msr() to msr_io()'s calling convention
 893 */
 
 
 
 
 
 894static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 895{
 896	return kvm_set_msr(vcpu, index, *data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 897}
 898
 899static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 900{
 901	int version;
 902	int r;
 903	struct pvclock_wall_clock wc;
 904	struct timespec boot;
 905
 906	if (!wall_clock)
 907		return;
 908
 909	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
 910	if (r)
 911		return;
 912
 913	if (version & 1)
 914		++version;  /* first time write, random junk */
 915
 916	++version;
 917
 918	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
 919
 920	/*
 921	 * The guest calculates current wall clock time by adding
 922	 * system time (updated by kvm_guest_time_update below) to the
 923	 * wall clock specified here.  guest system time equals host
 924	 * system time for us, thus we must fill in host boot time here.
 925	 */
 926	getboottime(&boot);
 927
 928	wc.sec = boot.tv_sec;
 
 
 
 
 929	wc.nsec = boot.tv_nsec;
 930	wc.version = version;
 931
 932	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 933
 934	version++;
 935	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 936}
 937
 938static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 939{
 940	uint32_t quotient, remainder;
 941
 942	/* Don't try to replace with do_div(), this one calculates
 943	 * "(dividend << 32) / divisor" */
 944	__asm__ ( "divl %4"
 945		  : "=a" (quotient), "=d" (remainder)
 946		  : "0" (0), "1" (dividend), "r" (divisor) );
 947	return quotient;
 948}
 949
 950static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 951			       s8 *pshift, u32 *pmultiplier)
 952{
 953	uint64_t scaled64;
 954	int32_t  shift = 0;
 955	uint64_t tps64;
 956	uint32_t tps32;
 957
 958	tps64 = base_khz * 1000LL;
 959	scaled64 = scaled_khz * 1000LL;
 960	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 961		tps64 >>= 1;
 962		shift--;
 963	}
 964
 965	tps32 = (uint32_t)tps64;
 966	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
 967		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
 968			scaled64 >>= 1;
 969		else
 970			tps32 <<= 1;
 971		shift++;
 972	}
 973
 974	*pshift = shift;
 975	*pmultiplier = div_frac(scaled64, tps32);
 
 
 
 
 
 976
 977	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
 978		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
 
 
 
 
 
 
 979}
 980
 981static inline u64 get_kernel_ns(void)
 982{
 983	struct timespec ts;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 984
 985	WARN_ON(preemptible());
 986	ktime_get_ts(&ts);
 987	monotonic_to_bootbased(&ts);
 988	return timespec_to_ns(&ts);
 
 
 
 
 
 
 
 
 989}
 990
 991static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 992unsigned long max_tsc_khz;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 993
 994static inline int kvm_tsc_changes_freq(void)
 995{
 996	int cpu = get_cpu();
 997	int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
 998		  cpufreq_quick_get(cpu) != 0;
 999	put_cpu();
1000	return ret;
1001}
1002
1003static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
1004{
1005	if (vcpu->arch.virtual_tsc_khz)
1006		return vcpu->arch.virtual_tsc_khz;
1007	else
1008		return __this_cpu_read(cpu_tsc_khz);
1009}
1010
1011static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1012{
1013	u64 ret;
 
 
 
1014
1015	WARN_ON(preemptible());
1016	if (kvm_tsc_changes_freq())
1017		printk_once(KERN_WARNING
1018		 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
1019	ret = nsec * vcpu_tsc_khz(vcpu);
1020	do_div(ret, USEC_PER_SEC);
1021	return ret;
 
 
 
 
 
 
 
 
 
 
 
 
1022}
1023
1024static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1025{
1026	/* Compute a scale to convert nanoseconds in TSC cycles */
1027	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1028			   &vcpu->arch.tsc_catchup_shift,
1029			   &vcpu->arch.tsc_catchup_mult);
1030}
1031
1032static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 
 
 
 
 
 
 
 
 
 
1033{
1034	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1035				      vcpu->arch.tsc_catchup_mult,
1036				      vcpu->arch.tsc_catchup_shift);
1037	tsc += vcpu->arch.last_tsc_write;
1038	return tsc;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1039}
1040
1041void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1042{
1043	struct kvm *kvm = vcpu->kvm;
1044	u64 offset, ns, elapsed;
1045	unsigned long flags;
1046	s64 sdiff;
 
 
 
1047
1048	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1049	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1050	ns = get_kernel_ns();
1051	elapsed = ns - kvm->arch.last_tsc_nsec;
1052	sdiff = data - kvm->arch.last_tsc_write;
1053	if (sdiff < 0)
1054		sdiff = -sdiff;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1055
1056	/*
1057	 * Special case: close write to TSC within 5 seconds of
1058	 * another CPU is interpreted as an attempt to synchronize
1059	 * The 5 seconds is to accommodate host load / swapping as
1060	 * well as any reset of TSC during the boot process.
1061	 *
1062	 * In that case, for a reliable TSC, we can match TSC offsets,
1063	 * or make a best guest using elapsed value.
1064	 */
1065	if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
1066	    elapsed < 5ULL * NSEC_PER_SEC) {
1067		if (!check_tsc_unstable()) {
1068			offset = kvm->arch.last_tsc_offset;
1069			pr_debug("kvm: matched tsc offset for %llu\n", data);
1070		} else {
1071			u64 delta = nsec_to_cycles(vcpu, elapsed);
1072			offset += delta;
1073			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1074		}
1075		ns = kvm->arch.last_tsc_nsec;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1076	}
 
 
 
 
 
1077	kvm->arch.last_tsc_nsec = ns;
1078	kvm->arch.last_tsc_write = data;
1079	kvm->arch.last_tsc_offset = offset;
1080	kvm_x86_ops->write_tsc_offset(vcpu, offset);
 
 
 
 
 
 
 
 
 
 
 
1081	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1082
1083	/* Reset of TSC must disable overshoot protection below */
1084	vcpu->arch.hv_clock.tsc_timestamp = 0;
1085	vcpu->arch.last_tsc_write = data;
1086	vcpu->arch.last_tsc_nsec = ns;
 
 
 
 
 
1087}
 
1088EXPORT_SYMBOL_GPL(kvm_write_tsc);
1089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090static int kvm_guest_time_update(struct kvm_vcpu *v)
1091{
1092	unsigned long flags;
1093	struct kvm_vcpu_arch *vcpu = &v->arch;
1094	void *shared_kaddr;
1095	unsigned long this_tsc_khz;
1096	s64 kernel_ns, max_kernel_ns;
1097	u64 tsc_timestamp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1098
1099	/* Keep irq disabled to prevent changes to the clock */
1100	local_irq_save(flags);
1101	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1102	kernel_ns = get_kernel_ns();
1103	this_tsc_khz = vcpu_tsc_khz(v);
1104	if (unlikely(this_tsc_khz == 0)) {
1105		local_irq_restore(flags);
1106		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1107		return 1;
1108	}
 
 
 
 
 
 
1109
1110	/*
1111	 * We may have to catch up the TSC to match elapsed wall clock
1112	 * time for two reasons, even if kvmclock is used.
1113	 *   1) CPU could have been running below the maximum TSC rate
1114	 *   2) Broken TSC compensation resets the base at each VCPU
1115	 *      entry to avoid unknown leaps of TSC even when running
1116	 *      again on the same CPU.  This may cause apparent elapsed
1117	 *      time to disappear, and the guest to stand still or run
1118	 *	very slowly.
1119	 */
1120	if (vcpu->tsc_catchup) {
1121		u64 tsc = compute_guest_tsc(v, kernel_ns);
1122		if (tsc > tsc_timestamp) {
1123			kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1124			tsc_timestamp = tsc;
1125		}
1126	}
1127
1128	local_irq_restore(flags);
1129
1130	if (!vcpu->time_page)
1131		return 0;
1132
1133	/*
1134	 * Time as measured by the TSC may go backwards when resetting the base
1135	 * tsc_timestamp.  The reason for this is that the TSC resolution is
1136	 * higher than the resolution of the other clock scales.  Thus, many
1137	 * possible measurments of the TSC correspond to one measurement of any
1138	 * other clock, and so a spread of values is possible.  This is not a
1139	 * problem for the computation of the nanosecond clock; with TSC rates
1140	 * around 1GHZ, there can only be a few cycles which correspond to one
1141	 * nanosecond value, and any path through this code will inevitably
1142	 * take longer than that.  However, with the kernel_ns value itself,
1143	 * the precision may be much lower, down to HZ granularity.  If the
1144	 * first sampling of TSC against kernel_ns ends in the low part of the
1145	 * range, and the second in the high end of the range, we can get:
1146	 *
1147	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1148	 *
1149	 * As the sampling errors potentially range in the thousands of cycles,
1150	 * it is possible such a time value has already been observed by the
1151	 * guest.  To protect against this, we must compute the system time as
1152	 * observed by the guest and ensure the new system time is greater.
1153	 */
1154	max_kernel_ns = 0;
1155	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1156		max_kernel_ns = vcpu->last_guest_tsc -
1157				vcpu->hv_clock.tsc_timestamp;
1158		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1159				    vcpu->hv_clock.tsc_to_system_mul,
1160				    vcpu->hv_clock.tsc_shift);
1161		max_kernel_ns += vcpu->last_kernel_ns;
1162	}
1163
1164	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1165		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1166				   &vcpu->hv_clock.tsc_shift,
1167				   &vcpu->hv_clock.tsc_to_system_mul);
1168		vcpu->hw_tsc_khz = this_tsc_khz;
1169	}
1170
1171	if (max_kernel_ns > kernel_ns)
1172		kernel_ns = max_kernel_ns;
1173
1174	/* With all the info we got, fill in the values */
1175	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1176	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1177	vcpu->last_kernel_ns = kernel_ns;
1178	vcpu->last_guest_tsc = tsc_timestamp;
1179	vcpu->hv_clock.flags = 0;
1180
1181	/*
1182	 * The interface expects us to write an even number signaling that the
1183	 * update is finished. Since the guest won't see the intermediate
1184	 * state, we just increase by 2 at the end.
1185	 */
1186	vcpu->hv_clock.version += 2;
1187
1188	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
1189
1190	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
1191	       sizeof(vcpu->hv_clock));
1192
1193	kunmap_atomic(shared_kaddr, KM_USER0);
1194
1195	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
 
 
 
 
 
1196	return 0;
1197}
1198
1199static bool msr_mtrr_valid(unsigned msr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1200{
1201	switch (msr) {
1202	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1203	case MSR_MTRRfix64K_00000:
1204	case MSR_MTRRfix16K_80000:
1205	case MSR_MTRRfix16K_A0000:
1206	case MSR_MTRRfix4K_C0000:
1207	case MSR_MTRRfix4K_C8000:
1208	case MSR_MTRRfix4K_D0000:
1209	case MSR_MTRRfix4K_D8000:
1210	case MSR_MTRRfix4K_E0000:
1211	case MSR_MTRRfix4K_E8000:
1212	case MSR_MTRRfix4K_F0000:
1213	case MSR_MTRRfix4K_F8000:
1214	case MSR_MTRRdefType:
1215	case MSR_IA32_CR_PAT:
1216		return true;
1217	case 0x2f8:
1218		return true;
1219	}
1220	return false;
1221}
1222
1223static bool valid_pat_type(unsigned t)
1224{
1225	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1226}
1227
1228static bool valid_mtrr_type(unsigned t)
1229{
1230	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1231}
1232
1233static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1234{
1235	int i;
1236
1237	if (!msr_mtrr_valid(msr))
1238		return false;
 
 
 
 
1239
1240	if (msr == MSR_IA32_CR_PAT) {
1241		for (i = 0; i < 8; i++)
1242			if (!valid_pat_type((data >> (i * 8)) & 0xff))
1243				return false;
1244		return true;
1245	} else if (msr == MSR_MTRRdefType) {
1246		if (data & ~0xcff)
1247			return false;
1248		return valid_mtrr_type(data & 0xff);
1249	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1250		for (i = 0; i < 8 ; i++)
1251			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1252				return false;
1253		return true;
1254	}
1255
1256	/* variable MTRRs */
1257	return valid_mtrr_type(data & 0xff);
 
1258}
1259
1260static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 
 
 
1261{
1262	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1263
1264	if (!mtrr_valid(vcpu, msr, data))
1265		return 1;
1266
1267	if (msr == MSR_MTRRdefType) {
1268		vcpu->arch.mtrr_state.def_type = data;
1269		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1270	} else if (msr == MSR_MTRRfix64K_00000)
1271		p[0] = data;
1272	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1273		p[1 + msr - MSR_MTRRfix16K_80000] = data;
1274	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1275		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1276	else if (msr == MSR_IA32_CR_PAT)
1277		vcpu->arch.pat = data;
1278	else {	/* Variable MTRRs */
1279		int idx, is_mtrr_mask;
1280		u64 *pt;
1281
1282		idx = (msr - 0x200) / 2;
1283		is_mtrr_mask = msr - 0x200 - 2 * idx;
1284		if (!is_mtrr_mask)
1285			pt =
1286			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1287		else
1288			pt =
1289			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1290		*pt = data;
1291	}
1292
1293	kvm_mmu_reset_context(vcpu);
1294	return 0;
1295}
1296
1297static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1298{
1299	u64 mcg_cap = vcpu->arch.mcg_cap;
1300	unsigned bank_num = mcg_cap & 0xff;
 
 
1301
1302	switch (msr) {
1303	case MSR_IA32_MCG_STATUS:
1304		vcpu->arch.mcg_status = data;
1305		break;
1306	case MSR_IA32_MCG_CTL:
1307		if (!(mcg_cap & MCG_CTL_P))
 
1308			return 1;
1309		if (data != 0 && data != ~(u64)0)
1310			return -1;
1311		vcpu->arch.mcg_ctl = data;
1312		break;
1313	default:
1314		if (msr >= MSR_IA32_MC0_CTL &&
1315		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1316			u32 offset = msr - MSR_IA32_MC0_CTL;
1317			/* only 0 or all 1s can be written to IA32_MCi_CTL
1318			 * some Linux kernels though clear bit 10 in bank 4 to
1319			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1320			 * this to avoid an uncatched #GP in the guest
1321			 */
1322			if ((offset & 0x3) == 0 &&
1323			    data != 0 && (data | (1 << 10)) != ~(u64)0)
1324				return -1;
 
 
 
 
 
 
 
 
1325			vcpu->arch.mce_banks[offset] = data;
1326			break;
1327		}
1328		return 1;
1329	}
1330	return 0;
1331}
1332
1333static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1334{
1335	struct kvm *kvm = vcpu->kvm;
1336	int lm = is_long_mode(vcpu);
1337	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1338		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1339	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1340		: kvm->arch.xen_hvm_config.blob_size_32;
1341	u32 page_num = data & ~PAGE_MASK;
1342	u64 page_addr = data & PAGE_MASK;
1343	u8 *page;
1344	int r;
1345
1346	r = -E2BIG;
1347	if (page_num >= blob_size)
1348		goto out;
1349	r = -ENOMEM;
1350	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
1351	if (!page)
 
1352		goto out;
1353	r = -EFAULT;
1354	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
1355		goto out_free;
1356	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1357		goto out_free;
1358	r = 0;
1359out_free:
1360	kfree(page);
1361out:
1362	return r;
1363}
1364
1365static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1366{
1367	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1368}
1369
1370static bool kvm_hv_msr_partition_wide(u32 msr)
1371{
1372	bool r = false;
1373	switch (msr) {
1374	case HV_X64_MSR_GUEST_OS_ID:
1375	case HV_X64_MSR_HYPERCALL:
1376		r = true;
1377		break;
1378	}
1379
1380	return r;
1381}
1382
1383static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1384{
1385	struct kvm *kvm = vcpu->kvm;
1386
1387	switch (msr) {
1388	case HV_X64_MSR_GUEST_OS_ID:
1389		kvm->arch.hv_guest_os_id = data;
1390		/* setting guest os id to zero disables hypercall page */
1391		if (!kvm->arch.hv_guest_os_id)
1392			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1393		break;
1394	case HV_X64_MSR_HYPERCALL: {
1395		u64 gfn;
1396		unsigned long addr;
1397		u8 instructions[4];
1398
1399		/* if guest os id is not set hypercall should remain disabled */
1400		if (!kvm->arch.hv_guest_os_id)
1401			break;
1402		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1403			kvm->arch.hv_hypercall = data;
1404			break;
1405		}
1406		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1407		addr = gfn_to_hva(kvm, gfn);
1408		if (kvm_is_error_hva(addr))
1409			return 1;
1410		kvm_x86_ops->patch_hypercall(vcpu, instructions);
1411		((unsigned char *)instructions)[3] = 0xc3; /* ret */
1412		if (__copy_to_user((void __user *)addr, instructions, 4))
1413			return 1;
1414		kvm->arch.hv_hypercall = data;
1415		break;
1416	}
1417	default:
1418		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1419			  "data 0x%llx\n", msr, data);
1420		return 1;
1421	}
1422	return 0;
1423}
1424
1425static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1426{
1427	switch (msr) {
1428	case HV_X64_MSR_APIC_ASSIST_PAGE: {
1429		unsigned long addr;
1430
1431		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1432			vcpu->arch.hv_vapic = data;
1433			break;
1434		}
1435		addr = gfn_to_hva(vcpu->kvm, data >>
1436				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1437		if (kvm_is_error_hva(addr))
1438			return 1;
1439		if (__clear_user((void __user *)addr, PAGE_SIZE))
1440			return 1;
1441		vcpu->arch.hv_vapic = data;
1442		break;
1443	}
1444	case HV_X64_MSR_EOI:
1445		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1446	case HV_X64_MSR_ICR:
1447		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1448	case HV_X64_MSR_TPR:
1449		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1450	default:
1451		pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1452			  "data 0x%llx\n", msr, data);
1453		return 1;
1454	}
1455
1456	return 0;
1457}
1458
1459static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1460{
1461	gpa_t gpa = data & ~0x3f;
1462
1463	/* Bits 2:5 are resrved, Should be zero */
1464	if (data & 0x3c)
1465		return 1;
1466
1467	vcpu->arch.apf.msr_val = data;
1468
1469	if (!(data & KVM_ASYNC_PF_ENABLED)) {
1470		kvm_clear_async_pf_completion_queue(vcpu);
1471		kvm_async_pf_hash_reset(vcpu);
1472		return 0;
1473	}
1474
1475	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
 
1476		return 1;
1477
1478	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
 
1479	kvm_async_pf_wakeup_all(vcpu);
1480	return 0;
1481}
1482
1483static void kvmclock_reset(struct kvm_vcpu *vcpu)
1484{
1485	if (vcpu->arch.time_page) {
1486		kvm_release_page_dirty(vcpu->arch.time_page);
1487		vcpu->arch.time_page = NULL;
1488	}
1489}
1490
1491static void accumulate_steal_time(struct kvm_vcpu *vcpu)
1492{
1493	u64 delta;
1494
1495	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1496		return;
1497
1498	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
1499	vcpu->arch.st.last_steal = current->sched_info.run_delay;
1500	vcpu->arch.st.accum_steal = delta;
1501}
1502
1503static void record_steal_time(struct kvm_vcpu *vcpu)
1504{
1505	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
1506		return;
1507
1508	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1509		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
1510		return;
1511
1512	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
1513	vcpu->arch.st.steal.version += 2;
1514	vcpu->arch.st.accum_steal = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1515
1516	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
1517		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
1518}
1519
1520int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1521{
 
 
 
 
1522	switch (msr) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1523	case MSR_EFER:
1524		return set_efer(vcpu, data);
1525	case MSR_K7_HWCR:
1526		data &= ~(u64)0x40;	/* ignore flush filter disable */
1527		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
1528		if (data != 0) {
1529			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
1530				data);
 
 
 
 
 
1531			return 1;
1532		}
1533		break;
1534	case MSR_FAM10H_MMIO_CONF_BASE:
1535		if (data != 0) {
1536			pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
1537				"0x%llx\n", data);
1538			return 1;
1539		}
1540		break;
1541	case MSR_AMD64_NB_CFG:
1542		break;
1543	case MSR_IA32_DEBUGCTLMSR:
1544		if (!data) {
1545			/* We support the non-activated case already */
1546			break;
1547		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
1548			/* Values other than LBR and BTF are vendor-specific,
1549			   thus reserved and should throw a #GP */
1550			return 1;
1551		}
1552		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1553			__func__, data);
1554		break;
1555	case MSR_IA32_UCODE_REV:
1556	case MSR_IA32_UCODE_WRITE:
1557	case MSR_VM_HSAVE_PA:
1558	case MSR_AMD64_PATCH_LOADER:
1559		break;
1560	case 0x200 ... 0x2ff:
1561		return set_msr_mtrr(vcpu, msr, data);
1562	case MSR_IA32_APICBASE:
1563		kvm_set_apic_base(vcpu, data);
1564		break;
1565	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1566		return kvm_x2apic_msr_write(vcpu, msr, data);
 
 
 
 
 
 
 
 
 
 
 
 
1567	case MSR_IA32_MISC_ENABLE:
1568		vcpu->arch.ia32_misc_enable_msr = data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1569		break;
1570	case MSR_KVM_WALL_CLOCK_NEW:
1571	case MSR_KVM_WALL_CLOCK:
1572		vcpu->kvm->arch.wall_clock = data;
1573		kvm_write_wall_clock(vcpu->kvm, data);
1574		break;
1575	case MSR_KVM_SYSTEM_TIME_NEW:
1576	case MSR_KVM_SYSTEM_TIME: {
1577		kvmclock_reset(vcpu);
 
 
 
 
 
 
 
 
 
1578
1579		vcpu->arch.time = data;
1580		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1581
1582		/* we verify if the enable bit is set... */
 
1583		if (!(data & 1))
1584			break;
1585
1586		/* ...but clean it before doing the actual write */
1587		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
1588
1589		vcpu->arch.time_page =
1590				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
1591
1592		if (is_error_page(vcpu->arch.time_page)) {
1593			kvm_release_page_clean(vcpu->arch.time_page);
1594			vcpu->arch.time_page = NULL;
1595		}
1596		break;
1597	}
1598	case MSR_KVM_ASYNC_PF_EN:
1599		if (kvm_pv_enable_async_pf(vcpu, data))
1600			return 1;
1601		break;
1602	case MSR_KVM_STEAL_TIME:
1603
1604		if (unlikely(!sched_info_on()))
1605			return 1;
1606
1607		if (data & KVM_STEAL_RESERVED_MASK)
1608			return 1;
1609
1610		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
1611							data & KVM_STEAL_VALID_BITS))
 
1612			return 1;
1613
1614		vcpu->arch.st.msr_val = data;
1615
1616		if (!(data & KVM_MSR_ENABLED))
1617			break;
1618
1619		vcpu->arch.st.last_steal = current->sched_info.run_delay;
1620
1621		preempt_disable();
1622		accumulate_steal_time(vcpu);
1623		preempt_enable();
 
 
1624
1625		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
 
 
1626
 
1627		break;
1628
1629	case MSR_IA32_MCG_CTL:
1630	case MSR_IA32_MCG_STATUS:
1631	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1632		return set_msr_mce(vcpu, msr, data);
1633
1634	/* Performance counters are not protected by a CPUID bit,
1635	 * so we should check all of them in the generic path for the sake of
1636	 * cross vendor migration.
1637	 * Writing a zero into the event select MSRs disables them,
1638	 * which we perfectly emulate ;-). Any other value should be at least
1639	 * reported, some guests depend on them.
1640	 */
1641	case MSR_P6_EVNTSEL0:
1642	case MSR_P6_EVNTSEL1:
1643	case MSR_K7_EVNTSEL0:
1644	case MSR_K7_EVNTSEL1:
1645	case MSR_K7_EVNTSEL2:
1646	case MSR_K7_EVNTSEL3:
1647		if (data != 0)
1648			pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1649				"0x%x data 0x%llx\n", msr, data);
1650		break;
1651	/* at least RHEL 4 unconditionally writes to the perfctr registers,
1652	 * so we ignore writes to make it happy.
1653	 */
1654	case MSR_P6_PERFCTR0:
1655	case MSR_P6_PERFCTR1:
1656	case MSR_K7_PERFCTR0:
1657	case MSR_K7_PERFCTR1:
1658	case MSR_K7_PERFCTR2:
1659	case MSR_K7_PERFCTR3:
1660		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1661			"0x%x data 0x%llx\n", msr, data);
1662		break;
1663	case MSR_K7_CLK_CTL:
1664		/*
1665		 * Ignore all writes to this no longer documented MSR.
1666		 * Writes are only relevant for old K7 processors,
1667		 * all pre-dating SVM, but a recommended workaround from
1668		 * AMD for these chips. It is possible to speicify the
1669		 * affected processor models on the command line, hence
1670		 * the need to ignore the workaround.
1671		 */
1672		break;
1673	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1674		if (kvm_hv_msr_partition_wide(msr)) {
1675			int r;
1676			mutex_lock(&vcpu->kvm->lock);
1677			r = set_msr_hyperv_pw(vcpu, msr, data);
1678			mutex_unlock(&vcpu->kvm->lock);
1679			return r;
1680		} else
1681			return set_msr_hyperv(vcpu, msr, data);
1682		break;
1683	case MSR_IA32_BBL_CR_CTL3:
1684		/* Drop writes to this legacy MSR -- see rdmsr
1685		 * counterpart for further detail.
1686		 */
1687		pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1688		break;
1689	default:
1690		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1691			return xen_hvm_config(vcpu, data);
 
 
1692		if (!ignore_msrs) {
1693			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
1694				msr, data);
1695			return 1;
1696		} else {
1697			pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
1698				msr, data);
 
 
1699			break;
1700		}
1701	}
1702	return 0;
1703}
1704EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1705
1706
1707/*
1708 * Reads an msr value (of 'msr_index') into 'pdata'.
1709 * Returns 0 on success, non-0 otherwise.
1710 * Assumes vcpu_load() was already called.
1711 */
1712int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1713{
1714	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1715}
1716
1717static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1718{
1719	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1720
1721	if (!msr_mtrr_valid(msr))
1722		return 1;
1723
1724	if (msr == MSR_MTRRdefType)
1725		*pdata = vcpu->arch.mtrr_state.def_type +
1726			 (vcpu->arch.mtrr_state.enabled << 10);
1727	else if (msr == MSR_MTRRfix64K_00000)
1728		*pdata = p[0];
1729	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1730		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1731	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1732		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1733	else if (msr == MSR_IA32_CR_PAT)
1734		*pdata = vcpu->arch.pat;
1735	else {	/* Variable MTRRs */
1736		int idx, is_mtrr_mask;
1737		u64 *pt;
1738
1739		idx = (msr - 0x200) / 2;
1740		is_mtrr_mask = msr - 0x200 - 2 * idx;
1741		if (!is_mtrr_mask)
1742			pt =
1743			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1744		else
1745			pt =
1746			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1747		*pdata = *pt;
1748	}
1749
1750	return 0;
1751}
1752
1753static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1754{
1755	u64 data;
1756	u64 mcg_cap = vcpu->arch.mcg_cap;
1757	unsigned bank_num = mcg_cap & 0xff;
1758
1759	switch (msr) {
1760	case MSR_IA32_P5_MC_ADDR:
1761	case MSR_IA32_P5_MC_TYPE:
1762		data = 0;
1763		break;
1764	case MSR_IA32_MCG_CAP:
1765		data = vcpu->arch.mcg_cap;
1766		break;
1767	case MSR_IA32_MCG_CTL:
1768		if (!(mcg_cap & MCG_CTL_P))
1769			return 1;
1770		data = vcpu->arch.mcg_ctl;
1771		break;
1772	case MSR_IA32_MCG_STATUS:
1773		data = vcpu->arch.mcg_status;
1774		break;
1775	default:
1776		if (msr >= MSR_IA32_MC0_CTL &&
1777		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1778			u32 offset = msr - MSR_IA32_MC0_CTL;
1779			data = vcpu->arch.mce_banks[offset];
1780			break;
1781		}
1782		return 1;
1783	}
1784	*pdata = data;
1785	return 0;
1786}
1787
1788static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1789{
1790	u64 data = 0;
1791	struct kvm *kvm = vcpu->kvm;
1792
1793	switch (msr) {
1794	case HV_X64_MSR_GUEST_OS_ID:
1795		data = kvm->arch.hv_guest_os_id;
1796		break;
1797	case HV_X64_MSR_HYPERCALL:
1798		data = kvm->arch.hv_hypercall;
1799		break;
1800	default:
1801		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1802		return 1;
1803	}
1804
1805	*pdata = data;
1806	return 0;
1807}
1808
1809static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1810{
1811	u64 data = 0;
1812
1813	switch (msr) {
1814	case HV_X64_MSR_VP_INDEX: {
1815		int r;
1816		struct kvm_vcpu *v;
1817		kvm_for_each_vcpu(r, v, vcpu->kvm)
1818			if (v == vcpu)
1819				data = r;
1820		break;
1821	}
1822	case HV_X64_MSR_EOI:
1823		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1824	case HV_X64_MSR_ICR:
1825		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1826	case HV_X64_MSR_TPR:
1827		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1828	default:
1829		pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1830		return 1;
1831	}
1832	*pdata = data;
1833	return 0;
1834}
1835
1836int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1837{
1838	u64 data;
1839
1840	switch (msr) {
1841	case MSR_IA32_PLATFORM_ID:
1842	case MSR_IA32_UCODE_REV:
1843	case MSR_IA32_EBL_CR_POWERON:
1844	case MSR_IA32_DEBUGCTLMSR:
1845	case MSR_IA32_LASTBRANCHFROMIP:
1846	case MSR_IA32_LASTBRANCHTOIP:
1847	case MSR_IA32_LASTINTFROMIP:
1848	case MSR_IA32_LASTINTTOIP:
1849	case MSR_K8_SYSCFG:
1850	case MSR_K7_HWCR:
 
1851	case MSR_VM_HSAVE_PA:
1852	case MSR_P6_PERFCTR0:
1853	case MSR_P6_PERFCTR1:
1854	case MSR_P6_EVNTSEL0:
1855	case MSR_P6_EVNTSEL1:
1856	case MSR_K7_EVNTSEL0:
1857	case MSR_K7_PERFCTR0:
1858	case MSR_K8_INT_PENDING_MSG:
1859	case MSR_AMD64_NB_CFG:
1860	case MSR_FAM10H_MMIO_CONF_BASE:
1861		data = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
1862		break;
1863	case MSR_MTRRcap:
1864		data = 0x500 | KVM_NR_VAR_MTRR;
 
 
 
 
 
 
1865		break;
 
 
 
 
 
 
 
1866	case 0x200 ... 0x2ff:
1867		return get_msr_mtrr(vcpu, msr, pdata);
1868	case 0xcd: /* fsb frequency */
1869		data = 3;
1870		break;
1871		/*
1872		 * MSR_EBC_FREQUENCY_ID
1873		 * Conservative value valid for even the basic CPU models.
1874		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1875		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1876		 * and 266MHz for model 3, or 4. Set Core Clock
1877		 * Frequency to System Bus Frequency Ratio to 1 (bits
1878		 * 31:24) even though these are only valid for CPU
1879		 * models > 2, however guests may end up dividing or
1880		 * multiplying by zero otherwise.
1881		 */
1882	case MSR_EBC_FREQUENCY_ID:
1883		data = 1 << 24;
1884		break;
1885	case MSR_IA32_APICBASE:
1886		data = kvm_get_apic_base(vcpu);
1887		break;
1888	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1889		return kvm_x2apic_msr_read(vcpu, msr, pdata);
 
 
 
 
 
 
1890		break;
1891	case MSR_IA32_MISC_ENABLE:
1892		data = vcpu->arch.ia32_misc_enable_msr;
 
 
 
 
 
 
 
 
1893		break;
1894	case MSR_IA32_PERF_STATUS:
1895		/* TSC increment by tick */
1896		data = 1000ULL;
1897		/* CPU multiplier */
1898		data |= (((uint64_t)4ULL) << 40);
1899		break;
1900	case MSR_EFER:
1901		data = vcpu->arch.efer;
1902		break;
1903	case MSR_KVM_WALL_CLOCK:
1904	case MSR_KVM_WALL_CLOCK_NEW:
1905		data = vcpu->kvm->arch.wall_clock;
1906		break;
1907	case MSR_KVM_SYSTEM_TIME:
1908	case MSR_KVM_SYSTEM_TIME_NEW:
1909		data = vcpu->arch.time;
1910		break;
1911	case MSR_KVM_ASYNC_PF_EN:
1912		data = vcpu->arch.apf.msr_val;
1913		break;
1914	case MSR_KVM_STEAL_TIME:
1915		data = vcpu->arch.st.msr_val;
 
 
 
 
 
 
1916		break;
1917	case MSR_IA32_P5_MC_ADDR:
1918	case MSR_IA32_P5_MC_TYPE:
1919	case MSR_IA32_MCG_CAP:
1920	case MSR_IA32_MCG_CTL:
1921	case MSR_IA32_MCG_STATUS:
1922	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1923		return get_msr_mce(vcpu, msr, pdata);
 
1924	case MSR_K7_CLK_CTL:
1925		/*
1926		 * Provide expected ramp-up count for K7. All other
1927		 * are set to zero, indicating minimum divisors for
1928		 * every field.
1929		 *
1930		 * This prevents guest kernels on AMD host with CPU
1931		 * type 6, model 8 and higher from exploding due to
1932		 * the rdmsr failing.
1933		 */
1934		data = 0x20000000;
1935		break;
1936	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1937		if (kvm_hv_msr_partition_wide(msr)) {
1938			int r;
1939			mutex_lock(&vcpu->kvm->lock);
1940			r = get_msr_hyperv_pw(vcpu, msr, pdata);
1941			mutex_unlock(&vcpu->kvm->lock);
1942			return r;
1943		} else
1944			return get_msr_hyperv(vcpu, msr, pdata);
 
1945		break;
1946	case MSR_IA32_BBL_CR_CTL3:
1947		/* This legacy MSR exists but isn't fully documented in current
1948		 * silicon.  It is however accessed by winxp in very narrow
1949		 * scenarios where it sets bit #19, itself documented as
1950		 * a "reserved" bit.  Best effort attempt to source coherent
1951		 * read data here should the balance of the register be
1952		 * interpreted by the guest:
1953		 *
1954		 * L2 cache control register 3: 64GB range, 256KB size,
1955		 * enabled, latency 0x1, configured
1956		 */
1957		data = 0xbe702111;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1958		break;
1959	default:
 
 
1960		if (!ignore_msrs) {
1961			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 
1962			return 1;
1963		} else {
1964			pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
1965			data = 0;
 
 
1966		}
1967		break;
1968	}
1969	*pdata = data;
1970	return 0;
1971}
1972EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1973
1974/*
1975 * Read or write a bunch of msrs. All parameters are kernel addresses.
1976 *
1977 * @return number of msrs set successfully.
1978 */
1979static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1980		    struct kvm_msr_entry *entries,
1981		    int (*do_msr)(struct kvm_vcpu *vcpu,
1982				  unsigned index, u64 *data))
1983{
1984	int i, idx;
1985
1986	idx = srcu_read_lock(&vcpu->kvm->srcu);
1987	for (i = 0; i < msrs->nmsrs; ++i)
1988		if (do_msr(vcpu, entries[i].index, &entries[i].data))
1989			break;
1990	srcu_read_unlock(&vcpu->kvm->srcu, idx);
1991
1992	return i;
1993}
1994
1995/*
1996 * Read or write a bunch of msrs. Parameters are user addresses.
1997 *
1998 * @return number of msrs set successfully.
1999 */
2000static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2001		  int (*do_msr)(struct kvm_vcpu *vcpu,
2002				unsigned index, u64 *data),
2003		  int writeback)
2004{
2005	struct kvm_msrs msrs;
2006	struct kvm_msr_entry *entries;
2007	int r, n;
2008	unsigned size;
2009
2010	r = -EFAULT;
2011	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2012		goto out;
2013
2014	r = -E2BIG;
2015	if (msrs.nmsrs >= MAX_IO_MSRS)
2016		goto out;
2017
2018	r = -ENOMEM;
2019	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2020	entries = kmalloc(size, GFP_KERNEL);
2021	if (!entries)
 
2022		goto out;
2023
2024	r = -EFAULT;
2025	if (copy_from_user(entries, user_msrs->entries, size))
2026		goto out_free;
2027
2028	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2029	if (r < 0)
2030		goto out_free;
2031
2032	r = -EFAULT;
2033	if (writeback && copy_to_user(user_msrs->entries, entries, size))
2034		goto out_free;
2035
2036	r = n;
2037
2038out_free:
2039	kfree(entries);
2040out:
2041	return r;
2042}
2043
2044int kvm_dev_ioctl_check_extension(long ext)
2045{
2046	int r;
 
 
 
 
 
 
 
2047
2048	switch (ext) {
2049	case KVM_CAP_IRQCHIP:
2050	case KVM_CAP_HLT:
2051	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2052	case KVM_CAP_SET_TSS_ADDR:
2053	case KVM_CAP_EXT_CPUID:
 
2054	case KVM_CAP_CLOCKSOURCE:
2055	case KVM_CAP_PIT:
2056	case KVM_CAP_NOP_IO_DELAY:
2057	case KVM_CAP_MP_STATE:
2058	case KVM_CAP_SYNC_MMU:
2059	case KVM_CAP_USER_NMI:
2060	case KVM_CAP_REINJECT_CONTROL:
2061	case KVM_CAP_IRQ_INJECT_STATUS:
2062	case KVM_CAP_ASSIGN_DEV_IRQ:
2063	case KVM_CAP_IRQFD:
2064	case KVM_CAP_IOEVENTFD:
 
2065	case KVM_CAP_PIT2:
2066	case KVM_CAP_PIT_STATE2:
2067	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2068	case KVM_CAP_XEN_HVM:
2069	case KVM_CAP_ADJUST_CLOCK:
2070	case KVM_CAP_VCPU_EVENTS:
2071	case KVM_CAP_HYPERV:
2072	case KVM_CAP_HYPERV_VAPIC:
2073	case KVM_CAP_HYPERV_SPIN:
 
 
 
 
 
 
 
2074	case KVM_CAP_PCI_SEGMENT:
2075	case KVM_CAP_DEBUGREGS:
2076	case KVM_CAP_X86_ROBUST_SINGLESTEP:
2077	case KVM_CAP_XSAVE:
2078	case KVM_CAP_ASYNC_PF:
2079	case KVM_CAP_GET_TSC_KHZ:
 
 
 
 
 
 
 
 
 
 
 
 
 
2080		r = 1;
2081		break;
2082	case KVM_CAP_COALESCED_MMIO:
2083		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2084		break;
2085	case KVM_CAP_VAPIC:
2086		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2087		break;
2088	case KVM_CAP_NR_VCPUS:
 
 
 
2089		r = KVM_MAX_VCPUS;
2090		break;
2091	case KVM_CAP_NR_MEMSLOTS:
2092		r = KVM_MEMORY_SLOTS;
2093		break;
2094	case KVM_CAP_PV_MMU:	/* obsolete */
2095		r = 0;
2096		break;
2097	case KVM_CAP_IOMMU:
2098		r = iommu_found();
2099		break;
2100	case KVM_CAP_MCE:
2101		r = KVM_MAX_MCE_BANKS;
2102		break;
2103	case KVM_CAP_XCRS:
2104		r = cpu_has_xsave;
2105		break;
2106	case KVM_CAP_TSC_CONTROL:
2107		r = kvm_has_tsc_control;
2108		break;
 
 
 
 
 
 
 
 
 
 
 
 
 
2109	default:
2110		r = 0;
2111		break;
2112	}
2113	return r;
2114
2115}
2116
2117long kvm_arch_dev_ioctl(struct file *filp,
2118			unsigned int ioctl, unsigned long arg)
2119{
2120	void __user *argp = (void __user *)arg;
2121	long r;
2122
2123	switch (ioctl) {
2124	case KVM_GET_MSR_INDEX_LIST: {
2125		struct kvm_msr_list __user *user_msr_list = argp;
2126		struct kvm_msr_list msr_list;
2127		unsigned n;
2128
2129		r = -EFAULT;
2130		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2131			goto out;
2132		n = msr_list.nmsrs;
2133		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2134		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2135			goto out;
2136		r = -E2BIG;
2137		if (n < msr_list.nmsrs)
2138			goto out;
2139		r = -EFAULT;
2140		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2141				 num_msrs_to_save * sizeof(u32)))
2142			goto out;
2143		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2144				 &emulated_msrs,
2145				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2146			goto out;
2147		r = 0;
2148		break;
2149	}
2150	case KVM_GET_SUPPORTED_CPUID: {
 
2151		struct kvm_cpuid2 __user *cpuid_arg = argp;
2152		struct kvm_cpuid2 cpuid;
2153
2154		r = -EFAULT;
2155		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2156			goto out;
2157		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
2158						      cpuid_arg->entries);
 
2159		if (r)
2160			goto out;
2161
2162		r = -EFAULT;
2163		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2164			goto out;
2165		r = 0;
2166		break;
2167	}
2168	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2169		u64 mce_cap;
 
 
 
 
 
 
 
 
 
2170
2171		mce_cap = KVM_MCE_CAP_SUPPORTED;
2172		r = -EFAULT;
2173		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
 
 
 
 
 
 
 
 
 
 
 
2174			goto out;
2175		r = 0;
2176		break;
2177	}
 
 
 
 
2178	default:
2179		r = -EINVAL;
2180	}
2181out:
2182	return r;
2183}
2184
2185static void wbinvd_ipi(void *garbage)
2186{
2187	wbinvd();
2188}
2189
2190static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
2191{
2192	return vcpu->kvm->arch.iommu_domain &&
2193		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
2194}
2195
2196void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2197{
2198	/* Address WBINVD may be executed by guest */
2199	if (need_emulate_wbinvd(vcpu)) {
2200		if (kvm_x86_ops->has_wbinvd_exit())
2201			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
2202		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
2203			smp_call_function_single(vcpu->cpu,
2204					wbinvd_ipi, NULL, 1);
2205	}
2206
2207	kvm_x86_ops->vcpu_load(vcpu, cpu);
2208	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2209		/* Make sure TSC doesn't go backwards */
2210		s64 tsc_delta;
2211		u64 tsc;
2212
2213		kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
2214		tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2215			     tsc - vcpu->arch.last_guest_tsc;
2216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2217		if (tsc_delta < 0)
2218			mark_tsc_unstable("KVM discovered backwards TSC");
2219		if (check_tsc_unstable()) {
2220			kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
 
 
 
2221			vcpu->arch.tsc_catchup = 1;
2222		}
2223		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 
 
 
 
 
 
 
 
 
2224		if (vcpu->cpu != cpu)
2225			kvm_migrate_timers(vcpu);
2226		vcpu->cpu = cpu;
2227	}
2228
2229	accumulate_steal_time(vcpu);
2230	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2231}
2232
2233void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2234{
2235	kvm_x86_ops->vcpu_put(vcpu);
2236	kvm_put_guest_fpu(vcpu);
2237	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
2238}
2239
2240static int is_efer_nx(void)
2241{
2242	unsigned long long efer = 0;
2243
2244	rdmsrl_safe(MSR_EFER, &efer);
2245	return efer & EFER_NX;
2246}
2247
2248static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
2249{
2250	int i;
2251	struct kvm_cpuid_entry2 *e, *entry;
2252
2253	entry = NULL;
2254	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2255		e = &vcpu->arch.cpuid_entries[i];
2256		if (e->function == 0x80000001) {
2257			entry = e;
2258			break;
2259		}
2260	}
2261	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
2262		entry->edx &= ~(1 << 20);
2263		printk(KERN_INFO "kvm: guest NX capability removed\n");
2264	}
2265}
2266
2267/* when an old userspace process fills a new kernel module */
2268static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
2269				    struct kvm_cpuid *cpuid,
2270				    struct kvm_cpuid_entry __user *entries)
2271{
2272	int r, i;
2273	struct kvm_cpuid_entry *cpuid_entries;
2274
2275	r = -E2BIG;
2276	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2277		goto out;
2278	r = -ENOMEM;
2279	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
2280	if (!cpuid_entries)
2281		goto out;
2282	r = -EFAULT;
2283	if (copy_from_user(cpuid_entries, entries,
2284			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
2285		goto out_free;
2286	for (i = 0; i < cpuid->nent; i++) {
2287		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
2288		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
2289		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
2290		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
2291		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
2292		vcpu->arch.cpuid_entries[i].index = 0;
2293		vcpu->arch.cpuid_entries[i].flags = 0;
2294		vcpu->arch.cpuid_entries[i].padding[0] = 0;
2295		vcpu->arch.cpuid_entries[i].padding[1] = 0;
2296		vcpu->arch.cpuid_entries[i].padding[2] = 0;
2297	}
2298	vcpu->arch.cpuid_nent = cpuid->nent;
2299	cpuid_fix_nx_cap(vcpu);
2300	r = 0;
2301	kvm_apic_set_version(vcpu);
2302	kvm_x86_ops->cpuid_update(vcpu);
2303	update_cpuid(vcpu);
2304
2305out_free:
2306	vfree(cpuid_entries);
2307out:
2308	return r;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309}
2310
2311static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
2312				     struct kvm_cpuid2 *cpuid,
2313				     struct kvm_cpuid_entry2 __user *entries)
2314{
2315	int r;
2316
2317	r = -E2BIG;
2318	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2319		goto out;
2320	r = -EFAULT;
2321	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
2322			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
2323		goto out;
2324	vcpu->arch.cpuid_nent = cpuid->nent;
2325	kvm_apic_set_version(vcpu);
2326	kvm_x86_ops->cpuid_update(vcpu);
2327	update_cpuid(vcpu);
2328	return 0;
2329
2330out:
2331	return r;
2332}
2333
2334static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
2335				     struct kvm_cpuid2 *cpuid,
2336				     struct kvm_cpuid_entry2 __user *entries)
2337{
2338	int r;
2339
2340	r = -E2BIG;
2341	if (cpuid->nent < vcpu->arch.cpuid_nent)
2342		goto out;
2343	r = -EFAULT;
2344	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
2345			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
2346		goto out;
2347	return 0;
2348
2349out:
2350	cpuid->nent = vcpu->arch.cpuid_nent;
2351	return r;
2352}
2353
2354static void cpuid_mask(u32 *word, int wordnum)
2355{
2356	*word &= boot_cpu_data.x86_capability[wordnum];
2357}
2358
2359static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2360			   u32 index)
2361{
2362	entry->function = function;
2363	entry->index = index;
2364	cpuid_count(entry->function, entry->index,
2365		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
2366	entry->flags = 0;
2367}
2368
2369static bool supported_xcr0_bit(unsigned bit)
2370{
2371	u64 mask = ((u64)1 << bit);
2372
2373	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
2374}
2375
2376#define F(x) bit(X86_FEATURE_##x)
2377
2378static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2379			 u32 index, int *nent, int maxnent)
 
 
 
2380{
2381	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
2382#ifdef CONFIG_X86_64
2383	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
2384				? F(GBPAGES) : 0;
2385	unsigned f_lm = F(LM);
2386#else
2387	unsigned f_gbpages = 0;
2388	unsigned f_lm = 0;
2389#endif
2390	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
2391
2392	/* cpuid 1.edx */
2393	const u32 kvm_supported_word0_x86_features =
2394		F(FPU) | F(VME) | F(DE) | F(PSE) |
2395		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2396		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
2397		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2398		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
2399		0 /* Reserved, DS, ACPI */ | F(MMX) |
2400		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
2401		0 /* HTT, TM, Reserved, PBE */;
2402	/* cpuid 0x80000001.edx */
2403	const u32 kvm_supported_word1_x86_features =
2404		F(FPU) | F(VME) | F(DE) | F(PSE) |
2405		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
2406		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
2407		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
2408		F(PAT) | F(PSE36) | 0 /* Reserved */ |
2409		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
2410		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
2411		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
2412	/* cpuid 1.ecx */
2413	const u32 kvm_supported_word4_x86_features =
2414		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
2415		0 /* DS-CPL, VMX, SMX, EST */ |
2416		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
2417		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
2418		0 /* Reserved, DCA */ | F(XMM4_1) |
2419		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
2420		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2421		F(F16C) | F(RDRAND);
2422	/* cpuid 0x80000001.ecx */
2423	const u32 kvm_supported_word6_x86_features =
2424		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
2425		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
2426		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2427		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2428
2429	/* cpuid 0xC0000001.edx */
2430	const u32 kvm_supported_word5_x86_features =
2431		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2432		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2433		F(PMM) | F(PMM_EN);
2434
2435	/* cpuid 7.0.ebx */
2436	const u32 kvm_supported_word9_x86_features =
2437		F(SMEP) | F(FSGSBASE) | F(ERMS);
2438
2439	/* all calls to cpuid_count() should be made on the same cpu */
2440	get_cpu();
2441	do_cpuid_1_ent(entry, function, index);
2442	++*nent;
2443
2444	switch (function) {
2445	case 0:
2446		entry->eax = min(entry->eax, (u32)0xd);
2447		break;
2448	case 1:
2449		entry->edx &= kvm_supported_word0_x86_features;
2450		cpuid_mask(&entry->edx, 0);
2451		entry->ecx &= kvm_supported_word4_x86_features;
2452		cpuid_mask(&entry->ecx, 4);
2453		/* we support x2apic emulation even if host does not support
2454		 * it since we emulate x2apic in software */
2455		entry->ecx |= F(X2APIC);
2456		break;
2457	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
2458	 * may return different values. This forces us to get_cpu() before
2459	 * issuing the first command, and also to emulate this annoying behavior
2460	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
2461	case 2: {
2462		int t, times = entry->eax & 0xff;
2463
2464		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2465		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
2466		for (t = 1; t < times && *nent < maxnent; ++t) {
2467			do_cpuid_1_ent(&entry[t], function, 0);
2468			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
2469			++*nent;
2470		}
2471		break;
2472	}
2473	/* function 4 has additional index. */
2474	case 4: {
2475		int i, cache_type;
2476
2477		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2478		/* read more entries until cache_type is zero */
2479		for (i = 1; *nent < maxnent; ++i) {
2480			cache_type = entry[i - 1].eax & 0x1f;
2481			if (!cache_type)
2482				break;
2483			do_cpuid_1_ent(&entry[i], function, i);
2484			entry[i].flags |=
2485			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2486			++*nent;
2487		}
2488		break;
2489	}
2490	case 7: {
2491		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2492		/* Mask ebx against host capbability word 9 */
2493		if (index == 0) {
2494			entry->ebx &= kvm_supported_word9_x86_features;
2495			cpuid_mask(&entry->ebx, 9);
2496		} else
2497			entry->ebx = 0;
2498		entry->eax = 0;
2499		entry->ecx = 0;
2500		entry->edx = 0;
2501		break;
2502	}
2503	case 9:
2504		break;
2505	/* function 0xb has additional index. */
2506	case 0xb: {
2507		int i, level_type;
2508
2509		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2510		/* read more entries until level_type is zero */
2511		for (i = 1; *nent < maxnent; ++i) {
2512			level_type = entry[i - 1].ecx & 0xff00;
2513			if (!level_type)
2514				break;
2515			do_cpuid_1_ent(&entry[i], function, i);
2516			entry[i].flags |=
2517			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2518			++*nent;
2519		}
2520		break;
2521	}
2522	case 0xd: {
2523		int idx, i;
2524
2525		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2526		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
2527			do_cpuid_1_ent(&entry[i], function, idx);
2528			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
2529				continue;
2530			entry[i].flags |=
2531			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2532			++*nent;
2533			++i;
2534		}
2535		break;
2536	}
2537	case KVM_CPUID_SIGNATURE: {
2538		char signature[12] = "KVMKVMKVM\0\0";
2539		u32 *sigptr = (u32 *)signature;
2540		entry->eax = 0;
2541		entry->ebx = sigptr[0];
2542		entry->ecx = sigptr[1];
2543		entry->edx = sigptr[2];
2544		break;
2545	}
2546	case KVM_CPUID_FEATURES:
2547		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2548			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
2549			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
2550			     (1 << KVM_FEATURE_ASYNC_PF) |
2551			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2552
2553		if (sched_info_on())
2554			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
2555
2556		entry->ebx = 0;
2557		entry->ecx = 0;
2558		entry->edx = 0;
2559		break;
2560	case 0x80000000:
2561		entry->eax = min(entry->eax, 0x8000001a);
2562		break;
2563	case 0x80000001:
2564		entry->edx &= kvm_supported_word1_x86_features;
2565		cpuid_mask(&entry->edx, 1);
2566		entry->ecx &= kvm_supported_word6_x86_features;
2567		cpuid_mask(&entry->ecx, 6);
2568		break;
2569	case 0x80000008: {
2570		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
2571		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
2572		unsigned phys_as = entry->eax & 0xff;
2573
2574		if (!g_phys_as)
2575			g_phys_as = phys_as;
2576		entry->eax = g_phys_as | (virt_as << 8);
2577		entry->ebx = entry->edx = 0;
2578		break;
2579	}
2580	case 0x80000019:
2581		entry->ecx = entry->edx = 0;
2582		break;
2583	case 0x8000001a:
2584		break;
2585	case 0x8000001d:
2586		break;
2587	/*Add support for Centaur's CPUID instruction*/
2588	case 0xC0000000:
2589		/*Just support up to 0xC0000004 now*/
2590		entry->eax = min(entry->eax, 0xC0000004);
2591		break;
2592	case 0xC0000001:
2593		entry->edx &= kvm_supported_word5_x86_features;
2594		cpuid_mask(&entry->edx, 5);
2595		break;
2596	case 3: /* Processor serial number */
2597	case 5: /* MONITOR/MWAIT */
2598	case 6: /* Thermal management */
2599	case 0xA: /* Architectural Performance Monitoring */
2600	case 0x80000007: /* Advanced power management */
2601	case 0xC0000002:
2602	case 0xC0000003:
2603	case 0xC0000004:
2604	default:
2605		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
2606		break;
2607	}
2608
2609	kvm_x86_ops->set_supported_cpuid(function, entry);
2610
2611	put_cpu();
2612}
2613
2614#undef F
2615
2616static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2617				     struct kvm_cpuid_entry2 __user *entries)
2618{
2619	struct kvm_cpuid_entry2 *cpuid_entries;
2620	int limit, nent = 0, r = -E2BIG;
2621	u32 func;
2622
2623	if (cpuid->nent < 1)
2624		goto out;
2625	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
2626		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
2627	r = -ENOMEM;
2628	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
2629	if (!cpuid_entries)
2630		goto out;
2631
2632	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
2633	limit = cpuid_entries[0].eax;
2634	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
2635		do_cpuid_ent(&cpuid_entries[nent], func, 0,
2636			     &nent, cpuid->nent);
2637	r = -E2BIG;
2638	if (nent >= cpuid->nent)
2639		goto out_free;
2640
2641	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
2642	limit = cpuid_entries[nent - 1].eax;
2643	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
2644		do_cpuid_ent(&cpuid_entries[nent], func, 0,
2645			     &nent, cpuid->nent);
2646
2647
2648
2649	r = -E2BIG;
2650	if (nent >= cpuid->nent)
2651		goto out_free;
2652
2653	/* Add support for Centaur's CPUID instruction. */
2654	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2655		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2656				&nent, cpuid->nent);
2657
2658		r = -E2BIG;
2659		if (nent >= cpuid->nent)
2660			goto out_free;
2661
2662		limit = cpuid_entries[nent - 1].eax;
2663		for (func = 0xC0000001;
2664			func <= limit && nent < cpuid->nent; ++func)
2665			do_cpuid_ent(&cpuid_entries[nent], func, 0,
2666					&nent, cpuid->nent);
2667
2668		r = -E2BIG;
2669		if (nent >= cpuid->nent)
2670			goto out_free;
 
2671	}
2672
2673	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2674		     cpuid->nent);
2675
2676	r = -E2BIG;
2677	if (nent >= cpuid->nent)
2678		goto out_free;
2679
2680	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
2681		     cpuid->nent);
2682
2683	r = -E2BIG;
2684	if (nent >= cpuid->nent)
2685		goto out_free;
2686
2687	r = -EFAULT;
2688	if (copy_to_user(entries, cpuid_entries,
2689			 nent * sizeof(struct kvm_cpuid_entry2)))
2690		goto out_free;
2691	cpuid->nent = nent;
2692	r = 0;
2693
2694out_free:
2695	vfree(cpuid_entries);
2696out:
2697	return r;
2698}
2699
2700static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2701				    struct kvm_lapic_state *s)
2702{
2703	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2704
2705	return 0;
2706}
2707
2708static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2709				    struct kvm_lapic_state *s)
2710{
2711	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
2712	kvm_apic_post_state_restore(vcpu);
2713	update_cr8_intercept(vcpu);
2714
 
 
2715	return 0;
2716}
2717
2718static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2719				    struct kvm_interrupt *irq)
2720{
2721	if (irq->irq < 0 || irq->irq >= 256)
2722		return -EINVAL;
2723	if (irqchip_in_kernel(vcpu->kvm))
2724		return -ENXIO;
2725
2726	kvm_queue_interrupt(vcpu, irq->irq, false);
2727	kvm_make_request(KVM_REQ_EVENT, vcpu);
2728
2729	return 0;
2730}
2731
2732static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2733{
2734	kvm_inject_nmi(vcpu);
2735
2736	return 0;
2737}
2738
2739static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2740					   struct kvm_tpr_access_ctl *tac)
2741{
2742	if (tac->flags)
2743		return -EINVAL;
2744	vcpu->arch.tpr_access_reporting = !!tac->enabled;
2745	return 0;
2746}
2747
2748static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2749					u64 mcg_cap)
2750{
2751	int r;
2752	unsigned bank_num = mcg_cap & 0xff, bank;
2753
2754	r = -EINVAL;
2755	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2756		goto out;
2757	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2758		goto out;
2759	r = 0;
2760	vcpu->arch.mcg_cap = mcg_cap;
2761	/* Init IA32_MCG_CTL to all 1s */
2762	if (mcg_cap & MCG_CTL_P)
2763		vcpu->arch.mcg_ctl = ~(u64)0;
2764	/* Init IA32_MCi_CTL to all 1s */
2765	for (bank = 0; bank < bank_num; bank++)
2766		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 
 
2767out:
2768	return r;
2769}
2770
2771static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2772				      struct kvm_x86_mce *mce)
2773{
2774	u64 mcg_cap = vcpu->arch.mcg_cap;
2775	unsigned bank_num = mcg_cap & 0xff;
2776	u64 *banks = vcpu->arch.mce_banks;
2777
2778	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
2779		return -EINVAL;
2780	/*
2781	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
2782	 * reporting is disabled
2783	 */
2784	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
2785	    vcpu->arch.mcg_ctl != ~(u64)0)
2786		return 0;
2787	banks += 4 * mce->bank;
2788	/*
2789	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
2790	 * reporting is disabled for the bank
2791	 */
2792	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
2793		return 0;
2794	if (mce->status & MCI_STATUS_UC) {
2795		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2796		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2797			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2798			return 0;
2799		}
2800		if (banks[1] & MCI_STATUS_VAL)
2801			mce->status |= MCI_STATUS_OVER;
2802		banks[2] = mce->addr;
2803		banks[3] = mce->misc;
2804		vcpu->arch.mcg_status = mce->mcg_status;
2805		banks[1] = mce->status;
2806		kvm_queue_exception(vcpu, MC_VECTOR);
2807	} else if (!(banks[1] & MCI_STATUS_VAL)
2808		   || !(banks[1] & MCI_STATUS_UC)) {
2809		if (banks[1] & MCI_STATUS_VAL)
2810			mce->status |= MCI_STATUS_OVER;
2811		banks[2] = mce->addr;
2812		banks[3] = mce->misc;
2813		banks[1] = mce->status;
2814	} else
2815		banks[1] |= MCI_STATUS_OVER;
2816	return 0;
2817}
2818
2819static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2820					       struct kvm_vcpu_events *events)
2821{
2822	events->exception.injected =
2823		vcpu->arch.exception.pending &&
2824		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2825	events->exception.nr = vcpu->arch.exception.nr;
2826	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2827	events->exception.pad = 0;
2828	events->exception.error_code = vcpu->arch.exception.error_code;
 
 
2829
2830	events->interrupt.injected =
2831		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
2832	events->interrupt.nr = vcpu->arch.interrupt.nr;
2833	events->interrupt.soft = 0;
2834	events->interrupt.shadow =
2835		kvm_x86_ops->get_interrupt_shadow(vcpu,
2836			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
2837
2838	events->nmi.injected = vcpu->arch.nmi_injected;
2839	events->nmi.pending = vcpu->arch.nmi_pending;
2840	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2841	events->nmi.pad = 0;
2842
2843	events->sipi_vector = vcpu->arch.sipi_vector;
 
 
 
 
 
 
2844
2845	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2846			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2847			 | KVM_VCPUEVENT_VALID_SHADOW);
 
 
 
2848	memset(&events->reserved, 0, sizeof(events->reserved));
2849}
2850
 
 
2851static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2852					      struct kvm_vcpu_events *events)
2853{
2854	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2855			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2856			      | KVM_VCPUEVENT_VALID_SHADOW))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2857		return -EINVAL;
2858
2859	vcpu->arch.exception.pending = events->exception.injected;
 
 
2860	vcpu->arch.exception.nr = events->exception.nr;
2861	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2862	vcpu->arch.exception.error_code = events->exception.error_code;
 
 
2863
2864	vcpu->arch.interrupt.pending = events->interrupt.injected;
2865	vcpu->arch.interrupt.nr = events->interrupt.nr;
2866	vcpu->arch.interrupt.soft = events->interrupt.soft;
2867	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2868		kvm_x86_ops->set_interrupt_shadow(vcpu,
2869						  events->interrupt.shadow);
2870
2871	vcpu->arch.nmi_injected = events->nmi.injected;
2872	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2873		vcpu->arch.nmi_pending = events->nmi.pending;
2874	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2875
2876	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2877		vcpu->arch.sipi_vector = events->sipi_vector;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2878
2879	kvm_make_request(KVM_REQ_EVENT, vcpu);
2880
2881	return 0;
2882}
2883
2884static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2885					     struct kvm_debugregs *dbgregs)
2886{
 
 
2887	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
2888	dbgregs->dr6 = vcpu->arch.dr6;
 
2889	dbgregs->dr7 = vcpu->arch.dr7;
2890	dbgregs->flags = 0;
2891	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
2892}
2893
2894static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
2895					    struct kvm_debugregs *dbgregs)
2896{
2897	if (dbgregs->flags)
2898		return -EINVAL;
2899
 
 
 
 
 
2900	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 
2901	vcpu->arch.dr6 = dbgregs->dr6;
 
2902	vcpu->arch.dr7 = dbgregs->dr7;
 
2903
2904	return 0;
2905}
2906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2907static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
2908					 struct kvm_xsave *guest_xsave)
2909{
2910	if (cpu_has_xsave)
2911		memcpy(guest_xsave->region,
2912			&vcpu->arch.guest_fpu.state->xsave,
2913			xstate_size);
2914	else {
2915		memcpy(guest_xsave->region,
2916			&vcpu->arch.guest_fpu.state->fxsave,
2917			sizeof(struct i387_fxsave_struct));
2918		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
2919			XSTATE_FPSSE;
2920	}
2921}
2922
 
 
2923static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
2924					struct kvm_xsave *guest_xsave)
2925{
2926	u64 xstate_bv =
2927		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
 
2928
2929	if (cpu_has_xsave)
2930		memcpy(&vcpu->arch.guest_fpu.state->xsave,
2931			guest_xsave->region, xstate_size);
2932	else {
2933		if (xstate_bv & ~XSTATE_FPSSE)
 
 
 
 
 
 
 
 
2934			return -EINVAL;
2935		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
2936			guest_xsave->region, sizeof(struct i387_fxsave_struct));
2937	}
2938	return 0;
2939}
2940
2941static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
2942					struct kvm_xcrs *guest_xcrs)
2943{
2944	if (!cpu_has_xsave) {
2945		guest_xcrs->nr_xcrs = 0;
2946		return;
2947	}
2948
2949	guest_xcrs->nr_xcrs = 1;
2950	guest_xcrs->flags = 0;
2951	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
2952	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
2953}
2954
2955static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
2956				       struct kvm_xcrs *guest_xcrs)
2957{
2958	int i, r = 0;
2959
2960	if (!cpu_has_xsave)
2961		return -EINVAL;
2962
2963	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
2964		return -EINVAL;
2965
2966	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
2967		/* Only support XCR0 currently */
2968		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
2969			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
2970				guest_xcrs->xcrs[0].value);
2971			break;
2972		}
2973	if (r)
2974		r = -EINVAL;
2975	return r;
2976}
2977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2978long kvm_arch_vcpu_ioctl(struct file *filp,
2979			 unsigned int ioctl, unsigned long arg)
2980{
2981	struct kvm_vcpu *vcpu = filp->private_data;
2982	void __user *argp = (void __user *)arg;
2983	int r;
2984	union {
2985		struct kvm_lapic_state *lapic;
2986		struct kvm_xsave *xsave;
2987		struct kvm_xcrs *xcrs;
2988		void *buffer;
2989	} u;
2990
 
 
2991	u.buffer = NULL;
2992	switch (ioctl) {
2993	case KVM_GET_LAPIC: {
2994		r = -EINVAL;
2995		if (!vcpu->arch.apic)
2996			goto out;
2997		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
2998
2999		r = -ENOMEM;
3000		if (!u.lapic)
3001			goto out;
3002		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3003		if (r)
3004			goto out;
3005		r = -EFAULT;
3006		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3007			goto out;
3008		r = 0;
3009		break;
3010	}
3011	case KVM_SET_LAPIC: {
3012		r = -EINVAL;
3013		if (!vcpu->arch.apic)
3014			goto out;
3015		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3016		r = -ENOMEM;
3017		if (!u.lapic)
3018			goto out;
3019		r = -EFAULT;
3020		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
3021			goto out;
 
 
 
 
 
 
3022		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3023		if (r)
3024			goto out;
3025		r = 0;
3026		break;
3027	}
3028	case KVM_INTERRUPT: {
3029		struct kvm_interrupt irq;
3030
3031		r = -EFAULT;
3032		if (copy_from_user(&irq, argp, sizeof irq))
3033			goto out;
3034		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3035		if (r)
3036			goto out;
3037		r = 0;
3038		break;
3039	}
3040	case KVM_NMI: {
3041		r = kvm_vcpu_ioctl_nmi(vcpu);
3042		if (r)
3043			goto out;
3044		r = 0;
 
3045		break;
3046	}
3047	case KVM_SET_CPUID: {
3048		struct kvm_cpuid __user *cpuid_arg = argp;
3049		struct kvm_cpuid cpuid;
3050
3051		r = -EFAULT;
3052		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3053			goto out;
3054		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3055		if (r)
3056			goto out;
3057		break;
3058	}
3059	case KVM_SET_CPUID2: {
3060		struct kvm_cpuid2 __user *cpuid_arg = argp;
3061		struct kvm_cpuid2 cpuid;
3062
3063		r = -EFAULT;
3064		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3065			goto out;
3066		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3067					      cpuid_arg->entries);
3068		if (r)
3069			goto out;
3070		break;
3071	}
3072	case KVM_GET_CPUID2: {
3073		struct kvm_cpuid2 __user *cpuid_arg = argp;
3074		struct kvm_cpuid2 cpuid;
3075
3076		r = -EFAULT;
3077		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3078			goto out;
3079		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3080					      cpuid_arg->entries);
3081		if (r)
3082			goto out;
3083		r = -EFAULT;
3084		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3085			goto out;
3086		r = 0;
3087		break;
3088	}
3089	case KVM_GET_MSRS:
3090		r = msr_io(vcpu, argp, kvm_get_msr, 1);
 
 
3091		break;
3092	case KVM_SET_MSRS:
 
 
3093		r = msr_io(vcpu, argp, do_set_msr, 0);
 
3094		break;
 
3095	case KVM_TPR_ACCESS_REPORTING: {
3096		struct kvm_tpr_access_ctl tac;
3097
3098		r = -EFAULT;
3099		if (copy_from_user(&tac, argp, sizeof tac))
3100			goto out;
3101		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3102		if (r)
3103			goto out;
3104		r = -EFAULT;
3105		if (copy_to_user(argp, &tac, sizeof tac))
3106			goto out;
3107		r = 0;
3108		break;
3109	};
3110	case KVM_SET_VAPIC_ADDR: {
3111		struct kvm_vapic_addr va;
 
3112
3113		r = -EINVAL;
3114		if (!irqchip_in_kernel(vcpu->kvm))
3115			goto out;
3116		r = -EFAULT;
3117		if (copy_from_user(&va, argp, sizeof va))
3118			goto out;
3119		r = 0;
3120		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
 
3121		break;
3122	}
3123	case KVM_X86_SETUP_MCE: {
3124		u64 mcg_cap;
3125
3126		r = -EFAULT;
3127		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3128			goto out;
3129		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3130		break;
3131	}
3132	case KVM_X86_SET_MCE: {
3133		struct kvm_x86_mce mce;
3134
3135		r = -EFAULT;
3136		if (copy_from_user(&mce, argp, sizeof mce))
3137			goto out;
3138		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3139		break;
3140	}
3141	case KVM_GET_VCPU_EVENTS: {
3142		struct kvm_vcpu_events events;
3143
3144		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3145
3146		r = -EFAULT;
3147		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3148			break;
3149		r = 0;
3150		break;
3151	}
3152	case KVM_SET_VCPU_EVENTS: {
3153		struct kvm_vcpu_events events;
3154
3155		r = -EFAULT;
3156		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
3157			break;
3158
3159		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
3160		break;
3161	}
3162	case KVM_GET_DEBUGREGS: {
3163		struct kvm_debugregs dbgregs;
3164
3165		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
3166
3167		r = -EFAULT;
3168		if (copy_to_user(argp, &dbgregs,
3169				 sizeof(struct kvm_debugregs)))
3170			break;
3171		r = 0;
3172		break;
3173	}
3174	case KVM_SET_DEBUGREGS: {
3175		struct kvm_debugregs dbgregs;
3176
3177		r = -EFAULT;
3178		if (copy_from_user(&dbgregs, argp,
3179				   sizeof(struct kvm_debugregs)))
3180			break;
3181
3182		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
3183		break;
3184	}
3185	case KVM_GET_XSAVE: {
3186		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3187		r = -ENOMEM;
3188		if (!u.xsave)
3189			break;
3190
3191		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
3192
3193		r = -EFAULT;
3194		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
3195			break;
3196		r = 0;
3197		break;
3198	}
3199	case KVM_SET_XSAVE: {
3200		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3201		r = -ENOMEM;
3202		if (!u.xsave)
3203			break;
3204
3205		r = -EFAULT;
3206		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
3207			break;
3208
3209		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3210		break;
3211	}
3212	case KVM_GET_XCRS: {
3213		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3214		r = -ENOMEM;
3215		if (!u.xcrs)
3216			break;
3217
3218		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
3219
3220		r = -EFAULT;
3221		if (copy_to_user(argp, u.xcrs,
3222				 sizeof(struct kvm_xcrs)))
3223			break;
3224		r = 0;
3225		break;
3226	}
3227	case KVM_SET_XCRS: {
3228		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3229		r = -ENOMEM;
3230		if (!u.xcrs)
3231			break;
3232
3233		r = -EFAULT;
3234		if (copy_from_user(u.xcrs, argp,
3235				   sizeof(struct kvm_xcrs)))
3236			break;
3237
3238		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3239		break;
3240	}
3241	case KVM_SET_TSC_KHZ: {
3242		u32 user_tsc_khz;
3243
3244		r = -EINVAL;
3245		if (!kvm_has_tsc_control)
3246			break;
3247
3248		user_tsc_khz = (u32)arg;
3249
3250		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3251			goto out;
3252
3253		kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
 
 
 
 
3254
3255		r = 0;
3256		goto out;
3257	}
3258	case KVM_GET_TSC_KHZ: {
3259		r = -EIO;
3260		if (check_tsc_unstable())
 
 
 
 
 
 
 
 
 
 
3261			goto out;
 
 
 
 
 
 
 
 
 
 
3262
3263		r = vcpu_tsc_khz(vcpu);
 
 
 
3264
3265		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3266	}
3267	default:
3268		r = -EINVAL;
3269	}
3270out:
3271	kfree(u.buffer);
 
 
3272	return r;
3273}
3274
 
 
 
 
 
3275static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
3276{
3277	int ret;
3278
3279	if (addr > (unsigned int)(-3 * PAGE_SIZE))
3280		return -1;
3281	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
3282	return ret;
3283}
3284
3285static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
3286					      u64 ident_addr)
3287{
3288	kvm->arch.ept_identity_map_addr = ident_addr;
3289	return 0;
3290}
3291
3292static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3293					  u32 kvm_nr_mmu_pages)
3294{
3295	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
3296		return -EINVAL;
3297
3298	mutex_lock(&kvm->slots_lock);
3299	spin_lock(&kvm->mmu_lock);
3300
3301	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3302	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3303
3304	spin_unlock(&kvm->mmu_lock);
3305	mutex_unlock(&kvm->slots_lock);
3306	return 0;
3307}
3308
3309static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
3310{
3311	return kvm->arch.n_max_mmu_pages;
3312}
3313
3314static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3315{
 
3316	int r;
3317
3318	r = 0;
3319	switch (chip->chip_id) {
3320	case KVM_IRQCHIP_PIC_MASTER:
3321		memcpy(&chip->chip.pic,
3322			&pic_irqchip(kvm)->pics[0],
3323			sizeof(struct kvm_pic_state));
3324		break;
3325	case KVM_IRQCHIP_PIC_SLAVE:
3326		memcpy(&chip->chip.pic,
3327			&pic_irqchip(kvm)->pics[1],
3328			sizeof(struct kvm_pic_state));
3329		break;
3330	case KVM_IRQCHIP_IOAPIC:
3331		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
3332		break;
3333	default:
3334		r = -EINVAL;
3335		break;
3336	}
3337	return r;
3338}
3339
3340static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3341{
 
3342	int r;
3343
3344	r = 0;
3345	switch (chip->chip_id) {
3346	case KVM_IRQCHIP_PIC_MASTER:
3347		spin_lock(&pic_irqchip(kvm)->lock);
3348		memcpy(&pic_irqchip(kvm)->pics[0],
3349			&chip->chip.pic,
3350			sizeof(struct kvm_pic_state));
3351		spin_unlock(&pic_irqchip(kvm)->lock);
3352		break;
3353	case KVM_IRQCHIP_PIC_SLAVE:
3354		spin_lock(&pic_irqchip(kvm)->lock);
3355		memcpy(&pic_irqchip(kvm)->pics[1],
3356			&chip->chip.pic,
3357			sizeof(struct kvm_pic_state));
3358		spin_unlock(&pic_irqchip(kvm)->lock);
3359		break;
3360	case KVM_IRQCHIP_IOAPIC:
3361		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
3362		break;
3363	default:
3364		r = -EINVAL;
3365		break;
3366	}
3367	kvm_pic_update_irq(pic_irqchip(kvm));
3368	return r;
3369}
3370
3371static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3372{
3373	int r = 0;
3374
3375	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3376	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3377	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3378	return r;
 
 
3379}
3380
3381static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3382{
3383	int r = 0;
 
3384
3385	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3386	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3387	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3388	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3389	return r;
 
3390}
3391
3392static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3393{
3394	int r = 0;
3395
3396	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3397	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3398		sizeof(ps->channels));
3399	ps->flags = kvm->arch.vpit->pit_state.flags;
3400	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3401	memset(&ps->reserved, 0, sizeof(ps->reserved));
3402	return r;
3403}
3404
3405static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3406{
3407	int r = 0, start = 0;
 
3408	u32 prev_legacy, cur_legacy;
3409	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3410	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 
 
3411	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
3412	if (!prev_legacy && cur_legacy)
3413		start = 1;
3414	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
3415	       sizeof(kvm->arch.vpit->pit_state.channels));
3416	kvm->arch.vpit->pit_state.flags = ps->flags;
3417	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3418	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3419	return r;
 
 
3420}
3421
3422static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3423				 struct kvm_reinject_control *control)
3424{
3425	if (!kvm->arch.vpit)
 
 
3426		return -ENXIO;
3427	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3428	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
3429	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 
 
 
 
 
 
3430	return 0;
3431}
3432
3433/*
3434 * Get (and clear) the dirty memory log for a memory slot.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3435 */
3436int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
3437				      struct kvm_dirty_log *log)
3438{
3439	int r, i;
3440	struct kvm_memory_slot *memslot;
3441	unsigned long n;
3442	unsigned long is_dirty = 0;
3443
3444	mutex_lock(&kvm->slots_lock);
3445
3446	r = -EINVAL;
3447	if (log->slot >= KVM_MEMORY_SLOTS)
3448		goto out;
 
 
3449
3450	memslot = &kvm->memslots->memslots[log->slot];
3451	r = -ENOENT;
3452	if (!memslot->dirty_bitmap)
3453		goto out;
3454
3455	n = kvm_dirty_bitmap_bytes(memslot);
 
 
 
 
 
 
3456
3457	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
3458		is_dirty = memslot->dirty_bitmap[i];
 
3459
3460	/* If nothing is dirty, don't bother messing with page tables. */
3461	if (is_dirty) {
3462		struct kvm_memslots *slots, *old_slots;
3463		unsigned long *dirty_bitmap;
3464
3465		dirty_bitmap = memslot->dirty_bitmap_head;
3466		if (memslot->dirty_bitmap == dirty_bitmap)
3467			dirty_bitmap += n / sizeof(long);
3468		memset(dirty_bitmap, 0, n);
3469
3470		r = -ENOMEM;
3471		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
3472		if (!slots)
3473			goto out;
3474		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
3475		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3476		slots->generation++;
3477
3478		old_slots = kvm->memslots;
3479		rcu_assign_pointer(kvm->memslots, slots);
3480		synchronize_srcu_expedited(&kvm->srcu);
3481		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
3482		kfree(old_slots);
3483
3484		spin_lock(&kvm->mmu_lock);
3485		kvm_mmu_slot_remove_write_access(kvm, log->slot);
3486		spin_unlock(&kvm->mmu_lock);
3487
3488		r = -EFAULT;
3489		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
3490			goto out;
3491	} else {
3492		r = -EFAULT;
3493		if (clear_user(log->dirty_bitmap, n))
3494			goto out;
3495	}
3496
3497	r = 0;
3498out:
3499	mutex_unlock(&kvm->slots_lock);
3500	return r;
3501}
3502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3503long kvm_arch_vm_ioctl(struct file *filp,
3504		       unsigned int ioctl, unsigned long arg)
3505{
3506	struct kvm *kvm = filp->private_data;
3507	void __user *argp = (void __user *)arg;
3508	int r = -ENOTTY;
3509	/*
3510	 * This union makes it completely explicit to gcc-3.x
3511	 * that these two variables' stack usage should be
3512	 * combined, not added together.
3513	 */
3514	union {
3515		struct kvm_pit_state ps;
3516		struct kvm_pit_state2 ps2;
3517		struct kvm_pit_config pit_config;
3518	} u;
3519
3520	switch (ioctl) {
3521	case KVM_SET_TSS_ADDR:
3522		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3523		if (r < 0)
3524			goto out;
3525		break;
3526	case KVM_SET_IDENTITY_MAP_ADDR: {
3527		u64 ident_addr;
3528
 
 
 
 
3529		r = -EFAULT;
3530		if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3531			goto out;
3532		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3533		if (r < 0)
3534			goto out;
3535		break;
3536	}
3537	case KVM_SET_NR_MMU_PAGES:
3538		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3539		if (r)
3540			goto out;
3541		break;
3542	case KVM_GET_NR_MMU_PAGES:
3543		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3544		break;
3545	case KVM_CREATE_IRQCHIP: {
3546		struct kvm_pic *vpic;
3547
3548		mutex_lock(&kvm->lock);
 
3549		r = -EEXIST;
3550		if (kvm->arch.vpic)
3551			goto create_irqchip_unlock;
3552		r = -ENOMEM;
3553		vpic = kvm_create_pic(kvm);
3554		if (vpic) {
3555			r = kvm_ioapic_init(kvm);
3556			if (r) {
3557				mutex_lock(&kvm->slots_lock);
3558				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3559							  &vpic->dev);
3560				mutex_unlock(&kvm->slots_lock);
3561				kfree(vpic);
3562				goto create_irqchip_unlock;
3563			}
3564		} else
3565			goto create_irqchip_unlock;
3566		smp_wmb();
3567		kvm->arch.vpic = vpic;
3568		smp_wmb();
 
 
 
 
 
 
 
 
3569		r = kvm_setup_default_irq_routing(kvm);
3570		if (r) {
3571			mutex_lock(&kvm->slots_lock);
3572			mutex_lock(&kvm->irq_lock);
3573			kvm_ioapic_destroy(kvm);
3574			kvm_destroy_pic(kvm);
3575			mutex_unlock(&kvm->irq_lock);
3576			mutex_unlock(&kvm->slots_lock);
3577		}
 
 
 
3578	create_irqchip_unlock:
3579		mutex_unlock(&kvm->lock);
3580		break;
3581	}
3582	case KVM_CREATE_PIT:
3583		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
3584		goto create_pit;
3585	case KVM_CREATE_PIT2:
3586		r = -EFAULT;
3587		if (copy_from_user(&u.pit_config, argp,
3588				   sizeof(struct kvm_pit_config)))
3589			goto out;
3590	create_pit:
3591		mutex_lock(&kvm->slots_lock);
3592		r = -EEXIST;
3593		if (kvm->arch.vpit)
3594			goto create_pit_unlock;
3595		r = -ENOMEM;
3596		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
3597		if (kvm->arch.vpit)
3598			r = 0;
3599	create_pit_unlock:
3600		mutex_unlock(&kvm->slots_lock);
3601		break;
3602	case KVM_IRQ_LINE_STATUS:
3603	case KVM_IRQ_LINE: {
3604		struct kvm_irq_level irq_event;
3605
3606		r = -EFAULT;
3607		if (copy_from_user(&irq_event, argp, sizeof irq_event))
3608			goto out;
3609		r = -ENXIO;
3610		if (irqchip_in_kernel(kvm)) {
3611			__s32 status;
3612			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3613					irq_event.irq, irq_event.level);
3614			if (ioctl == KVM_IRQ_LINE_STATUS) {
3615				r = -EFAULT;
3616				irq_event.status = status;
3617				if (copy_to_user(argp, &irq_event,
3618							sizeof irq_event))
3619					goto out;
3620			}
3621			r = 0;
3622		}
3623		break;
3624	}
3625	case KVM_GET_IRQCHIP: {
3626		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3627		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
3628
3629		r = -ENOMEM;
3630		if (!chip)
 
3631			goto out;
3632		r = -EFAULT;
3633		if (copy_from_user(chip, argp, sizeof *chip))
3634			goto get_irqchip_out;
3635		r = -ENXIO;
3636		if (!irqchip_in_kernel(kvm))
3637			goto get_irqchip_out;
3638		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3639		if (r)
3640			goto get_irqchip_out;
3641		r = -EFAULT;
3642		if (copy_to_user(argp, chip, sizeof *chip))
3643			goto get_irqchip_out;
3644		r = 0;
3645	get_irqchip_out:
3646		kfree(chip);
3647		if (r)
3648			goto out;
3649		break;
3650	}
3651	case KVM_SET_IRQCHIP: {
3652		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3653		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
3654
3655		r = -ENOMEM;
3656		if (!chip)
 
3657			goto out;
3658		r = -EFAULT;
3659		if (copy_from_user(chip, argp, sizeof *chip))
3660			goto set_irqchip_out;
3661		r = -ENXIO;
3662		if (!irqchip_in_kernel(kvm))
3663			goto set_irqchip_out;
3664		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3665		if (r)
3666			goto set_irqchip_out;
3667		r = 0;
3668	set_irqchip_out:
3669		kfree(chip);
3670		if (r)
3671			goto out;
3672		break;
3673	}
3674	case KVM_GET_PIT: {
3675		r = -EFAULT;
3676		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
3677			goto out;
3678		r = -ENXIO;
3679		if (!kvm->arch.vpit)
3680			goto out;
3681		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
3682		if (r)
3683			goto out;
3684		r = -EFAULT;
3685		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
3686			goto out;
3687		r = 0;
3688		break;
3689	}
3690	case KVM_SET_PIT: {
3691		r = -EFAULT;
3692		if (copy_from_user(&u.ps, argp, sizeof u.ps))
3693			goto out;
3694		r = -ENXIO;
3695		if (!kvm->arch.vpit)
3696			goto out;
3697		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3698		if (r)
3699			goto out;
3700		r = 0;
3701		break;
3702	}
3703	case KVM_GET_PIT2: {
3704		r = -ENXIO;
3705		if (!kvm->arch.vpit)
3706			goto out;
3707		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
3708		if (r)
3709			goto out;
3710		r = -EFAULT;
3711		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
3712			goto out;
3713		r = 0;
3714		break;
3715	}
3716	case KVM_SET_PIT2: {
3717		r = -EFAULT;
3718		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
3719			goto out;
3720		r = -ENXIO;
3721		if (!kvm->arch.vpit)
3722			goto out;
3723		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
3724		if (r)
3725			goto out;
3726		r = 0;
3727		break;
3728	}
3729	case KVM_REINJECT_CONTROL: {
3730		struct kvm_reinject_control control;
3731		r =  -EFAULT;
3732		if (copy_from_user(&control, argp, sizeof(control)))
3733			goto out;
3734		r = kvm_vm_ioctl_reinject(kvm, &control);
3735		if (r)
3736			goto out;
3737		r = 0;
3738		break;
3739	}
 
 
 
 
 
 
 
 
 
3740	case KVM_XEN_HVM_CONFIG: {
 
3741		r = -EFAULT;
3742		if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
3743				   sizeof(struct kvm_xen_hvm_config)))
3744			goto out;
3745		r = -EINVAL;
3746		if (kvm->arch.xen_hvm_config.flags)
3747			goto out;
 
3748		r = 0;
3749		break;
3750	}
3751	case KVM_SET_CLOCK: {
3752		struct kvm_clock_data user_ns;
3753		u64 now_ns;
3754		s64 delta;
3755
3756		r = -EFAULT;
3757		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
3758			goto out;
3759
3760		r = -EINVAL;
3761		if (user_ns.flags)
3762			goto out;
3763
3764		r = 0;
3765		local_irq_disable();
3766		now_ns = get_kernel_ns();
3767		delta = user_ns.clock - now_ns;
3768		local_irq_enable();
3769		kvm->arch.kvmclock_offset = delta;
 
 
 
 
3770		break;
3771	}
3772	case KVM_GET_CLOCK: {
3773		struct kvm_clock_data user_ns;
3774		u64 now_ns;
3775
3776		local_irq_disable();
3777		now_ns = get_kernel_ns();
3778		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3779		local_irq_enable();
3780		user_ns.flags = 0;
3781		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
3782
3783		r = -EFAULT;
3784		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
3785			goto out;
3786		r = 0;
3787		break;
3788	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3789
 
 
 
 
 
 
 
 
 
3790	default:
3791		;
3792	}
3793out:
3794	return r;
3795}
3796
3797static void kvm_init_msr_list(void)
3798{
 
3799	u32 dummy[2];
3800	unsigned i, j;
 
 
 
3801
3802	/* skip the first msrs in the list. KVM-specific */
3803	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
3804		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
 
 
 
 
 
3805			continue;
3806		if (j < i)
3807			msrs_to_save[j] = msrs_to_save[i];
3808		j++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3809	}
3810	num_msrs_to_save = j;
3811}
3812
3813static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3814			   const void *v)
3815{
3816	int handled = 0;
3817	int n;
3818
3819	do {
3820		n = min(len, 8);
3821		if (!(vcpu->arch.apic &&
3822		      !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3823		    && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3824			break;
3825		handled += n;
3826		addr += n;
3827		len -= n;
3828		v += n;
3829	} while (len);
3830
3831	return handled;
3832}
3833
3834static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3835{
3836	int handled = 0;
3837	int n;
3838
3839	do {
3840		n = min(len, 8);
3841		if (!(vcpu->arch.apic &&
3842		      !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3843		    && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
 
3844			break;
3845		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3846		handled += n;
3847		addr += n;
3848		len -= n;
3849		v += n;
3850	} while (len);
3851
3852	return handled;
3853}
3854
3855static void kvm_set_segment(struct kvm_vcpu *vcpu,
3856			struct kvm_segment *var, int seg)
3857{
3858	kvm_x86_ops->set_segment(vcpu, var, seg);
3859}
3860
3861void kvm_get_segment(struct kvm_vcpu *vcpu,
3862		     struct kvm_segment *var, int seg)
3863{
3864	kvm_x86_ops->get_segment(vcpu, var, seg);
3865}
3866
3867static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3868{
3869	return gpa;
3870}
3871
3872static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3873{
3874	gpa_t t_gpa;
3875	struct x86_exception exception;
3876
3877	BUG_ON(!mmu_is_nested(vcpu));
3878
3879	/* NPT walks are always user-walks */
3880	access |= PFERR_USER_MASK;
3881	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3882
3883	return t_gpa;
3884}
3885
3886gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3887			      struct x86_exception *exception)
3888{
3889	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3890	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3891}
3892
3893 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3894				struct x86_exception *exception)
3895{
3896	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3897	access |= PFERR_FETCH_MASK;
3898	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3899}
3900
3901gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3902			       struct x86_exception *exception)
3903{
3904	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3905	access |= PFERR_WRITE_MASK;
3906	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3907}
3908
3909/* uses this to access any guest's mapped memory without checking CPL */
3910gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3911				struct x86_exception *exception)
3912{
3913	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3914}
3915
3916static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3917				      struct kvm_vcpu *vcpu, u32 access,
3918				      struct x86_exception *exception)
3919{
3920	void *data = val;
3921	int r = X86EMUL_CONTINUE;
3922
3923	while (bytes) {
3924		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3925							    exception);
3926		unsigned offset = addr & (PAGE_SIZE-1);
3927		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3928		int ret;
3929
3930		if (gpa == UNMAPPED_GVA)
3931			return X86EMUL_PROPAGATE_FAULT;
3932		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 
3933		if (ret < 0) {
3934			r = X86EMUL_IO_NEEDED;
3935			goto out;
3936		}
3937
3938		bytes -= toread;
3939		data += toread;
3940		addr += toread;
3941	}
3942out:
3943	return r;
3944}
3945
3946/* used for instruction fetching */
3947static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3948				gva_t addr, void *val, unsigned int bytes,
3949				struct x86_exception *exception)
3950{
3951	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3952	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 
 
3953
3954	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3955					  access | PFERR_FETCH_MASK,
3956					  exception);
 
 
 
 
 
 
 
 
 
 
 
 
3957}
3958
3959int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3960			       gva_t addr, void *val, unsigned int bytes,
3961			       struct x86_exception *exception)
3962{
3963	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3964	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3965
 
 
 
 
 
 
 
3966	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3967					  exception);
3968}
3969EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
3970
3971static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3972				      gva_t addr, void *val, unsigned int bytes,
3973				      struct x86_exception *exception)
3974{
3975	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3976	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 
 
 
 
 
3977}
3978
3979int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3980				       gva_t addr, void *val,
3981				       unsigned int bytes,
3982				       struct x86_exception *exception)
3983{
3984	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
 
 
 
 
 
 
 
 
3985	void *data = val;
3986	int r = X86EMUL_CONTINUE;
3987
3988	while (bytes) {
3989		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3990							     PFERR_WRITE_MASK,
3991							     exception);
3992		unsigned offset = addr & (PAGE_SIZE-1);
3993		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3994		int ret;
3995
3996		if (gpa == UNMAPPED_GVA)
3997			return X86EMUL_PROPAGATE_FAULT;
3998		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3999		if (ret < 0) {
4000			r = X86EMUL_IO_NEEDED;
4001			goto out;
4002		}
4003
4004		bytes -= towrite;
4005		data += towrite;
4006		addr += towrite;
4007	}
4008out:
4009	return r;
4010}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4011EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4012
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4013static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4014				gpa_t *gpa, struct x86_exception *exception,
4015				bool write)
4016{
4017	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 
4018
4019	if (vcpu_match_mmio_gva(vcpu, gva) &&
4020		  check_write_user_access(vcpu, write, access,
4021		  vcpu->arch.access)) {
 
 
 
 
 
4022		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4023					(gva & (PAGE_SIZE - 1));
4024		trace_vcpu_match_mmio(gva, *gpa, write, false);
4025		return 1;
4026	}
4027
4028	if (write)
4029		access |= PFERR_WRITE_MASK;
4030
4031	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4032
4033	if (*gpa == UNMAPPED_GVA)
4034		return -1;
4035
4036	/* For APIC access vmexit */
4037	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4038		return 1;
4039
4040	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4041		trace_vcpu_match_mmio(gva, *gpa, write, true);
4042		return 1;
4043	}
4044
4045	return 0;
 
 
 
 
4046}
4047
4048static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4049				  unsigned long addr,
4050				  void *val,
4051				  unsigned int bytes,
4052				  struct x86_exception *exception)
4053{
4054	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4055	gpa_t gpa;
4056	int handled, ret;
 
 
4057
 
 
4058	if (vcpu->mmio_read_completed) {
4059		memcpy(val, vcpu->mmio_data, bytes);
4060		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4061			       vcpu->mmio_phys_addr, *(u64 *)val);
4062		vcpu->mmio_read_completed = 0;
4063		return X86EMUL_CONTINUE;
4064	}
4065
4066	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
4067
4068	if (ret < 0)
4069		return X86EMUL_PROPAGATE_FAULT;
4070
4071	if (ret)
4072		goto mmio;
4073
4074	if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
4075	    == X86EMUL_CONTINUE)
4076		return X86EMUL_CONTINUE;
4077
4078mmio:
4079	/*
4080	 * Is this MMIO handled locally?
4081	 */
4082	handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
4083
4084	if (handled == bytes)
4085		return X86EMUL_CONTINUE;
4086
4087	gpa += handled;
4088	bytes -= handled;
4089	val += handled;
 
 
4090
4091	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 
 
 
 
4092
4093	vcpu->mmio_needed = 1;
4094	vcpu->run->exit_reason = KVM_EXIT_MMIO;
4095	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
4096	vcpu->mmio_size = bytes;
4097	vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
4098	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
4099	vcpu->mmio_index = 0;
4100
 
 
 
 
4101	return X86EMUL_IO_NEEDED;
4102}
4103
4104int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4105			const void *val, int bytes)
4106{
4107	int ret;
4108
4109	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4110	if (ret < 0)
4111		return 0;
4112	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
4113	return 1;
4114}
4115
4116static int emulator_write_emulated_onepage(unsigned long addr,
4117					   const void *val,
4118					   unsigned int bytes,
4119					   struct x86_exception *exception,
4120					   struct kvm_vcpu *vcpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4121{
4122	gpa_t gpa;
4123	int handled, ret;
 
 
 
4124
4125	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
4126
4127	if (ret < 0)
4128		return X86EMUL_PROPAGATE_FAULT;
4129
4130	/* For APIC access vmexit */
4131	if (ret)
4132		goto mmio;
 
 
 
 
 
 
 
 
 
4133
4134	if (emulator_write_phys(vcpu, gpa, val, bytes))
4135		return X86EMUL_CONTINUE;
4136
4137mmio:
4138	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4139	/*
4140	 * Is this MMIO handled locally?
4141	 */
4142	handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
4143	if (handled == bytes)
4144		return X86EMUL_CONTINUE;
4145
4146	gpa += handled;
4147	bytes -= handled;
4148	val += handled;
4149
4150	vcpu->mmio_needed = 1;
4151	memcpy(vcpu->mmio_data, val, bytes);
4152	vcpu->run->exit_reason = KVM_EXIT_MMIO;
4153	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
4154	vcpu->mmio_size = bytes;
4155	vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
4156	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
4157	memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
4158	vcpu->mmio_index = 0;
4159
4160	return X86EMUL_CONTINUE;
4161}
4162
4163int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4164			    unsigned long addr,
4165			    const void *val,
4166			    unsigned int bytes,
4167			    struct x86_exception *exception)
4168{
4169	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
 
 
 
 
 
 
 
4170
4171	/* Crossing a page boundary? */
4172	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
4173		int rc, now;
4174
4175		now = -addr & ~PAGE_MASK;
4176		rc = emulator_write_emulated_onepage(addr, val, now, exception,
4177						     vcpu);
 
4178		if (rc != X86EMUL_CONTINUE)
4179			return rc;
4180		addr += now;
 
 
4181		val += now;
4182		bytes -= now;
4183	}
4184	return emulator_write_emulated_onepage(addr, val, bytes, exception,
4185					       vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4186}
4187
4188#define CMPXCHG_TYPE(t, ptr, old, new) \
4189	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
4190
4191#ifdef CONFIG_X86_64
4192#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
4193#else
4194#  define CMPXCHG64(ptr, old, new) \
4195	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
4196#endif
4197
4198static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4199				     unsigned long addr,
4200				     const void *old,
4201				     const void *new,
4202				     unsigned int bytes,
4203				     struct x86_exception *exception)
4204{
 
4205	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4206	gpa_t gpa;
4207	struct page *page;
4208	char *kaddr;
4209	bool exchanged;
4210
4211	/* guests cmpxchg8b have to be emulated atomically */
4212	if (bytes > 8 || (bytes & (bytes - 1)))
4213		goto emul_write;
4214
4215	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
4216
4217	if (gpa == UNMAPPED_GVA ||
4218	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4219		goto emul_write;
4220
4221	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
4222		goto emul_write;
4223
4224	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4225	if (is_error_page(page)) {
4226		kvm_release_page_clean(page);
4227		goto emul_write;
4228	}
4229
4230	kaddr = kmap_atomic(page, KM_USER0);
4231	kaddr += offset_in_page(gpa);
4232	switch (bytes) {
4233	case 1:
4234		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
4235		break;
4236	case 2:
4237		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
4238		break;
4239	case 4:
4240		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
4241		break;
4242	case 8:
4243		exchanged = CMPXCHG64(kaddr, old, new);
4244		break;
4245	default:
4246		BUG();
4247	}
4248	kunmap_atomic(kaddr, KM_USER0);
4249	kvm_release_page_dirty(page);
4250
4251	if (!exchanged)
4252		return X86EMUL_CMPXCHG_FAILED;
4253
4254	kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
4255
4256	return X86EMUL_CONTINUE;
4257
4258emul_write:
4259	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
4260
4261	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
4262}
4263
4264static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4265{
4266	/* TODO: String I/O for in kernel device */
4267	int r;
4268
4269	if (vcpu->arch.pio.in)
4270		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
4271				    vcpu->arch.pio.size, pd);
4272	else
4273		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
4274				     vcpu->arch.pio.port, vcpu->arch.pio.size,
4275				     pd);
 
 
 
 
 
4276	return r;
4277}
4278
4279
4280static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4281				    int size, unsigned short port, void *val,
4282				    unsigned int count)
4283{
4284	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4285
4286	if (vcpu->arch.pio.count)
4287		goto data_avail;
4288
4289	trace_kvm_pio(0, port, size, count);
4290
4291	vcpu->arch.pio.port = port;
4292	vcpu->arch.pio.in = 1;
4293	vcpu->arch.pio.count  = count;
4294	vcpu->arch.pio.size = size;
4295
4296	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4297	data_avail:
4298		memcpy(val, vcpu->arch.pio_data, size * count);
4299		vcpu->arch.pio.count = 0;
4300		return 1;
4301	}
4302
4303	vcpu->run->exit_reason = KVM_EXIT_IO;
4304	vcpu->run->io.direction = KVM_EXIT_IO_IN;
4305	vcpu->run->io.size = size;
4306	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4307	vcpu->run->io.count = count;
4308	vcpu->run->io.port = port;
4309
4310	return 0;
4311}
4312
4313static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4314				     int size, unsigned short port,
4315				     const void *val, unsigned int count)
4316{
4317	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 
4318
4319	trace_kvm_pio(1, port, size, count);
4320
4321	vcpu->arch.pio.port = port;
4322	vcpu->arch.pio.in = 0;
4323	vcpu->arch.pio.count = count;
4324	vcpu->arch.pio.size = size;
4325
4326	memcpy(vcpu->arch.pio_data, val, size * count);
4327
4328	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 
 
 
 
4329		vcpu->arch.pio.count = 0;
4330		return 1;
4331	}
4332
4333	vcpu->run->exit_reason = KVM_EXIT_IO;
4334	vcpu->run->io.direction = KVM_EXIT_IO_OUT;
4335	vcpu->run->io.size = size;
4336	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4337	vcpu->run->io.count = count;
4338	vcpu->run->io.port = port;
4339
4340	return 0;
4341}
4342
 
 
 
 
 
 
 
 
 
 
 
4343static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4344{
4345	return kvm_x86_ops->get_segment_base(vcpu, seg);
4346}
4347
4348static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4349{
4350	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4351}
4352
4353int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4354{
4355	if (!need_emulate_wbinvd(vcpu))
4356		return X86EMUL_CONTINUE;
4357
4358	if (kvm_x86_ops->has_wbinvd_exit()) {
4359		int cpu = get_cpu();
4360
4361		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4362		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
4363				wbinvd_ipi, NULL, 1);
4364		put_cpu();
4365		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
4366	} else
4367		wbinvd();
4368	return X86EMUL_CONTINUE;
4369}
 
 
 
 
 
 
4370EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4371
 
 
4372static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4373{
4374	kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4375}
4376
4377int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
 
4378{
4379	return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4380}
4381
4382int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 
4383{
4384
4385	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4386}
4387
4388static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4389{
4390	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4391}
4392
4393static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4394{
4395	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4396	unsigned long value;
4397
4398	switch (cr) {
4399	case 0:
4400		value = kvm_read_cr0(vcpu);
4401		break;
4402	case 2:
4403		value = vcpu->arch.cr2;
4404		break;
4405	case 3:
4406		value = kvm_read_cr3(vcpu);
4407		break;
4408	case 4:
4409		value = kvm_read_cr4(vcpu);
4410		break;
4411	case 8:
4412		value = kvm_get_cr8(vcpu);
4413		break;
4414	default:
4415		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4416		return 0;
4417	}
4418
4419	return value;
4420}
4421
4422static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4423{
4424	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4425	int res = 0;
4426
4427	switch (cr) {
4428	case 0:
4429		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4430		break;
4431	case 2:
4432		vcpu->arch.cr2 = val;
4433		break;
4434	case 3:
4435		res = kvm_set_cr3(vcpu, val);
4436		break;
4437	case 4:
4438		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4439		break;
4440	case 8:
4441		res = kvm_set_cr8(vcpu, val);
4442		break;
4443	default:
4444		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
4445		res = -1;
4446	}
4447
4448	return res;
4449}
4450
4451static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4452{
4453	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4454}
4455
4456static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4457{
4458	kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4459}
4460
4461static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4462{
4463	kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4464}
4465
4466static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4467{
4468	kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4469}
4470
4471static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4472{
4473	kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4474}
4475
4476static unsigned long emulator_get_cached_segment_base(
4477	struct x86_emulate_ctxt *ctxt, int seg)
4478{
4479	return get_segment_base(emul_to_vcpu(ctxt), seg);
4480}
4481
4482static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4483				 struct desc_struct *desc, u32 *base3,
4484				 int seg)
4485{
4486	struct kvm_segment var;
4487
4488	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4489	*selector = var.selector;
4490
4491	if (var.unusable)
 
 
 
4492		return false;
 
4493
4494	if (var.g)
4495		var.limit >>= 12;
4496	set_desc_limit(desc, var.limit);
4497	set_desc_base(desc, (unsigned long)var.base);
4498#ifdef CONFIG_X86_64
4499	if (base3)
4500		*base3 = var.base >> 32;
4501#endif
4502	desc->type = var.type;
4503	desc->s = var.s;
4504	desc->dpl = var.dpl;
4505	desc->p = var.present;
4506	desc->avl = var.avl;
4507	desc->l = var.l;
4508	desc->d = var.db;
4509	desc->g = var.g;
4510
4511	return true;
4512}
4513
4514static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4515				 struct desc_struct *desc, u32 base3,
4516				 int seg)
4517{
4518	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4519	struct kvm_segment var;
4520
4521	var.selector = selector;
4522	var.base = get_desc_base(desc);
4523#ifdef CONFIG_X86_64
4524	var.base |= ((u64)base3) << 32;
4525#endif
4526	var.limit = get_desc_limit(desc);
4527	if (desc->g)
4528		var.limit = (var.limit << 12) | 0xfff;
4529	var.type = desc->type;
4530	var.present = desc->p;
4531	var.dpl = desc->dpl;
4532	var.db = desc->d;
4533	var.s = desc->s;
4534	var.l = desc->l;
4535	var.g = desc->g;
4536	var.avl = desc->avl;
4537	var.present = desc->p;
4538	var.unusable = !var.present;
4539	var.padding = 0;
4540
4541	kvm_set_segment(vcpu, &var, seg);
4542	return;
4543}
4544
4545static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4546			    u32 msr_index, u64 *pdata)
4547{
4548	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4549}
4550
4551static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4552			    u32 msr_index, u64 data)
4553{
4554	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4555}
4556
4557static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4558{
4559	emul_to_vcpu(ctxt)->arch.halt_request = 1;
 
 
4560}
4561
4562static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4563{
4564	preempt_disable();
4565	kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4566	/*
4567	 * CR0.TS may reference the host fpu state, not the guest fpu state,
4568	 * so it may be clear at this point.
4569	 */
4570	clts();
4571}
4572
4573static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
 
4574{
4575	preempt_enable();
 
 
 
 
 
 
 
 
 
 
 
4576}
4577
4578static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4579			      struct x86_instruction_info *info,
4580			      enum x86_intercept_stage stage)
4581{
4582	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4583}
4584
4585static struct x86_emulate_ops emulate_ops = {
4586	.read_std            = kvm_read_guest_virt_system,
4587	.write_std           = kvm_write_guest_virt_system,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4588	.fetch               = kvm_fetch_guest_virt,
4589	.read_emulated       = emulator_read_emulated,
4590	.write_emulated      = emulator_write_emulated,
4591	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
4592	.invlpg              = emulator_invlpg,
4593	.pio_in_emulated     = emulator_pio_in_emulated,
4594	.pio_out_emulated    = emulator_pio_out_emulated,
4595	.get_segment         = emulator_get_segment,
4596	.set_segment         = emulator_set_segment,
4597	.get_cached_segment_base = emulator_get_cached_segment_base,
4598	.get_gdt             = emulator_get_gdt,
4599	.get_idt	     = emulator_get_idt,
4600	.set_gdt             = emulator_set_gdt,
4601	.set_idt	     = emulator_set_idt,
4602	.get_cr              = emulator_get_cr,
4603	.set_cr              = emulator_set_cr,
4604	.cpl                 = emulator_get_cpl,
4605	.get_dr              = emulator_get_dr,
4606	.set_dr              = emulator_set_dr,
 
 
4607	.set_msr             = emulator_set_msr,
4608	.get_msr             = emulator_get_msr,
 
 
4609	.halt                = emulator_halt,
4610	.wbinvd              = emulator_wbinvd,
4611	.fix_hypercall       = emulator_fix_hypercall,
4612	.get_fpu             = emulator_get_fpu,
4613	.put_fpu             = emulator_put_fpu,
4614	.intercept           = emulator_intercept,
 
 
 
 
 
 
 
4615};
4616
4617static void cache_all_regs(struct kvm_vcpu *vcpu)
4618{
4619	kvm_register_read(vcpu, VCPU_REGS_RAX);
4620	kvm_register_read(vcpu, VCPU_REGS_RSP);
4621	kvm_register_read(vcpu, VCPU_REGS_RIP);
4622	vcpu->arch.regs_dirty = ~0;
4623}
4624
4625static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
4626{
4627	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
4628	/*
4629	 * an sti; sti; sequence only disable interrupts for the first
4630	 * instruction. So, if the last instruction, be it emulated or
4631	 * not, left the system with the INT_STI flag enabled, it
4632	 * means that the last instruction is an sti. We should not
4633	 * leave the flag on in this case. The same goes for mov ss
4634	 */
4635	if (!(int_shadow & mask))
 
 
4636		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
 
 
 
4637}
4638
4639static void inject_emulated_exception(struct kvm_vcpu *vcpu)
4640{
4641	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4642	if (ctxt->exception.vector == PF_VECTOR)
4643		kvm_propagate_fault(vcpu, &ctxt->exception);
4644	else if (ctxt->exception.error_code_valid)
 
4645		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4646				      ctxt->exception.error_code);
4647	else
4648		kvm_queue_exception(vcpu, ctxt->exception.vector);
4649}
4650
4651static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
4652			      const unsigned long *regs)
4653{
4654	memset(&ctxt->twobyte, 0,
4655	       (void *)&ctxt->regs - (void *)&ctxt->twobyte);
4656	memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
4657
4658	ctxt->fetch.start = 0;
4659	ctxt->fetch.end = 0;
4660	ctxt->io_read.pos = 0;
4661	ctxt->io_read.end = 0;
4662	ctxt->mem_read.pos = 0;
4663	ctxt->mem_read.end = 0;
4664}
4665
4666static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4667{
4668	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4669	int cs_db, cs_l;
4670
4671	/*
4672	 * TODO: fix emulate.c to use guest_read/write_register
4673	 * instead of direct ->regs accesses, can save hundred cycles
4674	 * on Intel for instructions that don't read/change RSP, for
4675	 * for example.
4676	 */
4677	cache_all_regs(vcpu);
4678
4679	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4680
4681	ctxt->eflags = kvm_get_rflags(vcpu);
 
 
4682	ctxt->eip = kvm_rip_read(vcpu);
4683	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
4684		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
4685		     cs_l				? X86EMUL_MODE_PROT64 :
4686		     cs_db				? X86EMUL_MODE_PROT32 :
4687							  X86EMUL_MODE_PROT16;
4688	ctxt->guest_mode = is_guest_mode(vcpu);
 
 
4689
4690	init_decode_cache(ctxt, vcpu->arch.regs);
4691	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4692}
4693
4694int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4695{
4696	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4697	int ret;
4698
4699	init_emulate_ctxt(vcpu);
4700
4701	ctxt->op_bytes = 2;
4702	ctxt->ad_bytes = 2;
4703	ctxt->_eip = ctxt->eip + inc_eip;
4704	ret = emulate_int_real(ctxt, irq);
4705
4706	if (ret != X86EMUL_CONTINUE)
4707		return EMULATE_FAIL;
4708
4709	ctxt->eip = ctxt->_eip;
4710	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4711	kvm_rip_write(vcpu, ctxt->eip);
4712	kvm_set_rflags(vcpu, ctxt->eflags);
4713
4714	if (irq == NMI_VECTOR)
4715		vcpu->arch.nmi_pending = false;
4716	else
4717		vcpu->arch.interrupt.pending = false;
4718
4719	return EMULATE_DONE;
4720}
4721EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
4722
4723static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4724{
4725	int r = EMULATE_DONE;
4726
4727	++vcpu->stat.insn_emulation_fail;
4728	trace_kvm_emulate_insn_failed(vcpu);
4729	if (!is_guest_mode(vcpu)) {
 
 
 
 
 
 
4730		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4731		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4732		vcpu->run->internal.ndata = 0;
4733		r = EMULATE_FAIL;
4734	}
 
4735	kvm_queue_exception(vcpu, UD_VECTOR);
4736
4737	return r;
 
 
 
 
 
 
 
4738}
4739
4740static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 
 
4741{
4742	gpa_t gpa;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4743
4744	if (tdp_enabled)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4745		return false;
4746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4747	/*
4748	 * if emulation was due to access to shadowed page table
4749	 * and it failed try to unshadow page and re-entetr the
4750	 * guest to let CPU execute the instruction.
4751	 */
4752	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
4753		return true;
 
 
 
 
 
 
 
4754
4755	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
 
 
 
 
4756
4757	if (gpa == UNMAPPED_GVA)
4758		return true; /* let cpu generate fault */
4759
4760	if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
4761		return true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4762
4763	return false;
4764}
4765
4766int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4767			    unsigned long cr2,
4768			    int emulation_type,
4769			    void *insn,
4770			    int insn_len)
4771{
4772	int r;
4773	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4774	bool writeback = true;
 
 
 
4775
 
 
 
 
 
4776	kvm_clear_exception_queue(vcpu);
4777
4778	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4779		init_emulate_ctxt(vcpu);
 
 
 
 
 
 
 
 
 
 
 
4780		ctxt->interruptibility = 0;
4781		ctxt->have_exception = false;
 
4782		ctxt->perm_ok = false;
4783
4784		ctxt->only_vendor_specific_insn
4785			= emulation_type & EMULTYPE_TRAP_UD;
4786
4787		r = x86_decode_insn(ctxt, insn, insn_len);
4788
4789		trace_kvm_emulate_insn_start(vcpu);
4790		++vcpu->stat.insn_emulation;
4791		if (r)  {
4792			if (emulation_type & EMULTYPE_TRAP_UD)
4793				return EMULATE_FAIL;
4794			if (reexecute_instruction(vcpu, cr2))
4795				return EMULATE_DONE;
4796			if (emulation_type & EMULTYPE_SKIP)
4797				return EMULATE_FAIL;
4798			return handle_emulation_failure(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
4799		}
4800	}
4801
 
 
 
 
 
 
 
 
 
 
 
4802	if (emulation_type & EMULTYPE_SKIP) {
4803		kvm_rip_write(vcpu, ctxt->_eip);
4804		return EMULATE_DONE;
 
 
4805	}
4806
 
 
 
4807	/* this is needed for vmware backdoor interface to work since it
4808	   changes registers values  during IO operation */
4809	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4810		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4811		memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
4812	}
4813
4814restart:
 
 
 
4815	r = x86_emulate_insn(ctxt);
4816
4817	if (r == EMULATION_INTERCEPTED)
4818		return EMULATE_DONE;
4819
4820	if (r == EMULATION_FAILED) {
4821		if (reexecute_instruction(vcpu, cr2))
4822			return EMULATE_DONE;
 
4823
4824		return handle_emulation_failure(vcpu);
4825	}
4826
4827	if (ctxt->have_exception) {
4828		inject_emulated_exception(vcpu);
4829		r = EMULATE_DONE;
 
4830	} else if (vcpu->arch.pio.count) {
4831		if (!vcpu->arch.pio.in)
 
4832			vcpu->arch.pio.count = 0;
4833		else
4834			writeback = false;
4835		r = EMULATE_DO_MMIO;
 
 
4836	} else if (vcpu->mmio_needed) {
 
 
4837		if (!vcpu->mmio_is_write)
4838			writeback = false;
4839		r = EMULATE_DO_MMIO;
 
4840	} else if (r == EMULATION_RESTART)
4841		goto restart;
4842	else
4843		r = EMULATE_DONE;
4844
4845	if (writeback) {
 
4846		toggle_interruptibility(vcpu, ctxt->interruptibility);
4847		kvm_set_rflags(vcpu, ctxt->eflags);
4848		kvm_make_request(KVM_REQ_EVENT, vcpu);
4849		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
4850		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4851		kvm_rip_write(vcpu, ctxt->eip);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4852	} else
4853		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4854
4855	return r;
4856}
4857EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4858
4859int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4860{
4861	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
4862	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
4863					    size, port, &val, 1);
4864	/* do not return to emulator after return from userspace */
4865	vcpu->arch.pio.count = 0;
4866	return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4867}
4868EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4869
4870static void tsc_bad(void *info)
4871{
4872	__this_cpu_write(cpu_tsc_khz, 0);
 
4873}
4874
4875static void tsc_khz_changed(void *data)
4876{
4877	struct cpufreq_freqs *freq = data;
4878	unsigned long khz = 0;
4879
4880	if (data)
4881		khz = freq->new;
4882	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4883		khz = cpufreq_quick_get(raw_smp_processor_id());
4884	if (!khz)
4885		khz = tsc_khz;
4886	__this_cpu_write(cpu_tsc_khz, khz);
4887}
4888
4889static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
4890				     void *data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4891{
4892	struct cpufreq_freqs *freq = data;
4893	struct kvm *kvm;
4894	struct kvm_vcpu *vcpu;
4895	int i, send_ipi = 0;
4896
4897	/*
4898	 * We allow guests to temporarily run on slowing clocks,
4899	 * provided we notify them after, or to run on accelerating
4900	 * clocks, provided we notify them before.  Thus time never
4901	 * goes backwards.
4902	 *
4903	 * However, we have a problem.  We can't atomically update
4904	 * the frequency of a given CPU from this function; it is
4905	 * merely a notifier, which can be called from any CPU.
4906	 * Changing the TSC frequency at arbitrary points in time
4907	 * requires a recomputation of local variables related to
4908	 * the TSC for each VCPU.  We must flag these local variables
4909	 * to be updated and be sure the update takes place with the
4910	 * new frequency before any guests proceed.
4911	 *
4912	 * Unfortunately, the combination of hotplug CPU and frequency
4913	 * change creates an intractable locking scenario; the order
4914	 * of when these callouts happen is undefined with respect to
4915	 * CPU hotplug, and they can race with each other.  As such,
4916	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4917	 * undefined; you can actually have a CPU frequency change take
4918	 * place in between the computation of X and the setting of the
4919	 * variable.  To protect against this problem, all updates of
4920	 * the per_cpu tsc_khz variable are done in an interrupt
4921	 * protected IPI, and all callers wishing to update the value
4922	 * must wait for a synchronous IPI to complete (which is trivial
4923	 * if the caller is on the CPU already).  This establishes the
4924	 * necessary total order on variable updates.
4925	 *
4926	 * Note that because a guest time update may take place
4927	 * anytime after the setting of the VCPU's request bit, the
4928	 * correct TSC value must be set before the request.  However,
4929	 * to ensure the update actually makes it to any guest which
4930	 * starts running in hardware virtualization between the set
4931	 * and the acquisition of the spinlock, we must also ping the
4932	 * CPU after setting the request bit.
4933	 *
4934	 */
4935
4936	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4937		return 0;
4938	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4939		return 0;
4940
4941	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4942
4943	raw_spin_lock(&kvm_lock);
4944	list_for_each_entry(kvm, &vm_list, vm_list) {
4945		kvm_for_each_vcpu(i, vcpu, kvm) {
4946			if (vcpu->cpu != freq->cpu)
4947				continue;
4948			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4949			if (vcpu->cpu != smp_processor_id())
4950				send_ipi = 1;
4951		}
4952	}
4953	raw_spin_unlock(&kvm_lock);
4954
4955	if (freq->old < freq->new && send_ipi) {
4956		/*
4957		 * We upscale the frequency.  Must make the guest
4958		 * doesn't see old kvmclock values while running with
4959		 * the new frequency, otherwise we risk the guest sees
4960		 * time go backwards.
4961		 *
4962		 * In case we update the frequency for another cpu
4963		 * (which might be in guest context) send an interrupt
4964		 * to kick the cpu out of guest context.  Next time
4965		 * guest context is entered kvmclock will be updated,
4966		 * so the guest will not see stale values.
4967		 */
4968		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4969	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4970	return 0;
4971}
4972
4973static struct notifier_block kvmclock_cpufreq_notifier_block = {
4974	.notifier_call  = kvmclock_cpufreq_notifier
4975};
4976
4977static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4978					unsigned long action, void *hcpu)
4979{
4980	unsigned int cpu = (unsigned long)hcpu;
4981
4982	switch (action) {
4983		case CPU_ONLINE:
4984		case CPU_DOWN_FAILED:
4985			smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4986			break;
4987		case CPU_DOWN_PREPARE:
4988			smp_call_function_single(cpu, tsc_bad, NULL, 1);
4989			break;
4990	}
4991	return NOTIFY_OK;
4992}
4993
4994static struct notifier_block kvmclock_cpu_notifier_block = {
4995	.notifier_call  = kvmclock_cpu_notifier,
4996	.priority = -INT_MAX
4997};
4998
4999static void kvm_timer_init(void)
5000{
5001	int cpu;
5002
5003	max_tsc_khz = tsc_khz;
5004	register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5005	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5006#ifdef CONFIG_CPU_FREQ
5007		struct cpufreq_policy policy;
 
 
5008		memset(&policy, 0, sizeof(policy));
5009		cpu = get_cpu();
5010		cpufreq_get_policy(&policy, cpu);
5011		if (policy.cpuinfo.max_freq)
5012			max_tsc_khz = policy.cpuinfo.max_freq;
5013		put_cpu();
5014#endif
5015		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
5016					  CPUFREQ_TRANSITION_NOTIFIER);
5017	}
5018	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
5019	for_each_online_cpu(cpu)
5020		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5021}
5022
5023static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 
5024
5025static int kvm_is_in_guest(void)
5026{
5027	return percpu_read(current_vcpu) != NULL;
5028}
5029
5030static int kvm_is_user_mode(void)
5031{
5032	int user_mode = 3;
5033
5034	if (percpu_read(current_vcpu))
5035		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
5036
5037	return user_mode != 0;
5038}
5039
5040static unsigned long kvm_get_guest_ip(void)
5041{
5042	unsigned long ip = 0;
5043
5044	if (percpu_read(current_vcpu))
5045		ip = kvm_rip_read(percpu_read(current_vcpu));
5046
5047	return ip;
5048}
5049
 
 
 
 
 
 
 
 
 
5050static struct perf_guest_info_callbacks kvm_guest_cbs = {
5051	.is_in_guest		= kvm_is_in_guest,
5052	.is_user_mode		= kvm_is_user_mode,
5053	.get_guest_ip		= kvm_get_guest_ip,
 
5054};
5055
5056void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
 
5057{
5058	percpu_write(current_vcpu, vcpu);
5059}
5060EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5061
5062void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5063{
5064	percpu_write(current_vcpu, NULL);
 
 
 
 
 
 
5065}
5066EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5067
5068static void kvm_set_mmio_spte_mask(void)
 
 
 
 
 
 
5069{
5070	u64 mask;
5071	int maxphyaddr = boot_cpu_data.x86_phys_bits;
5072
5073	/*
5074	 * Set the reserved bits and the present bit of an paging-structure
5075	 * entry to generate page fault with PFER.RSV = 1.
5076	 */
5077	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
5078	mask |= 1ull;
5079
5080#ifdef CONFIG_X86_64
5081	/*
5082	 * If reserved bit is not supported, clear the present bit to disable
5083	 * mmio page fault.
5084	 */
5085	if (maxphyaddr == 52)
5086		mask &= ~1ull;
5087#endif
5088
5089	kvm_mmu_set_mmio_spte_mask(mask);
5090}
5091
 
 
 
 
 
5092int kvm_arch_init(void *opaque)
5093{
5094	int r;
5095	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
5096
5097	if (kvm_x86_ops) {
5098		printk(KERN_ERR "kvm: already loaded the other module\n");
5099		r = -EEXIST;
5100		goto out;
5101	}
5102
5103	if (!ops->cpu_has_kvm_support()) {
5104		printk(KERN_ERR "kvm: no hardware support\n");
5105		r = -EOPNOTSUPP;
5106		goto out;
5107	}
5108	if (ops->disabled_by_bios()) {
5109		printk(KERN_ERR "kvm: disabled by bios\n");
5110		r = -EOPNOTSUPP;
5111		goto out;
5112	}
5113
5114	r = kvm_mmu_module_init();
5115	if (r)
 
 
 
 
 
 
5116		goto out;
 
5117
5118	kvm_set_mmio_spte_mask();
5119	kvm_init_msr_list();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5120
5121	kvm_x86_ops = ops;
5122	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
5123			PT_DIRTY_MASK, PT64_NX_MASK, 0);
5124
 
 
 
5125	kvm_timer_init();
5126
5127	perf_register_guest_info_callbacks(&kvm_guest_cbs);
5128
5129	if (cpu_has_xsave)
5130		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
5131
 
 
 
 
 
 
 
 
 
 
5132	return 0;
5133
 
 
 
 
5134out:
5135	return r;
5136}
5137
5138void kvm_arch_exit(void)
5139{
 
 
 
 
 
5140	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5141
5142	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5143		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
5144					    CPUFREQ_TRANSITION_NOTIFIER);
5145	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
 
 
 
5146	kvm_x86_ops = NULL;
5147	kvm_mmu_module_exit();
 
 
5148}
5149
5150int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5151{
5152	++vcpu->stat.halt_exits;
5153	if (irqchip_in_kernel(vcpu->kvm)) {
5154		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5155		return 1;
5156	} else {
5157		vcpu->run->exit_reason = KVM_EXIT_HLT;
5158		return 0;
5159	}
5160}
5161EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5162
5163static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
5164			   unsigned long a1)
5165{
5166	if (is_long_mode(vcpu))
5167		return a0;
5168	else
5169		return a0 | ((gpa_t)a1 << 32);
 
 
5170}
 
5171
5172int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 
5173{
5174	u64 param, ingpa, outgpa, ret;
5175	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
5176	bool fast, longmode;
5177	int cs_db, cs_l;
5178
5179	/*
5180	 * hypercall generates UD from non zero cpl and real mode
5181	 * per HYPER-V spec
5182	 */
5183	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
5184		kvm_queue_exception(vcpu, UD_VECTOR);
5185		return 0;
5186	}
5187
5188	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5189	longmode = is_long_mode(vcpu) && cs_l == 1;
5190
5191	if (!longmode) {
5192		param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
5193			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
5194		ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
5195			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
5196		outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
5197			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
5198	}
5199#ifdef CONFIG_X86_64
5200	else {
5201		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
5202		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
5203		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
5204	}
5205#endif
5206
5207	code = param & 0xffff;
5208	fast = (param >> 16) & 0x1;
5209	rep_cnt = (param >> 32) & 0xfff;
5210	rep_idx = (param >> 48) & 0xfff;
5211
5212	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
5213
5214	switch (code) {
5215	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
5216		kvm_vcpu_on_spin(vcpu);
5217		break;
5218	default:
5219		res = HV_STATUS_INVALID_HYPERCALL_CODE;
5220		break;
5221	}
5222
5223	ret = res | (((u64)rep_done & 0xfff) << 32);
5224	if (longmode) {
5225		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5226	} else {
5227		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
5228		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
 
 
 
 
 
 
 
 
 
5229	}
 
 
5230
5231	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5232}
5233
5234int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5235{
5236	unsigned long nr, a0, a1, a2, a3, ret;
5237	int r = 1;
5238
5239	if (kvm_hv_hypercall_enabled(vcpu->kvm))
5240		return kvm_hv_hypercall(vcpu);
5241
5242	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
5243	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
5244	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
5245	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
5246	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
5247
5248	trace_kvm_hypercall(nr, a0, a1, a2, a3);
5249
5250	if (!is_long_mode(vcpu)) {
 
5251		nr &= 0xFFFFFFFF;
5252		a0 &= 0xFFFFFFFF;
5253		a1 &= 0xFFFFFFFF;
5254		a2 &= 0xFFFFFFFF;
5255		a3 &= 0xFFFFFFFF;
5256	}
5257
5258	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
5259		ret = -KVM_EPERM;
5260		goto out;
5261	}
5262
5263	switch (nr) {
5264	case KVM_HC_VAPIC_POLL_IRQ:
5265		ret = 0;
5266		break;
5267	case KVM_HC_MMU_OP:
5268		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5269		break;
5270	default:
5271		ret = -KVM_ENOSYS;
5272		break;
5273	}
5274out:
5275	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
 
 
 
5276	++vcpu->stat.hypercalls;
5277	return r;
5278}
5279EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5280
5281int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5282{
5283	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5284	char instruction[3];
5285	unsigned long rip = kvm_rip_read(vcpu);
5286
5287	/*
5288	 * Blow out the MMU to ensure that no other VCPU has an active mapping
5289	 * to ensure that the updated hypercall appears atomically across all
5290	 * VCPUs.
5291	 */
5292	kvm_mmu_zap_all(vcpu->kvm);
5293
5294	kvm_x86_ops->patch_hypercall(vcpu, instruction);
5295
5296	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 
5297}
5298
5299static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
5300{
5301	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
5302	int j, nent = vcpu->arch.cpuid_nent;
 
5303
5304	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
5305	/* when no next entry is found, the current entry[i] is reselected */
5306	for (j = i + 1; ; j = (j + 1) % nent) {
5307		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
5308		if (ej->function == e->function) {
5309			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
5310			return j;
5311		}
5312	}
5313	return 0; /* silence gcc, even though control never reaches here */
 
5314}
5315
5316/* find an entry with matching function, matching index (if needed), and that
5317 * should be read next (if it's stateful) */
5318static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
5319	u32 function, u32 index)
5320{
5321	if (e->function != function)
5322		return 0;
5323	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
5324		return 0;
5325	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
5326	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
5327		return 0;
5328	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5329}
5330
5331struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
5332					      u32 function, u32 index)
5333{
5334	int i;
5335	struct kvm_cpuid_entry2 *best = NULL;
5336
5337	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
5338		struct kvm_cpuid_entry2 *e;
5339
5340		e = &vcpu->arch.cpuid_entries[i];
5341		if (is_matching_cpuid_entry(e, function, index)) {
5342			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
5343				move_to_next_stateful_cpuid_entry(vcpu, i);
5344			best = e;
5345			break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5346		}
 
 
5347	}
5348	return best;
5349}
5350EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
5351
5352int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
5353{
5354	struct kvm_cpuid_entry2 *best;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5355
5356	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
5357	if (!best || best->eax < 0x80000008)
5358		goto not_found;
5359	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
5360	if (best)
5361		return best->eax & 0xff;
5362not_found:
5363	return 36;
5364}
5365
5366/*
5367 * If no match is found, check whether we exceed the vCPU's limit
5368 * and return the content of the highest valid _standard_ leaf instead.
5369 * This is to satisfy the CPUID specification.
5370 */
5371static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
5372                                                  u32 function, u32 index)
5373{
5374	struct kvm_cpuid_entry2 *maxlevel;
 
 
 
 
 
 
 
 
5375
5376	maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
5377	if (!maxlevel || maxlevel->eax >= function)
5378		return NULL;
5379	if (function & 0x80000000) {
5380		maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
5381		if (!maxlevel)
5382			return NULL;
5383	}
5384	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
5385}
5386
5387void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
5388{
5389	u32 function, index;
5390	struct kvm_cpuid_entry2 *best;
5391
5392	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
5393	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
5394	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
5395	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
5396	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
5397	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
5398	best = kvm_find_cpuid_entry(vcpu, function, index);
5399
5400	if (!best)
5401		best = check_cpuid_limit(vcpu, function, index);
5402
5403	if (best) {
5404		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
5405		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
5406		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
5407		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
5408	}
5409	kvm_x86_ops->skip_emulated_instruction(vcpu);
5410	trace_kvm_cpuid(function,
5411			kvm_register_read(vcpu, VCPU_REGS_RAX),
5412			kvm_register_read(vcpu, VCPU_REGS_RBX),
5413			kvm_register_read(vcpu, VCPU_REGS_RCX),
5414			kvm_register_read(vcpu, VCPU_REGS_RDX));
5415}
5416EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
5417
5418/*
5419 * Check if userspace requested an interrupt window, and that the
5420 * interrupt window is open.
5421 *
5422 * No need to exit to userspace if we already have an interrupt queued.
5423 */
5424static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5425{
5426	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
5427		vcpu->run->request_interrupt_window &&
5428		kvm_arch_interrupt_allowed(vcpu));
 
 
 
 
 
 
 
5429}
5430
5431static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5432{
5433	struct kvm_run *kvm_run = vcpu->run;
 
5434
5435	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
5436	kvm_run->cr8 = kvm_get_cr8(vcpu);
5437	kvm_run->apic_base = kvm_get_apic_base(vcpu);
5438	if (irqchip_in_kernel(vcpu->kvm))
5439		kvm_run->ready_for_interrupt_injection = 1;
5440	else
5441		kvm_run->ready_for_interrupt_injection =
5442			kvm_arch_interrupt_allowed(vcpu) &&
5443			!kvm_cpu_has_interrupt(vcpu) &&
5444			!kvm_event_needs_reinjection(vcpu);
 
5445}
5446
5447static void vapic_enter(struct kvm_vcpu *vcpu)
 
5448{
5449	struct kvm_lapic *apic = vcpu->arch.apic;
5450	struct page *page;
 
5451
5452	if (!apic || !apic->vapic_addr)
5453		return;
5454
5455	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5456
5457	vcpu->arch.apic->vapic_page = page;
 
 
 
 
5458}
5459
5460static void vapic_exit(struct kvm_vcpu *vcpu)
 
5461{
5462	struct kvm_lapic *apic = vcpu->arch.apic;
5463	int idx;
 
 
5464
5465	if (!apic || !apic->vapic_addr)
5466		return;
5467
5468	idx = srcu_read_lock(&vcpu->kvm->srcu);
5469	kvm_release_page_dirty(apic->vapic_page);
5470	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
5471	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5472}
 
5473
5474static void update_cr8_intercept(struct kvm_vcpu *vcpu)
5475{
5476	int max_irr, tpr;
 
 
 
5477
5478	if (!kvm_x86_ops->update_cr8_intercept)
5479		return;
 
 
 
 
 
 
5480
5481	if (!vcpu->arch.apic)
5482		return;
 
 
 
 
5483
5484	if (!vcpu->arch.apic->vapic_addr)
5485		max_irr = kvm_lapic_find_highest_irr(vcpu);
 
 
 
5486	else
5487		max_irr = -1;
5488
5489	if (max_irr != -1)
5490		max_irr >>= 4;
5491
5492	tpr = kvm_lapic_get_cr8(vcpu);
 
 
5493
5494	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5495}
5496
5497static void inject_pending_event(struct kvm_vcpu *vcpu)
5498{
5499	/* try to reinject previous events if any */
5500	if (vcpu->arch.exception.pending) {
5501		trace_kvm_inj_exception(vcpu->arch.exception.nr,
5502					vcpu->arch.exception.has_error_code,
5503					vcpu->arch.exception.error_code);
5504		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
5505					  vcpu->arch.exception.has_error_code,
5506					  vcpu->arch.exception.error_code,
5507					  vcpu->arch.exception.reinject);
5508		return;
5509	}
5510
5511	if (vcpu->arch.nmi_injected) {
5512		kvm_x86_ops->set_nmi(vcpu);
 
 
 
 
 
 
5513		return;
 
 
 
 
 
 
 
 
 
 
5514	}
5515
5516	if (vcpu->arch.interrupt.pending) {
5517		kvm_x86_ops->set_irq(vcpu);
 
 
 
 
 
 
 
 
 
5518		return;
5519	}
5520
5521	/* try to inject new event if pending */
5522	if (vcpu->arch.nmi_pending) {
5523		if (kvm_x86_ops->nmi_allowed(vcpu)) {
5524			vcpu->arch.nmi_pending = false;
5525			vcpu->arch.nmi_injected = true;
5526			kvm_x86_ops->set_nmi(vcpu);
5527		}
5528	} else if (kvm_cpu_has_interrupt(vcpu)) {
5529		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5530			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5531					    false);
5532			kvm_x86_ops->set_irq(vcpu);
5533		}
5534	}
5535}
5536
5537static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 
 
5538{
5539	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
5540			!vcpu->guest_xcr0_loaded) {
5541		/* kvm_set_xcr() also depends on this */
5542		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
5543		vcpu->guest_xcr0_loaded = 1;
5544	}
 
 
 
 
 
5545}
5546
5547static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
5548{
5549	if (vcpu->guest_xcr0_loaded) {
5550		if (vcpu->arch.xcr0 != host_xcr0)
5551			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
5552		vcpu->guest_xcr0_loaded = 0;
5553	}
 
 
 
 
 
 
 
 
 
 
 
 
 
5554}
 
5555
 
 
 
 
 
 
 
 
 
 
 
5556static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5557{
5558	int r;
5559	bool nmi_pending;
5560	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5561		vcpu->run->request_interrupt_window;
5562
5563	if (vcpu->requests) {
 
 
 
 
 
 
 
 
5564		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
5565			kvm_mmu_unload(vcpu);
5566		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
5567			__kvm_migrate_timers(vcpu);
 
 
 
 
5568		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
5569			r = kvm_guest_time_update(vcpu);
5570			if (unlikely(r))
5571				goto out;
5572		}
5573		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
5574			kvm_mmu_sync_roots(vcpu);
 
 
5575		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
5576			kvm_x86_ops->tlb_flush(vcpu);
5577		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
5578			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
5579			r = 0;
5580			goto out;
5581		}
5582		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
5583			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 
5584			r = 0;
5585			goto out;
5586		}
5587		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
5588			vcpu->fpu_active = 0;
5589			kvm_x86_ops->fpu_deactivate(vcpu);
5590		}
5591		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5592			/* Page is swapped out. Do synthetic halt */
5593			vcpu->arch.apf.halted = true;
5594			r = 1;
5595			goto out;
5596		}
5597		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
5598			record_steal_time(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5599
 
 
 
 
 
 
 
5600	}
5601
5602	r = kvm_mmu_reload(vcpu);
5603	if (unlikely(r))
5604		goto out;
5605
5606	/*
5607	 * An NMI can be injected between local nmi_pending read and
5608	 * vcpu->arch.nmi_pending read inside inject_pending_event().
5609	 * But in that case, KVM_REQ_EVENT will be set, which makes
5610	 * the race described above benign.
5611	 */
5612	nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5613
5614	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5615		inject_pending_event(vcpu);
 
 
 
 
 
5616
5617		/* enable NMI/IRQ window open exits if needed */
5618		if (nmi_pending)
5619			kvm_x86_ops->enable_nmi_window(vcpu);
5620		else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5621			kvm_x86_ops->enable_irq_window(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5622
5623		if (kvm_lapic_enabled(vcpu)) {
5624			update_cr8_intercept(vcpu);
5625			kvm_lapic_sync_to_vapic(vcpu);
5626		}
5627	}
5628
 
 
 
 
 
5629	preempt_disable();
5630
5631	kvm_x86_ops->prepare_guest_switch(vcpu);
5632	if (vcpu->fpu_active)
5633		kvm_load_guest_fpu(vcpu);
5634	kvm_load_guest_xcr0(vcpu);
5635
 
 
 
 
 
 
5636	vcpu->mode = IN_GUEST_MODE;
5637
5638	/* We should set ->mode before check ->requests,
5639	 * see the comment in make_all_cpus_request.
 
 
 
 
 
 
 
 
 
 
 
5640	 */
5641	smp_mb();
5642
5643	local_irq_disable();
 
 
 
 
 
5644
5645	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
5646	    || need_resched() || signal_pending(current)) {
5647		vcpu->mode = OUTSIDE_GUEST_MODE;
5648		smp_wmb();
5649		local_irq_enable();
5650		preempt_enable();
5651		kvm_x86_ops->cancel_injection(vcpu);
5652		r = 1;
5653		goto out;
5654	}
5655
5656	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 
 
 
 
 
5657
5658	kvm_guest_enter();
 
5659
5660	if (unlikely(vcpu->arch.switch_db_regs)) {
5661		set_debugreg(0, 7);
5662		set_debugreg(vcpu->arch.eff_db[0], 0);
5663		set_debugreg(vcpu->arch.eff_db[1], 1);
5664		set_debugreg(vcpu->arch.eff_db[2], 2);
5665		set_debugreg(vcpu->arch.eff_db[3], 3);
 
 
5666	}
5667
5668	trace_kvm_entry(vcpu->vcpu_id);
5669	kvm_x86_ops->run(vcpu);
5670
5671	/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5672	 * If the guest has used debug registers, at least dr7
5673	 * will be disabled while returning to the host.
5674	 * If we don't have active breakpoints in the host, we don't
5675	 * care about the messed up debug address registers. But if
5676	 * we have some of them active, restore the old state.
5677	 */
5678	if (hw_breakpoint_active())
5679		hw_breakpoint_restore();
5680
5681	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5682
5683	vcpu->mode = OUTSIDE_GUEST_MODE;
5684	smp_wmb();
5685	local_irq_enable();
5686
5687	++vcpu->stat.exits;
5688
5689	/*
5690	 * We must have an instruction between local_irq_enable() and
5691	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
5692	 * the interrupt shadow.  The stat.exits increment will do nicely.
5693	 * But we need to prevent reordering, hence this barrier():
 
5694	 */
5695	barrier();
 
 
 
 
5696
5697	kvm_guest_exit();
 
 
 
 
 
 
 
5698
 
5699	preempt_enable();
5700
5701	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5702
5703	/*
5704	 * Profile KVM exit RIPs:
5705	 */
5706	if (unlikely(prof_on == KVM_PROFILING)) {
5707		unsigned long rip = kvm_rip_read(vcpu);
5708		profile_hit(KVM_PROFILING, (void *)rip);
5709	}
5710
 
 
5711
5712	kvm_lapic_sync_from_vapic(vcpu);
 
5713
 
5714	r = kvm_x86_ops->handle_exit(vcpu);
 
 
 
 
 
 
5715out:
5716	return r;
5717}
5718
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5719
5720static int __vcpu_run(struct kvm_vcpu *vcpu)
5721{
5722	int r;
5723	struct kvm *kvm = vcpu->kvm;
5724
5725	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
5726		pr_debug("vcpu %d received sipi with vector # %x\n",
5727			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
5728		kvm_lapic_reset(vcpu);
5729		r = kvm_arch_vcpu_reset(vcpu);
5730		if (r)
5731			return r;
5732		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5733	}
5734
5735	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5736	vapic_enter(vcpu);
5737
5738	r = 1;
5739	while (r > 0) {
5740		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5741		    !vcpu->arch.apf.halted)
5742			r = vcpu_enter_guest(vcpu);
5743		else {
5744			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5745			kvm_vcpu_block(vcpu);
5746			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5747			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
5748			{
5749				switch(vcpu->arch.mp_state) {
5750				case KVM_MP_STATE_HALTED:
5751					vcpu->arch.mp_state =
5752						KVM_MP_STATE_RUNNABLE;
5753				case KVM_MP_STATE_RUNNABLE:
5754					vcpu->arch.apf.halted = false;
5755					break;
5756				case KVM_MP_STATE_SIPI_RECEIVED:
5757				default:
5758					r = -EINTR;
5759					break;
5760				}
5761			}
5762		}
5763
5764		if (r <= 0)
5765			break;
5766
5767		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
5768		if (kvm_cpu_has_pending_timer(vcpu))
5769			kvm_inject_pending_timer_irqs(vcpu);
5770
5771		if (dm_request_for_irq_injection(vcpu)) {
5772			r = -EINTR;
5773			vcpu->run->exit_reason = KVM_EXIT_INTR;
 
5774			++vcpu->stat.request_irq_exits;
 
5775		}
5776
5777		kvm_check_async_pf_completion(vcpu);
5778
5779		if (signal_pending(current)) {
5780			r = -EINTR;
5781			vcpu->run->exit_reason = KVM_EXIT_INTR;
5782			++vcpu->stat.signal_exits;
 
5783		}
5784		if (need_resched()) {
5785			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5786			kvm_resched(vcpu);
5787			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
5788		}
5789	}
5790
5791	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
5792
5793	vapic_exit(vcpu);
 
 
 
 
 
5794
 
 
 
5795	return r;
5796}
5797
5798static int complete_mmio(struct kvm_vcpu *vcpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5799{
5800	struct kvm_run *run = vcpu->run;
5801	int r;
 
5802
5803	if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5804		return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5805
5806	if (vcpu->mmio_needed) {
5807		vcpu->mmio_needed = 0;
5808		if (!vcpu->mmio_is_write)
5809			memcpy(vcpu->mmio_data + vcpu->mmio_index,
5810			       run->mmio.data, 8);
5811		vcpu->mmio_index += 8;
5812		if (vcpu->mmio_index < vcpu->mmio_size) {
5813			run->exit_reason = KVM_EXIT_MMIO;
5814			run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
5815			memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
5816			run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5817			run->mmio.is_write = vcpu->mmio_is_write;
5818			vcpu->mmio_needed = 1;
5819			return 0;
5820		}
5821		if (vcpu->mmio_is_write)
5822			return 1;
5823		vcpu->mmio_read_completed = 1;
 
5824	}
5825	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5826	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5827	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5828	if (r != EMULATE_DONE)
5829		return 0;
5830	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5831}
5832
5833int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5834{
5835	int r;
5836	sigset_t sigsaved;
5837
5838	if (!tsk_used_math(current) && init_fpu(current))
5839		return -ENOMEM;
5840
5841	if (vcpu->sigset_active)
5842		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
5843
5844	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 
 
 
 
5845		kvm_vcpu_block(vcpu);
5846		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 
5847		r = -EAGAIN;
 
 
 
 
 
 
 
 
 
 
5848		goto out;
5849	}
5850
 
 
 
 
 
 
5851	/* re-sync apic's tpr */
5852	if (!irqchip_in_kernel(vcpu->kvm)) {
5853		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
5854			r = -EINVAL;
5855			goto out;
5856		}
5857	}
5858
5859	r = complete_mmio(vcpu);
5860	if (r <= 0)
5861		goto out;
5862
5863	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
5864		kvm_register_write(vcpu, VCPU_REGS_RAX,
5865				     kvm_run->hypercall.ret);
 
5866
5867	r = __vcpu_run(vcpu);
 
 
 
5868
5869out:
 
 
 
5870	post_kvm_run_save(vcpu);
5871	if (vcpu->sigset_active)
5872		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
5873
 
5874	return r;
5875}
5876
5877int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5878{
5879	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
5880		/*
5881		 * We are here if userspace calls get_regs() in the middle of
5882		 * instruction emulation. Registers state needs to be copied
5883		 * back from emulation context to vcpu. Usrapace shouldn't do
5884		 * that usually, but some bad designed PV devices (vmware
5885		 * backdoor interface) need this to work
5886		 */
5887		struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5888		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
5889		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5890	}
5891	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5892	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5893	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
5894	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
5895	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
5896	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
5897	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
5898	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
5899#ifdef CONFIG_X86_64
5900	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
5901	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
5902	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
5903	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
5904	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
5905	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
5906	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
5907	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
5908#endif
5909
5910	regs->rip = kvm_rip_read(vcpu);
5911	regs->rflags = kvm_get_rflags(vcpu);
 
5912
 
 
 
 
 
5913	return 0;
5914}
5915
5916int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5917{
5918	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
5919	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5920
5921	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
5922	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
5923	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
5924	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
5925	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
5926	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
5927	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
5928	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
5929#ifdef CONFIG_X86_64
5930	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
5931	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
5932	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
5933	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
5934	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
5935	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
5936	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
5937	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
5938#endif
5939
5940	kvm_rip_write(vcpu, regs->rip);
5941	kvm_set_rflags(vcpu, regs->rflags);
5942
5943	vcpu->arch.exception.pending = false;
5944
5945	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
5946
 
 
 
 
 
5947	return 0;
5948}
5949
5950void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
5951{
5952	struct kvm_segment cs;
5953
5954	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
5955	*db = cs.db;
5956	*l = cs.l;
5957}
5958EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
5959
5960int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5961				  struct kvm_sregs *sregs)
5962{
5963	struct desc_ptr dt;
5964
5965	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
5966	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
5967	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
5968	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
5969	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
5970	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
5971
5972	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
5973	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
5974
5975	kvm_x86_ops->get_idt(vcpu, &dt);
5976	sregs->idt.limit = dt.size;
5977	sregs->idt.base = dt.address;
5978	kvm_x86_ops->get_gdt(vcpu, &dt);
5979	sregs->gdt.limit = dt.size;
5980	sregs->gdt.base = dt.address;
5981
5982	sregs->cr0 = kvm_read_cr0(vcpu);
5983	sregs->cr2 = vcpu->arch.cr2;
5984	sregs->cr3 = kvm_read_cr3(vcpu);
5985	sregs->cr4 = kvm_read_cr4(vcpu);
5986	sregs->cr8 = kvm_get_cr8(vcpu);
5987	sregs->efer = vcpu->arch.efer;
5988	sregs->apic_base = kvm_get_apic_base(vcpu);
5989
5990	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
5991
5992	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
5993		set_bit(vcpu->arch.interrupt.nr,
5994			(unsigned long *)sregs->interrupt_bitmap);
 
5995
 
 
 
 
 
 
5996	return 0;
5997}
5998
5999int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6000				    struct kvm_mp_state *mp_state)
6001{
6002	mp_state->mp_state = vcpu->arch.mp_state;
 
 
 
 
 
 
 
 
 
6003	return 0;
6004}
6005
6006int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6007				    struct kvm_mp_state *mp_state)
6008{
6009	vcpu->arch.mp_state = mp_state->mp_state;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6010	kvm_make_request(KVM_REQ_EVENT, vcpu);
6011	return 0;
 
 
 
 
6012}
6013
6014int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
6015		    bool has_error_code, u32 error_code)
6016{
6017	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6018	int ret;
6019
6020	init_emulate_ctxt(vcpu);
6021
6022	ret = emulator_task_switch(ctxt, tss_selector, reason,
6023				   has_error_code, error_code);
 
 
 
 
 
 
6024
6025	if (ret)
6026		return EMULATE_FAIL;
6027
6028	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
6029	kvm_rip_write(vcpu, ctxt->eip);
6030	kvm_set_rflags(vcpu, ctxt->eflags);
6031	kvm_make_request(KVM_REQ_EVENT, vcpu);
6032	return EMULATE_DONE;
6033}
6034EXPORT_SYMBOL_GPL(kvm_task_switch);
6035
6036int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6037				  struct kvm_sregs *sregs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6038{
 
6039	int mmu_reset_needed = 0;
 
6040	int pending_vec, max_bits, idx;
6041	struct desc_ptr dt;
 
 
 
 
 
 
 
 
 
6042
6043	dt.size = sregs->idt.limit;
6044	dt.address = sregs->idt.base;
6045	kvm_x86_ops->set_idt(vcpu, &dt);
6046	dt.size = sregs->gdt.limit;
6047	dt.address = sregs->gdt.base;
6048	kvm_x86_ops->set_gdt(vcpu, &dt);
6049
6050	vcpu->arch.cr2 = sregs->cr2;
6051	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
6052	vcpu->arch.cr3 = sregs->cr3;
6053	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
6054
6055	kvm_set_cr8(vcpu, sregs->cr8);
6056
6057	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
6058	kvm_x86_ops->set_efer(vcpu, sregs->efer);
6059	kvm_set_apic_base(vcpu, sregs->apic_base);
6060
6061	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
6062	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
6063	vcpu->arch.cr0 = sregs->cr0;
6064
6065	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 
 
6066	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6067	if (sregs->cr4 & X86_CR4_OSXSAVE)
6068		update_cpuid(vcpu);
6069
6070	idx = srcu_read_lock(&vcpu->kvm->srcu);
6071	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
6072		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
6073		mmu_reset_needed = 1;
6074	}
6075	srcu_read_unlock(&vcpu->kvm->srcu, idx);
6076
6077	if (mmu_reset_needed)
6078		kvm_mmu_reset_context(vcpu);
6079
6080	max_bits = (sizeof sregs->interrupt_bitmap) << 3;
6081	pending_vec = find_first_bit(
6082		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
6083	if (pending_vec < max_bits) {
6084		kvm_queue_interrupt(vcpu, pending_vec, false);
6085		pr_debug("Set back pending irq %d\n", pending_vec);
6086	}
6087
6088	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6089	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6090	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6091	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6092	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6093	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6094
6095	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6096	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6097
6098	update_cr8_intercept(vcpu);
6099
6100	/* Older userspace won't unhalt the vcpu on reset. */
6101	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
6102	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
6103	    !is_protmode(vcpu))
6104		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6105
6106	kvm_make_request(KVM_REQ_EVENT, vcpu);
6107
6108	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
6109}
6110
6111int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
6112					struct kvm_guest_debug *dbg)
6113{
6114	unsigned long rflags;
6115	int i, r;
6116
 
 
6117	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
6118		r = -EBUSY;
6119		if (vcpu->arch.exception.pending)
6120			goto out;
6121		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
6122			kvm_queue_exception(vcpu, DB_VECTOR);
6123		else
6124			kvm_queue_exception(vcpu, BP_VECTOR);
6125	}
6126
6127	/*
6128	 * Read rflags as long as potentially injected trace flags are still
6129	 * filtered out.
6130	 */
6131	rflags = kvm_get_rflags(vcpu);
6132
6133	vcpu->guest_debug = dbg->control;
6134	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
6135		vcpu->guest_debug = 0;
6136
6137	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6138		for (i = 0; i < KVM_NR_DB_REGS; ++i)
6139			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
6140		vcpu->arch.switch_db_regs =
6141			(dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
6142	} else {
6143		for (i = 0; i < KVM_NR_DB_REGS; i++)
6144			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6145		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
6146	}
 
6147
6148	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6149		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
6150			get_segment_base(vcpu, VCPU_SREG_CS);
6151
6152	/*
6153	 * Trigger an rflags update that will inject or remove the trace
6154	 * flags.
6155	 */
6156	kvm_set_rflags(vcpu, rflags);
6157
6158	kvm_x86_ops->set_guest_debug(vcpu, dbg);
6159
6160	r = 0;
6161
6162out:
6163
6164	return r;
6165}
6166
6167/*
6168 * Translate a guest virtual address to a guest physical address.
6169 */
6170int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
6171				    struct kvm_translation *tr)
6172{
6173	unsigned long vaddr = tr->linear_address;
6174	gpa_t gpa;
6175	int idx;
6176
 
 
6177	idx = srcu_read_lock(&vcpu->kvm->srcu);
6178	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
6179	srcu_read_unlock(&vcpu->kvm->srcu, idx);
6180	tr->physical_address = gpa;
6181	tr->valid = gpa != UNMAPPED_GVA;
6182	tr->writeable = 1;
6183	tr->usermode = 0;
6184
 
6185	return 0;
6186}
6187
6188int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6189{
6190	struct i387_fxsave_struct *fxsave =
6191			&vcpu->arch.guest_fpu.state->fxsave;
6192
 
 
 
6193	memcpy(fpu->fpr, fxsave->st_space, 128);
6194	fpu->fcw = fxsave->cwd;
6195	fpu->fsw = fxsave->swd;
6196	fpu->ftwx = fxsave->twd;
6197	fpu->last_opcode = fxsave->fop;
6198	fpu->last_ip = fxsave->rip;
6199	fpu->last_dp = fxsave->rdp;
6200	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
6201
 
6202	return 0;
6203}
6204
6205int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6206{
6207	struct i387_fxsave_struct *fxsave =
6208			&vcpu->arch.guest_fpu.state->fxsave;
 
 
 
6209
6210	memcpy(fxsave->st_space, fpu->fpr, 128);
6211	fxsave->cwd = fpu->fcw;
6212	fxsave->swd = fpu->fsw;
6213	fxsave->twd = fpu->ftwx;
6214	fxsave->fop = fpu->last_opcode;
6215	fxsave->rip = fpu->last_ip;
6216	fxsave->rdp = fpu->last_dp;
6217	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
6218
 
6219	return 0;
6220}
6221
6222int fx_init(struct kvm_vcpu *vcpu)
6223{
6224	int err;
6225
6226	err = fpu_alloc(&vcpu->arch.guest_fpu);
6227	if (err)
6228		return err;
6229
6230	fpu_finit(&vcpu->arch.guest_fpu);
6231
6232	/*
6233	 * Ensure guest xcr0 is valid for loading
6234	 */
6235	vcpu->arch.xcr0 = XSTATE_FP;
6236
6237	vcpu->arch.cr0 |= X86_CR0_ET;
 
6238
6239	return 0;
 
 
6240}
6241EXPORT_SYMBOL_GPL(fx_init);
6242
6243static void fx_free(struct kvm_vcpu *vcpu)
6244{
6245	fpu_free(&vcpu->arch.guest_fpu);
6246}
6247
6248void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
6249{
6250	if (vcpu->guest_fpu_loaded)
6251		return;
 
 
 
 
 
 
 
 
 
 
 
6252
6253	/*
6254	 * Restore all possible states in the guest,
6255	 * and assume host would use all available bits.
6256	 * Guest xcr0 would be loaded later.
6257	 */
6258	kvm_put_guest_xcr0(vcpu);
6259	vcpu->guest_fpu_loaded = 1;
6260	unlazy_fpu(current);
6261	fpu_restore_checking(&vcpu->arch.guest_fpu);
6262	trace_kvm_fpu(1);
6263}
6264
6265void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
6266{
6267	kvm_put_guest_xcr0(vcpu);
 
 
 
6268
6269	if (!vcpu->guest_fpu_loaded)
6270		return;
 
 
6271
6272	vcpu->guest_fpu_loaded = 0;
6273	fpu_save_init(&vcpu->arch.guest_fpu);
6274	++vcpu->stat.fpu_reload;
6275	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
6276	trace_kvm_fpu(0);
6277}
6278
6279void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
6280{
 
 
6281	kvmclock_reset(vcpu);
6282
6283	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
6284	fx_free(vcpu);
6285	kvm_x86_ops->vcpu_free(vcpu);
 
6286}
6287
6288struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
6289						unsigned int id)
6290{
6291	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
 
 
6292		printk_once(KERN_WARNING
6293		"kvm: SMP vm created on host with unstable TSC; "
6294		"guest TSC will not be reliable\n");
6295	return kvm_x86_ops->vcpu_create(kvm, id);
 
 
 
6296}
6297
6298int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
6299{
6300	int r;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6301
6302	vcpu->arch.mtrr_state.have_fixed = 1;
 
6303	vcpu_load(vcpu);
6304	r = kvm_arch_vcpu_reset(vcpu);
6305	if (r == 0)
6306		r = kvm_mmu_setup(vcpu);
 
6307	vcpu_put(vcpu);
6308
6309	return r;
 
 
 
 
 
 
 
 
 
6310}
6311
6312void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
6313{
6314	vcpu->arch.apf.msr_val = 0;
6315
6316	vcpu_load(vcpu);
6317	kvm_mmu_unload(vcpu);
6318	vcpu_put(vcpu);
6319
6320	fx_free(vcpu);
6321	kvm_x86_ops->vcpu_free(vcpu);
6322}
6323
6324int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
6325{
6326	vcpu->arch.nmi_pending = false;
 
 
 
 
 
 
 
6327	vcpu->arch.nmi_injected = false;
 
 
 
6328
6329	vcpu->arch.switch_db_regs = 0;
6330	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
6331	vcpu->arch.dr6 = DR6_FIXED_1;
 
 
6332	vcpu->arch.dr7 = DR7_FIXED_1;
 
 
 
6333
6334	kvm_make_request(KVM_REQ_EVENT, vcpu);
6335	vcpu->arch.apf.msr_val = 0;
6336	vcpu->arch.st.msr_val = 0;
6337
6338	kvmclock_reset(vcpu);
6339
6340	kvm_clear_async_pf_completion_queue(vcpu);
6341	kvm_async_pf_hash_reset(vcpu);
6342	vcpu->arch.apf.halted = false;
6343
6344	return kvm_x86_ops->vcpu_reset(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6345}
6346
6347int kvm_arch_hardware_enable(void *garbage)
6348{
6349	struct kvm *kvm;
6350	struct kvm_vcpu *vcpu;
6351	int i;
 
 
 
 
6352
6353	kvm_shared_msr_cpu_online();
6354	list_for_each_entry(kvm, &vm_list, vm_list)
6355		kvm_for_each_vcpu(i, vcpu, kvm)
6356			if (vcpu->cpu == smp_processor_id())
 
 
 
 
 
 
6357				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6358	return kvm_x86_ops->hardware_enable(garbage);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6359}
6360
6361void kvm_arch_hardware_disable(void *garbage)
6362{
6363	kvm_x86_ops->hardware_disable(garbage);
6364	drop_user_return_notifiers(garbage);
6365}
6366
6367int kvm_arch_hardware_setup(void)
6368{
6369	return kvm_x86_ops->hardware_setup();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6370}
6371
6372void kvm_arch_hardware_unsetup(void)
6373{
6374	kvm_x86_ops->hardware_unsetup();
6375}
6376
6377void kvm_arch_check_processor_compat(void *rtn)
 
 
 
 
 
 
 
 
 
 
 
6378{
6379	kvm_x86_ops->check_processor_compatibility(rtn);
6380}
6381
 
 
 
6382int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
6383{
6384	struct page *page;
6385	struct kvm *kvm;
6386	int r;
6387
6388	BUG_ON(vcpu->kvm == NULL);
6389	kvm = vcpu->kvm;
6390
6391	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6392	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
6393	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
6394	vcpu->arch.mmu.translate_gpa = translate_gpa;
6395	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
6396	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
6397		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6398	else
6399		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
6400
6401	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
6402	if (!page) {
6403		r = -ENOMEM;
6404		goto fail;
6405	}
6406	vcpu->arch.pio_data = page_address(page);
6407
6408	kvm_init_tsc_catchup(vcpu, max_tsc_khz);
6409
6410	r = kvm_mmu_create(vcpu);
6411	if (r < 0)
6412		goto fail_free_pio_data;
6413
6414	if (irqchip_in_kernel(kvm)) {
6415		r = kvm_create_lapic(vcpu);
 
6416		if (r < 0)
6417			goto fail_mmu_destroy;
6418	}
 
6419
6420	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
6421				       GFP_KERNEL);
6422	if (!vcpu->arch.mce_banks) {
6423		r = -ENOMEM;
6424		goto fail_free_lapic;
6425	}
6426	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
6427
6428	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
 
 
6429		goto fail_free_mce_banks;
 
 
 
 
 
 
 
 
 
6430
6431	kvm_async_pf_hash_reset(vcpu);
 
 
 
 
 
 
6432
6433	return 0;
 
6434fail_free_mce_banks:
6435	kfree(vcpu->arch.mce_banks);
6436fail_free_lapic:
6437	kvm_free_lapic(vcpu);
6438fail_mmu_destroy:
6439	kvm_mmu_destroy(vcpu);
6440fail_free_pio_data:
6441	free_page((unsigned long)vcpu->arch.pio_data);
6442fail:
6443	return r;
6444}
6445
6446void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
6447{
6448	int idx;
6449
 
 
6450	kfree(vcpu->arch.mce_banks);
6451	kvm_free_lapic(vcpu);
6452	idx = srcu_read_lock(&vcpu->kvm->srcu);
6453	kvm_mmu_destroy(vcpu);
6454	srcu_read_unlock(&vcpu->kvm->srcu, idx);
6455	free_page((unsigned long)vcpu->arch.pio_data);
 
 
 
 
 
 
 
 
6456}
6457
6458int kvm_arch_init_vm(struct kvm *kvm)
6459{
 
 
 
 
6460	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 
 
6461	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
6462
6463	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
6464	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 
 
 
6465
6466	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 
 
6467
6468	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6469}
6470
6471static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
6472{
6473	vcpu_load(vcpu);
6474	kvm_mmu_unload(vcpu);
6475	vcpu_put(vcpu);
6476}
6477
6478static void kvm_free_vcpus(struct kvm *kvm)
6479{
6480	unsigned int i;
6481	struct kvm_vcpu *vcpu;
6482
6483	/*
6484	 * Unpin any mmu pages first.
6485	 */
6486	kvm_for_each_vcpu(i, vcpu, kvm) {
6487		kvm_clear_async_pf_completion_queue(vcpu);
6488		kvm_unload_vcpu_mmu(vcpu);
6489	}
6490	kvm_for_each_vcpu(i, vcpu, kvm)
6491		kvm_arch_vcpu_free(vcpu);
6492
6493	mutex_lock(&kvm->lock);
6494	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
6495		kvm->vcpus[i] = NULL;
6496
6497	atomic_set(&kvm->online_vcpus, 0);
6498	mutex_unlock(&kvm->lock);
6499}
6500
6501void kvm_arch_sync_events(struct kvm *kvm)
6502{
6503	kvm_free_all_assigned_devices(kvm);
 
6504	kvm_free_pit(kvm);
6505}
6506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6507void kvm_arch_destroy_vm(struct kvm *kvm)
6508{
6509	kvm_iommu_unmap_guest(kvm);
6510	kfree(kvm->arch.vpic);
6511	kfree(kvm->arch.vioapic);
 
 
 
 
 
 
 
 
 
 
 
6512	kvm_free_vcpus(kvm);
6513	if (kvm->arch.apic_access_page)
6514		put_page(kvm->arch.apic_access_page);
6515	if (kvm->arch.ept_identity_pagetable)
6516		put_page(kvm->arch.ept_identity_pagetable);
 
6517}
6518
6519int kvm_arch_prepare_memory_region(struct kvm *kvm,
6520				struct kvm_memory_slot *memslot,
6521				struct kvm_memory_slot old,
6522				struct kvm_userspace_memory_region *mem,
6523				int user_alloc)
6524{
6525	int npages = memslot->npages;
6526	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
6527
6528	/* Prevent internal slot pages from being moved by fork()/COW. */
6529	if (memslot->id >= KVM_MEMORY_SLOTS)
6530		map_flags = MAP_SHARED | MAP_ANONYMOUS;
6531
6532	/*To keep backward compatibility with older userspace,
6533	 *x86 needs to hanlde !user_alloc case.
6534	 */
6535	if (!user_alloc) {
6536		if (npages && !old.rmap) {
6537			unsigned long userspace_addr;
6538
6539			down_write(&current->mm->mmap_sem);
6540			userspace_addr = do_mmap(NULL, 0,
6541						 npages * PAGE_SIZE,
6542						 PROT_READ | PROT_WRITE,
6543						 map_flags,
6544						 0);
6545			up_write(&current->mm->mmap_sem);
6546
6547			if (IS_ERR((void *)userspace_addr))
6548				return PTR_ERR((void *)userspace_addr);
 
 
 
 
 
6549
6550			memslot->userspace_addr = userspace_addr;
 
 
 
6551		}
6552	}
6553
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6554
6555	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
6556}
6557
6558void kvm_arch_commit_memory_region(struct kvm *kvm,
6559				struct kvm_userspace_memory_region *mem,
6560				struct kvm_memory_slot old,
6561				int user_alloc)
6562{
 
 
 
 
 
 
6563
6564	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
 
 
 
 
 
 
6565
6566	if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
6567		int ret;
 
 
 
 
 
 
6568
6569		down_write(&current->mm->mmap_sem);
6570		ret = do_munmap(current->mm, old.userspace_addr,
6571				old.npages * PAGE_SIZE);
6572		up_write(&current->mm->mmap_sem);
6573		if (ret < 0)
6574			printk(KERN_WARNING
6575			       "kvm_vm_ioctl_set_memory_region: "
6576			       "failed to munmap memory\n");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6577	}
 
6578
 
 
 
 
 
 
6579	if (!kvm->arch.n_requested_mmu_pages)
6580		nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6581
6582	spin_lock(&kvm->mmu_lock);
6583	if (nr_mmu_pages)
6584		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6585	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6586	spin_unlock(&kvm->mmu_lock);
 
 
 
 
 
 
 
6587}
6588
6589void kvm_arch_flush_shadow(struct kvm *kvm)
6590{
6591	kvm_mmu_zap_all(kvm);
6592	kvm_reload_remote_mmus(kvm);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6593}
6594
6595int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
6596{
6597	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6598		!vcpu->arch.apf.halted)
6599		|| !list_empty_careful(&vcpu->async_pf.done)
6600		|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
6601		|| vcpu->arch.nmi_pending ||
6602		(kvm_arch_interrupt_allowed(vcpu) &&
6603		 kvm_cpu_has_interrupt(vcpu));
6604}
6605
6606void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
6607{
6608	int me;
6609	int cpu = vcpu->cpu;
6610
6611	if (waitqueue_active(&vcpu->wq)) {
6612		wake_up_interruptible(&vcpu->wq);
6613		++vcpu->stat.halt_wakeup;
6614	}
6615
6616	me = get_cpu();
6617	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
6618		if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
6619			smp_send_reschedule(cpu);
6620	put_cpu();
 
 
 
 
 
 
 
 
 
6621}
6622
6623int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
6624{
6625	return kvm_x86_ops->interrupt_allowed(vcpu);
6626}
6627
6628bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
6629{
6630	unsigned long current_rip = kvm_rip_read(vcpu) +
6631		get_segment_base(vcpu, VCPU_SREG_CS);
 
 
 
 
6632
6633	return current_rip == linear_rip;
 
 
6634}
6635EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
6636
6637unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
6638{
6639	unsigned long rflags;
6640
6641	rflags = kvm_x86_ops->get_rflags(vcpu);
6642	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6643		rflags &= ~X86_EFLAGS_TF;
6644	return rflags;
6645}
6646EXPORT_SYMBOL_GPL(kvm_get_rflags);
6647
6648void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6649{
6650	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
6651	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
6652		rflags |= X86_EFLAGS_TF;
6653	kvm_x86_ops->set_rflags(vcpu, rflags);
 
 
 
 
 
6654	kvm_make_request(KVM_REQ_EVENT, vcpu);
6655}
6656EXPORT_SYMBOL_GPL(kvm_set_rflags);
6657
6658void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6659{
6660	int r;
6661
6662	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6663	      is_error_page(work->page))
6664		return;
6665
6666	r = kvm_mmu_reload(vcpu);
6667	if (unlikely(r))
6668		return;
6669
6670	if (!vcpu->arch.mmu.direct_map &&
6671	      work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6672		return;
6673
6674	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6675}
6676
6677static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6678{
6679	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6680}
6681
6682static inline u32 kvm_async_pf_next_probe(u32 key)
6683{
6684	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6685}
6686
6687static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6688{
6689	u32 key = kvm_async_pf_hash_fn(gfn);
6690
6691	while (vcpu->arch.apf.gfns[key] != ~0)
6692		key = kvm_async_pf_next_probe(key);
6693
6694	vcpu->arch.apf.gfns[key] = gfn;
6695}
6696
6697static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6698{
6699	int i;
6700	u32 key = kvm_async_pf_hash_fn(gfn);
6701
6702	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6703		     (vcpu->arch.apf.gfns[key] != gfn &&
6704		      vcpu->arch.apf.gfns[key] != ~0); i++)
6705		key = kvm_async_pf_next_probe(key);
6706
6707	return key;
6708}
6709
6710bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6711{
6712	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6713}
6714
6715static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6716{
6717	u32 i, j, k;
6718
6719	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6720	while (true) {
6721		vcpu->arch.apf.gfns[i] = ~0;
6722		do {
6723			j = kvm_async_pf_next_probe(j);
6724			if (vcpu->arch.apf.gfns[j] == ~0)
6725				return;
6726			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6727			/*
6728			 * k lies cyclically in ]i,j]
6729			 * |    i.k.j |
6730			 * |....j i.k.| or  |.k..j i...|
6731			 */
6732		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6733		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6734		i = j;
6735	}
6736}
6737
6738static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6739{
6740
6741	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6742				      sizeof(val));
6743}
6744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6745void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6746				     struct kvm_async_pf *work)
6747{
6748	struct x86_exception fault;
6749
6750	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6751	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6752
6753	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6754	    (vcpu->arch.apf.send_user_only &&
6755	     kvm_x86_ops->get_cpl(vcpu) == 0))
6756		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6757	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6758		fault.vector = PF_VECTOR;
6759		fault.error_code_valid = true;
6760		fault.error_code = 0;
6761		fault.nested_page_fault = false;
6762		fault.address = work->arch.token;
 
6763		kvm_inject_page_fault(vcpu, &fault);
 
 
 
 
 
 
 
 
 
 
6764	}
6765}
6766
6767void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6768				 struct kvm_async_pf *work)
6769{
6770	struct x86_exception fault;
 
6771
6772	trace_kvm_async_pf_ready(work->arch.token, work->gva);
6773	if (is_error_page(work->page))
6774		work->arch.token = ~0; /* broadcast wakeup */
6775	else
6776		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
 
6777
6778	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6779	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6780		fault.vector = PF_VECTOR;
6781		fault.error_code_valid = true;
6782		fault.error_code = 0;
6783		fault.nested_page_fault = false;
6784		fault.address = work->arch.token;
6785		kvm_inject_page_fault(vcpu, &fault);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6786	}
6787	vcpu->arch.apf.halted = false;
 
6788}
6789
6790bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6791{
6792	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6793		return true;
6794	else
6795		return !kvm_event_needs_reinjection(vcpu) &&
6796			kvm_x86_ops->interrupt_allowed(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6797}
6798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6799EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 
6800EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
6801EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
6802EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
6803EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
6804EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
6805EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
6806EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
6807EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 
6808EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
6809EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
6810EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);

    1// SPDX-License-Identifier: GPL-2.0-only
    2/*
    3 * Kernel-based Virtual Machine driver for Linux
    4 *
    5 * derived from drivers/kvm/kvm_main.c
    6 *
    7 * Copyright (C) 2006 Qumranet, Inc.
    8 * Copyright (C) 2008 Qumranet, Inc.
    9 * Copyright IBM Corporation, 2008
   10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
   11 *
   12 * Authors:
   13 *   Avi Kivity   <avi@qumranet.com>
   14 *   Yaniv Kamay  <yaniv@qumranet.com>
   15 *   Amit Shah    <amit.shah@qumranet.com>
   16 *   Ben-Ami Yassour <benami@il.ibm.com>
 
 
 
 
   17 */
   18
   19#include <linux/kvm_host.h>
   20#include "irq.h"
   21#include "mmu.h"
   22#include "i8254.h"
   23#include "tss.h"
   24#include "kvm_cache_regs.h"
   25#include "x86.h"
   26#include "cpuid.h"
   27#include "pmu.h"
   28#include "hyperv.h"
   29
   30#include <linux/clocksource.h>
   31#include <linux/interrupt.h>
   32#include <linux/kvm.h>
   33#include <linux/fs.h>
   34#include <linux/vmalloc.h>
   35#include <linux/export.h>
   36#include <linux/moduleparam.h>
   37#include <linux/mman.h>
   38#include <linux/highmem.h>
   39#include <linux/iommu.h>
   40#include <linux/intel-iommu.h>
   41#include <linux/cpufreq.h>
   42#include <linux/user-return-notifier.h>
   43#include <linux/srcu.h>
   44#include <linux/slab.h>
   45#include <linux/perf_event.h>
   46#include <linux/uaccess.h>
   47#include <linux/hash.h>
   48#include <linux/pci.h>
   49#include <linux/timekeeper_internal.h>
   50#include <linux/pvclock_gtod.h>
   51#include <linux/kvm_irqfd.h>
   52#include <linux/irqbypass.h>
   53#include <linux/sched/stat.h>
   54#include <linux/sched/isolation.h>
   55#include <linux/mem_encrypt.h>
   56
   57#include <trace/events/kvm.h>
 
   58
   59#include <asm/debugreg.h>
   60#include <asm/msr.h>
   61#include <asm/desc.h>
 
   62#include <asm/mce.h>
   63#include <linux/kernel_stat.h>
   64#include <asm/fpu/internal.h> /* Ugh! */
   65#include <asm/pvclock.h>
   66#include <asm/div64.h>
   67#include <asm/irq_remapping.h>
   68#include <asm/mshyperv.h>
   69#include <asm/hypervisor.h>
   70#include <asm/intel_pt.h>
   71#include <clocksource/hyperv_timer.h>
   72
   73#define CREATE_TRACE_POINTS
   74#include "trace.h"
   75
   76#define MAX_IO_MSRS 256
   77#define KVM_MAX_MCE_BANKS 32
   78u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
   79EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
   80
   81#define emul_to_vcpu(ctxt) \
   82	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
   83
   84/* EFER defaults:
   85 * - enable syscall per default because its emulated by KVM
   86 * - enable LME and LMA per default on 64 bit KVM
   87 */
   88#ifdef CONFIG_X86_64
   89static
   90u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
   91#else
   92static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
   93#endif
   94
   95#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
   96#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
   97
   98#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
   99                                    KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
  100
  101static void update_cr8_intercept(struct kvm_vcpu *vcpu);
  102static void process_nmi(struct kvm_vcpu *vcpu);
  103static void enter_smm(struct kvm_vcpu *vcpu);
  104static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
  105static void store_regs(struct kvm_vcpu *vcpu);
  106static int sync_regs(struct kvm_vcpu *vcpu);
  107
  108struct kvm_x86_ops *kvm_x86_ops __read_mostly;
  109EXPORT_SYMBOL_GPL(kvm_x86_ops);
  110
  111static bool __read_mostly ignore_msrs = 0;
  112module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
  113
  114static bool __read_mostly report_ignored_msrs = true;
  115module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
  116
  117unsigned int min_timer_period_us = 200;
  118module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
  119
  120static bool __read_mostly kvmclock_periodic_sync = true;
  121module_param(kvmclock_periodic_sync, bool, S_IRUGO);
  122
  123bool __read_mostly kvm_has_tsc_control;
  124EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
  125u32  __read_mostly kvm_max_guest_tsc_khz;
  126EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
  127u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
  128EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
  129u64  __read_mostly kvm_max_tsc_scaling_ratio;
  130EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
  131u64 __read_mostly kvm_default_tsc_scaling_ratio;
  132EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
  133
  134/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
  135static u32 __read_mostly tsc_tolerance_ppm = 250;
  136module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
  137
  138/*
  139 * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
  140 * adaptive tuning starting from default advancment of 1000ns.  '0' disables
  141 * advancement entirely.  Any other value is used as-is and disables adaptive
  142 * tuning, i.e. allows priveleged userspace to set an exact advancement time.
  143 */
  144static int __read_mostly lapic_timer_advance_ns = -1;
  145module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
  146
  147static bool __read_mostly vector_hashing = true;
  148module_param(vector_hashing, bool, S_IRUGO);
  149
  150bool __read_mostly enable_vmware_backdoor = false;
  151module_param(enable_vmware_backdoor, bool, S_IRUGO);
  152EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
  153
  154static bool __read_mostly force_emulation_prefix = false;
  155module_param(force_emulation_prefix, bool, S_IRUGO);
  156
  157int __read_mostly pi_inject_timer = -1;
  158module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
  159
  160#define KVM_NR_SHARED_MSRS 16
  161
  162struct kvm_shared_msrs_global {
  163	int nr;
  164	u32 msrs[KVM_NR_SHARED_MSRS];
  165};
  166
  167struct kvm_shared_msrs {
  168	struct user_return_notifier urn;
  169	bool registered;
  170	struct kvm_shared_msr_values {
  171		u64 host;
  172		u64 curr;
  173	} values[KVM_NR_SHARED_MSRS];
  174};
  175
  176static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
  177static struct kvm_shared_msrs __percpu *shared_msrs;
  178
  179struct kvm_stats_debugfs_item debugfs_entries[] = {
  180	{ "pf_fixed", VCPU_STAT(pf_fixed) },
  181	{ "pf_guest", VCPU_STAT(pf_guest) },
  182	{ "tlb_flush", VCPU_STAT(tlb_flush) },
  183	{ "invlpg", VCPU_STAT(invlpg) },
  184	{ "exits", VCPU_STAT(exits) },
  185	{ "io_exits", VCPU_STAT(io_exits) },
  186	{ "mmio_exits", VCPU_STAT(mmio_exits) },
  187	{ "signal_exits", VCPU_STAT(signal_exits) },
  188	{ "irq_window", VCPU_STAT(irq_window_exits) },
  189	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
  190	{ "halt_exits", VCPU_STAT(halt_exits) },
  191	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
  192	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
  193	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
  194	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
  195	{ "hypercalls", VCPU_STAT(hypercalls) },
  196	{ "request_irq", VCPU_STAT(request_irq_exits) },
  197	{ "irq_exits", VCPU_STAT(irq_exits) },
  198	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 
  199	{ "fpu_reload", VCPU_STAT(fpu_reload) },
  200	{ "insn_emulation", VCPU_STAT(insn_emulation) },
  201	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
  202	{ "irq_injections", VCPU_STAT(irq_injections) },
  203	{ "nmi_injections", VCPU_STAT(nmi_injections) },
  204	{ "req_event", VCPU_STAT(req_event) },
  205	{ "l1d_flush", VCPU_STAT(l1d_flush) },
  206	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
  207	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
  208	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
  209	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
  210	{ "mmu_flooded", VM_STAT(mmu_flooded) },
  211	{ "mmu_recycled", VM_STAT(mmu_recycled) },
  212	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
  213	{ "mmu_unsync", VM_STAT(mmu_unsync) },
  214	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
  215	{ "largepages", VM_STAT(lpages, .mode = 0444) },
  216	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
  217	{ "max_mmu_page_hash_collisions",
  218		VM_STAT(max_mmu_page_hash_collisions) },
  219	{ NULL }
  220};
  221
  222u64 __read_mostly host_xcr0;
  223
  224struct kmem_cache *x86_fpu_cache;
  225EXPORT_SYMBOL_GPL(x86_fpu_cache);
  226
  227static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
  228
  229static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
  230{
  231	int i;
  232	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
  233		vcpu->arch.apf.gfns[i] = ~0;
  234}
  235
  236static void kvm_on_user_return(struct user_return_notifier *urn)
  237{
  238	unsigned slot;
  239	struct kvm_shared_msrs *locals
  240		= container_of(urn, struct kvm_shared_msrs, urn);
  241	struct kvm_shared_msr_values *values;
  242	unsigned long flags;
  243
  244	/*
  245	 * Disabling irqs at this point since the following code could be
  246	 * interrupted and executed through kvm_arch_hardware_disable()
  247	 */
  248	local_irq_save(flags);
  249	if (locals->registered) {
  250		locals->registered = false;
  251		user_return_notifier_unregister(urn);
  252	}
  253	local_irq_restore(flags);
  254	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
  255		values = &locals->values[slot];
  256		if (values->host != values->curr) {
  257			wrmsrl(shared_msrs_global.msrs[slot], values->host);
  258			values->curr = values->host;
  259		}
  260	}
 
 
  261}
  262
  263static void shared_msr_update(unsigned slot, u32 msr)
  264{
 
  265	u64 value;
  266	unsigned int cpu = smp_processor_id();
  267	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
  268
 
  269	/* only read, and nobody should modify it at this time,
  270	 * so don't need lock */
  271	if (slot >= shared_msrs_global.nr) {
  272		printk(KERN_ERR "kvm: invalid MSR slot!");
  273		return;
  274	}
  275	rdmsrl_safe(msr, &value);
  276	smsr->values[slot].host = value;
  277	smsr->values[slot].curr = value;
  278}
  279
  280void kvm_define_shared_msr(unsigned slot, u32 msr)
  281{
  282	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
  283	shared_msrs_global.msrs[slot] = msr;
  284	if (slot >= shared_msrs_global.nr)
  285		shared_msrs_global.nr = slot + 1;
 
 
 
  286}
  287EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
  288
  289static void kvm_shared_msr_cpu_online(void)
  290{
  291	unsigned i;
  292
  293	for (i = 0; i < shared_msrs_global.nr; ++i)
  294		shared_msr_update(i, shared_msrs_global.msrs[i]);
  295}
  296
  297int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
  298{
  299	unsigned int cpu = smp_processor_id();
  300	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
  301	int err;
  302
  303	if (((value ^ smsr->values[slot].curr) & mask) == 0)
  304		return 0;
  305	smsr->values[slot].curr = value;
  306	err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
  307	if (err)
  308		return 1;
  309
  310	if (!smsr->registered) {
  311		smsr->urn.on_user_return = kvm_on_user_return;
  312		user_return_notifier_register(&smsr->urn);
  313		smsr->registered = true;
  314	}
  315	return 0;
  316}
  317EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
  318
  319static void drop_user_return_notifiers(void)
  320{
  321	unsigned int cpu = smp_processor_id();
  322	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
  323
  324	if (smsr->registered)
  325		kvm_on_user_return(&smsr->urn);
  326}
  327
  328u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
  329{
  330	return vcpu->arch.apic_base;
 
 
 
  331}
  332EXPORT_SYMBOL_GPL(kvm_get_apic_base);
  333
  334enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
  335{
  336	return kvm_apic_mode(kvm_get_apic_base(vcpu));
  337}
  338EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
  339
  340int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  341{
  342	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
  343	enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
  344	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
  345		(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
  346
  347	if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
  348		return 1;
  349	if (!msr_info->host_initiated) {
  350		if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
  351			return 1;
  352		if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
  353			return 1;
  354	}
  355
  356	kvm_lapic_set_base(vcpu, msr_info->data);
  357	return 0;
  358}
  359EXPORT_SYMBOL_GPL(kvm_set_apic_base);
  360
  361asmlinkage __visible void kvm_spurious_fault(void)
  362{
  363	/* Fault while not rebooting.  We want the trace. */
  364	BUG_ON(!kvm_rebooting);
  365}
  366EXPORT_SYMBOL_GPL(kvm_spurious_fault);
  367
  368#define EXCPT_BENIGN		0
  369#define EXCPT_CONTRIBUTORY	1
  370#define EXCPT_PF		2
  371
  372static int exception_class(int vector)
  373{
  374	switch (vector) {
  375	case PF_VECTOR:
  376		return EXCPT_PF;
  377	case DE_VECTOR:
  378	case TS_VECTOR:
  379	case NP_VECTOR:
  380	case SS_VECTOR:
  381	case GP_VECTOR:
  382		return EXCPT_CONTRIBUTORY;
  383	default:
  384		break;
  385	}
  386	return EXCPT_BENIGN;
  387}
  388
  389#define EXCPT_FAULT		0
  390#define EXCPT_TRAP		1
  391#define EXCPT_ABORT		2
  392#define EXCPT_INTERRUPT		3
  393
  394static int exception_type(int vector)
  395{
  396	unsigned int mask;
  397
  398	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
  399		return EXCPT_INTERRUPT;
  400
  401	mask = 1 << vector;
  402
  403	/* #DB is trap, as instruction watchpoints are handled elsewhere */
  404	if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
  405		return EXCPT_TRAP;
  406
  407	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
  408		return EXCPT_ABORT;
  409
  410	/* Reserved exceptions will result in fault */
  411	return EXCPT_FAULT;
  412}
  413
  414void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
  415{
  416	unsigned nr = vcpu->arch.exception.nr;
  417	bool has_payload = vcpu->arch.exception.has_payload;
  418	unsigned long payload = vcpu->arch.exception.payload;
  419
  420	if (!has_payload)
  421		return;
  422
  423	switch (nr) {
  424	case DB_VECTOR:
  425		/*
  426		 * "Certain debug exceptions may clear bit 0-3.  The
  427		 * remaining contents of the DR6 register are never
  428		 * cleared by the processor".
  429		 */
  430		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
  431		/*
  432		 * DR6.RTM is set by all #DB exceptions that don't clear it.
  433		 */
  434		vcpu->arch.dr6 |= DR6_RTM;
  435		vcpu->arch.dr6 |= payload;
  436		/*
  437		 * Bit 16 should be set in the payload whenever the #DB
  438		 * exception should clear DR6.RTM. This makes the payload
  439		 * compatible with the pending debug exceptions under VMX.
  440		 * Though not currently documented in the SDM, this also
  441		 * makes the payload compatible with the exit qualification
  442		 * for #DB exceptions under VMX.
  443		 */
  444		vcpu->arch.dr6 ^= payload & DR6_RTM;
  445		break;
  446	case PF_VECTOR:
  447		vcpu->arch.cr2 = payload;
  448		break;
  449	}
  450
  451	vcpu->arch.exception.has_payload = false;
  452	vcpu->arch.exception.payload = 0;
  453}
  454EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
  455
  456static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
  457		unsigned nr, bool has_error, u32 error_code,
  458	        bool has_payload, unsigned long payload, bool reinject)
  459{
  460	u32 prev_nr;
  461	int class1, class2;
  462
  463	kvm_make_request(KVM_REQ_EVENT, vcpu);
  464
  465	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
  466	queue:
  467		if (has_error && !is_protmode(vcpu))
  468			has_error = false;
  469		if (reinject) {
  470			/*
  471			 * On vmentry, vcpu->arch.exception.pending is only
  472			 * true if an event injection was blocked by
  473			 * nested_run_pending.  In that case, however,
  474			 * vcpu_enter_guest requests an immediate exit,
  475			 * and the guest shouldn't proceed far enough to
  476			 * need reinjection.
  477			 */
  478			WARN_ON_ONCE(vcpu->arch.exception.pending);
  479			vcpu->arch.exception.injected = true;
  480			if (WARN_ON_ONCE(has_payload)) {
  481				/*
  482				 * A reinjected event has already
  483				 * delivered its payload.
  484				 */
  485				has_payload = false;
  486				payload = 0;
  487			}
  488		} else {
  489			vcpu->arch.exception.pending = true;
  490			vcpu->arch.exception.injected = false;
  491		}
  492		vcpu->arch.exception.has_error_code = has_error;
  493		vcpu->arch.exception.nr = nr;
  494		vcpu->arch.exception.error_code = error_code;
  495		vcpu->arch.exception.has_payload = has_payload;
  496		vcpu->arch.exception.payload = payload;
  497		/*
  498		 * In guest mode, payload delivery should be deferred,
  499		 * so that the L1 hypervisor can intercept #PF before
  500		 * CR2 is modified (or intercept #DB before DR6 is
  501		 * modified under nVMX).  However, for ABI
  502		 * compatibility with KVM_GET_VCPU_EVENTS and
  503		 * KVM_SET_VCPU_EVENTS, we can't delay payload
  504		 * delivery unless userspace has enabled this
  505		 * functionality via the per-VM capability,
  506		 * KVM_CAP_EXCEPTION_PAYLOAD.
  507		 */
  508		if (!vcpu->kvm->arch.exception_payload_enabled ||
  509		    !is_guest_mode(vcpu))
  510			kvm_deliver_exception_payload(vcpu);
  511		return;
  512	}
  513
  514	/* to check exception */
  515	prev_nr = vcpu->arch.exception.nr;
  516	if (prev_nr == DF_VECTOR) {
  517		/* triple fault -> shutdown */
  518		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
  519		return;
  520	}
  521	class1 = exception_class(prev_nr);
  522	class2 = exception_class(nr);
  523	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
  524		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
  525		/*
  526		 * Generate double fault per SDM Table 5-5.  Set
  527		 * exception.pending = true so that the double fault
  528		 * can trigger a nested vmexit.
  529		 */
  530		vcpu->arch.exception.pending = true;
  531		vcpu->arch.exception.injected = false;
  532		vcpu->arch.exception.has_error_code = true;
  533		vcpu->arch.exception.nr = DF_VECTOR;
  534		vcpu->arch.exception.error_code = 0;
  535		vcpu->arch.exception.has_payload = false;
  536		vcpu->arch.exception.payload = 0;
  537	} else
  538		/* replace previous exception with a new one in a hope
  539		   that instruction re-execution will regenerate lost
  540		   exception */
  541		goto queue;
  542}
  543
  544void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
  545{
  546	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
  547}
  548EXPORT_SYMBOL_GPL(kvm_queue_exception);
  549
  550void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
  551{
  552	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
  553}
  554EXPORT_SYMBOL_GPL(kvm_requeue_exception);
  555
  556static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
  557				  unsigned long payload)
  558{
  559	kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
  560}
  561
  562static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
  563				    u32 error_code, unsigned long payload)
  564{
  565	kvm_multiple_exception(vcpu, nr, true, error_code,
  566			       true, payload, false);
  567}
  568
  569int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
  570{
  571	if (err)
  572		kvm_inject_gp(vcpu, 0);
  573	else
  574		return kvm_skip_emulated_instruction(vcpu);
  575
  576	return 1;
  577}
  578EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
  579
  580void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
  581{
  582	++vcpu->stat.pf_guest;
  583	vcpu->arch.exception.nested_apf =
  584		is_guest_mode(vcpu) && fault->async_page_fault;
  585	if (vcpu->arch.exception.nested_apf) {
  586		vcpu->arch.apf.nested_apf_token = fault->address;
  587		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
  588	} else {
  589		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
  590					fault->address);
  591	}
  592}
  593EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
  594
  595static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
  596{
  597	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
  598		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
  599	else
  600		vcpu->arch.mmu->inject_page_fault(vcpu, fault);
  601
  602	return fault->nested_page_fault;
  603}
  604
  605void kvm_inject_nmi(struct kvm_vcpu *vcpu)
  606{
  607	atomic_inc(&vcpu->arch.nmi_queued);
  608	kvm_make_request(KVM_REQ_NMI, vcpu);
  609}
  610EXPORT_SYMBOL_GPL(kvm_inject_nmi);
  611
  612void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
  613{
  614	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
  615}
  616EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
  617
  618void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
  619{
  620	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
  621}
  622EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
  623
  624/*
  625 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
  626 * a #GP and return false.
  627 */
  628bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
  629{
  630	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
  631		return true;
  632	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
  633	return false;
  634}
  635EXPORT_SYMBOL_GPL(kvm_require_cpl);
  636
  637bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
  638{
  639	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
  640		return true;
  641
  642	kvm_queue_exception(vcpu, UD_VECTOR);
  643	return false;
  644}
  645EXPORT_SYMBOL_GPL(kvm_require_dr);
  646
  647/*
  648 * This function will be used to read from the physical memory of the currently
  649 * running guest. The difference to kvm_vcpu_read_guest_page is that this function
  650 * can read from guest physical or from the guest's guest physical memory.
  651 */
  652int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
  653			    gfn_t ngfn, void *data, int offset, int len,
  654			    u32 access)
  655{
  656	struct x86_exception exception;
  657	gfn_t real_gfn;
  658	gpa_t ngpa;
  659
  660	ngpa     = gfn_to_gpa(ngfn);
  661	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
  662	if (real_gfn == UNMAPPED_GVA)
  663		return -EFAULT;
  664
  665	real_gfn = gpa_to_gfn(real_gfn);
  666
  667	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
  668}
  669EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
  670
  671static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  672			       void *data, int offset, int len, u32 access)
  673{
  674	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
  675				       data, offset, len, access);
  676}
  677
  678static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
  679{
  680	return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
  681	       rsvd_bits(1, 2);
  682}
  683
  684/*
  685 * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
  686 */
  687int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
  688{
  689	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
  690	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
  691	int i;
  692	int ret;
  693	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
  694
  695	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
  696				      offset * sizeof(u64), sizeof(pdpte),
  697				      PFERR_USER_MASK|PFERR_WRITE_MASK);
  698	if (ret < 0) {
  699		ret = 0;
  700		goto out;
  701	}
  702	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
  703		if ((pdpte[i] & PT_PRESENT_MASK) &&
  704		    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
  705			ret = 0;
  706			goto out;
  707		}
  708	}
  709	ret = 1;
  710
  711	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
  712	__set_bit(VCPU_EXREG_PDPTR,
  713		  (unsigned long *)&vcpu->arch.regs_avail);
  714	__set_bit(VCPU_EXREG_PDPTR,
  715		  (unsigned long *)&vcpu->arch.regs_dirty);
  716out:
  717
  718	return ret;
  719}
  720EXPORT_SYMBOL_GPL(load_pdptrs);
  721
  722bool pdptrs_changed(struct kvm_vcpu *vcpu)
  723{
  724	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
  725	bool changed = true;
  726	int offset;
  727	gfn_t gfn;
  728	int r;
  729
  730	if (!is_pae_paging(vcpu))
  731		return false;
  732
  733	if (!test_bit(VCPU_EXREG_PDPTR,
  734		      (unsigned long *)&vcpu->arch.regs_avail))
  735		return true;
  736
  737	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
  738	offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
  739	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
  740				       PFERR_USER_MASK | PFERR_WRITE_MASK);
  741	if (r < 0)
  742		goto out;
  743	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
  744out:
  745
  746	return changed;
  747}
  748EXPORT_SYMBOL_GPL(pdptrs_changed);
  749
  750int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  751{
  752	unsigned long old_cr0 = kvm_read_cr0(vcpu);
  753	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 
  754
  755	cr0 |= X86_CR0_ET;
  756
  757#ifdef CONFIG_X86_64
  758	if (cr0 & 0xffffffff00000000UL)
  759		return 1;
  760#endif
  761
  762	cr0 &= ~CR0_RESERVED_BITS;
  763
  764	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
  765		return 1;
  766
  767	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
  768		return 1;
  769
  770	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
  771#ifdef CONFIG_X86_64
  772		if ((vcpu->arch.efer & EFER_LME)) {
  773			int cs_db, cs_l;
  774
  775			if (!is_pae(vcpu))
  776				return 1;
  777			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
  778			if (cs_l)
  779				return 1;
  780		} else
  781#endif
  782		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
  783						 kvm_read_cr3(vcpu)))
  784			return 1;
  785	}
  786
  787	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
  788		return 1;
  789
  790	kvm_x86_ops->set_cr0(vcpu, cr0);
  791
  792	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
  793		kvm_clear_async_pf_completion_queue(vcpu);
  794		kvm_async_pf_hash_reset(vcpu);
  795	}
  796
  797	if ((cr0 ^ old_cr0) & update_bits)
  798		kvm_mmu_reset_context(vcpu);
  799
  800	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
  801	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
  802	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
  803		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
  804
  805	return 0;
  806}
  807EXPORT_SYMBOL_GPL(kvm_set_cr0);
  808
  809void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
  810{
  811	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
  812}
  813EXPORT_SYMBOL_GPL(kvm_lmsw);
  814
  815void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
  816{
  817	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
  818			!vcpu->guest_xcr0_loaded) {
  819		/* kvm_set_xcr() also depends on this */
  820		if (vcpu->arch.xcr0 != host_xcr0)
  821			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
  822		vcpu->guest_xcr0_loaded = 1;
  823	}
  824}
  825EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
  826
  827void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
  828{
  829	if (vcpu->guest_xcr0_loaded) {
  830		if (vcpu->arch.xcr0 != host_xcr0)
  831			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
  832		vcpu->guest_xcr0_loaded = 0;
  833	}
  834}
  835EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
  836
  837static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
  838{
  839	u64 xcr0 = xcr;
  840	u64 old_xcr0 = vcpu->arch.xcr0;
  841	u64 valid_bits;
  842
  843	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
  844	if (index != XCR_XFEATURE_ENABLED_MASK)
  845		return 1;
  846	if (!(xcr0 & XFEATURE_MASK_FP))
 
  847		return 1;
  848	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
  849		return 1;
  850
  851	/*
  852	 * Do not allow the guest to set bits that we do not support
  853	 * saving.  However, xcr0 bit 0 is always set, even if the
  854	 * emulated CPU does not support XSAVE (see fx_init).
  855	 */
  856	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
  857	if (xcr0 & ~valid_bits)
  858		return 1;
  859
  860	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
  861	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
  862		return 1;
  863
  864	if (xcr0 & XFEATURE_MASK_AVX512) {
  865		if (!(xcr0 & XFEATURE_MASK_YMM))
  866			return 1;
  867		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
  868			return 1;
  869	}
  870	vcpu->arch.xcr0 = xcr0;
  871
  872	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
  873		kvm_update_cpuid(vcpu);
  874	return 0;
  875}
  876
  877int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
  878{
  879	if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
  880	    __kvm_set_xcr(vcpu, index, xcr)) {
  881		kvm_inject_gp(vcpu, 0);
  882		return 1;
  883	}
  884	return 0;
  885}
  886EXPORT_SYMBOL_GPL(kvm_set_xcr);
  887
  888static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  889{
  890	if (cr4 & CR4_RESERVED_BITS)
  891		return -EINVAL;
  892
  893	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
  894		return -EINVAL;
 
  895
  896	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
  897		return -EINVAL;
 
  898
  899	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
  900		return -EINVAL;
 
  901
  902	if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
  903		return -EINVAL;
 
  904
  905	if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
  906		return -EINVAL;
 
  907
  908	if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
  909		return -EINVAL;
 
  910
  911	if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
  912		return -EINVAL;
 
  913
  914	return 0;
 
 
 
 
 
  915}
  916
  917int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  918{
  919	unsigned long old_cr4 = kvm_read_cr4(vcpu);
  920	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
  921				   X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
 
 
  922
  923	if (kvm_valid_cr4(vcpu, cr4))
 
 
 
 
 
 
  924		return 1;
  925
  926	if (is_long_mode(vcpu)) {
  927		if (!(cr4 & X86_CR4_PAE))
  928			return 1;
  929	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
  930		   && ((cr4 ^ old_cr4) & pdptr_bits)
  931		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
  932				   kvm_read_cr3(vcpu)))
  933		return 1;
  934
  935	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
  936		if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
  937			return 1;
  938
  939		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
  940		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
  941			return 1;
  942	}
  943
  944	if (kvm_x86_ops->set_cr4(vcpu, cr4))
  945		return 1;
  946
  947	if (((cr4 ^ old_cr4) & pdptr_bits) ||
  948	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
  949		kvm_mmu_reset_context(vcpu);
  950
  951	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
  952		kvm_update_cpuid(vcpu);
  953
  954	return 0;
  955}
  956EXPORT_SYMBOL_GPL(kvm_set_cr4);
  957
  958int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  959{
  960	bool skip_tlb_flush = false;
  961#ifdef CONFIG_X86_64
  962	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
  963
  964	if (pcid_enabled) {
  965		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
  966		cr3 &= ~X86_CR3_PCID_NOFLUSH;
  967	}
  968#endif
  969
  970	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
  971		if (!skip_tlb_flush) {
  972			kvm_mmu_sync_roots(vcpu);
  973			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 
 
 
 
 
 
  974		}
  975		return 0;
 
 
 
  976	}
  977
  978	if (is_long_mode(vcpu) &&
  979	    (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
 
 
 
 
 
 
 
 
  980		return 1;
  981	else if (is_pae_paging(vcpu) &&
  982		 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
  983		return 1;
  984
  985	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
  986	vcpu->arch.cr3 = cr3;
  987	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
  988
  989	return 0;
  990}
  991EXPORT_SYMBOL_GPL(kvm_set_cr3);
  992
  993int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
  994{
  995	if (cr8 & CR8_RESERVED_BITS)
  996		return 1;
  997	if (lapic_in_kernel(vcpu))
  998		kvm_lapic_set_tpr(vcpu, cr8);
  999	else
 1000		vcpu->arch.cr8 = cr8;
 1001	return 0;
 1002}
 1003EXPORT_SYMBOL_GPL(kvm_set_cr8);
 1004
 1005unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 1006{
 1007	if (lapic_in_kernel(vcpu))
 1008		return kvm_lapic_get_cr8(vcpu);
 1009	else
 1010		return vcpu->arch.cr8;
 1011}
 1012EXPORT_SYMBOL_GPL(kvm_get_cr8);
 1013
 1014static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
 1015{
 1016	int i;
 1017
 1018	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 1019		for (i = 0; i < KVM_NR_DB_REGS; i++)
 1020			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
 1021		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
 1022	}
 1023}
 1024
 1025static void kvm_update_dr6(struct kvm_vcpu *vcpu)
 1026{
 1027	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 1028		kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
 1029}
 1030
 1031static void kvm_update_dr7(struct kvm_vcpu *vcpu)
 1032{
 1033	unsigned long dr7;
 1034
 1035	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 1036		dr7 = vcpu->arch.guest_debug_dr7;
 1037	else
 1038		dr7 = vcpu->arch.dr7;
 1039	kvm_x86_ops->set_dr7(vcpu, dr7);
 1040	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
 1041	if (dr7 & DR7_BP_EN_MASK)
 1042		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
 1043}
 1044
 1045static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
 1046{
 1047	u64 fixed = DR6_FIXED_1;
 1048
 1049	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
 1050		fixed |= DR6_RTM;
 1051	return fixed;
 1052}
 1053
 1054static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 1055{
 1056	switch (dr) {
 1057	case 0 ... 3:
 1058		vcpu->arch.db[dr] = val;
 1059		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
 1060			vcpu->arch.eff_db[dr] = val;
 1061		break;
 1062	case 4:
 
 
 1063		/* fall through */
 1064	case 6:
 1065		if (val & 0xffffffff00000000ULL)
 1066			return -1; /* #GP */
 1067		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
 1068		kvm_update_dr6(vcpu);
 1069		break;
 1070	case 5:
 
 
 1071		/* fall through */
 1072	default: /* 7 */
 1073		if (val & 0xffffffff00000000ULL)
 1074			return -1; /* #GP */
 1075		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 1076		kvm_update_dr7(vcpu);
 
 
 
 1077		break;
 1078	}
 1079
 1080	return 0;
 1081}
 1082
 1083int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 1084{
 1085	if (__kvm_set_dr(vcpu, dr, val)) {
 
 
 
 
 
 1086		kvm_inject_gp(vcpu, 0);
 1087		return 1;
 1088	}
 1089	return 0;
 1090}
 1091EXPORT_SYMBOL_GPL(kvm_set_dr);
 1092
 1093int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 1094{
 1095	switch (dr) {
 1096	case 0 ... 3:
 1097		*val = vcpu->arch.db[dr];
 1098		break;
 1099	case 4:
 
 
 1100		/* fall through */
 1101	case 6:
 1102		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 1103			*val = vcpu->arch.dr6;
 1104		else
 1105			*val = kvm_x86_ops->get_dr6(vcpu);
 1106		break;
 1107	case 5:
 
 
 1108		/* fall through */
 1109	default: /* 7 */
 1110		*val = vcpu->arch.dr7;
 1111		break;
 1112	}
 
 1113	return 0;
 1114}
 1115EXPORT_SYMBOL_GPL(kvm_get_dr);
 1116
 1117bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 1118{
 1119	u32 ecx = kvm_rcx_read(vcpu);
 1120	u64 data;
 1121	int err;
 1122
 1123	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
 1124	if (err)
 1125		return err;
 1126	kvm_rax_write(vcpu, (u32)data);
 1127	kvm_rdx_write(vcpu, data >> 32);
 1128	return err;
 1129}
 1130EXPORT_SYMBOL_GPL(kvm_rdpmc);
 1131
 1132/*
 1133 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 1134 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 1135 *
 1136 * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
 1137 * extract the supported MSRs from the related const lists.
 1138 * msrs_to_save is selected from the msrs_to_save_all to reflect the
 1139 * capabilities of the host cpu. This capabilities test skips MSRs that are
 1140 * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
 1141 * may depend on host virtualization features rather than host cpu features.
 1142 */
 1143
 1144static const u32 msrs_to_save_all[] = {
 
 
 
 
 
 1145	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 1146	MSR_STAR,
 1147#ifdef CONFIG_X86_64
 1148	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 1149#endif
 1150	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 1151	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
 1152	MSR_IA32_SPEC_CTRL,
 1153	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
 1154	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
 1155	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
 1156	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
 1157	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
 1158	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
 1159	MSR_IA32_UMWAIT_CONTROL,
 1160
 1161	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
 1162	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
 1163	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
 1164	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
 1165	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
 1166	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
 1167	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
 1168	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
 1169	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
 1170	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
 1171	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
 1172	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
 1173	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
 1174	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
 1175	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
 1176	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
 1177	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
 1178	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
 1179	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
 1180	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
 1181	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
 1182	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
 1183};
 1184
 1185static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
 1186static unsigned num_msrs_to_save;
 1187
 1188static const u32 emulated_msrs_all[] = {
 1189	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 1190	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 1191	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 1192	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 1193	HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
 1194	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 1195	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 1196	HV_X64_MSR_RESET,
 1197	HV_X64_MSR_VP_INDEX,
 1198	HV_X64_MSR_VP_RUNTIME,
 1199	HV_X64_MSR_SCONTROL,
 1200	HV_X64_MSR_STIMER0_CONFIG,
 1201	HV_X64_MSR_VP_ASSIST_PAGE,
 1202	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
 1203	HV_X64_MSR_TSC_EMULATION_STATUS,
 1204
 1205	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 1206	MSR_KVM_PV_EOI_EN,
 1207
 1208	MSR_IA32_TSC_ADJUST,
 1209	MSR_IA32_TSCDEADLINE,
 1210	MSR_IA32_ARCH_CAPABILITIES,
 1211	MSR_IA32_MISC_ENABLE,
 1212	MSR_IA32_MCG_STATUS,
 1213	MSR_IA32_MCG_CTL,
 1214	MSR_IA32_MCG_EXT_CTL,
 1215	MSR_IA32_SMBASE,
 1216	MSR_SMI_COUNT,
 1217	MSR_PLATFORM_INFO,
 1218	MSR_MISC_FEATURES_ENABLES,
 1219	MSR_AMD64_VIRT_SPEC_CTRL,
 1220	MSR_IA32_POWER_CTL,
 1221
 1222	/*
 1223	 * The following list leaves out MSRs whose values are determined
 1224	 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
 1225	 * We always support the "true" VMX control MSRs, even if the host
 1226	 * processor does not, so I am putting these registers here rather
 1227	 * than in msrs_to_save_all.
 1228	 */
 1229	MSR_IA32_VMX_BASIC,
 1230	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
 1231	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
 1232	MSR_IA32_VMX_TRUE_EXIT_CTLS,
 1233	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
 1234	MSR_IA32_VMX_MISC,
 1235	MSR_IA32_VMX_CR0_FIXED0,
 1236	MSR_IA32_VMX_CR4_FIXED0,
 1237	MSR_IA32_VMX_VMCS_ENUM,
 1238	MSR_IA32_VMX_PROCBASED_CTLS2,
 1239	MSR_IA32_VMX_EPT_VPID_CAP,
 1240	MSR_IA32_VMX_VMFUNC,
 1241
 1242	MSR_K7_HWCR,
 1243	MSR_KVM_POLL_CONTROL,
 1244};
 1245
 1246static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
 1247static unsigned num_emulated_msrs;
 1248
 1249/*
 1250 * List of msr numbers which are used to expose MSR-based features that
 1251 * can be used by a hypervisor to validate requested CPU features.
 1252 */
 1253static const u32 msr_based_features_all[] = {
 1254	MSR_IA32_VMX_BASIC,
 1255	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
 1256	MSR_IA32_VMX_PINBASED_CTLS,
 1257	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
 1258	MSR_IA32_VMX_PROCBASED_CTLS,
 1259	MSR_IA32_VMX_TRUE_EXIT_CTLS,
 1260	MSR_IA32_VMX_EXIT_CTLS,
 1261	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
 1262	MSR_IA32_VMX_ENTRY_CTLS,
 1263	MSR_IA32_VMX_MISC,
 1264	MSR_IA32_VMX_CR0_FIXED0,
 1265	MSR_IA32_VMX_CR0_FIXED1,
 1266	MSR_IA32_VMX_CR4_FIXED0,
 1267	MSR_IA32_VMX_CR4_FIXED1,
 1268	MSR_IA32_VMX_VMCS_ENUM,
 1269	MSR_IA32_VMX_PROCBASED_CTLS2,
 1270	MSR_IA32_VMX_EPT_VPID_CAP,
 1271	MSR_IA32_VMX_VMFUNC,
 1272
 1273	MSR_F10H_DECFG,
 1274	MSR_IA32_UCODE_REV,
 1275	MSR_IA32_ARCH_CAPABILITIES,
 1276};
 1277
 1278static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
 1279static unsigned int num_msr_based_features;
 1280
 1281static u64 kvm_get_arch_capabilities(void)
 1282{
 1283	u64 data = 0;
 1284
 1285	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
 1286		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
 1287
 1288	/*
 1289	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
 1290	 * the nested hypervisor runs with NX huge pages.  If it is not,
 1291	 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
 1292	 * L1 guests, so it need not worry about its own (L2) guests.
 1293	 */
 1294	data |= ARCH_CAP_PSCHANGE_MC_NO;
 1295
 1296	/*
 1297	 * If we're doing cache flushes (either "always" or "cond")
 1298	 * we will do one whenever the guest does a vmlaunch/vmresume.
 1299	 * If an outer hypervisor is doing the cache flush for us
 1300	 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
 1301	 * capability to the guest too, and if EPT is disabled we're not
 1302	 * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
 1303	 * require a nested hypervisor to do a flush of its own.
 1304	 */
 1305	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
 1306		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
 1307
 1308	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 1309		data |= ARCH_CAP_RDCL_NO;
 1310	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
 1311		data |= ARCH_CAP_SSB_NO;
 1312	if (!boot_cpu_has_bug(X86_BUG_MDS))
 1313		data |= ARCH_CAP_MDS_NO;
 1314
 1315	/*
 1316	 * On TAA affected systems, export MDS_NO=0 when:
 1317	 *	- TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
 1318	 *	- Updated microcode is present. This is detected by
 1319	 *	  the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
 1320	 *	  that VERW clears CPU buffers.
 1321	 *
 1322	 * When MDS_NO=0 is exported, guests deploy clear CPU buffer
 1323	 * mitigation and don't complain:
 1324	 *
 1325	 *	"Vulnerable: Clear CPU buffers attempted, no microcode"
 1326	 *
 1327	 * If TSX is disabled on the system, guests are also mitigated against
 1328	 * TAA and clear CPU buffer mitigation is not required for guests.
 1329	 */
 1330	if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
 1331	    (data & ARCH_CAP_TSX_CTRL_MSR))
 1332		data &= ~ARCH_CAP_MDS_NO;
 1333
 1334	return data;
 1335}
 1336
 1337static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
 1338{
 1339	switch (msr->index) {
 1340	case MSR_IA32_ARCH_CAPABILITIES:
 1341		msr->data = kvm_get_arch_capabilities();
 1342		break;
 1343	case MSR_IA32_UCODE_REV:
 1344		rdmsrl_safe(msr->index, &msr->data);
 1345		break;
 1346	default:
 1347		if (kvm_x86_ops->get_msr_feature(msr))
 1348			return 1;
 1349	}
 1350	return 0;
 1351}
 1352
 1353static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 1354{
 1355	struct kvm_msr_entry msr;
 1356	int r;
 1357
 1358	msr.index = index;
 1359	r = kvm_get_msr_feature(&msr);
 1360	if (r)
 1361		return r;
 1362
 1363	*data = msr.data;
 1364
 1365	return 0;
 1366}
 1367
 1368static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 1369{
 1370	if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
 1371		return false;
 1372
 1373	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
 1374		return false;
 1375
 1376	if (efer & (EFER_LME | EFER_LMA) &&
 1377	    !guest_cpuid_has(vcpu, X86_FEATURE_LM))
 1378		return false;
 1379
 1380	if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
 1381		return false;
 1382
 1383	return true;
 1384
 1385}
 1386bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 1387{
 1388	if (efer & efer_reserved_bits)
 1389		return false;
 1390
 1391	return __kvm_valid_efer(vcpu, efer);
 1392}
 1393EXPORT_SYMBOL_GPL(kvm_valid_efer);
 1394
 1395static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 1396{
 1397	u64 old_efer = vcpu->arch.efer;
 1398	u64 efer = msr_info->data;
 1399
 1400	if (efer & efer_reserved_bits)
 1401		return 1;
 1402
 1403	if (!msr_info->host_initiated) {
 1404		if (!__kvm_valid_efer(vcpu, efer))
 1405			return 1;
 1406
 1407		if (is_paging(vcpu) &&
 1408		    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
 1409			return 1;
 1410	}
 1411
 1412	efer &= ~EFER_LMA;
 1413	efer |= vcpu->arch.efer & EFER_LMA;
 1414
 1415	kvm_x86_ops->set_efer(vcpu, efer);
 1416
 
 
 1417	/* Update reserved bits */
 1418	if ((efer ^ old_efer) & EFER_NX)
 1419		kvm_mmu_reset_context(vcpu);
 1420
 1421	return 0;
 1422}
 1423
 1424void kvm_enable_efer_bits(u64 mask)
 1425{
 1426       efer_reserved_bits &= ~mask;
 1427}
 1428EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 1429
 1430/*
 1431 * Write @data into the MSR specified by @index.  Select MSR specific fault
 1432 * checks are bypassed if @host_initiated is %true.
 1433 * Returns 0 on success, non-0 otherwise.
 1434 * Assumes vcpu_load() was already called.
 1435 */
 1436static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
 1437			 bool host_initiated)
 1438{
 1439	struct msr_data msr;
 1440
 1441	switch (index) {
 1442	case MSR_FS_BASE:
 1443	case MSR_GS_BASE:
 1444	case MSR_KERNEL_GS_BASE:
 1445	case MSR_CSTAR:
 1446	case MSR_LSTAR:
 1447		if (is_noncanonical_address(data, vcpu))
 1448			return 1;
 1449		break;
 1450	case MSR_IA32_SYSENTER_EIP:
 1451	case MSR_IA32_SYSENTER_ESP:
 1452		/*
 1453		 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
 1454		 * non-canonical address is written on Intel but not on
 1455		 * AMD (which ignores the top 32-bits, because it does
 1456		 * not implement 64-bit SYSENTER).
 1457		 *
 1458		 * 64-bit code should hence be able to write a non-canonical
 1459		 * value on AMD.  Making the address canonical ensures that
 1460		 * vmentry does not fail on Intel after writing a non-canonical
 1461		 * value, and that something deterministic happens if the guest
 1462		 * invokes 64-bit SYSENTER.
 1463		 */
 1464		data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
 1465	}
 1466
 1467	msr.data = data;
 1468	msr.index = index;
 1469	msr.host_initiated = host_initiated;
 1470
 1471	return kvm_x86_ops->set_msr(vcpu, &msr);
 1472}
 1473
 1474/*
 1475 * Read the MSR specified by @index into @data.  Select MSR specific fault
 1476 * checks are bypassed if @host_initiated is %true.
 1477 * Returns 0 on success, non-0 otherwise.
 1478 * Assumes vcpu_load() was already called.
 1479 */
 1480static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 1481			 bool host_initiated)
 1482{
 1483	struct msr_data msr;
 1484	int ret;
 1485
 1486	msr.index = index;
 1487	msr.host_initiated = host_initiated;
 1488
 1489	ret = kvm_x86_ops->get_msr(vcpu, &msr);
 1490	if (!ret)
 1491		*data = msr.data;
 1492	return ret;
 1493}
 1494
 1495int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
 1496{
 1497	return __kvm_get_msr(vcpu, index, data, false);
 1498}
 1499EXPORT_SYMBOL_GPL(kvm_get_msr);
 1500
 1501int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 1502{
 1503	return __kvm_set_msr(vcpu, index, data, false);
 1504}
 1505EXPORT_SYMBOL_GPL(kvm_set_msr);
 1506
 1507int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
 1508{
 1509	u32 ecx = kvm_rcx_read(vcpu);
 1510	u64 data;
 1511
 1512	if (kvm_get_msr(vcpu, ecx, &data)) {
 1513		trace_kvm_msr_read_ex(ecx);
 1514		kvm_inject_gp(vcpu, 0);
 1515		return 1;
 1516	}
 1517
 1518	trace_kvm_msr_read(ecx, data);
 1519
 1520	kvm_rax_write(vcpu, data & -1u);
 1521	kvm_rdx_write(vcpu, (data >> 32) & -1u);
 1522	return kvm_skip_emulated_instruction(vcpu);
 1523}
 1524EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
 1525
 1526int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 1527{
 1528	u32 ecx = kvm_rcx_read(vcpu);
 1529	u64 data = kvm_read_edx_eax(vcpu);
 1530
 1531	if (kvm_set_msr(vcpu, ecx, data)) {
 1532		trace_kvm_msr_write_ex(ecx, data);
 1533		kvm_inject_gp(vcpu, 0);
 1534		return 1;
 1535	}
 1536
 1537	trace_kvm_msr_write(ecx, data);
 1538	return kvm_skip_emulated_instruction(vcpu);
 1539}
 1540EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
 1541
 1542/*
 1543 * Adapt set_msr() to msr_io()'s calling convention
 1544 */
 1545static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 1546{
 1547	return __kvm_get_msr(vcpu, index, data, true);
 1548}
 1549
 1550static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 1551{
 1552	return __kvm_set_msr(vcpu, index, *data, true);
 1553}
 1554
 1555#ifdef CONFIG_X86_64
 1556struct pvclock_gtod_data {
 1557	seqcount_t	seq;
 1558
 1559	struct { /* extract of a clocksource struct */
 1560		int vclock_mode;
 1561		u64	cycle_last;
 1562		u64	mask;
 1563		u32	mult;
 1564		u32	shift;
 1565	} clock;
 1566
 1567	u64		boot_ns;
 1568	u64		nsec_base;
 1569	u64		wall_time_sec;
 1570};
 1571
 1572static struct pvclock_gtod_data pvclock_gtod_data;
 1573
 1574static void update_pvclock_gtod(struct timekeeper *tk)
 1575{
 1576	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
 1577	u64 boot_ns;
 1578
 1579	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
 1580
 1581	write_seqcount_begin(&vdata->seq);
 1582
 1583	/* copy pvclock gtod data */
 1584	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
 1585	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
 1586	vdata->clock.mask		= tk->tkr_mono.mask;
 1587	vdata->clock.mult		= tk->tkr_mono.mult;
 1588	vdata->clock.shift		= tk->tkr_mono.shift;
 1589
 1590	vdata->boot_ns			= boot_ns;
 1591	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 1592
 1593	vdata->wall_time_sec            = tk->xtime_sec;
 1594
 1595	write_seqcount_end(&vdata->seq);
 1596}
 1597#endif
 1598
 1599void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
 1600{
 1601	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 1602	kvm_vcpu_kick(vcpu);
 1603}
 1604
 1605static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 1606{
 1607	int version;
 1608	int r;
 1609	struct pvclock_wall_clock wc;
 1610	struct timespec64 boot;
 1611
 1612	if (!wall_clock)
 1613		return;
 1614
 1615	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
 1616	if (r)
 1617		return;
 1618
 1619	if (version & 1)
 1620		++version;  /* first time write, random junk */
 1621
 1622	++version;
 1623
 1624	if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
 1625		return;
 1626
 1627	/*
 1628	 * The guest calculates current wall clock time by adding
 1629	 * system time (updated by kvm_guest_time_update below) to the
 1630	 * wall clock specified here.  guest system time equals host
 1631	 * system time for us, thus we must fill in host boot time here.
 1632	 */
 1633	getboottime64(&boot);
 1634
 1635	if (kvm->arch.kvmclock_offset) {
 1636		struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
 1637		boot = timespec64_sub(boot, ts);
 1638	}
 1639	wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
 1640	wc.nsec = boot.tv_nsec;
 1641	wc.version = version;
 1642
 1643	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
 1644
 1645	version++;
 1646	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 1647}
 1648
 1649static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 1650{
 1651	do_shl32_div32(dividend, divisor);
 1652	return dividend;
 
 
 
 
 
 
 1653}
 1654
 1655static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
 1656			       s8 *pshift, u32 *pmultiplier)
 1657{
 1658	uint64_t scaled64;
 1659	int32_t  shift = 0;
 1660	uint64_t tps64;
 1661	uint32_t tps32;
 1662
 1663	tps64 = base_hz;
 1664	scaled64 = scaled_hz;
 1665	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 1666		tps64 >>= 1;
 1667		shift--;
 1668	}
 1669
 1670	tps32 = (uint32_t)tps64;
 1671	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
 1672		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
 1673			scaled64 >>= 1;
 1674		else
 1675			tps32 <<= 1;
 1676		shift++;
 1677	}
 1678
 1679	*pshift = shift;
 1680	*pmultiplier = div_frac(scaled64, tps32);
 1681}
 1682
 1683#ifdef CONFIG_X86_64
 1684static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 1685#endif
 1686
 1687static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 1688static unsigned long max_tsc_khz;
 1689
 1690static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 1691{
 1692	u64 v = (u64)khz * (1000000 + ppm);
 1693	do_div(v, 1000000);
 1694	return v;
 1695}
 1696
 1697static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 1698{
 1699	u64 ratio;
 1700
 1701	/* Guest TSC same frequency as host TSC? */
 1702	if (!scale) {
 1703		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 1704		return 0;
 1705	}
 1706
 1707	/* TSC scaling supported? */
 1708	if (!kvm_has_tsc_control) {
 1709		if (user_tsc_khz > tsc_khz) {
 1710			vcpu->arch.tsc_catchup = 1;
 1711			vcpu->arch.tsc_always_catchup = 1;
 1712			return 0;
 1713		} else {
 1714			pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
 1715			return -1;
 1716		}
 1717	}
 1718
 1719	/* TSC scaling required  - calculate ratio */
 1720	ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
 1721				user_tsc_khz, tsc_khz);
 1722
 1723	if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
 1724		pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
 1725			            user_tsc_khz);
 1726		return -1;
 1727	}
 1728
 1729	vcpu->arch.tsc_scaling_ratio = ratio;
 1730	return 0;
 1731}
 1732
 1733static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 1734{
 1735	u32 thresh_lo, thresh_hi;
 1736	int use_scaling = 0;
 1737
 1738	/* tsc_khz can be zero if TSC calibration fails */
 1739	if (user_tsc_khz == 0) {
 1740		/* set tsc_scaling_ratio to a safe value */
 1741		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 1742		return -1;
 1743	}
 1744
 1745	/* Compute a scale to convert nanoseconds in TSC cycles */
 1746	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
 1747			   &vcpu->arch.virtual_tsc_shift,
 1748			   &vcpu->arch.virtual_tsc_mult);
 1749	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 1750
 1751	/*
 1752	 * Compute the variation in TSC rate which is acceptable
 1753	 * within the range of tolerance and decide if the
 1754	 * rate being applied is within that bounds of the hardware
 1755	 * rate.  If so, no scaling or compensation need be done.
 1756	 */
 1757	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
 1758	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
 1759	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
 1760		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
 1761		use_scaling = 1;
 1762	}
 1763	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
 1764}
 1765
 1766static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 1767{
 1768	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
 1769				      vcpu->arch.virtual_tsc_mult,
 1770				      vcpu->arch.virtual_tsc_shift);
 1771	tsc += vcpu->arch.this_tsc_write;
 1772	return tsc;
 1773}
 1774
 1775static inline int gtod_is_based_on_tsc(int mode)
 1776{
 1777	return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
 
 
 
 1778}
 1779
 1780static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 1781{
 1782#ifdef CONFIG_X86_64
 1783	bool vcpus_matched;
 1784	struct kvm_arch *ka = &vcpu->kvm->arch;
 1785	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 1786
 1787	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 1788			 atomic_read(&vcpu->kvm->online_vcpus));
 1789
 1790	/*
 1791	 * Once the masterclock is enabled, always perform request in
 1792	 * order to update it.
 1793	 *
 1794	 * In order to enable masterclock, the host clocksource must be TSC
 1795	 * and the vcpus need to have matched TSCs.  When that happens,
 1796	 * perform request to enable masterclock.
 1797	 */
 1798	if (ka->use_master_clock ||
 1799	    (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
 1800		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 1801
 1802	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
 1803			    atomic_read(&vcpu->kvm->online_vcpus),
 1804		            ka->use_master_clock, gtod->clock.vclock_mode);
 1805#endif
 1806}
 1807
 1808static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 1809{
 1810	u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
 1811	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 
 
 1812}
 1813
 1814/*
 1815 * Multiply tsc by a fixed point number represented by ratio.
 1816 *
 1817 * The most significant 64-N bits (mult) of ratio represent the
 1818 * integral part of the fixed point number; the remaining N bits
 1819 * (frac) represent the fractional part, ie. ratio represents a fixed
 1820 * point number (mult + frac * 2^(-N)).
 1821 *
 1822 * N equals to kvm_tsc_scaling_ratio_frac_bits.
 1823 */
 1824static inline u64 __scale_tsc(u64 ratio, u64 tsc)
 1825{
 1826	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
 1827}
 1828
 1829u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
 1830{
 1831	u64 _tsc = tsc;
 1832	u64 ratio = vcpu->arch.tsc_scaling_ratio;
 1833
 1834	if (ratio != kvm_default_tsc_scaling_ratio)
 1835		_tsc = __scale_tsc(ratio, tsc);
 1836
 1837	return _tsc;
 1838}
 1839EXPORT_SYMBOL_GPL(kvm_scale_tsc);
 1840
 1841static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 1842{
 1843	u64 tsc;
 1844
 1845	tsc = kvm_scale_tsc(vcpu, rdtsc());
 1846
 1847	return target_tsc - tsc;
 1848}
 1849
 1850u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 1851{
 1852	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
 1853
 1854	return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 1855}
 1856EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 1857
 1858static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 1859{
 1860	vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
 1861}
 1862
 1863static inline bool kvm_check_tsc_unstable(void)
 1864{
 1865#ifdef CONFIG_X86_64
 1866	/*
 1867	 * TSC is marked unstable when we're running on Hyper-V,
 1868	 * 'TSC page' clocksource is good.
 1869	 */
 1870	if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
 1871		return false;
 1872#endif
 1873	return check_tsc_unstable();
 1874}
 1875
 1876void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 1877{
 1878	struct kvm *kvm = vcpu->kvm;
 1879	u64 offset, ns, elapsed;
 1880	unsigned long flags;
 1881	bool matched;
 1882	bool already_matched;
 1883	u64 data = msr->data;
 1884	bool synchronizing = false;
 1885
 1886	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 1887	offset = kvm_compute_tsc_offset(vcpu, data);
 1888	ns = ktime_get_boottime_ns();
 1889	elapsed = ns - kvm->arch.last_tsc_nsec;
 1890
 1891	if (vcpu->arch.virtual_tsc_khz) {
 1892		if (data == 0 && msr->host_initiated) {
 1893			/*
 1894			 * detection of vcpu initialization -- need to sync
 1895			 * with other vCPUs. This particularly helps to keep
 1896			 * kvm_clock stable after CPU hotplug
 1897			 */
 1898			synchronizing = true;
 1899		} else {
 1900			u64 tsc_exp = kvm->arch.last_tsc_write +
 1901						nsec_to_cycles(vcpu, elapsed);
 1902			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
 1903			/*
 1904			 * Special case: TSC write with a small delta (1 second)
 1905			 * of virtual cycle time against real time is
 1906			 * interpreted as an attempt to synchronize the CPU.
 1907			 */
 1908			synchronizing = data < tsc_exp + tsc_hz &&
 1909					data + tsc_hz > tsc_exp;
 1910		}
 1911	}
 1912
 1913	/*
 1914	 * For a reliable TSC, we can match TSC offsets, and for an unstable
 1915	 * TSC, we add elapsed time in this computation.  We could let the
 1916	 * compensation code attempt to catch up if we fall behind, but
 1917	 * it's better to try to match offsets from the beginning.
 1918         */
 1919	if (synchronizing &&
 1920	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 1921		if (!kvm_check_tsc_unstable()) {
 1922			offset = kvm->arch.cur_tsc_offset;
 
 
 
 
 1923		} else {
 1924			u64 delta = nsec_to_cycles(vcpu, elapsed);
 1925			data += delta;
 1926			offset = kvm_compute_tsc_offset(vcpu, data);
 1927		}
 1928		matched = true;
 1929		already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
 1930	} else {
 1931		/*
 1932		 * We split periods of matched TSC writes into generations.
 1933		 * For each generation, we track the original measured
 1934		 * nanosecond time, offset, and write, so if TSCs are in
 1935		 * sync, we can match exact offset, and if not, we can match
 1936		 * exact software computation in compute_guest_tsc()
 1937		 *
 1938		 * These values are tracked in kvm->arch.cur_xxx variables.
 1939		 */
 1940		kvm->arch.cur_tsc_generation++;
 1941		kvm->arch.cur_tsc_nsec = ns;
 1942		kvm->arch.cur_tsc_write = data;
 1943		kvm->arch.cur_tsc_offset = offset;
 1944		matched = false;
 1945	}
 1946
 1947	/*
 1948	 * We also track th most recent recorded KHZ, write and time to
 1949	 * allow the matching interval to be extended at each write.
 1950	 */
 1951	kvm->arch.last_tsc_nsec = ns;
 1952	kvm->arch.last_tsc_write = data;
 1953	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
 1954
 1955	vcpu->arch.last_guest_tsc = data;
 1956
 1957	/* Keep track of which generation this VCPU has synchronized to */
 1958	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
 1959	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
 1960	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
 1961
 1962	if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
 1963		update_ia32_tsc_adjust_msr(vcpu, offset);
 1964
 1965	kvm_vcpu_write_tsc_offset(vcpu, offset);
 1966	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 1967
 1968	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
 1969	if (!matched) {
 1970		kvm->arch.nr_vcpus_matched_tsc = 0;
 1971	} else if (!already_matched) {
 1972		kvm->arch.nr_vcpus_matched_tsc++;
 1973	}
 1974
 1975	kvm_track_tsc_matching(vcpu);
 1976	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 1977}
 1978
 1979EXPORT_SYMBOL_GPL(kvm_write_tsc);
 1980
 1981static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 1982					   s64 adjustment)
 1983{
 1984	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
 1985	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
 1986}
 1987
 1988static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 1989{
 1990	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
 1991		WARN_ON(adjustment < 0);
 1992	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
 1993	adjust_tsc_offset_guest(vcpu, adjustment);
 1994}
 1995
 1996#ifdef CONFIG_X86_64
 1997
 1998static u64 read_tsc(void)
 1999{
 2000	u64 ret = (u64)rdtsc_ordered();
 2001	u64 last = pvclock_gtod_data.clock.cycle_last;
 2002
 2003	if (likely(ret >= last))
 2004		return ret;
 2005
 2006	/*
 2007	 * GCC likes to generate cmov here, but this branch is extremely
 2008	 * predictable (it's just a function of time and the likely is
 2009	 * very likely) and there's a data dependence, so force GCC
 2010	 * to generate a branch instead.  I don't barrier() because
 2011	 * we don't actually need a barrier, and if this function
 2012	 * ever gets inlined it will generate worse code.
 2013	 */
 2014	asm volatile ("");
 2015	return last;
 2016}
 2017
 2018static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
 2019{
 2020	long v;
 2021	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 2022	u64 tsc_pg_val;
 2023
 2024	switch (gtod->clock.vclock_mode) {
 2025	case VCLOCK_HVCLOCK:
 2026		tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
 2027						  tsc_timestamp);
 2028		if (tsc_pg_val != U64_MAX) {
 2029			/* TSC page valid */
 2030			*mode = VCLOCK_HVCLOCK;
 2031			v = (tsc_pg_val - gtod->clock.cycle_last) &
 2032				gtod->clock.mask;
 2033		} else {
 2034			/* TSC page invalid */
 2035			*mode = VCLOCK_NONE;
 2036		}
 2037		break;
 2038	case VCLOCK_TSC:
 2039		*mode = VCLOCK_TSC;
 2040		*tsc_timestamp = read_tsc();
 2041		v = (*tsc_timestamp - gtod->clock.cycle_last) &
 2042			gtod->clock.mask;
 2043		break;
 2044	default:
 2045		*mode = VCLOCK_NONE;
 2046	}
 2047
 2048	if (*mode == VCLOCK_NONE)
 2049		*tsc_timestamp = v = 0;
 2050
 2051	return v * gtod->clock.mult;
 2052}
 2053
 2054static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
 2055{
 2056	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 2057	unsigned long seq;
 2058	int mode;
 2059	u64 ns;
 2060
 2061	do {
 2062		seq = read_seqcount_begin(&gtod->seq);
 2063		ns = gtod->nsec_base;
 2064		ns += vgettsc(tsc_timestamp, &mode);
 2065		ns >>= gtod->clock.shift;
 2066		ns += gtod->boot_ns;
 2067	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 2068	*t = ns;
 2069
 2070	return mode;
 2071}
 2072
 2073static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
 2074{
 2075	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 2076	unsigned long seq;
 2077	int mode;
 2078	u64 ns;
 2079
 2080	do {
 2081		seq = read_seqcount_begin(&gtod->seq);
 2082		ts->tv_sec = gtod->wall_time_sec;
 2083		ns = gtod->nsec_base;
 2084		ns += vgettsc(tsc_timestamp, &mode);
 2085		ns >>= gtod->clock.shift;
 2086	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 2087
 2088	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
 2089	ts->tv_nsec = ns;
 2090
 2091	return mode;
 2092}
 2093
 2094/* returns true if host is using TSC based clocksource */
 2095static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
 2096{
 2097	/* checked again under seqlock below */
 2098	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
 2099		return false;
 2100
 2101	return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
 2102						      tsc_timestamp));
 2103}
 2104
 2105/* returns true if host is using TSC based clocksource */
 2106static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
 2107					   u64 *tsc_timestamp)
 2108{
 2109	/* checked again under seqlock below */
 2110	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
 2111		return false;
 2112
 2113	return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
 2114}
 2115#endif
 2116
 2117/*
 2118 *
 2119 * Assuming a stable TSC across physical CPUS, and a stable TSC
 2120 * across virtual CPUs, the following condition is possible.
 2121 * Each numbered line represents an event visible to both
 2122 * CPUs at the next numbered event.
 2123 *
 2124 * "timespecX" represents host monotonic time. "tscX" represents
 2125 * RDTSC value.
 2126 *
 2127 * 		VCPU0 on CPU0		|	VCPU1 on CPU1
 2128 *
 2129 * 1.  read timespec0,tsc0
 2130 * 2.					| timespec1 = timespec0 + N
 2131 * 					| tsc1 = tsc0 + M
 2132 * 3. transition to guest		| transition to guest
 2133 * 4. ret0 = timespec0 + (rdtsc - tsc0) |
 2134 * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
 2135 * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
 2136 *
 2137 * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
 2138 *
 2139 * 	- ret0 < ret1
 2140 *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
 2141 *		...
 2142 *	- 0 < N - M => M < N
 2143 *
 2144 * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
 2145 * always the case (the difference between two distinct xtime instances
 2146 * might be smaller then the difference between corresponding TSC reads,
 2147 * when updating guest vcpus pvclock areas).
 2148 *
 2149 * To avoid that problem, do not allow visibility of distinct
 2150 * system_timestamp/tsc_timestamp values simultaneously: use a master
 2151 * copy of host monotonic time values. Update that master copy
 2152 * in lockstep.
 2153 *
 2154 * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
 2155 *
 2156 */
 2157
 2158static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 2159{
 2160#ifdef CONFIG_X86_64
 2161	struct kvm_arch *ka = &kvm->arch;
 2162	int vclock_mode;
 2163	bool host_tsc_clocksource, vcpus_matched;
 2164
 2165	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 2166			atomic_read(&kvm->online_vcpus));
 2167
 2168	/*
 2169	 * If the host uses TSC clock, then passthrough TSC as stable
 2170	 * to the guest.
 2171	 */
 2172	host_tsc_clocksource = kvm_get_time_and_clockread(
 2173					&ka->master_kernel_ns,
 2174					&ka->master_cycle_now);
 2175
 2176	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
 2177				&& !ka->backwards_tsc_observed
 2178				&& !ka->boot_vcpu_runs_old_kvmclock;
 2179
 2180	if (ka->use_master_clock)
 2181		atomic_set(&kvm_guest_has_master_clock, 1);
 2182
 2183	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
 2184	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
 2185					vcpus_matched);
 2186#endif
 2187}
 2188
 2189void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 2190{
 2191	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 2192}
 2193
 2194static void kvm_gen_update_masterclock(struct kvm *kvm)
 2195{
 2196#ifdef CONFIG_X86_64
 2197	int i;
 2198	struct kvm_vcpu *vcpu;
 2199	struct kvm_arch *ka = &kvm->arch;
 2200
 2201	spin_lock(&ka->pvclock_gtod_sync_lock);
 2202	kvm_make_mclock_inprogress_request(kvm);
 2203	/* no guest entries from this point */
 2204	pvclock_update_vm_gtod_copy(kvm);
 2205
 2206	kvm_for_each_vcpu(i, vcpu, kvm)
 2207		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 2208
 2209	/* guest entries allowed */
 2210	kvm_for_each_vcpu(i, vcpu, kvm)
 2211		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 2212
 2213	spin_unlock(&ka->pvclock_gtod_sync_lock);
 2214#endif
 2215}
 2216
 2217u64 get_kvmclock_ns(struct kvm *kvm)
 2218{
 2219	struct kvm_arch *ka = &kvm->arch;
 2220	struct pvclock_vcpu_time_info hv_clock;
 2221	u64 ret;
 2222
 2223	spin_lock(&ka->pvclock_gtod_sync_lock);
 2224	if (!ka->use_master_clock) {
 2225		spin_unlock(&ka->pvclock_gtod_sync_lock);
 2226		return ktime_get_boottime_ns() + ka->kvmclock_offset;
 2227	}
 2228
 2229	hv_clock.tsc_timestamp = ka->master_cycle_now;
 2230	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
 2231	spin_unlock(&ka->pvclock_gtod_sync_lock);
 2232
 2233	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
 2234	get_cpu();
 2235
 2236	if (__this_cpu_read(cpu_tsc_khz)) {
 2237		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
 2238				   &hv_clock.tsc_shift,
 2239				   &hv_clock.tsc_to_system_mul);
 2240		ret = __pvclock_read_cycles(&hv_clock, rdtsc());
 2241	} else
 2242		ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
 2243
 2244	put_cpu();
 2245
 2246	return ret;
 2247}
 2248
 2249static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 2250{
 2251	struct kvm_vcpu_arch *vcpu = &v->arch;
 2252	struct pvclock_vcpu_time_info guest_hv_clock;
 2253
 2254	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
 2255		&guest_hv_clock, sizeof(guest_hv_clock))))
 2256		return;
 2257
 2258	/* This VCPU is paused, but it's legal for a guest to read another
 2259	 * VCPU's kvmclock, so we really have to follow the specification where
 2260	 * it says that version is odd if data is being modified, and even after
 2261	 * it is consistent.
 2262	 *
 2263	 * Version field updates must be kept separate.  This is because
 2264	 * kvm_write_guest_cached might use a "rep movs" instruction, and
 2265	 * writes within a string instruction are weakly ordered.  So there
 2266	 * are three writes overall.
 2267	 *
 2268	 * As a small optimization, only write the version field in the first
 2269	 * and third write.  The vcpu->pv_time cache is still valid, because the
 2270	 * version field is the first in the struct.
 2271	 */
 2272	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 2273
 2274	if (guest_hv_clock.version & 1)
 2275		++guest_hv_clock.version;  /* first time write, random junk */
 2276
 2277	vcpu->hv_clock.version = guest_hv_clock.version + 1;
 2278	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 2279				&vcpu->hv_clock,
 2280				sizeof(vcpu->hv_clock.version));
 2281
 2282	smp_wmb();
 2283
 2284	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
 2285	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
 2286
 2287	if (vcpu->pvclock_set_guest_stopped_request) {
 2288		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
 2289		vcpu->pvclock_set_guest_stopped_request = false;
 2290	}
 2291
 2292	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 2293
 2294	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 2295				&vcpu->hv_clock,
 2296				sizeof(vcpu->hv_clock));
 2297
 2298	smp_wmb();
 2299
 2300	vcpu->hv_clock.version++;
 2301	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 2302				&vcpu->hv_clock,
 2303				sizeof(vcpu->hv_clock.version));
 2304}
 2305
 2306static int kvm_guest_time_update(struct kvm_vcpu *v)
 2307{
 2308	unsigned long flags, tgt_tsc_khz;
 2309	struct kvm_vcpu_arch *vcpu = &v->arch;
 2310	struct kvm_arch *ka = &v->kvm->arch;
 2311	s64 kernel_ns;
 2312	u64 tsc_timestamp, host_tsc;
 2313	u8 pvclock_flags;
 2314	bool use_master_clock;
 2315
 2316	kernel_ns = 0;
 2317	host_tsc = 0;
 2318
 2319	/*
 2320	 * If the host uses TSC clock, then passthrough TSC as stable
 2321	 * to the guest.
 2322	 */
 2323	spin_lock(&ka->pvclock_gtod_sync_lock);
 2324	use_master_clock = ka->use_master_clock;
 2325	if (use_master_clock) {
 2326		host_tsc = ka->master_cycle_now;
 2327		kernel_ns = ka->master_kernel_ns;
 2328	}
 2329	spin_unlock(&ka->pvclock_gtod_sync_lock);
 2330
 2331	/* Keep irq disabled to prevent changes to the clock */
 2332	local_irq_save(flags);
 2333	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
 2334	if (unlikely(tgt_tsc_khz == 0)) {
 
 
 2335		local_irq_restore(flags);
 2336		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 2337		return 1;
 2338	}
 2339	if (!use_master_clock) {
 2340		host_tsc = rdtsc();
 2341		kernel_ns = ktime_get_boottime_ns();
 2342	}
 2343
 2344	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
 2345
 2346	/*
 2347	 * We may have to catch up the TSC to match elapsed wall clock
 2348	 * time for two reasons, even if kvmclock is used.
 2349	 *   1) CPU could have been running below the maximum TSC rate
 2350	 *   2) Broken TSC compensation resets the base at each VCPU
 2351	 *      entry to avoid unknown leaps of TSC even when running
 2352	 *      again on the same CPU.  This may cause apparent elapsed
 2353	 *      time to disappear, and the guest to stand still or run
 2354	 *	very slowly.
 2355	 */
 2356	if (vcpu->tsc_catchup) {
 2357		u64 tsc = compute_guest_tsc(v, kernel_ns);
 2358		if (tsc > tsc_timestamp) {
 2359			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
 2360			tsc_timestamp = tsc;
 2361		}
 2362	}
 2363
 2364	local_irq_restore(flags);
 2365
 2366	/* With all the info we got, fill in the values */
 
 2367
 2368	if (kvm_has_tsc_control)
 2369		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2370
 2371	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
 2372		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
 2373				   &vcpu->hv_clock.tsc_shift,
 2374				   &vcpu->hv_clock.tsc_to_system_mul);
 2375		vcpu->hw_tsc_khz = tgt_tsc_khz;
 2376	}
 2377
 
 
 
 
 2378	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 2379	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 
 2380	vcpu->last_guest_tsc = tsc_timestamp;
 
 
 
 
 
 
 
 
 
 
 2381
 2382	/* If the host uses TSC clocksource, then it is stable */
 2383	pvclock_flags = 0;
 2384	if (use_master_clock)
 2385		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 2386
 2387	vcpu->hv_clock.flags = pvclock_flags;
 2388
 2389	if (vcpu->pv_time_enabled)
 2390		kvm_setup_pvclock_page(v);
 2391	if (v == kvm_get_vcpu(v->kvm, 0))
 2392		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 2393	return 0;
 2394}
 2395
 2396/*
 2397 * kvmclock updates which are isolated to a given vcpu, such as
 2398 * vcpu->cpu migration, should not allow system_timestamp from
 2399 * the rest of the vcpus to remain static. Otherwise ntp frequency
 2400 * correction applies to one vcpu's system_timestamp but not
 2401 * the others.
 2402 *
 2403 * So in those cases, request a kvmclock update for all vcpus.
 2404 * We need to rate-limit these requests though, as they can
 2405 * considerably slow guests that have a large number of vcpus.
 2406 * The time for a remote vcpu to update its kvmclock is bound
 2407 * by the delay we use to rate-limit the updates.
 2408 */
 2409
 2410#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
 2411
 2412static void kvmclock_update_fn(struct work_struct *work)
 2413{
 2414	int i;
 2415	struct delayed_work *dwork = to_delayed_work(work);
 2416	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
 2417					   kvmclock_update_work);
 2418	struct kvm *kvm = container_of(ka, struct kvm, arch);
 2419	struct kvm_vcpu *vcpu;
 2420
 2421	kvm_for_each_vcpu(i, vcpu, kvm) {
 2422		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 2423		kvm_vcpu_kick(vcpu);
 
 
 
 
 
 
 
 
 2424	}
 
 2425}
 2426
 2427static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
 2428{
 2429	struct kvm *kvm = v->kvm;
 
 2430
 2431	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 2432	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
 2433					KVMCLOCK_UPDATE_DELAY);
 2434}
 2435
 2436#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
 
 
 2437
 2438static void kvmclock_sync_fn(struct work_struct *work)
 2439{
 2440	struct delayed_work *dwork = to_delayed_work(work);
 2441	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
 2442					   kvmclock_sync_work);
 2443	struct kvm *kvm = container_of(ka, struct kvm, arch);
 2444
 2445	if (!kvmclock_periodic_sync)
 2446		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 2447
 2448	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
 2449	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 2450					KVMCLOCK_SYNC_PERIOD);
 2451}
 2452
 2453/*
 2454 * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
 2455 */
 2456static bool can_set_mci_status(struct kvm_vcpu *vcpu)
 2457{
 2458	/* McStatusWrEn enabled? */
 2459	if (guest_cpuid_is_amd(vcpu))
 2460		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2461
 2462	return false;
 
 2463}
 2464
 2465static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 2466{
 2467	u64 mcg_cap = vcpu->arch.mcg_cap;
 2468	unsigned bank_num = mcg_cap & 0xff;
 2469	u32 msr = msr_info->index;
 2470	u64 data = msr_info->data;
 2471
 2472	switch (msr) {
 2473	case MSR_IA32_MCG_STATUS:
 2474		vcpu->arch.mcg_status = data;
 2475		break;
 2476	case MSR_IA32_MCG_CTL:
 2477		if (!(mcg_cap & MCG_CTL_P) &&
 2478		    (data || !msr_info->host_initiated))
 2479			return 1;
 2480		if (data != 0 && data != ~(u64)0)
 2481			return 1;
 2482		vcpu->arch.mcg_ctl = data;
 2483		break;
 2484	default:
 2485		if (msr >= MSR_IA32_MC0_CTL &&
 2486		    msr < MSR_IA32_MCx_CTL(bank_num)) {
 2487			u32 offset = msr - MSR_IA32_MC0_CTL;
 2488			/* only 0 or all 1s can be written to IA32_MCi_CTL
 2489			 * some Linux kernels though clear bit 10 in bank 4 to
 2490			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
 2491			 * this to avoid an uncatched #GP in the guest
 2492			 */
 2493			if ((offset & 0x3) == 0 &&
 2494			    data != 0 && (data | (1 << 10)) != ~(u64)0)
 2495				return -1;
 2496
 2497			/* MCi_STATUS */
 2498			if (!msr_info->host_initiated &&
 2499			    (offset & 0x3) == 1 && data != 0) {
 2500				if (!can_set_mci_status(vcpu))
 2501					return -1;
 2502			}
 2503
 2504			vcpu->arch.mce_banks[offset] = data;
 2505			break;
 2506		}
 2507		return 1;
 2508	}
 2509	return 0;
 2510}
 2511
 2512static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 2513{
 2514	struct kvm *kvm = vcpu->kvm;
 2515	int lm = is_long_mode(vcpu);
 2516	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
 2517		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
 2518	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
 2519		: kvm->arch.xen_hvm_config.blob_size_32;
 2520	u32 page_num = data & ~PAGE_MASK;
 2521	u64 page_addr = data & PAGE_MASK;
 2522	u8 *page;
 2523	int r;
 2524
 2525	r = -E2BIG;
 2526	if (page_num >= blob_size)
 2527		goto out;
 2528	r = -ENOMEM;
 2529	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
 2530	if (IS_ERR(page)) {
 2531		r = PTR_ERR(page);
 2532		goto out;
 2533	}
 2534	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
 
 
 2535		goto out_free;
 2536	r = 0;
 2537out_free:
 2538	kfree(page);
 2539out:
 2540	return r;
 2541}
 2542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2543static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 2544{
 2545	gpa_t gpa = data & ~0x3f;
 2546
 2547	/* Bits 3:5 are reserved, Should be zero */
 2548	if (data & 0x38)
 2549		return 1;
 2550
 2551	vcpu->arch.apf.msr_val = data;
 2552
 2553	if (!(data & KVM_ASYNC_PF_ENABLED)) {
 2554		kvm_clear_async_pf_completion_queue(vcpu);
 2555		kvm_async_pf_hash_reset(vcpu);
 2556		return 0;
 2557	}
 2558
 2559	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
 2560					sizeof(u32)))
 2561		return 1;
 2562
 2563	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
 2564	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
 2565	kvm_async_pf_wakeup_all(vcpu);
 2566	return 0;
 2567}
 2568
 2569static void kvmclock_reset(struct kvm_vcpu *vcpu)
 2570{
 2571	vcpu->arch.pv_time_enabled = false;
 2572	vcpu->arch.time = 0;
 
 
 2573}
 2574
 2575static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 2576{
 2577	++vcpu->stat.tlb_flush;
 2578	kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
 
 
 
 
 
 
 2579}
 2580
 2581static void record_steal_time(struct kvm_vcpu *vcpu)
 2582{
 2583	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 2584		return;
 2585
 2586	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 2587		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 2588		return;
 2589
 2590	/*
 2591	 * Doing a TLB flush here, on the guest's behalf, can avoid
 2592	 * expensive IPIs.
 2593	 */
 2594	trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
 2595		vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB);
 2596	if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
 2597		kvm_vcpu_flush_tlb(vcpu, false);
 2598
 2599	if (vcpu->arch.st.steal.version & 1)
 2600		vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
 2601
 2602	vcpu->arch.st.steal.version += 1;
 2603
 2604	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 2605		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 2606
 2607	smp_wmb();
 2608
 2609	vcpu->arch.st.steal.steal += current->sched_info.run_delay -
 2610		vcpu->arch.st.last_steal;
 2611	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 2612
 2613	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 2614		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 2615
 2616	smp_wmb();
 2617
 2618	vcpu->arch.st.steal.version += 1;
 2619
 2620	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 2621		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 2622}
 2623
 2624int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 2625{
 2626	bool pr = false;
 2627	u32 msr = msr_info->index;
 2628	u64 data = msr_info->data;
 2629
 2630	switch (msr) {
 2631	case MSR_AMD64_NB_CFG:
 2632	case MSR_IA32_UCODE_WRITE:
 2633	case MSR_VM_HSAVE_PA:
 2634	case MSR_AMD64_PATCH_LOADER:
 2635	case MSR_AMD64_BU_CFG2:
 2636	case MSR_AMD64_DC_CFG:
 2637	case MSR_F15H_EX_CFG:
 2638		break;
 2639
 2640	case MSR_IA32_UCODE_REV:
 2641		if (msr_info->host_initiated)
 2642			vcpu->arch.microcode_version = data;
 2643		break;
 2644	case MSR_IA32_ARCH_CAPABILITIES:
 2645		if (!msr_info->host_initiated)
 2646			return 1;
 2647		vcpu->arch.arch_capabilities = data;
 2648		break;
 2649	case MSR_EFER:
 2650		return set_efer(vcpu, msr_info);
 2651	case MSR_K7_HWCR:
 2652		data &= ~(u64)0x40;	/* ignore flush filter disable */
 2653		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 2654		data &= ~(u64)0x8;	/* ignore TLB cache disable */
 2655
 2656		/* Handle McStatusWrEn */
 2657		if (data == BIT_ULL(18)) {
 2658			vcpu->arch.msr_hwcr = data;
 2659		} else if (data != 0) {
 2660			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 2661				    data);
 2662			return 1;
 2663		}
 2664		break;
 2665	case MSR_FAM10H_MMIO_CONF_BASE:
 2666		if (data != 0) {
 2667			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
 2668				    "0x%llx\n", data);
 2669			return 1;
 2670		}
 2671		break;
 
 
 2672	case MSR_IA32_DEBUGCTLMSR:
 2673		if (!data) {
 2674			/* We support the non-activated case already */
 2675			break;
 2676		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
 2677			/* Values other than LBR and BTF are vendor-specific,
 2678			   thus reserved and should throw a #GP */
 2679			return 1;
 2680		}
 2681		vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
 2682			    __func__, data);
 
 
 
 
 
 2683		break;
 2684	case 0x200 ... 0x2ff:
 2685		return kvm_mtrr_set_msr(vcpu, msr, data);
 2686	case MSR_IA32_APICBASE:
 2687		return kvm_set_apic_base(vcpu, msr_info);
 
 2688	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 2689		return kvm_x2apic_msr_write(vcpu, msr, data);
 2690	case MSR_IA32_TSCDEADLINE:
 2691		kvm_set_lapic_tscdeadline_msr(vcpu, data);
 2692		break;
 2693	case MSR_IA32_TSC_ADJUST:
 2694		if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
 2695			if (!msr_info->host_initiated) {
 2696				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
 2697				adjust_tsc_offset_guest(vcpu, adj);
 2698			}
 2699			vcpu->arch.ia32_tsc_adjust_msr = data;
 2700		}
 2701		break;
 2702	case MSR_IA32_MISC_ENABLE:
 2703		if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
 2704		    ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
 2705			if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
 2706				return 1;
 2707			vcpu->arch.ia32_misc_enable_msr = data;
 2708			kvm_update_cpuid(vcpu);
 2709		} else {
 2710			vcpu->arch.ia32_misc_enable_msr = data;
 2711		}
 2712		break;
 2713	case MSR_IA32_SMBASE:
 2714		if (!msr_info->host_initiated)
 2715			return 1;
 2716		vcpu->arch.smbase = data;
 2717		break;
 2718	case MSR_IA32_POWER_CTL:
 2719		vcpu->arch.msr_ia32_power_ctl = data;
 2720		break;
 2721	case MSR_IA32_TSC:
 2722		kvm_write_tsc(vcpu, msr_info);
 2723		break;
 2724	case MSR_SMI_COUNT:
 2725		if (!msr_info->host_initiated)
 2726			return 1;
 2727		vcpu->arch.smi_count = data;
 2728		break;
 2729	case MSR_KVM_WALL_CLOCK_NEW:
 2730	case MSR_KVM_WALL_CLOCK:
 2731		vcpu->kvm->arch.wall_clock = data;
 2732		kvm_write_wall_clock(vcpu->kvm, data);
 2733		break;
 2734	case MSR_KVM_SYSTEM_TIME_NEW:
 2735	case MSR_KVM_SYSTEM_TIME: {
 2736		struct kvm_arch *ka = &vcpu->kvm->arch;
 2737
 2738		if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
 2739			bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
 2740
 2741			if (ka->boot_vcpu_runs_old_kvmclock != tmp)
 2742				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 2743
 2744			ka->boot_vcpu_runs_old_kvmclock = tmp;
 2745		}
 2746
 2747		vcpu->arch.time = data;
 2748		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 2749
 2750		/* we verify if the enable bit is set... */
 2751		vcpu->arch.pv_time_enabled = false;
 2752		if (!(data & 1))
 2753			break;
 2754
 2755		if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
 2756		     &vcpu->arch.pv_time, data & ~1ULL,
 2757		     sizeof(struct pvclock_vcpu_time_info)))
 2758			vcpu->arch.pv_time_enabled = true;
 
 2759
 
 
 
 
 2760		break;
 2761	}
 2762	case MSR_KVM_ASYNC_PF_EN:
 2763		if (kvm_pv_enable_async_pf(vcpu, data))
 2764			return 1;
 2765		break;
 2766	case MSR_KVM_STEAL_TIME:
 2767
 2768		if (unlikely(!sched_info_on()))
 2769			return 1;
 2770
 2771		if (data & KVM_STEAL_RESERVED_MASK)
 2772			return 1;
 2773
 2774		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
 2775						data & KVM_STEAL_VALID_BITS,
 2776						sizeof(struct kvm_steal_time)))
 2777			return 1;
 2778
 2779		vcpu->arch.st.msr_val = data;
 2780
 2781		if (!(data & KVM_MSR_ENABLED))
 2782			break;
 2783
 2784		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 2785
 2786		break;
 2787	case MSR_KVM_PV_EOI_EN:
 2788		if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
 2789			return 1;
 2790		break;
 2791
 2792	case MSR_KVM_POLL_CONTROL:
 2793		/* only enable bit supported */
 2794		if (data & (-1ULL << 1))
 2795			return 1;
 2796
 2797		vcpu->arch.msr_kvm_poll_control = data;
 2798		break;
 2799
 2800	case MSR_IA32_MCG_CTL:
 2801	case MSR_IA32_MCG_STATUS:
 2802	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 2803		return set_msr_mce(vcpu, msr_info);
 2804
 2805	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
 2806	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
 2807		pr = true; /* fall through */
 2808	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
 2809	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
 2810		if (kvm_pmu_is_valid_msr(vcpu, msr))
 2811			return kvm_pmu_set_msr(vcpu, msr_info);
 2812
 2813		if (pr || data != 0)
 2814			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
 2815				    "0x%x data 0x%llx\n", msr, data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2816		break;
 2817	case MSR_K7_CLK_CTL:
 2818		/*
 2819		 * Ignore all writes to this no longer documented MSR.
 2820		 * Writes are only relevant for old K7 processors,
 2821		 * all pre-dating SVM, but a recommended workaround from
 2822		 * AMD for these chips. It is possible to specify the
 2823		 * affected processor models on the command line, hence
 2824		 * the need to ignore the workaround.
 2825		 */
 2826		break;
 2827	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 2828	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 2829	case HV_X64_MSR_CRASH_CTL:
 2830	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
 2831	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
 2832	case HV_X64_MSR_TSC_EMULATION_CONTROL:
 2833	case HV_X64_MSR_TSC_EMULATION_STATUS:
 2834		return kvm_hv_set_msr_common(vcpu, msr, data,
 2835					     msr_info->host_initiated);
 
 2836	case MSR_IA32_BBL_CR_CTL3:
 2837		/* Drop writes to this legacy MSR -- see rdmsr
 2838		 * counterpart for further detail.
 2839		 */
 2840		if (report_ignored_msrs)
 2841			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
 2842				msr, data);
 2843		break;
 2844	case MSR_AMD64_OSVW_ID_LENGTH:
 2845		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
 2846			return 1;
 2847		vcpu->arch.osvw.length = data;
 2848		break;
 2849	case MSR_AMD64_OSVW_STATUS:
 2850		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
 2851			return 1;
 2852		vcpu->arch.osvw.status = data;
 2853		break;
 2854	case MSR_PLATFORM_INFO:
 2855		if (!msr_info->host_initiated ||
 2856		    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
 2857		     cpuid_fault_enabled(vcpu)))
 2858			return 1;
 2859		vcpu->arch.msr_platform_info = data;
 2860		break;
 2861	case MSR_MISC_FEATURES_ENABLES:
 2862		if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
 2863		    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
 2864		     !supports_cpuid_fault(vcpu)))
 2865			return 1;
 2866		vcpu->arch.msr_misc_features_enables = data;
 2867		break;
 2868	default:
 2869		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 2870			return xen_hvm_config(vcpu, data);
 2871		if (kvm_pmu_is_valid_msr(vcpu, msr))
 2872			return kvm_pmu_set_msr(vcpu, msr_info);
 2873		if (!ignore_msrs) {
 2874			vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
 2875				    msr, data);
 2876			return 1;
 2877		} else {
 2878			if (report_ignored_msrs)
 2879				vcpu_unimpl(vcpu,
 2880					"ignored wrmsr: 0x%x data 0x%llx\n",
 2881					msr, data);
 2882			break;
 2883		}
 2884	}
 2885	return 0;
 2886}
 2887EXPORT_SYMBOL_GPL(kvm_set_msr_common);
 2888
 2889static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2890{
 2891	u64 data;
 2892	u64 mcg_cap = vcpu->arch.mcg_cap;
 2893	unsigned bank_num = mcg_cap & 0xff;
 2894
 2895	switch (msr) {
 2896	case MSR_IA32_P5_MC_ADDR:
 2897	case MSR_IA32_P5_MC_TYPE:
 2898		data = 0;
 2899		break;
 2900	case MSR_IA32_MCG_CAP:
 2901		data = vcpu->arch.mcg_cap;
 2902		break;
 2903	case MSR_IA32_MCG_CTL:
 2904		if (!(mcg_cap & MCG_CTL_P) && !host)
 2905			return 1;
 2906		data = vcpu->arch.mcg_ctl;
 2907		break;
 2908	case MSR_IA32_MCG_STATUS:
 2909		data = vcpu->arch.mcg_status;
 2910		break;
 2911	default:
 2912		if (msr >= MSR_IA32_MC0_CTL &&
 2913		    msr < MSR_IA32_MCx_CTL(bank_num)) {
 2914			u32 offset = msr - MSR_IA32_MC0_CTL;
 2915			data = vcpu->arch.mce_banks[offset];
 2916			break;
 2917		}
 2918		return 1;
 2919	}
 2920	*pdata = data;
 2921	return 0;
 2922}
 2923
 2924int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 2925{
 2926	switch (msr_info->index) {
 
 
 2927	case MSR_IA32_PLATFORM_ID:
 
 2928	case MSR_IA32_EBL_CR_POWERON:
 2929	case MSR_IA32_DEBUGCTLMSR:
 2930	case MSR_IA32_LASTBRANCHFROMIP:
 2931	case MSR_IA32_LASTBRANCHTOIP:
 2932	case MSR_IA32_LASTINTFROMIP:
 2933	case MSR_IA32_LASTINTTOIP:
 2934	case MSR_K8_SYSCFG:
 2935	case MSR_K8_TSEG_ADDR:
 2936	case MSR_K8_TSEG_MASK:
 2937	case MSR_VM_HSAVE_PA:
 
 
 
 
 
 
 2938	case MSR_K8_INT_PENDING_MSG:
 2939	case MSR_AMD64_NB_CFG:
 2940	case MSR_FAM10H_MMIO_CONF_BASE:
 2941	case MSR_AMD64_BU_CFG2:
 2942	case MSR_IA32_PERF_CTL:
 2943	case MSR_AMD64_DC_CFG:
 2944	case MSR_F15H_EX_CFG:
 2945		msr_info->data = 0;
 2946		break;
 2947	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
 2948	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
 2949	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
 2950	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
 2951	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
 2952		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 2953			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
 2954		msr_info->data = 0;
 2955		break;
 2956	case MSR_IA32_UCODE_REV:
 2957		msr_info->data = vcpu->arch.microcode_version;
 2958		break;
 2959	case MSR_IA32_ARCH_CAPABILITIES:
 2960		if (!msr_info->host_initiated &&
 2961		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
 2962			return 1;
 2963		msr_info->data = vcpu->arch.arch_capabilities;
 2964		break;
 2965	case MSR_IA32_POWER_CTL:
 2966		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
 2967		break;
 2968	case MSR_IA32_TSC:
 2969		msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
 2970		break;
 2971	case MSR_MTRRcap:
 2972	case 0x200 ... 0x2ff:
 2973		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
 2974	case 0xcd: /* fsb frequency */
 2975		msr_info->data = 3;
 2976		break;
 2977		/*
 2978		 * MSR_EBC_FREQUENCY_ID
 2979		 * Conservative value valid for even the basic CPU models.
 2980		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
 2981		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
 2982		 * and 266MHz for model 3, or 4. Set Core Clock
 2983		 * Frequency to System Bus Frequency Ratio to 1 (bits
 2984		 * 31:24) even though these are only valid for CPU
 2985		 * models > 2, however guests may end up dividing or
 2986		 * multiplying by zero otherwise.
 2987		 */
 2988	case MSR_EBC_FREQUENCY_ID:
 2989		msr_info->data = 1 << 24;
 2990		break;
 2991	case MSR_IA32_APICBASE:
 2992		msr_info->data = kvm_get_apic_base(vcpu);
 2993		break;
 2994	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 2995		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
 2996		break;
 2997	case MSR_IA32_TSCDEADLINE:
 2998		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
 2999		break;
 3000	case MSR_IA32_TSC_ADJUST:
 3001		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
 3002		break;
 3003	case MSR_IA32_MISC_ENABLE:
 3004		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
 3005		break;
 3006	case MSR_IA32_SMBASE:
 3007		if (!msr_info->host_initiated)
 3008			return 1;
 3009		msr_info->data = vcpu->arch.smbase;
 3010		break;
 3011	case MSR_SMI_COUNT:
 3012		msr_info->data = vcpu->arch.smi_count;
 3013		break;
 3014	case MSR_IA32_PERF_STATUS:
 3015		/* TSC increment by tick */
 3016		msr_info->data = 1000ULL;
 3017		/* CPU multiplier */
 3018		msr_info->data |= (((uint64_t)4ULL) << 40);
 3019		break;
 3020	case MSR_EFER:
 3021		msr_info->data = vcpu->arch.efer;
 3022		break;
 3023	case MSR_KVM_WALL_CLOCK:
 3024	case MSR_KVM_WALL_CLOCK_NEW:
 3025		msr_info->data = vcpu->kvm->arch.wall_clock;
 3026		break;
 3027	case MSR_KVM_SYSTEM_TIME:
 3028	case MSR_KVM_SYSTEM_TIME_NEW:
 3029		msr_info->data = vcpu->arch.time;
 3030		break;
 3031	case MSR_KVM_ASYNC_PF_EN:
 3032		msr_info->data = vcpu->arch.apf.msr_val;
 3033		break;
 3034	case MSR_KVM_STEAL_TIME:
 3035		msr_info->data = vcpu->arch.st.msr_val;
 3036		break;
 3037	case MSR_KVM_PV_EOI_EN:
 3038		msr_info->data = vcpu->arch.pv_eoi.msr_val;
 3039		break;
 3040	case MSR_KVM_POLL_CONTROL:
 3041		msr_info->data = vcpu->arch.msr_kvm_poll_control;
 3042		break;
 3043	case MSR_IA32_P5_MC_ADDR:
 3044	case MSR_IA32_P5_MC_TYPE:
 3045	case MSR_IA32_MCG_CAP:
 3046	case MSR_IA32_MCG_CTL:
 3047	case MSR_IA32_MCG_STATUS:
 3048	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 3049		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
 3050				   msr_info->host_initiated);
 3051	case MSR_K7_CLK_CTL:
 3052		/*
 3053		 * Provide expected ramp-up count for K7. All other
 3054		 * are set to zero, indicating minimum divisors for
 3055		 * every field.
 3056		 *
 3057		 * This prevents guest kernels on AMD host with CPU
 3058		 * type 6, model 8 and higher from exploding due to
 3059		 * the rdmsr failing.
 3060		 */
 3061		msr_info->data = 0x20000000;
 3062		break;
 3063	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 3064	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 3065	case HV_X64_MSR_CRASH_CTL:
 3066	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
 3067	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
 3068	case HV_X64_MSR_TSC_EMULATION_CONTROL:
 3069	case HV_X64_MSR_TSC_EMULATION_STATUS:
 3070		return kvm_hv_get_msr_common(vcpu,
 3071					     msr_info->index, &msr_info->data,
 3072					     msr_info->host_initiated);
 3073		break;
 3074	case MSR_IA32_BBL_CR_CTL3:
 3075		/* This legacy MSR exists but isn't fully documented in current
 3076		 * silicon.  It is however accessed by winxp in very narrow
 3077		 * scenarios where it sets bit #19, itself documented as
 3078		 * a "reserved" bit.  Best effort attempt to source coherent
 3079		 * read data here should the balance of the register be
 3080		 * interpreted by the guest:
 3081		 *
 3082		 * L2 cache control register 3: 64GB range, 256KB size,
 3083		 * enabled, latency 0x1, configured
 3084		 */
 3085		msr_info->data = 0xbe702111;
 3086		break;
 3087	case MSR_AMD64_OSVW_ID_LENGTH:
 3088		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
 3089			return 1;
 3090		msr_info->data = vcpu->arch.osvw.length;
 3091		break;
 3092	case MSR_AMD64_OSVW_STATUS:
 3093		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
 3094			return 1;
 3095		msr_info->data = vcpu->arch.osvw.status;
 3096		break;
 3097	case MSR_PLATFORM_INFO:
 3098		if (!msr_info->host_initiated &&
 3099		    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
 3100			return 1;
 3101		msr_info->data = vcpu->arch.msr_platform_info;
 3102		break;
 3103	case MSR_MISC_FEATURES_ENABLES:
 3104		msr_info->data = vcpu->arch.msr_misc_features_enables;
 3105		break;
 3106	case MSR_K7_HWCR:
 3107		msr_info->data = vcpu->arch.msr_hwcr;
 3108		break;
 3109	default:
 3110		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 3111			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
 3112		if (!ignore_msrs) {
 3113			vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
 3114					       msr_info->index);
 3115			return 1;
 3116		} else {
 3117			if (report_ignored_msrs)
 3118				vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
 3119					msr_info->index);
 3120			msr_info->data = 0;
 3121		}
 3122		break;
 3123	}
 
 3124	return 0;
 3125}
 3126EXPORT_SYMBOL_GPL(kvm_get_msr_common);
 3127
 3128/*
 3129 * Read or write a bunch of msrs. All parameters are kernel addresses.
 3130 *
 3131 * @return number of msrs set successfully.
 3132 */
 3133static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 3134		    struct kvm_msr_entry *entries,
 3135		    int (*do_msr)(struct kvm_vcpu *vcpu,
 3136				  unsigned index, u64 *data))
 3137{
 3138	int i;
 3139
 
 3140	for (i = 0; i < msrs->nmsrs; ++i)
 3141		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 3142			break;
 
 3143
 3144	return i;
 3145}
 3146
 3147/*
 3148 * Read or write a bunch of msrs. Parameters are user addresses.
 3149 *
 3150 * @return number of msrs set successfully.
 3151 */
 3152static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 3153		  int (*do_msr)(struct kvm_vcpu *vcpu,
 3154				unsigned index, u64 *data),
 3155		  int writeback)
 3156{
 3157	struct kvm_msrs msrs;
 3158	struct kvm_msr_entry *entries;
 3159	int r, n;
 3160	unsigned size;
 3161
 3162	r = -EFAULT;
 3163	if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
 3164		goto out;
 3165
 3166	r = -E2BIG;
 3167	if (msrs.nmsrs >= MAX_IO_MSRS)
 3168		goto out;
 3169
 
 3170	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
 3171	entries = memdup_user(user_msrs->entries, size);
 3172	if (IS_ERR(entries)) {
 3173		r = PTR_ERR(entries);
 3174		goto out;
 3175	}
 
 
 
 3176
 3177	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 3178	if (r < 0)
 3179		goto out_free;
 3180
 3181	r = -EFAULT;
 3182	if (writeback && copy_to_user(user_msrs->entries, entries, size))
 3183		goto out_free;
 3184
 3185	r = n;
 3186
 3187out_free:
 3188	kfree(entries);
 3189out:
 3190	return r;
 3191}
 3192
 3193static inline bool kvm_can_mwait_in_guest(void)
 3194{
 3195	return boot_cpu_has(X86_FEATURE_MWAIT) &&
 3196		!boot_cpu_has_bug(X86_BUG_MONITOR) &&
 3197		boot_cpu_has(X86_FEATURE_ARAT);
 3198}
 3199
 3200int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 3201{
 3202	int r = 0;
 3203
 3204	switch (ext) {
 3205	case KVM_CAP_IRQCHIP:
 3206	case KVM_CAP_HLT:
 3207	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
 3208	case KVM_CAP_SET_TSS_ADDR:
 3209	case KVM_CAP_EXT_CPUID:
 3210	case KVM_CAP_EXT_EMUL_CPUID:
 3211	case KVM_CAP_CLOCKSOURCE:
 3212	case KVM_CAP_PIT:
 3213	case KVM_CAP_NOP_IO_DELAY:
 3214	case KVM_CAP_MP_STATE:
 3215	case KVM_CAP_SYNC_MMU:
 3216	case KVM_CAP_USER_NMI:
 3217	case KVM_CAP_REINJECT_CONTROL:
 3218	case KVM_CAP_IRQ_INJECT_STATUS:
 
 
 3219	case KVM_CAP_IOEVENTFD:
 3220	case KVM_CAP_IOEVENTFD_NO_LENGTH:
 3221	case KVM_CAP_PIT2:
 3222	case KVM_CAP_PIT_STATE2:
 3223	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 3224	case KVM_CAP_XEN_HVM:
 
 3225	case KVM_CAP_VCPU_EVENTS:
 3226	case KVM_CAP_HYPERV:
 3227	case KVM_CAP_HYPERV_VAPIC:
 3228	case KVM_CAP_HYPERV_SPIN:
 3229	case KVM_CAP_HYPERV_SYNIC:
 3230	case KVM_CAP_HYPERV_SYNIC2:
 3231	case KVM_CAP_HYPERV_VP_INDEX:
 3232	case KVM_CAP_HYPERV_EVENTFD:
 3233	case KVM_CAP_HYPERV_TLBFLUSH:
 3234	case KVM_CAP_HYPERV_SEND_IPI:
 3235	case KVM_CAP_HYPERV_CPUID:
 3236	case KVM_CAP_PCI_SEGMENT:
 3237	case KVM_CAP_DEBUGREGS:
 3238	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 3239	case KVM_CAP_XSAVE:
 3240	case KVM_CAP_ASYNC_PF:
 3241	case KVM_CAP_GET_TSC_KHZ:
 3242	case KVM_CAP_KVMCLOCK_CTRL:
 3243	case KVM_CAP_READONLY_MEM:
 3244	case KVM_CAP_HYPERV_TIME:
 3245	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 3246	case KVM_CAP_TSC_DEADLINE_TIMER:
 3247	case KVM_CAP_DISABLE_QUIRKS:
 3248	case KVM_CAP_SET_BOOT_CPU_ID:
 3249 	case KVM_CAP_SPLIT_IRQCHIP:
 3250	case KVM_CAP_IMMEDIATE_EXIT:
 3251	case KVM_CAP_PMU_EVENT_FILTER:
 3252	case KVM_CAP_GET_MSR_FEATURES:
 3253	case KVM_CAP_MSR_PLATFORM_INFO:
 3254	case KVM_CAP_EXCEPTION_PAYLOAD:
 3255		r = 1;
 3256		break;
 3257	case KVM_CAP_SYNC_REGS:
 3258		r = KVM_SYNC_X86_VALID_FIELDS;
 3259		break;
 3260	case KVM_CAP_ADJUST_CLOCK:
 3261		r = KVM_CLOCK_TSC_STABLE;
 3262		break;
 3263	case KVM_CAP_X86_DISABLE_EXITS:
 3264		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
 3265		      KVM_X86_DISABLE_EXITS_CSTATE;
 3266		if(kvm_can_mwait_in_guest())
 3267			r |= KVM_X86_DISABLE_EXITS_MWAIT;
 3268		break;
 3269	case KVM_CAP_X86_SMM:
 3270		/* SMBASE is usually relocated above 1M on modern chipsets,
 3271		 * and SMM handlers might indeed rely on 4G segment limits,
 3272		 * so do not report SMM to be available if real mode is
 3273		 * emulated via vm86 mode.  Still, do not go to great lengths
 3274		 * to avoid userspace's usage of the feature, because it is a
 3275		 * fringe case that is not enabled except via specific settings
 3276		 * of the module parameters.
 3277		 */
 3278		r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
 3279		break;
 3280	case KVM_CAP_VAPIC:
 3281		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
 3282		break;
 3283	case KVM_CAP_NR_VCPUS:
 3284		r = KVM_SOFT_MAX_VCPUS;
 3285		break;
 3286	case KVM_CAP_MAX_VCPUS:
 3287		r = KVM_MAX_VCPUS;
 3288		break;
 3289	case KVM_CAP_MAX_VCPU_ID:
 3290		r = KVM_MAX_VCPU_ID;
 3291		break;
 3292	case KVM_CAP_PV_MMU:	/* obsolete */
 3293		r = 0;
 3294		break;
 
 
 
 3295	case KVM_CAP_MCE:
 3296		r = KVM_MAX_MCE_BANKS;
 3297		break;
 3298	case KVM_CAP_XCRS:
 3299		r = boot_cpu_has(X86_FEATURE_XSAVE);
 3300		break;
 3301	case KVM_CAP_TSC_CONTROL:
 3302		r = kvm_has_tsc_control;
 3303		break;
 3304	case KVM_CAP_X2APIC_API:
 3305		r = KVM_X2APIC_API_VALID_FLAGS;
 3306		break;
 3307	case KVM_CAP_NESTED_STATE:
 3308		r = kvm_x86_ops->get_nested_state ?
 3309			kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
 3310		break;
 3311	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
 3312		r = kvm_x86_ops->enable_direct_tlbflush != NULL;
 3313		break;
 3314	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
 3315		r = kvm_x86_ops->nested_enable_evmcs != NULL;
 3316		break;
 3317	default:
 
 3318		break;
 3319	}
 3320	return r;
 3321
 3322}
 3323
 3324long kvm_arch_dev_ioctl(struct file *filp,
 3325			unsigned int ioctl, unsigned long arg)
 3326{
 3327	void __user *argp = (void __user *)arg;
 3328	long r;
 3329
 3330	switch (ioctl) {
 3331	case KVM_GET_MSR_INDEX_LIST: {
 3332		struct kvm_msr_list __user *user_msr_list = argp;
 3333		struct kvm_msr_list msr_list;
 3334		unsigned n;
 3335
 3336		r = -EFAULT;
 3337		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
 3338			goto out;
 3339		n = msr_list.nmsrs;
 3340		msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
 3341		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
 3342			goto out;
 3343		r = -E2BIG;
 3344		if (n < msr_list.nmsrs)
 3345			goto out;
 3346		r = -EFAULT;
 3347		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
 3348				 num_msrs_to_save * sizeof(u32)))
 3349			goto out;
 3350		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
 3351				 &emulated_msrs,
 3352				 num_emulated_msrs * sizeof(u32)))
 3353			goto out;
 3354		r = 0;
 3355		break;
 3356	}
 3357	case KVM_GET_SUPPORTED_CPUID:
 3358	case KVM_GET_EMULATED_CPUID: {
 3359		struct kvm_cpuid2 __user *cpuid_arg = argp;
 3360		struct kvm_cpuid2 cpuid;
 3361
 3362		r = -EFAULT;
 3363		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 3364			goto out;
 3365
 3366		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
 3367					    ioctl);
 3368		if (r)
 3369			goto out;
 3370
 3371		r = -EFAULT;
 3372		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
 3373			goto out;
 3374		r = 0;
 3375		break;
 3376	}
 3377	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
 3378		r = -EFAULT;
 3379		if (copy_to_user(argp, &kvm_mce_cap_supported,
 3380				 sizeof(kvm_mce_cap_supported)))
 3381			goto out;
 3382		r = 0;
 3383		break;
 3384	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
 3385		struct kvm_msr_list __user *user_msr_list = argp;
 3386		struct kvm_msr_list msr_list;
 3387		unsigned int n;
 3388
 
 3389		r = -EFAULT;
 3390		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
 3391			goto out;
 3392		n = msr_list.nmsrs;
 3393		msr_list.nmsrs = num_msr_based_features;
 3394		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
 3395			goto out;
 3396		r = -E2BIG;
 3397		if (n < msr_list.nmsrs)
 3398			goto out;
 3399		r = -EFAULT;
 3400		if (copy_to_user(user_msr_list->indices, &msr_based_features,
 3401				 num_msr_based_features * sizeof(u32)))
 3402			goto out;
 3403		r = 0;
 3404		break;
 3405	}
 3406	case KVM_GET_MSRS:
 3407		r = msr_io(NULL, argp, do_get_msr_feature, 1);
 3408		break;
 3409	}
 3410	default:
 3411		r = -EINVAL;
 3412	}
 3413out:
 3414	return r;
 3415}
 3416
 3417static void wbinvd_ipi(void *garbage)
 3418{
 3419	wbinvd();
 3420}
 3421
 3422static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 3423{
 3424	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
 
 3425}
 3426
 3427void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 3428{
 3429	/* Address WBINVD may be executed by guest */
 3430	if (need_emulate_wbinvd(vcpu)) {
 3431		if (kvm_x86_ops->has_wbinvd_exit())
 3432			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
 3433		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
 3434			smp_call_function_single(vcpu->cpu,
 3435					wbinvd_ipi, NULL, 1);
 3436	}
 3437
 3438	kvm_x86_ops->vcpu_load(vcpu, cpu);
 
 
 
 
 
 
 
 
 3439
 3440	fpregs_assert_state_consistent();
 3441	if (test_thread_flag(TIF_NEED_FPU_LOAD))
 3442		switch_fpu_return();
 3443
 3444	/* Apply any externally detected TSC adjustments (due to suspend) */
 3445	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
 3446		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
 3447		vcpu->arch.tsc_offset_adjustment = 0;
 3448		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 3449	}
 3450
 3451	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
 3452		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 3453				rdtsc() - vcpu->arch.last_host_tsc;
 3454		if (tsc_delta < 0)
 3455			mark_tsc_unstable("KVM discovered backwards TSC");
 3456
 3457		if (kvm_check_tsc_unstable()) {
 3458			u64 offset = kvm_compute_tsc_offset(vcpu,
 3459						vcpu->arch.last_guest_tsc);
 3460			kvm_vcpu_write_tsc_offset(vcpu, offset);
 3461			vcpu->arch.tsc_catchup = 1;
 3462		}
 3463
 3464		if (kvm_lapic_hv_timer_in_use(vcpu))
 3465			kvm_lapic_restart_hv_timer(vcpu);
 3466
 3467		/*
 3468		 * On a host with synchronized TSC, there is no need to update
 3469		 * kvmclock on vcpu->cpu migration
 3470		 */
 3471		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
 3472			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 3473		if (vcpu->cpu != cpu)
 3474			kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
 3475		vcpu->cpu = cpu;
 3476	}
 3477
 
 3478	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 3479}
 3480
 3481static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 
 
 
 
 
 
 
 3482{
 3483	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 3484		return;
 
 
 
 3485
 3486	vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
 
 
 
 3487
 3488	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
 3489			&vcpu->arch.st.steal.preempted,
 3490			offsetof(struct kvm_steal_time, preempted),
 3491			sizeof(vcpu->arch.st.steal.preempted));
 
 
 
 
 
 
 
 
 3492}
 3493
 3494void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 
 
 
 3495{
 3496	int idx;
 
 3497
 3498	if (vcpu->preempted)
 3499		vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3500
 3501	/*
 3502	 * Disable page faults because we're in atomic context here.
 3503	 * kvm_write_guest_offset_cached() would call might_fault()
 3504	 * that relies on pagefault_disable() to tell if there's a
 3505	 * bug. NOTE: the write to guest memory may not go through if
 3506	 * during postcopy live migration or if there's heavy guest
 3507	 * paging.
 3508	 */
 3509	pagefault_disable();
 3510	/*
 3511	 * kvm_memslots() will be called by
 3512	 * kvm_write_guest_offset_cached() so take the srcu lock.
 3513	 */
 3514	idx = srcu_read_lock(&vcpu->kvm->srcu);
 3515	kvm_steal_time_set_preempted(vcpu);
 3516	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 3517	pagefault_enable();
 3518	kvm_x86_ops->vcpu_put(vcpu);
 3519	vcpu->arch.last_host_tsc = rdtsc();
 3520	/*
 3521	 * If userspace has set any breakpoints or watchpoints, dr6 is restored
 3522	 * on every vmexit, but if not, we might have a stale dr6 from the
 3523	 * guest. do_debug expects dr6 to be cleared after it runs, do the same.
 3524	 */
 3525	set_debugreg(0, 6);
 3526}
 3527
 3528static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 3529				    struct kvm_lapic_state *s)
 
 3530{
 3531	if (vcpu->arch.apicv_active)
 3532		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 3533
 3534	return kvm_apic_get_state(vcpu, s);
 
 3535}
 3536
 3537static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 3538				    struct kvm_lapic_state *s)
 
 3539{
 3540	int r;
 3541
 3542	r = kvm_apic_set_state(vcpu, s);
 3543	if (r)
 3544		return r;
 3545	update_cr8_intercept(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3546
 3547	return 0;
 
 
 
 
 
 
 
 3548}
 3549
 3550static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
 3551{
 3552	return (!lapic_in_kernel(vcpu) ||
 3553		kvm_apic_accept_pic_intr(vcpu));
 
 3554}
 3555
 3556/*
 3557 * if userspace requested an interrupt window, check that the
 3558 * interrupt window is open.
 3559 *
 3560 * No need to exit to userspace if we already have an interrupt queued.
 3561 */
 3562static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
 3563{
 3564	return kvm_arch_interrupt_allowed(vcpu) &&
 3565		!kvm_cpu_has_interrupt(vcpu) &&
 3566		!kvm_event_needs_reinjection(vcpu) &&
 3567		kvm_cpu_accept_dm_intr(vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3568}
 3569
 3570static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 3571				    struct kvm_interrupt *irq)
 
 
 3572{
 3573	if (irq->irq >= KVM_NR_INTERRUPTS)
 3574		return -EINVAL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3575
 3576	if (!irqchip_in_kernel(vcpu->kvm)) {
 3577		kvm_queue_interrupt(vcpu, irq->irq, false);
 3578		kvm_make_request(KVM_REQ_EVENT, vcpu);
 3579		return 0;
 3580	}
 3581
 3582	/*
 3583	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
 3584	 * fail for in-kernel 8259.
 3585	 */
 3586	if (pic_in_kernel(vcpu->kvm))
 3587		return -ENXIO;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3588
 3589	if (vcpu->arch.pending_external_vector != -1)
 3590		return -EEXIST;
 
 
 
 
 3591
 3592	vcpu->arch.pending_external_vector = irq->irq;
 3593	kvm_make_request(KVM_REQ_EVENT, vcpu);
 3594	return 0;
 3595}
 3596
 3597static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 
 3598{
 3599	kvm_inject_nmi(vcpu);
 
 
 
 
 
 
 3600
 3601	return 0;
 3602}
 3603
 3604static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
 3605{
 3606	kvm_make_request(KVM_REQ_SMI, vcpu);
 3607
 3608	return 0;
 3609}
 3610
 3611static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 3612					   struct kvm_tpr_access_ctl *tac)
 3613{
 3614	if (tac->flags)
 3615		return -EINVAL;
 3616	vcpu->arch.tpr_access_reporting = !!tac->enabled;
 3617	return 0;
 3618}
 3619
 3620static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 3621					u64 mcg_cap)
 3622{
 3623	int r;
 3624	unsigned bank_num = mcg_cap & 0xff, bank;
 3625
 3626	r = -EINVAL;
 3627	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 3628		goto out;
 3629	if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
 3630		goto out;
 3631	r = 0;
 3632	vcpu->arch.mcg_cap = mcg_cap;
 3633	/* Init IA32_MCG_CTL to all 1s */
 3634	if (mcg_cap & MCG_CTL_P)
 3635		vcpu->arch.mcg_ctl = ~(u64)0;
 3636	/* Init IA32_MCi_CTL to all 1s */
 3637	for (bank = 0; bank < bank_num; bank++)
 3638		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 3639
 3640	kvm_x86_ops->setup_mce(vcpu);
 3641out:
 3642	return r;
 3643}
 3644
 3645static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 3646				      struct kvm_x86_mce *mce)
 3647{
 3648	u64 mcg_cap = vcpu->arch.mcg_cap;
 3649	unsigned bank_num = mcg_cap & 0xff;
 3650	u64 *banks = vcpu->arch.mce_banks;
 3651
 3652	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
 3653		return -EINVAL;
 3654	/*
 3655	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
 3656	 * reporting is disabled
 3657	 */
 3658	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
 3659	    vcpu->arch.mcg_ctl != ~(u64)0)
 3660		return 0;
 3661	banks += 4 * mce->bank;
 3662	/*
 3663	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
 3664	 * reporting is disabled for the bank
 3665	 */
 3666	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
 3667		return 0;
 3668	if (mce->status & MCI_STATUS_UC) {
 3669		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
 3670		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
 3671			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 3672			return 0;
 3673		}
 3674		if (banks[1] & MCI_STATUS_VAL)
 3675			mce->status |= MCI_STATUS_OVER;
 3676		banks[2] = mce->addr;
 3677		banks[3] = mce->misc;
 3678		vcpu->arch.mcg_status = mce->mcg_status;
 3679		banks[1] = mce->status;
 3680		kvm_queue_exception(vcpu, MC_VECTOR);
 3681	} else if (!(banks[1] & MCI_STATUS_VAL)
 3682		   || !(banks[1] & MCI_STATUS_UC)) {
 3683		if (banks[1] & MCI_STATUS_VAL)
 3684			mce->status |= MCI_STATUS_OVER;
 3685		banks[2] = mce->addr;
 3686		banks[3] = mce->misc;
 3687		banks[1] = mce->status;
 3688	} else
 3689		banks[1] |= MCI_STATUS_OVER;
 3690	return 0;
 3691}
 3692
 3693static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 3694					       struct kvm_vcpu_events *events)
 3695{
 3696	process_nmi(vcpu);
 3697
 3698	/*
 3699	 * The API doesn't provide the instruction length for software
 3700	 * exceptions, so don't report them. As long as the guest RIP
 3701	 * isn't advanced, we should expect to encounter the exception
 3702	 * again.
 3703	 */
 3704	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
 3705		events->exception.injected = 0;
 3706		events->exception.pending = 0;
 3707	} else {
 3708		events->exception.injected = vcpu->arch.exception.injected;
 3709		events->exception.pending = vcpu->arch.exception.pending;
 3710		/*
 3711		 * For ABI compatibility, deliberately conflate
 3712		 * pending and injected exceptions when
 3713		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
 3714		 */
 3715		if (!vcpu->kvm->arch.exception_payload_enabled)
 3716			events->exception.injected |=
 3717				vcpu->arch.exception.pending;
 3718	}
 3719	events->exception.nr = vcpu->arch.exception.nr;
 3720	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
 
 3721	events->exception.error_code = vcpu->arch.exception.error_code;
 3722	events->exception_has_payload = vcpu->arch.exception.has_payload;
 3723	events->exception_payload = vcpu->arch.exception.payload;
 3724
 3725	events->interrupt.injected =
 3726		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
 3727	events->interrupt.nr = vcpu->arch.interrupt.nr;
 3728	events->interrupt.soft = 0;
 3729	events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
 
 
 3730
 3731	events->nmi.injected = vcpu->arch.nmi_injected;
 3732	events->nmi.pending = vcpu->arch.nmi_pending != 0;
 3733	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
 3734	events->nmi.pad = 0;
 3735
 3736	events->sipi_vector = 0; /* never valid when reporting to user space */
 3737
 3738	events->smi.smm = is_smm(vcpu);
 3739	events->smi.pending = vcpu->arch.smi_pending;
 3740	events->smi.smm_inside_nmi =
 3741		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
 3742	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
 3743
 3744	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 3745			 | KVM_VCPUEVENT_VALID_SHADOW
 3746			 | KVM_VCPUEVENT_VALID_SMM);
 3747	if (vcpu->kvm->arch.exception_payload_enabled)
 3748		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
 3749
 3750	memset(&events->reserved, 0, sizeof(events->reserved));
 3751}
 3752
 3753static void kvm_smm_changed(struct kvm_vcpu *vcpu);
 3754
 3755static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 3756					      struct kvm_vcpu_events *events)
 3757{
 3758	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 3759			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 3760			      | KVM_VCPUEVENT_VALID_SHADOW
 3761			      | KVM_VCPUEVENT_VALID_SMM
 3762			      | KVM_VCPUEVENT_VALID_PAYLOAD))
 3763		return -EINVAL;
 3764
 3765	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
 3766		if (!vcpu->kvm->arch.exception_payload_enabled)
 3767			return -EINVAL;
 3768		if (events->exception.pending)
 3769			events->exception.injected = 0;
 3770		else
 3771			events->exception_has_payload = 0;
 3772	} else {
 3773		events->exception.pending = 0;
 3774		events->exception_has_payload = 0;
 3775	}
 3776
 3777	if ((events->exception.injected || events->exception.pending) &&
 3778	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
 3779		return -EINVAL;
 3780
 3781	/* INITs are latched while in SMM */
 3782	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
 3783	    (events->smi.smm || events->smi.pending) &&
 3784	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
 3785		return -EINVAL;
 3786
 3787	process_nmi(vcpu);
 3788	vcpu->arch.exception.injected = events->exception.injected;
 3789	vcpu->arch.exception.pending = events->exception.pending;
 3790	vcpu->arch.exception.nr = events->exception.nr;
 3791	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
 3792	vcpu->arch.exception.error_code = events->exception.error_code;
 3793	vcpu->arch.exception.has_payload = events->exception_has_payload;
 3794	vcpu->arch.exception.payload = events->exception_payload;
 3795
 3796	vcpu->arch.interrupt.injected = events->interrupt.injected;
 3797	vcpu->arch.interrupt.nr = events->interrupt.nr;
 3798	vcpu->arch.interrupt.soft = events->interrupt.soft;
 3799	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
 3800		kvm_x86_ops->set_interrupt_shadow(vcpu,
 3801						  events->interrupt.shadow);
 3802
 3803	vcpu->arch.nmi_injected = events->nmi.injected;
 3804	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
 3805		vcpu->arch.nmi_pending = events->nmi.pending;
 3806	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 3807
 3808	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
 3809	    lapic_in_kernel(vcpu))
 3810		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 3811
 3812	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
 3813		if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
 3814			if (events->smi.smm)
 3815				vcpu->arch.hflags |= HF_SMM_MASK;
 3816			else
 3817				vcpu->arch.hflags &= ~HF_SMM_MASK;
 3818			kvm_smm_changed(vcpu);
 3819		}
 3820
 3821		vcpu->arch.smi_pending = events->smi.pending;
 3822
 3823		if (events->smi.smm) {
 3824			if (events->smi.smm_inside_nmi)
 3825				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
 3826			else
 3827				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
 3828			if (lapic_in_kernel(vcpu)) {
 3829				if (events->smi.latched_init)
 3830					set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 3831				else
 3832					clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 3833			}
 3834		}
 3835	}
 3836
 3837	kvm_make_request(KVM_REQ_EVENT, vcpu);
 3838
 3839	return 0;
 3840}
 3841
 3842static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 3843					     struct kvm_debugregs *dbgregs)
 3844{
 3845	unsigned long val;
 3846
 3847	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 3848	kvm_get_dr(vcpu, 6, &val);
 3849	dbgregs->dr6 = val;
 3850	dbgregs->dr7 = vcpu->arch.dr7;
 3851	dbgregs->flags = 0;
 3852	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
 3853}
 3854
 3855static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 3856					    struct kvm_debugregs *dbgregs)
 3857{
 3858	if (dbgregs->flags)
 3859		return -EINVAL;
 3860
 3861	if (dbgregs->dr6 & ~0xffffffffull)
 3862		return -EINVAL;
 3863	if (dbgregs->dr7 & ~0xffffffffull)
 3864		return -EINVAL;
 3865
 3866	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 3867	kvm_update_dr0123(vcpu);
 3868	vcpu->arch.dr6 = dbgregs->dr6;
 3869	kvm_update_dr6(vcpu);
 3870	vcpu->arch.dr7 = dbgregs->dr7;
 3871	kvm_update_dr7(vcpu);
 3872
 3873	return 0;
 3874}
 3875
 3876#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
 3877
 3878static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 3879{
 3880	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
 3881	u64 xstate_bv = xsave->header.xfeatures;
 3882	u64 valid;
 3883
 3884	/*
 3885	 * Copy legacy XSAVE area, to avoid complications with CPUID
 3886	 * leaves 0 and 1 in the loop below.
 3887	 */
 3888	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
 3889
 3890	/* Set XSTATE_BV */
 3891	xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
 3892	*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
 3893
 3894	/*
 3895	 * Copy each region from the possibly compacted offset to the
 3896	 * non-compacted offset.
 3897	 */
 3898	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 3899	while (valid) {
 3900		u64 xfeature_mask = valid & -valid;
 3901		int xfeature_nr = fls64(xfeature_mask) - 1;
 3902		void *src = get_xsave_addr(xsave, xfeature_nr);
 3903
 3904		if (src) {
 3905			u32 size, offset, ecx, edx;
 3906			cpuid_count(XSTATE_CPUID, xfeature_nr,
 3907				    &size, &offset, &ecx, &edx);
 3908			if (xfeature_nr == XFEATURE_PKRU)
 3909				memcpy(dest + offset, &vcpu->arch.pkru,
 3910				       sizeof(vcpu->arch.pkru));
 3911			else
 3912				memcpy(dest + offset, src, size);
 3913
 3914		}
 3915
 3916		valid -= xfeature_mask;
 3917	}
 3918}
 3919
 3920static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 3921{
 3922	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
 3923	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
 3924	u64 valid;
 3925
 3926	/*
 3927	 * Copy legacy XSAVE area, to avoid complications with CPUID
 3928	 * leaves 0 and 1 in the loop below.
 3929	 */
 3930	memcpy(xsave, src, XSAVE_HDR_OFFSET);
 3931
 3932	/* Set XSTATE_BV and possibly XCOMP_BV.  */
 3933	xsave->header.xfeatures = xstate_bv;
 3934	if (boot_cpu_has(X86_FEATURE_XSAVES))
 3935		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
 3936
 3937	/*
 3938	 * Copy each region from the non-compacted offset to the
 3939	 * possibly compacted offset.
 3940	 */
 3941	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 3942	while (valid) {
 3943		u64 xfeature_mask = valid & -valid;
 3944		int xfeature_nr = fls64(xfeature_mask) - 1;
 3945		void *dest = get_xsave_addr(xsave, xfeature_nr);
 3946
 3947		if (dest) {
 3948			u32 size, offset, ecx, edx;
 3949			cpuid_count(XSTATE_CPUID, xfeature_nr,
 3950				    &size, &offset, &ecx, &edx);
 3951			if (xfeature_nr == XFEATURE_PKRU)
 3952				memcpy(&vcpu->arch.pkru, src + offset,
 3953				       sizeof(vcpu->arch.pkru));
 3954			else
 3955				memcpy(dest, src + offset, size);
 3956		}
 3957
 3958		valid -= xfeature_mask;
 3959	}
 3960}
 3961
 3962static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 3963					 struct kvm_xsave *guest_xsave)
 3964{
 3965	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 3966		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
 3967		fill_xsave((u8 *) guest_xsave->region, vcpu);
 3968	} else {
 
 3969		memcpy(guest_xsave->region,
 3970			&vcpu->arch.guest_fpu->state.fxsave,
 3971			sizeof(struct fxregs_state));
 3972		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
 3973			XFEATURE_MASK_FPSSE;
 3974	}
 3975}
 3976
 3977#define XSAVE_MXCSR_OFFSET 24
 3978
 3979static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 3980					struct kvm_xsave *guest_xsave)
 3981{
 3982	u64 xstate_bv =
 3983		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
 3984	u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
 3985
 3986	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 3987		/*
 3988		 * Here we allow setting states that are not present in
 3989		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
 3990		 * with old userspace.
 3991		 */
 3992		if (xstate_bv & ~kvm_supported_xcr0() ||
 3993			mxcsr & ~mxcsr_feature_mask)
 3994			return -EINVAL;
 3995		load_xsave(vcpu, (u8 *)guest_xsave->region);
 3996	} else {
 3997		if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
 3998			mxcsr & ~mxcsr_feature_mask)
 3999			return -EINVAL;
 4000		memcpy(&vcpu->arch.guest_fpu->state.fxsave,
 4001			guest_xsave->region, sizeof(struct fxregs_state));
 4002	}
 4003	return 0;
 4004}
 4005
 4006static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
 4007					struct kvm_xcrs *guest_xcrs)
 4008{
 4009	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
 4010		guest_xcrs->nr_xcrs = 0;
 4011		return;
 4012	}
 4013
 4014	guest_xcrs->nr_xcrs = 1;
 4015	guest_xcrs->flags = 0;
 4016	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
 4017	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
 4018}
 4019
 4020static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 4021				       struct kvm_xcrs *guest_xcrs)
 4022{
 4023	int i, r = 0;
 4024
 4025	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 4026		return -EINVAL;
 4027
 4028	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
 4029		return -EINVAL;
 4030
 4031	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
 4032		/* Only support XCR0 currently */
 4033		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
 4034			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
 4035				guest_xcrs->xcrs[i].value);
 4036			break;
 4037		}
 4038	if (r)
 4039		r = -EINVAL;
 4040	return r;
 4041}
 4042
 4043/*
 4044 * kvm_set_guest_paused() indicates to the guest kernel that it has been
 4045 * stopped by the hypervisor.  This function will be called from the host only.
 4046 * EINVAL is returned when the host attempts to set the flag for a guest that
 4047 * does not support pv clocks.
 4048 */
 4049static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 4050{
 4051	if (!vcpu->arch.pv_time_enabled)
 4052		return -EINVAL;
 4053	vcpu->arch.pvclock_set_guest_stopped_request = true;
 4054	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 4055	return 0;
 4056}
 4057
 4058static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 4059				     struct kvm_enable_cap *cap)
 4060{
 4061	int r;
 4062	uint16_t vmcs_version;
 4063	void __user *user_ptr;
 4064
 4065	if (cap->flags)
 4066		return -EINVAL;
 4067
 4068	switch (cap->cap) {
 4069	case KVM_CAP_HYPERV_SYNIC2:
 4070		if (cap->args[0])
 4071			return -EINVAL;
 4072		/* fall through */
 4073
 4074	case KVM_CAP_HYPERV_SYNIC:
 4075		if (!irqchip_in_kernel(vcpu->kvm))
 4076			return -EINVAL;
 4077		return kvm_hv_activate_synic(vcpu, cap->cap ==
 4078					     KVM_CAP_HYPERV_SYNIC2);
 4079	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
 4080		if (!kvm_x86_ops->nested_enable_evmcs)
 4081			return -ENOTTY;
 4082		r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
 4083		if (!r) {
 4084			user_ptr = (void __user *)(uintptr_t)cap->args[0];
 4085			if (copy_to_user(user_ptr, &vmcs_version,
 4086					 sizeof(vmcs_version)))
 4087				r = -EFAULT;
 4088		}
 4089		return r;
 4090	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
 4091		if (!kvm_x86_ops->enable_direct_tlbflush)
 4092			return -ENOTTY;
 4093
 4094		return kvm_x86_ops->enable_direct_tlbflush(vcpu);
 4095
 4096	default:
 4097		return -EINVAL;
 4098	}
 4099}
 4100
 4101long kvm_arch_vcpu_ioctl(struct file *filp,
 4102			 unsigned int ioctl, unsigned long arg)
 4103{
 4104	struct kvm_vcpu *vcpu = filp->private_data;
 4105	void __user *argp = (void __user *)arg;
 4106	int r;
 4107	union {
 4108		struct kvm_lapic_state *lapic;
 4109		struct kvm_xsave *xsave;
 4110		struct kvm_xcrs *xcrs;
 4111		void *buffer;
 4112	} u;
 4113
 4114	vcpu_load(vcpu);
 4115
 4116	u.buffer = NULL;
 4117	switch (ioctl) {
 4118	case KVM_GET_LAPIC: {
 4119		r = -EINVAL;
 4120		if (!lapic_in_kernel(vcpu))
 4121			goto out;
 4122		u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
 4123				GFP_KERNEL_ACCOUNT);
 4124
 4125		r = -ENOMEM;
 4126		if (!u.lapic)
 4127			goto out;
 4128		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
 4129		if (r)
 4130			goto out;
 4131		r = -EFAULT;
 4132		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
 4133			goto out;
 4134		r = 0;
 4135		break;
 4136	}
 4137	case KVM_SET_LAPIC: {
 4138		r = -EINVAL;
 4139		if (!lapic_in_kernel(vcpu))
 
 
 
 
 
 
 
 4140			goto out;
 4141		u.lapic = memdup_user(argp, sizeof(*u.lapic));
 4142		if (IS_ERR(u.lapic)) {
 4143			r = PTR_ERR(u.lapic);
 4144			goto out_nofree;
 4145		}
 4146
 4147		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 
 
 
 4148		break;
 4149	}
 4150	case KVM_INTERRUPT: {
 4151		struct kvm_interrupt irq;
 4152
 4153		r = -EFAULT;
 4154		if (copy_from_user(&irq, argp, sizeof(irq)))
 4155			goto out;
 4156		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 
 
 
 4157		break;
 4158	}
 4159	case KVM_NMI: {
 4160		r = kvm_vcpu_ioctl_nmi(vcpu);
 4161		break;
 4162	}
 4163	case KVM_SMI: {
 4164		r = kvm_vcpu_ioctl_smi(vcpu);
 4165		break;
 4166	}
 4167	case KVM_SET_CPUID: {
 4168		struct kvm_cpuid __user *cpuid_arg = argp;
 4169		struct kvm_cpuid cpuid;
 4170
 4171		r = -EFAULT;
 4172		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 4173			goto out;
 4174		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
 
 
 4175		break;
 4176	}
 4177	case KVM_SET_CPUID2: {
 4178		struct kvm_cpuid2 __user *cpuid_arg = argp;
 4179		struct kvm_cpuid2 cpuid;
 4180
 4181		r = -EFAULT;
 4182		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 4183			goto out;
 4184		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
 4185					      cpuid_arg->entries);
 
 
 4186		break;
 4187	}
 4188	case KVM_GET_CPUID2: {
 4189		struct kvm_cpuid2 __user *cpuid_arg = argp;
 4190		struct kvm_cpuid2 cpuid;
 4191
 4192		r = -EFAULT;
 4193		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 4194			goto out;
 4195		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
 4196					      cpuid_arg->entries);
 4197		if (r)
 4198			goto out;
 4199		r = -EFAULT;
 4200		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
 4201			goto out;
 4202		r = 0;
 4203		break;
 4204	}
 4205	case KVM_GET_MSRS: {
 4206		int idx = srcu_read_lock(&vcpu->kvm->srcu);
 4207		r = msr_io(vcpu, argp, do_get_msr, 1);
 4208		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 4209		break;
 4210	}
 4211	case KVM_SET_MSRS: {
 4212		int idx = srcu_read_lock(&vcpu->kvm->srcu);
 4213		r = msr_io(vcpu, argp, do_set_msr, 0);
 4214		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 4215		break;
 4216	}
 4217	case KVM_TPR_ACCESS_REPORTING: {
 4218		struct kvm_tpr_access_ctl tac;
 4219
 4220		r = -EFAULT;
 4221		if (copy_from_user(&tac, argp, sizeof(tac)))
 4222			goto out;
 4223		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
 4224		if (r)
 4225			goto out;
 4226		r = -EFAULT;
 4227		if (copy_to_user(argp, &tac, sizeof(tac)))
 4228			goto out;
 4229		r = 0;
 4230		break;
 4231	};
 4232	case KVM_SET_VAPIC_ADDR: {
 4233		struct kvm_vapic_addr va;
 4234		int idx;
 4235
 4236		r = -EINVAL;
 4237		if (!lapic_in_kernel(vcpu))
 4238			goto out;
 4239		r = -EFAULT;
 4240		if (copy_from_user(&va, argp, sizeof(va)))
 4241			goto out;
 4242		idx = srcu_read_lock(&vcpu->kvm->srcu);
 4243		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
 4244		srcu_read_unlock(&vcpu->kvm->srcu, idx);
 4245		break;
 4246	}
 4247	case KVM_X86_SETUP_MCE: {
 4248		u64 mcg_cap;
 4249
 4250		r = -EFAULT;
 4251		if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
 4252			goto out;
 4253		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
 4254		break;
 4255	}
 4256	case KVM_X86_SET_MCE: {
 4257		struct kvm_x86_mce mce;
 4258
 4259		r = -EFAULT;
 4260		if (copy_from_user(&mce, argp, sizeof(mce)))
 4261			goto out;
 4262		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
 4263		break;
 4264	}
 4265	case KVM_GET_VCPU_EVENTS: {
 4266		struct kvm_vcpu_events events;
 4267
 4268		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
 4269
 4270		r = -EFAULT;
 4271		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
 4272			break;
 4273		r = 0;
 4274		break;
 4275	}
 4276	case KVM_SET_VCPU_EVENTS: {
 4277		struct kvm_vcpu_events events;
 4278
 4279		r = -EFAULT;
 4280		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
 4281			break;
 4282
 4283		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
 4284		break;
 4285	}
 4286	case KVM_GET_DEBUGREGS: {
 4287		struct kvm_debugregs dbgregs;
 4288
 4289		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
 4290
 4291		r = -EFAULT;
 4292		if (copy_to_user(argp, &dbgregs,
 4293				 sizeof(struct kvm_debugregs)))
 4294			break;
 4295		r = 0;
 4296		break;
 4297	}
 4298	case KVM_SET_DEBUGREGS: {
 4299		struct kvm_debugregs dbgregs;
 4300
 4301		r = -EFAULT;
 4302		if (copy_from_user(&dbgregs, argp,
 4303				   sizeof(struct kvm_debugregs)))
 4304			break;
 4305
 4306		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 4307		break;
 4308	}
 4309	case KVM_GET_XSAVE: {
 4310		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
 4311		r = -ENOMEM;
 4312		if (!u.xsave)
 4313			break;
 4314
 4315		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
 4316
 4317		r = -EFAULT;
 4318		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
 4319			break;
 4320		r = 0;
 4321		break;
 4322	}
 4323	case KVM_SET_XSAVE: {
 4324		u.xsave = memdup_user(argp, sizeof(*u.xsave));
 4325		if (IS_ERR(u.xsave)) {
 4326			r = PTR_ERR(u.xsave);
 4327			goto out_nofree;
 4328		}
 
 
 
 4329
 4330		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 4331		break;
 4332	}
 4333	case KVM_GET_XCRS: {
 4334		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 4335		r = -ENOMEM;
 4336		if (!u.xcrs)
 4337			break;
 4338
 4339		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
 4340
 4341		r = -EFAULT;
 4342		if (copy_to_user(argp, u.xcrs,
 4343				 sizeof(struct kvm_xcrs)))
 4344			break;
 4345		r = 0;
 4346		break;
 4347	}
 4348	case KVM_SET_XCRS: {
 4349		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
 4350		if (IS_ERR(u.xcrs)) {
 4351			r = PTR_ERR(u.xcrs);
 4352			goto out_nofree;
 4353		}
 
 
 
 
 4354
 4355		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 4356		break;
 4357	}
 4358	case KVM_SET_TSC_KHZ: {
 4359		u32 user_tsc_khz;
 4360
 4361		r = -EINVAL;
 
 
 
 4362		user_tsc_khz = (u32)arg;
 4363
 4364		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
 4365			goto out;
 4366
 4367		if (user_tsc_khz == 0)
 4368			user_tsc_khz = tsc_khz;
 4369
 4370		if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
 4371			r = 0;
 4372
 
 4373		goto out;
 4374	}
 4375	case KVM_GET_TSC_KHZ: {
 4376		r = vcpu->arch.virtual_tsc_khz;
 4377		goto out;
 4378	}
 4379	case KVM_KVMCLOCK_CTRL: {
 4380		r = kvm_set_guest_paused(vcpu);
 4381		goto out;
 4382	}
 4383	case KVM_ENABLE_CAP: {
 4384		struct kvm_enable_cap cap;
 4385
 4386		r = -EFAULT;
 4387		if (copy_from_user(&cap, argp, sizeof(cap)))
 4388			goto out;
 4389		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 4390		break;
 4391	}
 4392	case KVM_GET_NESTED_STATE: {
 4393		struct kvm_nested_state __user *user_kvm_nested_state = argp;
 4394		u32 user_data_size;
 4395
 4396		r = -EINVAL;
 4397		if (!kvm_x86_ops->get_nested_state)
 4398			break;
 4399
 4400		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
 4401		r = -EFAULT;
 4402		if (get_user(user_data_size, &user_kvm_nested_state->size))
 4403			break;
 4404
 4405		r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
 4406						  user_data_size);
 4407		if (r < 0)
 4408			break;
 4409
 4410		if (r > user_data_size) {
 4411			if (put_user(r, &user_kvm_nested_state->size))
 4412				r = -EFAULT;
 4413			else
 4414				r = -E2BIG;
 4415			break;
 4416		}
 4417
 4418		r = 0;
 4419		break;
 4420	}
 4421	case KVM_SET_NESTED_STATE: {
 4422		struct kvm_nested_state __user *user_kvm_nested_state = argp;
 4423		struct kvm_nested_state kvm_state;
 4424
 4425		r = -EINVAL;
 4426		if (!kvm_x86_ops->set_nested_state)
 4427			break;
 4428
 4429		r = -EFAULT;
 4430		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
 4431			break;
 4432
 4433		r = -EINVAL;
 4434		if (kvm_state.size < sizeof(kvm_state))
 4435			break;
 4436
 4437		if (kvm_state.flags &
 4438		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
 4439		      | KVM_STATE_NESTED_EVMCS))
 4440			break;
 4441
 4442		/* nested_run_pending implies guest_mode.  */
 4443		if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
 4444		    && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
 4445			break;
 4446
 4447		r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
 4448		break;
 4449	}
 4450	case KVM_GET_SUPPORTED_HV_CPUID: {
 4451		struct kvm_cpuid2 __user *cpuid_arg = argp;
 4452		struct kvm_cpuid2 cpuid;
 4453
 4454		r = -EFAULT;
 4455		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
 4456			goto out;
 4457
 4458		r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
 4459						cpuid_arg->entries);
 4460		if (r)
 4461			goto out;
 4462
 4463		r = -EFAULT;
 4464		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
 4465			goto out;
 4466		r = 0;
 4467		break;
 4468	}
 4469	default:
 4470		r = -EINVAL;
 4471	}
 4472out:
 4473	kfree(u.buffer);
 4474out_nofree:
 4475	vcpu_put(vcpu);
 4476	return r;
 4477}
 4478
 4479vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 4480{
 4481	return VM_FAULT_SIGBUS;
 4482}
 4483
 4484static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
 4485{
 4486	int ret;
 4487
 4488	if (addr > (unsigned int)(-3 * PAGE_SIZE))
 4489		return -EINVAL;
 4490	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
 4491	return ret;
 4492}
 4493
 4494static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
 4495					      u64 ident_addr)
 4496{
 4497	return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
 
 4498}
 4499
 4500static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 4501					 unsigned long kvm_nr_mmu_pages)
 4502{
 4503	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
 4504		return -EINVAL;
 4505
 4506	mutex_lock(&kvm->slots_lock);
 
 4507
 4508	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 4509	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 4510
 
 4511	mutex_unlock(&kvm->slots_lock);
 4512	return 0;
 4513}
 4514
 4515static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 4516{
 4517	return kvm->arch.n_max_mmu_pages;
 4518}
 4519
 4520static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 4521{
 4522	struct kvm_pic *pic = kvm->arch.vpic;
 4523	int r;
 4524
 4525	r = 0;
 4526	switch (chip->chip_id) {
 4527	case KVM_IRQCHIP_PIC_MASTER:
 4528		memcpy(&chip->chip.pic, &pic->pics[0],
 
 4529			sizeof(struct kvm_pic_state));
 4530		break;
 4531	case KVM_IRQCHIP_PIC_SLAVE:
 4532		memcpy(&chip->chip.pic, &pic->pics[1],
 
 4533			sizeof(struct kvm_pic_state));
 4534		break;
 4535	case KVM_IRQCHIP_IOAPIC:
 4536		kvm_get_ioapic(kvm, &chip->chip.ioapic);
 4537		break;
 4538	default:
 4539		r = -EINVAL;
 4540		break;
 4541	}
 4542	return r;
 4543}
 4544
 4545static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 4546{
 4547	struct kvm_pic *pic = kvm->arch.vpic;
 4548	int r;
 4549
 4550	r = 0;
 4551	switch (chip->chip_id) {
 4552	case KVM_IRQCHIP_PIC_MASTER:
 4553		spin_lock(&pic->lock);
 4554		memcpy(&pic->pics[0], &chip->chip.pic,
 
 4555			sizeof(struct kvm_pic_state));
 4556		spin_unlock(&pic->lock);
 4557		break;
 4558	case KVM_IRQCHIP_PIC_SLAVE:
 4559		spin_lock(&pic->lock);
 4560		memcpy(&pic->pics[1], &chip->chip.pic,
 
 4561			sizeof(struct kvm_pic_state));
 4562		spin_unlock(&pic->lock);
 4563		break;
 4564	case KVM_IRQCHIP_IOAPIC:
 4565		kvm_set_ioapic(kvm, &chip->chip.ioapic);
 4566		break;
 4567	default:
 4568		r = -EINVAL;
 4569		break;
 4570	}
 4571	kvm_pic_update_irq(pic);
 4572	return r;
 4573}
 4574
 4575static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 4576{
 4577	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
 4578
 4579	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
 4580
 4581	mutex_lock(&kps->lock);
 4582	memcpy(ps, &kps->channels, sizeof(*ps));
 4583	mutex_unlock(&kps->lock);
 4584	return 0;
 4585}
 4586
 4587static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 4588{
 4589	int i;
 4590	struct kvm_pit *pit = kvm->arch.vpit;
 4591
 4592	mutex_lock(&pit->pit_state.lock);
 4593	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
 4594	for (i = 0; i < 3; i++)
 4595		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
 4596	mutex_unlock(&pit->pit_state.lock);
 4597	return 0;
 4598}
 4599
 4600static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 4601{
 
 
 4602	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 4603	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 4604		sizeof(ps->channels));
 4605	ps->flags = kvm->arch.vpit->pit_state.flags;
 4606	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 4607	memset(&ps->reserved, 0, sizeof(ps->reserved));
 4608	return 0;
 4609}
 4610
 4611static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 4612{
 4613	int start = 0;
 4614	int i;
 4615	u32 prev_legacy, cur_legacy;
 4616	struct kvm_pit *pit = kvm->arch.vpit;
 4617
 4618	mutex_lock(&pit->pit_state.lock);
 4619	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 4620	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
 4621	if (!prev_legacy && cur_legacy)
 4622		start = 1;
 4623	memcpy(&pit->pit_state.channels, &ps->channels,
 4624	       sizeof(pit->pit_state.channels));
 4625	pit->pit_state.flags = ps->flags;
 4626	for (i = 0; i < 3; i++)
 4627		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
 4628				   start && i == 0);
 4629	mutex_unlock(&pit->pit_state.lock);
 4630	return 0;
 4631}
 4632
 4633static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 4634				 struct kvm_reinject_control *control)
 4635{
 4636	struct kvm_pit *pit = kvm->arch.vpit;
 4637
 4638	if (!pit)
 4639		return -ENXIO;
 4640
 4641	/* pit->pit_state.lock was overloaded to prevent userspace from getting
 4642	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
 4643	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
 4644	 */
 4645	mutex_lock(&pit->pit_state.lock);
 4646	kvm_pit_set_reinject(pit, control->pit_reinject);
 4647	mutex_unlock(&pit->pit_state.lock);
 4648
 4649	return 0;
 4650}
 4651
 4652/**
 4653 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
 4654 * @kvm: kvm instance
 4655 * @log: slot id and address to which we copy the log
 4656 *
 4657 * Steps 1-4 below provide general overview of dirty page logging. See
 4658 * kvm_get_dirty_log_protect() function description for additional details.
 4659 *
 4660 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
 4661 * always flush the TLB (step 4) even if previous step failed  and the dirty
 4662 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
 4663 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
 4664 * writes will be marked dirty for next log read.
 4665 *
 4666 *   1. Take a snapshot of the bit and clear it if needed.
 4667 *   2. Write protect the corresponding page.
 4668 *   3. Copy the snapshot to the userspace.
 4669 *   4. Flush TLB's if needed.
 4670 */
 4671int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 
 4672{
 4673	bool flush = false;
 4674	int r;
 
 
 4675
 4676	mutex_lock(&kvm->slots_lock);
 4677
 4678	/*
 4679	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
 4680	 */
 4681	if (kvm_x86_ops->flush_log_dirty)
 4682		kvm_x86_ops->flush_log_dirty(kvm);
 4683
 4684	r = kvm_get_dirty_log_protect(kvm, log, &flush);
 
 
 
 4685
 4686	/*
 4687	 * All the TLBs can be flushed out of mmu lock, see the comments in
 4688	 * kvm_mmu_slot_remove_write_access().
 4689	 */
 4690	lockdep_assert_held(&kvm->slots_lock);
 4691	if (flush)
 4692		kvm_flush_remote_tlbs(kvm);
 4693
 4694	mutex_unlock(&kvm->slots_lock);
 4695	return r;
 4696}
 4697
 4698int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
 4699{
 4700	bool flush = false;
 4701	int r;
 
 
 
 
 
 4702
 4703	mutex_lock(&kvm->slots_lock);
 
 
 
 
 
 
 4704
 4705	/*
 4706	 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
 4707	 */
 4708	if (kvm_x86_ops->flush_log_dirty)
 4709		kvm_x86_ops->flush_log_dirty(kvm);
 4710
 4711	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
 
 
 4712
 4713	/*
 4714	 * All the TLBs can be flushed out of mmu lock, see the comments in
 4715	 * kvm_mmu_slot_remove_write_access().
 4716	 */
 4717	lockdep_assert_held(&kvm->slots_lock);
 4718	if (flush)
 4719		kvm_flush_remote_tlbs(kvm);
 
 4720
 
 
 4721	mutex_unlock(&kvm->slots_lock);
 4722	return r;
 4723}
 4724
 4725int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 4726			bool line_status)
 4727{
 4728	if (!irqchip_in_kernel(kvm))
 4729		return -ENXIO;
 4730
 4731	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 4732					irq_event->irq, irq_event->level,
 4733					line_status);
 4734	return 0;
 4735}
 4736
 4737int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 4738			    struct kvm_enable_cap *cap)
 4739{
 4740	int r;
 4741
 4742	if (cap->flags)
 4743		return -EINVAL;
 4744
 4745	switch (cap->cap) {
 4746	case KVM_CAP_DISABLE_QUIRKS:
 4747		kvm->arch.disabled_quirks = cap->args[0];
 4748		r = 0;
 4749		break;
 4750	case KVM_CAP_SPLIT_IRQCHIP: {
 4751		mutex_lock(&kvm->lock);
 4752		r = -EINVAL;
 4753		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
 4754			goto split_irqchip_unlock;
 4755		r = -EEXIST;
 4756		if (irqchip_in_kernel(kvm))
 4757			goto split_irqchip_unlock;
 4758		if (kvm->created_vcpus)
 4759			goto split_irqchip_unlock;
 4760		r = kvm_setup_empty_irq_routing(kvm);
 4761		if (r)
 4762			goto split_irqchip_unlock;
 4763		/* Pairs with irqchip_in_kernel. */
 4764		smp_wmb();
 4765		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
 4766		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
 4767		r = 0;
 4768split_irqchip_unlock:
 4769		mutex_unlock(&kvm->lock);
 4770		break;
 4771	}
 4772	case KVM_CAP_X2APIC_API:
 4773		r = -EINVAL;
 4774		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
 4775			break;
 4776
 4777		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
 4778			kvm->arch.x2apic_format = true;
 4779		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
 4780			kvm->arch.x2apic_broadcast_quirk_disabled = true;
 4781
 4782		r = 0;
 4783		break;
 4784	case KVM_CAP_X86_DISABLE_EXITS:
 4785		r = -EINVAL;
 4786		if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
 4787			break;
 4788
 4789		if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
 4790			kvm_can_mwait_in_guest())
 4791			kvm->arch.mwait_in_guest = true;
 4792		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
 4793			kvm->arch.hlt_in_guest = true;
 4794		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
 4795			kvm->arch.pause_in_guest = true;
 4796		if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
 4797			kvm->arch.cstate_in_guest = true;
 4798		r = 0;
 4799		break;
 4800	case KVM_CAP_MSR_PLATFORM_INFO:
 4801		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
 4802		r = 0;
 4803		break;
 4804	case KVM_CAP_EXCEPTION_PAYLOAD:
 4805		kvm->arch.exception_payload_enabled = cap->args[0];
 4806		r = 0;
 4807		break;
 4808	default:
 4809		r = -EINVAL;
 4810		break;
 4811	}
 4812	return r;
 4813}
 4814
 4815long kvm_arch_vm_ioctl(struct file *filp,
 4816		       unsigned int ioctl, unsigned long arg)
 4817{
 4818	struct kvm *kvm = filp->private_data;
 4819	void __user *argp = (void __user *)arg;
 4820	int r = -ENOTTY;
 4821	/*
 4822	 * This union makes it completely explicit to gcc-3.x
 4823	 * that these two variables' stack usage should be
 4824	 * combined, not added together.
 4825	 */
 4826	union {
 4827		struct kvm_pit_state ps;
 4828		struct kvm_pit_state2 ps2;
 4829		struct kvm_pit_config pit_config;
 4830	} u;
 4831
 4832	switch (ioctl) {
 4833	case KVM_SET_TSS_ADDR:
 4834		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
 
 
 4835		break;
 4836	case KVM_SET_IDENTITY_MAP_ADDR: {
 4837		u64 ident_addr;
 4838
 4839		mutex_lock(&kvm->lock);
 4840		r = -EINVAL;
 4841		if (kvm->created_vcpus)
 4842			goto set_identity_unlock;
 4843		r = -EFAULT;
 4844		if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
 4845			goto set_identity_unlock;
 4846		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
 4847set_identity_unlock:
 4848		mutex_unlock(&kvm->lock);
 4849		break;
 4850	}
 4851	case KVM_SET_NR_MMU_PAGES:
 4852		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 
 
 4853		break;
 4854	case KVM_GET_NR_MMU_PAGES:
 4855		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 4856		break;
 4857	case KVM_CREATE_IRQCHIP: {
 
 
 4858		mutex_lock(&kvm->lock);
 4859
 4860		r = -EEXIST;
 4861		if (irqchip_in_kernel(kvm))
 4862			goto create_irqchip_unlock;
 4863
 4864		r = -EINVAL;
 4865		if (kvm->created_vcpus)
 
 
 
 
 
 
 
 
 
 
 4866			goto create_irqchip_unlock;
 4867
 4868		r = kvm_pic_init(kvm);
 4869		if (r)
 4870			goto create_irqchip_unlock;
 4871
 4872		r = kvm_ioapic_init(kvm);
 4873		if (r) {
 4874			kvm_pic_destroy(kvm);
 4875			goto create_irqchip_unlock;
 4876		}
 4877
 4878		r = kvm_setup_default_irq_routing(kvm);
 4879		if (r) {
 
 
 4880			kvm_ioapic_destroy(kvm);
 4881			kvm_pic_destroy(kvm);
 4882			goto create_irqchip_unlock;
 
 4883		}
 4884		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
 4885		smp_wmb();
 4886		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
 4887	create_irqchip_unlock:
 4888		mutex_unlock(&kvm->lock);
 4889		break;
 4890	}
 4891	case KVM_CREATE_PIT:
 4892		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
 4893		goto create_pit;
 4894	case KVM_CREATE_PIT2:
 4895		r = -EFAULT;
 4896		if (copy_from_user(&u.pit_config, argp,
 4897				   sizeof(struct kvm_pit_config)))
 4898			goto out;
 4899	create_pit:
 4900		mutex_lock(&kvm->lock);
 4901		r = -EEXIST;
 4902		if (kvm->arch.vpit)
 4903			goto create_pit_unlock;
 4904		r = -ENOMEM;
 4905		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
 4906		if (kvm->arch.vpit)
 4907			r = 0;
 4908	create_pit_unlock:
 4909		mutex_unlock(&kvm->lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 4910		break;
 
 4911	case KVM_GET_IRQCHIP: {
 4912		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
 4913		struct kvm_irqchip *chip;
 4914
 4915		chip = memdup_user(argp, sizeof(*chip));
 4916		if (IS_ERR(chip)) {
 4917			r = PTR_ERR(chip);
 4918			goto out;
 4919		}
 4920
 
 4921		r = -ENXIO;
 4922		if (!irqchip_kernel(kvm))
 4923			goto get_irqchip_out;
 4924		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 4925		if (r)
 4926			goto get_irqchip_out;
 4927		r = -EFAULT;
 4928		if (copy_to_user(argp, chip, sizeof(*chip)))
 4929			goto get_irqchip_out;
 4930		r = 0;
 4931	get_irqchip_out:
 4932		kfree(chip);
 
 
 4933		break;
 4934	}
 4935	case KVM_SET_IRQCHIP: {
 4936		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
 4937		struct kvm_irqchip *chip;
 4938
 4939		chip = memdup_user(argp, sizeof(*chip));
 4940		if (IS_ERR(chip)) {
 4941			r = PTR_ERR(chip);
 4942			goto out;
 4943		}
 4944
 
 4945		r = -ENXIO;
 4946		if (!irqchip_kernel(kvm))
 4947			goto set_irqchip_out;
 4948		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 4949		if (r)
 4950			goto set_irqchip_out;
 4951		r = 0;
 4952	set_irqchip_out:
 4953		kfree(chip);
 
 
 4954		break;
 4955	}
 4956	case KVM_GET_PIT: {
 4957		r = -EFAULT;
 4958		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
 4959			goto out;
 4960		r = -ENXIO;
 4961		if (!kvm->arch.vpit)
 4962			goto out;
 4963		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
 4964		if (r)
 4965			goto out;
 4966		r = -EFAULT;
 4967		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
 4968			goto out;
 4969		r = 0;
 4970		break;
 4971	}
 4972	case KVM_SET_PIT: {
 4973		r = -EFAULT;
 4974		if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
 4975			goto out;
 4976		r = -ENXIO;
 4977		if (!kvm->arch.vpit)
 4978			goto out;
 4979		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
 
 
 
 4980		break;
 4981	}
 4982	case KVM_GET_PIT2: {
 4983		r = -ENXIO;
 4984		if (!kvm->arch.vpit)
 4985			goto out;
 4986		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
 4987		if (r)
 4988			goto out;
 4989		r = -EFAULT;
 4990		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
 4991			goto out;
 4992		r = 0;
 4993		break;
 4994	}
 4995	case KVM_SET_PIT2: {
 4996		r = -EFAULT;
 4997		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
 4998			goto out;
 4999		r = -ENXIO;
 5000		if (!kvm->arch.vpit)
 5001			goto out;
 5002		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
 
 
 
 5003		break;
 5004	}
 5005	case KVM_REINJECT_CONTROL: {
 5006		struct kvm_reinject_control control;
 5007		r =  -EFAULT;
 5008		if (copy_from_user(&control, argp, sizeof(control)))
 5009			goto out;
 5010		r = kvm_vm_ioctl_reinject(kvm, &control);
 
 
 
 5011		break;
 5012	}
 5013	case KVM_SET_BOOT_CPU_ID:
 5014		r = 0;
 5015		mutex_lock(&kvm->lock);
 5016		if (kvm->created_vcpus)
 5017			r = -EBUSY;
 5018		else
 5019			kvm->arch.bsp_vcpu_id = arg;
 5020		mutex_unlock(&kvm->lock);
 5021		break;
 5022	case KVM_XEN_HVM_CONFIG: {
 5023		struct kvm_xen_hvm_config xhc;
 5024		r = -EFAULT;
 5025		if (copy_from_user(&xhc, argp, sizeof(xhc)))
 
 5026			goto out;
 5027		r = -EINVAL;
 5028		if (xhc.flags)
 5029			goto out;
 5030		memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
 5031		r = 0;
 5032		break;
 5033	}
 5034	case KVM_SET_CLOCK: {
 5035		struct kvm_clock_data user_ns;
 5036		u64 now_ns;
 
 5037
 5038		r = -EFAULT;
 5039		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
 5040			goto out;
 5041
 5042		r = -EINVAL;
 5043		if (user_ns.flags)
 5044			goto out;
 5045
 5046		r = 0;
 5047		/*
 5048		 * TODO: userspace has to take care of races with VCPU_RUN, so
 5049		 * kvm_gen_update_masterclock() can be cut down to locked
 5050		 * pvclock_update_vm_gtod_copy().
 5051		 */
 5052		kvm_gen_update_masterclock(kvm);
 5053		now_ns = get_kvmclock_ns(kvm);
 5054		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
 5055		kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
 5056		break;
 5057	}
 5058	case KVM_GET_CLOCK: {
 5059		struct kvm_clock_data user_ns;
 5060		u64 now_ns;
 5061
 5062		now_ns = get_kvmclock_ns(kvm);
 5063		user_ns.clock = now_ns;
 5064		user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
 
 
 5065		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 5066
 5067		r = -EFAULT;
 5068		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
 5069			goto out;
 5070		r = 0;
 5071		break;
 5072	}
 5073	case KVM_MEMORY_ENCRYPT_OP: {
 5074		r = -ENOTTY;
 5075		if (kvm_x86_ops->mem_enc_op)
 5076			r = kvm_x86_ops->mem_enc_op(kvm, argp);
 5077		break;
 5078	}
 5079	case KVM_MEMORY_ENCRYPT_REG_REGION: {
 5080		struct kvm_enc_region region;
 5081
 5082		r = -EFAULT;
 5083		if (copy_from_user(&region, argp, sizeof(region)))
 5084			goto out;
 5085
 5086		r = -ENOTTY;
 5087		if (kvm_x86_ops->mem_enc_reg_region)
 5088			r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
 5089		break;
 5090	}
 5091	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
 5092		struct kvm_enc_region region;
 5093
 5094		r = -EFAULT;
 5095		if (copy_from_user(&region, argp, sizeof(region)))
 5096			goto out;
 5097
 5098		r = -ENOTTY;
 5099		if (kvm_x86_ops->mem_enc_unreg_region)
 5100			r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
 5101		break;
 5102	}
 5103	case KVM_HYPERV_EVENTFD: {
 5104		struct kvm_hyperv_eventfd hvevfd;
 5105
 5106		r = -EFAULT;
 5107		if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
 5108			goto out;
 5109		r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
 5110		break;
 5111	}
 5112	case KVM_SET_PMU_EVENT_FILTER:
 5113		r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
 5114		break;
 5115	default:
 5116		r = -ENOTTY;
 5117	}
 5118out:
 5119	return r;
 5120}
 5121
 5122static void kvm_init_msr_list(void)
 5123{
 5124	struct x86_pmu_capability x86_pmu;
 5125	u32 dummy[2];
 5126	unsigned i;
 5127
 5128	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
 5129			 "Please update the fixed PMCs in msrs_to_saved_all[]");
 5130
 5131	perf_get_x86_pmu_capability(&x86_pmu);
 5132
 5133	num_msrs_to_save = 0;
 5134	num_emulated_msrs = 0;
 5135	num_msr_based_features = 0;
 5136
 5137	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
 5138		if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
 5139			continue;
 5140
 5141		/*
 5142		 * Even MSRs that are valid in the host may not be exposed
 5143		 * to the guests in some cases.
 5144		 */
 5145		switch (msrs_to_save_all[i]) {
 5146		case MSR_IA32_BNDCFGS:
 5147			if (!kvm_mpx_supported())
 5148				continue;
 5149			break;
 5150		case MSR_TSC_AUX:
 5151			if (!kvm_x86_ops->rdtscp_supported())
 5152				continue;
 5153			break;
 5154		case MSR_IA32_RTIT_CTL:
 5155		case MSR_IA32_RTIT_STATUS:
 5156			if (!kvm_x86_ops->pt_supported())
 5157				continue;
 5158			break;
 5159		case MSR_IA32_RTIT_CR3_MATCH:
 5160			if (!kvm_x86_ops->pt_supported() ||
 5161			    !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
 5162				continue;
 5163			break;
 5164		case MSR_IA32_RTIT_OUTPUT_BASE:
 5165		case MSR_IA32_RTIT_OUTPUT_MASK:
 5166			if (!kvm_x86_ops->pt_supported() ||
 5167				(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
 5168				 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
 5169				continue;
 5170			break;
 5171		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
 5172			if (!kvm_x86_ops->pt_supported() ||
 5173				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
 5174				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
 5175				continue;
 5176			break;
 5177		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
 5178			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
 5179			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 5180				continue;
 5181			break;
 5182		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
 5183			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
 5184			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 5185				continue;
 5186		}
 5187		default:
 5188			break;
 5189		}
 5190
 5191		msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
 5192	}
 5193
 5194	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
 5195		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
 5196			continue;
 5197
 5198		emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
 5199	}
 5200
 5201	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
 5202		struct kvm_msr_entry msr;
 5203
 5204		msr.index = msr_based_features_all[i];
 5205		if (kvm_get_msr_feature(&msr))
 5206			continue;
 5207
 5208		msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
 5209	}
 
 5210}
 5211
 5212static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 5213			   const void *v)
 5214{
 5215	int handled = 0;
 5216	int n;
 5217
 5218	do {
 5219		n = min(len, 8);
 5220		if (!(lapic_in_kernel(vcpu) &&
 5221		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
 5222		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
 5223			break;
 5224		handled += n;
 5225		addr += n;
 5226		len -= n;
 5227		v += n;
 5228	} while (len);
 5229
 5230	return handled;
 5231}
 5232
 5233static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 5234{
 5235	int handled = 0;
 5236	int n;
 5237
 5238	do {
 5239		n = min(len, 8);
 5240		if (!(lapic_in_kernel(vcpu) &&
 5241		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
 5242					 addr, n, v))
 5243		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
 5244			break;
 5245		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
 5246		handled += n;
 5247		addr += n;
 5248		len -= n;
 5249		v += n;
 5250	} while (len);
 5251
 5252	return handled;
 5253}
 5254
 5255static void kvm_set_segment(struct kvm_vcpu *vcpu,
 5256			struct kvm_segment *var, int seg)
 5257{
 5258	kvm_x86_ops->set_segment(vcpu, var, seg);
 5259}
 5260
 5261void kvm_get_segment(struct kvm_vcpu *vcpu,
 5262		     struct kvm_segment *var, int seg)
 5263{
 5264	kvm_x86_ops->get_segment(vcpu, var, seg);
 5265}
 5266
 5267gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 5268			   struct x86_exception *exception)
 
 
 
 
 5269{
 5270	gpa_t t_gpa;
 
 5271
 5272	BUG_ON(!mmu_is_nested(vcpu));
 5273
 5274	/* NPT walks are always user-walks */
 5275	access |= PFERR_USER_MASK;
 5276	t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
 5277
 5278	return t_gpa;
 5279}
 5280
 5281gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 5282			      struct x86_exception *exception)
 5283{
 5284	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 5285	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 5286}
 5287
 5288 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
 5289				struct x86_exception *exception)
 5290{
 5291	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 5292	access |= PFERR_FETCH_MASK;
 5293	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 5294}
 5295
 5296gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 5297			       struct x86_exception *exception)
 5298{
 5299	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 5300	access |= PFERR_WRITE_MASK;
 5301	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 5302}
 5303
 5304/* uses this to access any guest's mapped memory without checking CPL */
 5305gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 5306				struct x86_exception *exception)
 5307{
 5308	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
 5309}
 5310
 5311static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 5312				      struct kvm_vcpu *vcpu, u32 access,
 5313				      struct x86_exception *exception)
 5314{
 5315	void *data = val;
 5316	int r = X86EMUL_CONTINUE;
 5317
 5318	while (bytes) {
 5319		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
 5320							    exception);
 5321		unsigned offset = addr & (PAGE_SIZE-1);
 5322		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
 5323		int ret;
 5324
 5325		if (gpa == UNMAPPED_GVA)
 5326			return X86EMUL_PROPAGATE_FAULT;
 5327		ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
 5328					       offset, toread);
 5329		if (ret < 0) {
 5330			r = X86EMUL_IO_NEEDED;
 5331			goto out;
 5332		}
 5333
 5334		bytes -= toread;
 5335		data += toread;
 5336		addr += toread;
 5337	}
 5338out:
 5339	return r;
 5340}
 5341
 5342/* used for instruction fetching */
 5343static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 5344				gva_t addr, void *val, unsigned int bytes,
 5345				struct x86_exception *exception)
 5346{
 5347	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5348	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 5349	unsigned offset;
 5350	int ret;
 5351
 5352	/* Inline kvm_read_guest_virt_helper for speed.  */
 5353	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
 5354						    exception);
 5355	if (unlikely(gpa == UNMAPPED_GVA))
 5356		return X86EMUL_PROPAGATE_FAULT;
 5357
 5358	offset = addr & (PAGE_SIZE-1);
 5359	if (WARN_ON(offset + bytes > PAGE_SIZE))
 5360		bytes = (unsigned)PAGE_SIZE - offset;
 5361	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
 5362				       offset, bytes);
 5363	if (unlikely(ret < 0))
 5364		return X86EMUL_IO_NEEDED;
 5365
 5366	return X86EMUL_CONTINUE;
 5367}
 5368
 5369int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
 5370			       gva_t addr, void *val, unsigned int bytes,
 5371			       struct x86_exception *exception)
 5372{
 
 5373	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
 5374
 5375	/*
 5376	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
 5377	 * is returned, but our callers are not ready for that and they blindly
 5378	 * call kvm_inject_page_fault.  Ensure that they at least do not leak
 5379	 * uninitialized kernel stack memory into cr2 and error code.
 5380	 */
 5381	memset(exception, 0, sizeof(*exception));
 5382	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
 5383					  exception);
 5384}
 5385EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 5386
 5387static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
 5388			     gva_t addr, void *val, unsigned int bytes,
 5389			     struct x86_exception *exception, bool system)
 5390{
 5391	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5392	u32 access = 0;
 5393
 5394	if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
 5395		access |= PFERR_USER_MASK;
 5396
 5397	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
 5398}
 5399
 5400static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
 5401		unsigned long addr, void *val, unsigned int bytes)
 
 
 5402{
 5403	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5404	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
 5405
 5406	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
 5407}
 5408
 5409static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 5410				      struct kvm_vcpu *vcpu, u32 access,
 5411				      struct x86_exception *exception)
 5412{
 5413	void *data = val;
 5414	int r = X86EMUL_CONTINUE;
 5415
 5416	while (bytes) {
 5417		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
 5418							     access,
 5419							     exception);
 5420		unsigned offset = addr & (PAGE_SIZE-1);
 5421		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 5422		int ret;
 5423
 5424		if (gpa == UNMAPPED_GVA)
 5425			return X86EMUL_PROPAGATE_FAULT;
 5426		ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
 5427		if (ret < 0) {
 5428			r = X86EMUL_IO_NEEDED;
 5429			goto out;
 5430		}
 5431
 5432		bytes -= towrite;
 5433		data += towrite;
 5434		addr += towrite;
 5435	}
 5436out:
 5437	return r;
 5438}
 5439
 5440static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
 5441			      unsigned int bytes, struct x86_exception *exception,
 5442			      bool system)
 5443{
 5444	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5445	u32 access = PFERR_WRITE_MASK;
 5446
 5447	if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
 5448		access |= PFERR_USER_MASK;
 5449
 5450	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
 5451					   access, exception);
 5452}
 5453
 5454int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
 5455				unsigned int bytes, struct x86_exception *exception)
 5456{
 5457	/* kvm_write_guest_virt_system can pull in tons of pages. */
 5458	vcpu->arch.l1tf_flush_l1d = true;
 5459
 5460	/*
 5461	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
 5462	 * is returned, but our callers are not ready for that and they blindly
 5463	 * call kvm_inject_page_fault.  Ensure that they at least do not leak
 5464	 * uninitialized kernel stack memory into cr2 and error code.
 5465	 */
 5466	memset(exception, 0, sizeof(*exception));
 5467	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
 5468					   PFERR_WRITE_MASK, exception);
 5469}
 5470EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 5471
 5472int handle_ud(struct kvm_vcpu *vcpu)
 5473{
 5474	int emul_type = EMULTYPE_TRAP_UD;
 5475	char sig[5]; /* ud2; .ascii "kvm" */
 5476	struct x86_exception e;
 5477
 5478	if (force_emulation_prefix &&
 5479	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
 5480				sig, sizeof(sig), &e) == 0 &&
 5481	    memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
 5482		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
 5483		emul_type = EMULTYPE_TRAP_UD_FORCED;
 5484	}
 5485
 5486	return kvm_emulate_instruction(vcpu, emul_type);
 5487}
 5488EXPORT_SYMBOL_GPL(handle_ud);
 5489
 5490static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 5491			    gpa_t gpa, bool write)
 5492{
 5493	/* For APIC access vmexit */
 5494	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 5495		return 1;
 5496
 5497	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
 5498		trace_vcpu_match_mmio(gva, gpa, write, true);
 5499		return 1;
 5500	}
 5501
 5502	return 0;
 5503}
 5504
 5505static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 5506				gpa_t *gpa, struct x86_exception *exception,
 5507				bool write)
 5508{
 5509	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
 5510		| (write ? PFERR_WRITE_MASK : 0);
 5511
 5512	/*
 5513	 * currently PKRU is only applied to ept enabled guest so
 5514	 * there is no pkey in EPT page table for L1 guest or EPT
 5515	 * shadow page table for L2 guest.
 5516	 */
 5517	if (vcpu_match_mmio_gva(vcpu, gva)
 5518	    && !permission_fault(vcpu, vcpu->arch.walk_mmu,
 5519				 vcpu->arch.mmio_access, 0, access)) {
 5520		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 5521					(gva & (PAGE_SIZE - 1));
 5522		trace_vcpu_match_mmio(gva, *gpa, write, false);
 5523		return 1;
 5524	}
 5525
 
 
 
 5526	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 5527
 5528	if (*gpa == UNMAPPED_GVA)
 5529		return -1;
 5530
 5531	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
 5532}
 
 5533
 5534int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 5535			const void *val, int bytes)
 5536{
 5537	int ret;
 5538
 5539	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
 5540	if (ret < 0)
 5541		return 0;
 5542	kvm_page_track_write(vcpu, gpa, val, bytes);
 5543	return 1;
 5544}
 5545
 5546struct read_write_emulator_ops {
 5547	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
 5548				  int bytes);
 5549	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
 5550				  void *val, int bytes);
 5551	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
 5552			       int bytes, void *val);
 5553	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
 5554				    void *val, int bytes);
 5555	bool write;
 5556};
 5557
 5558static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 5559{
 5560	if (vcpu->mmio_read_completed) {
 
 5561		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
 5562			       vcpu->mmio_fragments[0].gpa, val);
 5563		vcpu->mmio_read_completed = 0;
 5564		return 1;
 5565	}
 5566
 5567	return 0;
 5568}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 5569
 5570static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 5571			void *val, int bytes)
 5572{
 5573	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
 5574}
 5575
 5576static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 5577			 void *val, int bytes)
 5578{
 5579	return emulator_write_phys(vcpu, gpa, val, bytes);
 5580}
 5581
 5582static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
 5583{
 5584	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
 5585	return vcpu_mmio_write(vcpu, gpa, bytes, val);
 5586}
 
 
 5587
 5588static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 5589			  void *val, int bytes)
 5590{
 5591	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
 5592	return X86EMUL_IO_NEEDED;
 5593}
 5594
 5595static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 5596			   void *val, int bytes)
 5597{
 5598	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
 5599
 5600	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
 5601	return X86EMUL_CONTINUE;
 
 
 
 5602}
 5603
 5604static const struct read_write_emulator_ops read_emultor = {
 5605	.read_write_prepare = read_prepare,
 5606	.read_write_emulate = read_emulate,
 5607	.read_write_mmio = vcpu_mmio_read,
 5608	.read_write_exit_mmio = read_exit_mmio,
 5609};
 5610
 5611static const struct read_write_emulator_ops write_emultor = {
 5612	.read_write_emulate = write_emulate,
 5613	.read_write_mmio = write_mmio,
 5614	.read_write_exit_mmio = write_exit_mmio,
 5615	.write = true,
 5616};
 5617
 5618static int emulator_read_write_onepage(unsigned long addr, void *val,
 5619				       unsigned int bytes,
 5620				       struct x86_exception *exception,
 5621				       struct kvm_vcpu *vcpu,
 5622				       const struct read_write_emulator_ops *ops)
 5623{
 5624	gpa_t gpa;
 5625	int handled, ret;
 5626	bool write = ops->write;
 5627	struct kvm_mmio_fragment *frag;
 5628	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 5629
 5630	/*
 5631	 * If the exit was due to a NPF we may already have a GPA.
 5632	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
 5633	 * Note, this cannot be used on string operations since string
 5634	 * operation using rep will only have the initial GPA from the NPF
 5635	 * occurred.
 5636	 */
 5637	if (vcpu->arch.gpa_available &&
 5638	    emulator_can_use_gpa(ctxt) &&
 5639	    (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
 5640		gpa = vcpu->arch.gpa_val;
 5641		ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
 5642	} else {
 5643		ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
 5644		if (ret < 0)
 5645			return X86EMUL_PROPAGATE_FAULT;
 5646	}
 5647
 5648	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
 5649		return X86EMUL_CONTINUE;
 5650
 
 
 5651	/*
 5652	 * Is this MMIO handled locally?
 5653	 */
 5654	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
 5655	if (handled == bytes)
 5656		return X86EMUL_CONTINUE;
 5657
 5658	gpa += handled;
 5659	bytes -= handled;
 5660	val += handled;
 5661
 5662	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
 5663	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
 5664	frag->gpa = gpa;
 5665	frag->data = val;
 5666	frag->len = bytes;
 
 
 
 
 
 5667	return X86EMUL_CONTINUE;
 5668}
 5669
 5670static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
 5671			unsigned long addr,
 5672			void *val, unsigned int bytes,
 5673			struct x86_exception *exception,
 5674			const struct read_write_emulator_ops *ops)
 5675{
 5676	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5677	gpa_t gpa;
 5678	int rc;
 5679
 5680	if (ops->read_write_prepare &&
 5681		  ops->read_write_prepare(vcpu, val, bytes))
 5682		return X86EMUL_CONTINUE;
 5683
 5684	vcpu->mmio_nr_fragments = 0;
 5685
 5686	/* Crossing a page boundary? */
 5687	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 5688		int now;
 5689
 5690		now = -addr & ~PAGE_MASK;
 5691		rc = emulator_read_write_onepage(addr, val, now, exception,
 5692						 vcpu, ops);
 5693
 5694		if (rc != X86EMUL_CONTINUE)
 5695			return rc;
 5696		addr += now;
 5697		if (ctxt->mode != X86EMUL_MODE_PROT64)
 5698			addr = (u32)addr;
 5699		val += now;
 5700		bytes -= now;
 5701	}
 5702
 5703	rc = emulator_read_write_onepage(addr, val, bytes, exception,
 5704					 vcpu, ops);
 5705	if (rc != X86EMUL_CONTINUE)
 5706		return rc;
 5707
 5708	if (!vcpu->mmio_nr_fragments)
 5709		return rc;
 5710
 5711	gpa = vcpu->mmio_fragments[0].gpa;
 5712
 5713	vcpu->mmio_needed = 1;
 5714	vcpu->mmio_cur_fragment = 0;
 5715
 5716	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
 5717	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
 5718	vcpu->run->exit_reason = KVM_EXIT_MMIO;
 5719	vcpu->run->mmio.phys_addr = gpa;
 5720
 5721	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
 5722}
 5723
 5724static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 5725				  unsigned long addr,
 5726				  void *val,
 5727				  unsigned int bytes,
 5728				  struct x86_exception *exception)
 5729{
 5730	return emulator_read_write(ctxt, addr, val, bytes,
 5731				   exception, &read_emultor);
 5732}
 5733
 5734static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
 5735			    unsigned long addr,
 5736			    const void *val,
 5737			    unsigned int bytes,
 5738			    struct x86_exception *exception)
 5739{
 5740	return emulator_read_write(ctxt, addr, (void *)val, bytes,
 5741				   exception, &write_emultor);
 5742}
 5743
 5744#define CMPXCHG_TYPE(t, ptr, old, new) \
 5745	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
 5746
 5747#ifdef CONFIG_X86_64
 5748#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
 5749#else
 5750#  define CMPXCHG64(ptr, old, new) \
 5751	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
 5752#endif
 5753
 5754static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 5755				     unsigned long addr,
 5756				     const void *old,
 5757				     const void *new,
 5758				     unsigned int bytes,
 5759				     struct x86_exception *exception)
 5760{
 5761	struct kvm_host_map map;
 5762	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5763	gpa_t gpa;
 
 5764	char *kaddr;
 5765	bool exchanged;
 5766
 5767	/* guests cmpxchg8b have to be emulated atomically */
 5768	if (bytes > 8 || (bytes & (bytes - 1)))
 5769		goto emul_write;
 5770
 5771	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 5772
 5773	if (gpa == UNMAPPED_GVA ||
 5774	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 5775		goto emul_write;
 5776
 5777	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
 5778		goto emul_write;
 5779
 5780	if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
 
 
 5781		goto emul_write;
 
 5782
 5783	kaddr = map.hva + offset_in_page(gpa);
 5784
 5785	switch (bytes) {
 5786	case 1:
 5787		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
 5788		break;
 5789	case 2:
 5790		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
 5791		break;
 5792	case 4:
 5793		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
 5794		break;
 5795	case 8:
 5796		exchanged = CMPXCHG64(kaddr, old, new);
 5797		break;
 5798	default:
 5799		BUG();
 5800	}
 5801
 5802	kvm_vcpu_unmap(vcpu, &map, true);
 5803
 5804	if (!exchanged)
 5805		return X86EMUL_CMPXCHG_FAILED;
 5806
 5807	kvm_page_track_write(vcpu, gpa, new, bytes);
 5808
 5809	return X86EMUL_CONTINUE;
 5810
 5811emul_write:
 5812	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 5813
 5814	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
 5815}
 5816
 5817static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 5818{
 5819	int r = 0, i;
 
 5820
 5821	for (i = 0; i < vcpu->arch.pio.count; i++) {
 5822		if (vcpu->arch.pio.in)
 5823			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
 5824					    vcpu->arch.pio.size, pd);
 5825		else
 5826			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
 5827					     vcpu->arch.pio.port, vcpu->arch.pio.size,
 5828					     pd);
 5829		if (r)
 5830			break;
 5831		pd += vcpu->arch.pio.size;
 5832	}
 5833	return r;
 5834}
 5835
 5836static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
 5837			       unsigned short port, void *val,
 5838			       unsigned int count, bool in)
 
 5839{
 
 
 
 
 
 
 
 5840	vcpu->arch.pio.port = port;
 5841	vcpu->arch.pio.in = in;
 5842	vcpu->arch.pio.count  = count;
 5843	vcpu->arch.pio.size = size;
 5844
 5845	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 
 
 5846		vcpu->arch.pio.count = 0;
 5847		return 1;
 5848	}
 5849
 5850	vcpu->run->exit_reason = KVM_EXIT_IO;
 5851	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 5852	vcpu->run->io.size = size;
 5853	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 5854	vcpu->run->io.count = count;
 5855	vcpu->run->io.port = port;
 5856
 5857	return 0;
 5858}
 5859
 5860static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 5861				    int size, unsigned short port, void *val,
 5862				    unsigned int count)
 5863{
 5864	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5865	int ret;
 5866
 5867	if (vcpu->arch.pio.count)
 5868		goto data_avail;
 
 
 
 
 5869
 5870	memset(vcpu->arch.pio_data, 0, size * count);
 5871
 5872	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
 5873	if (ret) {
 5874data_avail:
 5875		memcpy(val, vcpu->arch.pio_data, size * count);
 5876		trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
 5877		vcpu->arch.pio.count = 0;
 5878		return 1;
 5879	}
 5880
 
 
 
 
 
 
 
 5881	return 0;
 5882}
 5883
 5884static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 5885				     int size, unsigned short port,
 5886				     const void *val, unsigned int count)
 5887{
 5888	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5889
 5890	memcpy(vcpu->arch.pio_data, val, size * count);
 5891	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
 5892	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
 5893}
 5894
 5895static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 5896{
 5897	return kvm_x86_ops->get_segment_base(vcpu, seg);
 5898}
 5899
 5900static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
 5901{
 5902	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
 5903}
 5904
 5905static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
 5906{
 5907	if (!need_emulate_wbinvd(vcpu))
 5908		return X86EMUL_CONTINUE;
 5909
 5910	if (kvm_x86_ops->has_wbinvd_exit()) {
 5911		int cpu = get_cpu();
 5912
 5913		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
 5914		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
 5915				wbinvd_ipi, NULL, 1);
 5916		put_cpu();
 5917		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
 5918	} else
 5919		wbinvd();
 5920	return X86EMUL_CONTINUE;
 5921}
 5922
 5923int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 5924{
 5925	kvm_emulate_wbinvd_noskip(vcpu);
 5926	return kvm_skip_emulated_instruction(vcpu);
 5927}
 5928EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 5929
 5930
 5931
 5932static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
 5933{
 5934	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 5935}
 5936
 5937static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
 5938			   unsigned long *dest)
 5939{
 5940	return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
 5941}
 5942
 5943static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
 5944			   unsigned long value)
 5945{
 5946
 5947	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
 5948}
 5949
 5950static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 5951{
 5952	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
 5953}
 5954
 5955static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
 5956{
 5957	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5958	unsigned long value;
 5959
 5960	switch (cr) {
 5961	case 0:
 5962		value = kvm_read_cr0(vcpu);
 5963		break;
 5964	case 2:
 5965		value = vcpu->arch.cr2;
 5966		break;
 5967	case 3:
 5968		value = kvm_read_cr3(vcpu);
 5969		break;
 5970	case 4:
 5971		value = kvm_read_cr4(vcpu);
 5972		break;
 5973	case 8:
 5974		value = kvm_get_cr8(vcpu);
 5975		break;
 5976	default:
 5977		kvm_err("%s: unexpected cr %u\n", __func__, cr);
 5978		return 0;
 5979	}
 5980
 5981	return value;
 5982}
 5983
 5984static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
 5985{
 5986	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 5987	int res = 0;
 5988
 5989	switch (cr) {
 5990	case 0:
 5991		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 5992		break;
 5993	case 2:
 5994		vcpu->arch.cr2 = val;
 5995		break;
 5996	case 3:
 5997		res = kvm_set_cr3(vcpu, val);
 5998		break;
 5999	case 4:
 6000		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 6001		break;
 6002	case 8:
 6003		res = kvm_set_cr8(vcpu, val);
 6004		break;
 6005	default:
 6006		kvm_err("%s: unexpected cr %u\n", __func__, cr);
 6007		res = -1;
 6008	}
 6009
 6010	return res;
 6011}
 6012
 6013static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
 6014{
 6015	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
 6016}
 6017
 6018static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 6019{
 6020	kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
 6021}
 6022
 6023static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 6024{
 6025	kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
 6026}
 6027
 6028static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 6029{
 6030	kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
 6031}
 6032
 6033static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
 6034{
 6035	kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
 6036}
 6037
 6038static unsigned long emulator_get_cached_segment_base(
 6039	struct x86_emulate_ctxt *ctxt, int seg)
 6040{
 6041	return get_segment_base(emul_to_vcpu(ctxt), seg);
 6042}
 6043
 6044static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
 6045				 struct desc_struct *desc, u32 *base3,
 6046				 int seg)
 6047{
 6048	struct kvm_segment var;
 6049
 6050	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
 6051	*selector = var.selector;
 6052
 6053	if (var.unusable) {
 6054		memset(desc, 0, sizeof(*desc));
 6055		if (base3)
 6056			*base3 = 0;
 6057		return false;
 6058	}
 6059
 6060	if (var.g)
 6061		var.limit >>= 12;
 6062	set_desc_limit(desc, var.limit);
 6063	set_desc_base(desc, (unsigned long)var.base);
 6064#ifdef CONFIG_X86_64
 6065	if (base3)
 6066		*base3 = var.base >> 32;
 6067#endif
 6068	desc->type = var.type;
 6069	desc->s = var.s;
 6070	desc->dpl = var.dpl;
 6071	desc->p = var.present;
 6072	desc->avl = var.avl;
 6073	desc->l = var.l;
 6074	desc->d = var.db;
 6075	desc->g = var.g;
 6076
 6077	return true;
 6078}
 6079
 6080static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
 6081				 struct desc_struct *desc, u32 base3,
 6082				 int seg)
 6083{
 6084	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 6085	struct kvm_segment var;
 6086
 6087	var.selector = selector;
 6088	var.base = get_desc_base(desc);
 6089#ifdef CONFIG_X86_64
 6090	var.base |= ((u64)base3) << 32;
 6091#endif
 6092	var.limit = get_desc_limit(desc);
 6093	if (desc->g)
 6094		var.limit = (var.limit << 12) | 0xfff;
 6095	var.type = desc->type;
 
 6096	var.dpl = desc->dpl;
 6097	var.db = desc->d;
 6098	var.s = desc->s;
 6099	var.l = desc->l;
 6100	var.g = desc->g;
 6101	var.avl = desc->avl;
 6102	var.present = desc->p;
 6103	var.unusable = !var.present;
 6104	var.padding = 0;
 6105
 6106	kvm_set_segment(vcpu, &var, seg);
 6107	return;
 6108}
 6109
 6110static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 6111			    u32 msr_index, u64 *pdata)
 6112{
 6113	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
 6114}
 6115
 6116static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 6117			    u32 msr_index, u64 data)
 6118{
 6119	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
 6120}
 6121
 6122static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
 6123{
 6124	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 6125
 6126	return vcpu->arch.smbase;
 6127}
 6128
 6129static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
 6130{
 6131	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 6132
 6133	vcpu->arch.smbase = smbase;
 
 
 
 
 6134}
 6135
 6136static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
 6137			      u32 pmc)
 6138{
 6139	return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
 6140}
 6141
 6142static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
 6143			     u32 pmc, u64 *pdata)
 6144{
 6145	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
 6146}
 6147
 6148static void emulator_halt(struct x86_emulate_ctxt *ctxt)
 6149{
 6150	emul_to_vcpu(ctxt)->arch.halt_request = 1;
 6151}
 6152
 6153static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
 6154			      struct x86_instruction_info *info,
 6155			      enum x86_intercept_stage stage)
 6156{
 6157	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
 6158}
 6159
 6160static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
 6161			u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
 6162{
 6163	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
 6164}
 6165
 6166static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
 6167{
 6168	return kvm_register_read(emul_to_vcpu(ctxt), reg);
 6169}
 6170
 6171static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
 6172{
 6173	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
 6174}
 6175
 6176static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
 6177{
 6178	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
 6179}
 6180
 6181static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
 6182{
 6183	return emul_to_vcpu(ctxt)->arch.hflags;
 6184}
 6185
 6186static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
 6187{
 6188	emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
 6189}
 6190
 6191static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
 6192				  const char *smstate)
 6193{
 6194	return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate);
 6195}
 6196
 6197static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
 6198{
 6199	kvm_smm_changed(emul_to_vcpu(ctxt));
 6200}
 6201
 6202static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
 6203{
 6204	return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
 6205}
 6206
 6207static const struct x86_emulate_ops emulate_ops = {
 6208	.read_gpr            = emulator_read_gpr,
 6209	.write_gpr           = emulator_write_gpr,
 6210	.read_std            = emulator_read_std,
 6211	.write_std           = emulator_write_std,
 6212	.read_phys           = kvm_read_guest_phys_system,
 6213	.fetch               = kvm_fetch_guest_virt,
 6214	.read_emulated       = emulator_read_emulated,
 6215	.write_emulated      = emulator_write_emulated,
 6216	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 6217	.invlpg              = emulator_invlpg,
 6218	.pio_in_emulated     = emulator_pio_in_emulated,
 6219	.pio_out_emulated    = emulator_pio_out_emulated,
 6220	.get_segment         = emulator_get_segment,
 6221	.set_segment         = emulator_set_segment,
 6222	.get_cached_segment_base = emulator_get_cached_segment_base,
 6223	.get_gdt             = emulator_get_gdt,
 6224	.get_idt	     = emulator_get_idt,
 6225	.set_gdt             = emulator_set_gdt,
 6226	.set_idt	     = emulator_set_idt,
 6227	.get_cr              = emulator_get_cr,
 6228	.set_cr              = emulator_set_cr,
 6229	.cpl                 = emulator_get_cpl,
 6230	.get_dr              = emulator_get_dr,
 6231	.set_dr              = emulator_set_dr,
 6232	.get_smbase          = emulator_get_smbase,
 6233	.set_smbase          = emulator_set_smbase,
 6234	.set_msr             = emulator_set_msr,
 6235	.get_msr             = emulator_get_msr,
 6236	.check_pmc	     = emulator_check_pmc,
 6237	.read_pmc            = emulator_read_pmc,
 6238	.halt                = emulator_halt,
 6239	.wbinvd              = emulator_wbinvd,
 6240	.fix_hypercall       = emulator_fix_hypercall,
 
 
 6241	.intercept           = emulator_intercept,
 6242	.get_cpuid           = emulator_get_cpuid,
 6243	.set_nmi_mask        = emulator_set_nmi_mask,
 6244	.get_hflags          = emulator_get_hflags,
 6245	.set_hflags          = emulator_set_hflags,
 6246	.pre_leave_smm       = emulator_pre_leave_smm,
 6247	.post_leave_smm      = emulator_post_leave_smm,
 6248	.set_xcr             = emulator_set_xcr,
 6249};
 6250
 
 
 
 
 
 
 
 
 6251static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 6252{
 6253	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
 6254	/*
 6255	 * an sti; sti; sequence only disable interrupts for the first
 6256	 * instruction. So, if the last instruction, be it emulated or
 6257	 * not, left the system with the INT_STI flag enabled, it
 6258	 * means that the last instruction is an sti. We should not
 6259	 * leave the flag on in this case. The same goes for mov ss
 6260	 */
 6261	if (int_shadow & mask)
 6262		mask = 0;
 6263	if (unlikely(int_shadow || mask)) {
 6264		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
 6265		if (!mask)
 6266			kvm_make_request(KVM_REQ_EVENT, vcpu);
 6267	}
 6268}
 6269
 6270static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
 6271{
 6272	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 6273	if (ctxt->exception.vector == PF_VECTOR)
 6274		return kvm_propagate_fault(vcpu, &ctxt->exception);
 6275
 6276	if (ctxt->exception.error_code_valid)
 6277		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
 6278				      ctxt->exception.error_code);
 6279	else
 6280		kvm_queue_exception(vcpu, ctxt->exception.vector);
 6281	return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 6282}
 6283
 6284static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 6285{
 6286	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 6287	int cs_db, cs_l;
 6288
 
 
 
 
 
 
 
 
 6289	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 6290
 6291	ctxt->eflags = kvm_get_rflags(vcpu);
 6292	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
 6293
 6294	ctxt->eip = kvm_rip_read(vcpu);
 6295	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
 6296		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
 6297		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
 6298		     cs_db				? X86EMUL_MODE_PROT32 :
 6299							  X86EMUL_MODE_PROT16;
 6300	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
 6301	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
 6302	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
 6303
 6304	init_decode_cache(ctxt);
 6305	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 6306}
 6307
 6308void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 6309{
 6310	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 6311	int ret;
 6312
 6313	init_emulate_ctxt(vcpu);
 6314
 6315	ctxt->op_bytes = 2;
 6316	ctxt->ad_bytes = 2;
 6317	ctxt->_eip = ctxt->eip + inc_eip;
 6318	ret = emulate_int_real(ctxt, irq);
 6319
 6320	if (ret != X86EMUL_CONTINUE) {
 6321		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 6322	} else {
 6323		ctxt->eip = ctxt->_eip;
 6324		kvm_rip_write(vcpu, ctxt->eip);
 6325		kvm_set_rflags(vcpu, ctxt->eflags);
 6326	}
 
 
 
 
 
 
 
 6327}
 6328EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 6329
 6330static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 6331{
 
 
 6332	++vcpu->stat.insn_emulation_fail;
 6333	trace_kvm_emulate_insn_failed(vcpu);
 6334
 6335	if (emulation_type & EMULTYPE_VMWARE_GP) {
 6336		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 6337		return 1;
 6338	}
 6339
 6340	if (emulation_type & EMULTYPE_SKIP) {
 6341		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 6342		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 6343		vcpu->run->internal.ndata = 0;
 6344		return 0;
 6345	}
 6346
 6347	kvm_queue_exception(vcpu, UD_VECTOR);
 6348
 6349	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
 6350		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 6351		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 6352		vcpu->run->internal.ndata = 0;
 6353		return 0;
 6354	}
 6355
 6356	return 1;
 6357}
 6358
 6359static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
 6360				  bool write_fault_to_shadow_pgtable,
 6361				  int emulation_type)
 6362{
 6363	gpa_t gpa = cr2;
 6364	kvm_pfn_t pfn;
 6365
 6366	if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
 6367		return false;
 6368
 6369	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
 6370		return false;
 6371
 6372	if (!vcpu->arch.mmu->direct_map) {
 6373		/*
 6374		 * Write permission should be allowed since only
 6375		 * write access need to be emulated.
 6376		 */
 6377		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 6378
 6379		/*
 6380		 * If the mapping is invalid in guest, let cpu retry
 6381		 * it to generate fault.
 6382		 */
 6383		if (gpa == UNMAPPED_GVA)
 6384			return true;
 6385	}
 6386
 6387	/*
 6388	 * Do not retry the unhandleable instruction if it faults on the
 6389	 * readonly host memory, otherwise it will goto a infinite loop:
 6390	 * retry instruction -> write #PF -> emulation fail -> retry
 6391	 * instruction -> ...
 6392	 */
 6393	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
 6394
 6395	/*
 6396	 * If the instruction failed on the error pfn, it can not be fixed,
 6397	 * report the error to userspace.
 6398	 */
 6399	if (is_error_noslot_pfn(pfn))
 6400		return false;
 6401
 6402	kvm_release_pfn_clean(pfn);
 6403
 6404	/* The instructions are well-emulated on direct mmu. */
 6405	if (vcpu->arch.mmu->direct_map) {
 6406		unsigned int indirect_shadow_pages;
 6407
 6408		spin_lock(&vcpu->kvm->mmu_lock);
 6409		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
 6410		spin_unlock(&vcpu->kvm->mmu_lock);
 6411
 6412		if (indirect_shadow_pages)
 6413			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 6414
 6415		return true;
 6416	}
 6417
 6418	/*
 6419	 * if emulation was due to access to shadowed page table
 6420	 * and it failed try to unshadow page and re-enter the
 6421	 * guest to let CPU execute the instruction.
 6422	 */
 6423	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 6424
 6425	/*
 6426	 * If the access faults on its page table, it can not
 6427	 * be fixed by unprotecting shadow page and it should
 6428	 * be reported to userspace.
 6429	 */
 6430	return !write_fault_to_shadow_pgtable;
 6431}
 6432
 6433static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 6434			      unsigned long cr2,  int emulation_type)
 6435{
 6436	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 6437	unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
 6438
 6439	last_retry_eip = vcpu->arch.last_retry_eip;
 6440	last_retry_addr = vcpu->arch.last_retry_addr;
 6441
 6442	/*
 6443	 * If the emulation is caused by #PF and it is non-page_table
 6444	 * writing instruction, it means the VM-EXIT is caused by shadow
 6445	 * page protected, we can zap the shadow page and retry this
 6446	 * instruction directly.
 6447	 *
 6448	 * Note: if the guest uses a non-page-table modifying instruction
 6449	 * on the PDE that points to the instruction, then we will unmap
 6450	 * the instruction and go to an infinite loop. So, we cache the
 6451	 * last retried eip and the last fault address, if we meet the eip
 6452	 * and the address again, we can break out of the potential infinite
 6453	 * loop.
 6454	 */
 6455	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
 6456
 6457	if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
 6458		return false;
 6459
 6460	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
 6461		return false;
 6462
 6463	if (x86_page_table_writing_insn(ctxt))
 6464		return false;
 6465
 6466	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
 6467		return false;
 6468
 6469	vcpu->arch.last_retry_eip = ctxt->eip;
 6470	vcpu->arch.last_retry_addr = cr2;
 6471
 6472	if (!vcpu->arch.mmu->direct_map)
 6473		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 6474
 6475	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 6476
 6477	return true;
 6478}
 6479
 6480static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 6481static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 6482
 6483static void kvm_smm_changed(struct kvm_vcpu *vcpu)
 6484{
 6485	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
 6486		/* This is a good place to trace that we are exiting SMM.  */
 6487		trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
 6488
 6489		/* Process a latched INIT or SMI, if any.  */
 6490		kvm_make_request(KVM_REQ_EVENT, vcpu);
 6491	}
 6492
 6493	kvm_mmu_reset_context(vcpu);
 6494}
 6495
 6496static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
 6497				unsigned long *db)
 6498{
 6499	u32 dr6 = 0;
 6500	int i;
 6501	u32 enable, rwlen;
 6502
 6503	enable = dr7;
 6504	rwlen = dr7 >> 16;
 6505	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
 6506		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
 6507			dr6 |= (1 << i);
 6508	return dr6;
 6509}
 6510
 6511static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
 6512{
 6513	struct kvm_run *kvm_run = vcpu->run;
 6514
 6515	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
 6516		kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
 6517		kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
 6518		kvm_run->debug.arch.exception = DB_VECTOR;
 6519		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 6520		return 0;
 6521	}
 6522	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
 6523	return 1;
 6524}
 6525
 6526int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
 6527{
 6528	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
 6529	int r;
 6530
 6531	r = kvm_x86_ops->skip_emulated_instruction(vcpu);
 6532	if (unlikely(!r))
 6533		return 0;
 6534
 6535	/*
 6536	 * rflags is the old, "raw" value of the flags.  The new value has
 6537	 * not been saved yet.
 6538	 *
 6539	 * This is correct even for TF set by the guest, because "the
 6540	 * processor will not generate this exception after the instruction
 6541	 * that sets the TF flag".
 6542	 */
 6543	if (unlikely(rflags & X86_EFLAGS_TF))
 6544		r = kvm_vcpu_do_singlestep(vcpu);
 6545	return r;
 6546}
 6547EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
 6548
 6549static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
 6550{
 6551	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
 6552	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
 6553		struct kvm_run *kvm_run = vcpu->run;
 6554		unsigned long eip = kvm_get_linear_rip(vcpu);
 6555		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
 6556					   vcpu->arch.guest_debug_dr7,
 6557					   vcpu->arch.eff_db);
 6558
 6559		if (dr6 != 0) {
 6560			kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
 6561			kvm_run->debug.arch.pc = eip;
 6562			kvm_run->debug.arch.exception = DB_VECTOR;
 6563			kvm_run->exit_reason = KVM_EXIT_DEBUG;
 6564			*r = 0;
 6565			return true;
 6566		}
 6567	}
 6568
 6569	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
 6570	    !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
 6571		unsigned long eip = kvm_get_linear_rip(vcpu);
 6572		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
 6573					   vcpu->arch.dr7,
 6574					   vcpu->arch.db);
 6575
 6576		if (dr6 != 0) {
 6577			vcpu->arch.dr6 &= ~DR_TRAP_BITS;
 6578			vcpu->arch.dr6 |= dr6 | DR6_RTM;
 6579			kvm_queue_exception(vcpu, DB_VECTOR);
 6580			*r = 1;
 6581			return true;
 6582		}
 6583	}
 6584
 6585	return false;
 6586}
 6587
 6588static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
 6589{
 6590	switch (ctxt->opcode_len) {
 6591	case 1:
 6592		switch (ctxt->b) {
 6593		case 0xe4:	/* IN */
 6594		case 0xe5:
 6595		case 0xec:
 6596		case 0xed:
 6597		case 0xe6:	/* OUT */
 6598		case 0xe7:
 6599		case 0xee:
 6600		case 0xef:
 6601		case 0x6c:	/* INS */
 6602		case 0x6d:
 6603		case 0x6e:	/* OUTS */
 6604		case 0x6f:
 6605			return true;
 6606		}
 6607		break;
 6608	case 2:
 6609		switch (ctxt->b) {
 6610		case 0x33:	/* RDPMC */
 6611			return true;
 6612		}
 6613		break;
 6614	}
 6615
 6616	return false;
 6617}
 6618
 6619int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 6620			    unsigned long cr2,
 6621			    int emulation_type,
 6622			    void *insn,
 6623			    int insn_len)
 6624{
 6625	int r;
 6626	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 6627	bool writeback = true;
 6628	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
 6629
 6630	vcpu->arch.l1tf_flush_l1d = true;
 6631
 6632	/*
 6633	 * Clear write_fault_to_shadow_pgtable here to ensure it is
 6634	 * never reused.
 6635	 */
 6636	vcpu->arch.write_fault_to_shadow_pgtable = false;
 6637	kvm_clear_exception_queue(vcpu);
 6638
 6639	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 6640		init_emulate_ctxt(vcpu);
 6641
 6642		/*
 6643		 * We will reenter on the same instruction since
 6644		 * we do not set complete_userspace_io.  This does not
 6645		 * handle watchpoints yet, those would be handled in
 6646		 * the emulate_ops.
 6647		 */
 6648		if (!(emulation_type & EMULTYPE_SKIP) &&
 6649		    kvm_vcpu_check_breakpoint(vcpu, &r))
 6650			return r;
 6651
 6652		ctxt->interruptibility = 0;
 6653		ctxt->have_exception = false;
 6654		ctxt->exception.vector = -1;
 6655		ctxt->perm_ok = false;
 6656
 6657		ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
 
 6658
 6659		r = x86_decode_insn(ctxt, insn, insn_len);
 6660
 6661		trace_kvm_emulate_insn_start(vcpu);
 6662		++vcpu->stat.insn_emulation;
 6663		if (r != EMULATION_OK)  {
 6664			if ((emulation_type & EMULTYPE_TRAP_UD) ||
 6665			    (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
 6666				kvm_queue_exception(vcpu, UD_VECTOR);
 6667				return 1;
 6668			}
 6669			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
 6670						emulation_type))
 6671				return 1;
 6672			if (ctxt->have_exception) {
 6673				/*
 6674				 * #UD should result in just EMULATION_FAILED, and trap-like
 6675				 * exception should not be encountered during decode.
 6676				 */
 6677				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
 6678					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
 6679				inject_emulated_exception(vcpu);
 6680				return 1;
 6681			}
 6682			return handle_emulation_failure(vcpu, emulation_type);
 6683		}
 6684	}
 6685
 6686	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
 6687	    !is_vmware_backdoor_opcode(ctxt)) {
 6688		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 6689		return 1;
 6690	}
 6691
 6692	/*
 6693	 * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
 6694	 * for kvm_skip_emulated_instruction().  The caller is responsible for
 6695	 * updating interruptibility state and injecting single-step #DBs.
 6696	 */
 6697	if (emulation_type & EMULTYPE_SKIP) {
 6698		kvm_rip_write(vcpu, ctxt->_eip);
 6699		if (ctxt->eflags & X86_EFLAGS_RF)
 6700			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
 6701		return 1;
 6702	}
 6703
 6704	if (retry_instruction(ctxt, cr2, emulation_type))
 6705		return 1;
 6706
 6707	/* this is needed for vmware backdoor interface to work since it
 6708	   changes registers values  during IO operation */
 6709	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
 6710		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 6711		emulator_invalidate_register_cache(ctxt);
 6712	}
 6713
 6714restart:
 6715	/* Save the faulting GPA (cr2) in the address field */
 6716	ctxt->exception.address = cr2;
 6717
 6718	r = x86_emulate_insn(ctxt);
 6719
 6720	if (r == EMULATION_INTERCEPTED)
 6721		return 1;
 6722
 6723	if (r == EMULATION_FAILED) {
 6724		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
 6725					emulation_type))
 6726			return 1;
 6727
 6728		return handle_emulation_failure(vcpu, emulation_type);
 6729	}
 6730
 6731	if (ctxt->have_exception) {
 6732		r = 1;
 6733		if (inject_emulated_exception(vcpu))
 6734			return r;
 6735	} else if (vcpu->arch.pio.count) {
 6736		if (!vcpu->arch.pio.in) {
 6737			/* FIXME: return into emulator if single-stepping.  */
 6738			vcpu->arch.pio.count = 0;
 6739		} else {
 6740			writeback = false;
 6741			vcpu->arch.complete_userspace_io = complete_emulated_pio;
 6742		}
 6743		r = 0;
 6744	} else if (vcpu->mmio_needed) {
 6745		++vcpu->stat.mmio_exits;
 6746
 6747		if (!vcpu->mmio_is_write)
 6748			writeback = false;
 6749		r = 0;
 6750		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 6751	} else if (r == EMULATION_RESTART)
 6752		goto restart;
 6753	else
 6754		r = 1;
 6755
 6756	if (writeback) {
 6757		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
 6758		toggle_interruptibility(vcpu, ctxt->interruptibility);
 
 
 
 6759		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 6760		if (!ctxt->have_exception ||
 6761		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
 6762			kvm_rip_write(vcpu, ctxt->eip);
 6763			if (r && ctxt->tf)
 6764				r = kvm_vcpu_do_singlestep(vcpu);
 6765			__kvm_set_rflags(vcpu, ctxt->eflags);
 6766		}
 6767
 6768		/*
 6769		 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
 6770		 * do nothing, and it will be requested again as soon as
 6771		 * the shadow expires.  But we still need to check here,
 6772		 * because POPF has no interrupt shadow.
 6773		 */
 6774		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
 6775			kvm_make_request(KVM_REQ_EVENT, vcpu);
 6776	} else
 6777		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 6778
 6779	return r;
 6780}
 
 6781
 6782int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
 6783{
 6784	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
 6785}
 6786EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
 6787
 6788int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
 6789					void *insn, int insn_len)
 6790{
 6791	return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
 6792}
 6793EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
 6794
 6795static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
 6796{
 6797	vcpu->arch.pio.count = 0;
 6798	return 1;
 6799}
 6800
 6801static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
 6802{
 6803	vcpu->arch.pio.count = 0;
 6804
 6805	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
 6806		return 1;
 6807
 6808	return kvm_skip_emulated_instruction(vcpu);
 6809}
 6810
 6811static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
 6812			    unsigned short port)
 6813{
 6814	unsigned long val = kvm_rax_read(vcpu);
 6815	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
 6816					    size, port, &val, 1);
 6817	if (ret)
 6818		return ret;
 6819
 6820	/*
 6821	 * Workaround userspace that relies on old KVM behavior of %rip being
 6822	 * incremented prior to exiting to userspace to handle "OUT 0x7e".
 6823	 */
 6824	if (port == 0x7e &&
 6825	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
 6826		vcpu->arch.complete_userspace_io =
 6827			complete_fast_pio_out_port_0x7e;
 6828		kvm_skip_emulated_instruction(vcpu);
 6829	} else {
 6830		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
 6831		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
 6832	}
 6833	return 0;
 6834}
 6835
 6836static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
 6837{
 6838	unsigned long val;
 6839
 6840	/* We should only ever be called with arch.pio.count equal to 1 */
 6841	BUG_ON(vcpu->arch.pio.count != 1);
 6842
 6843	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
 6844		vcpu->arch.pio.count = 0;
 6845		return 1;
 6846	}
 6847
 6848	/* For size less than 4 we merge, else we zero extend */
 6849	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
 6850
 6851	/*
 6852	 * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
 6853	 * the copy and tracing
 6854	 */
 6855	emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
 6856				 vcpu->arch.pio.port, &val, 1);
 6857	kvm_rax_write(vcpu, val);
 6858
 6859	return kvm_skip_emulated_instruction(vcpu);
 6860}
 6861
 6862static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
 6863			   unsigned short port)
 6864{
 6865	unsigned long val;
 6866	int ret;
 6867
 6868	/* For size less than 4 we merge, else we zero extend */
 6869	val = (size < 4) ? kvm_rax_read(vcpu) : 0;
 6870
 6871	ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
 6872				       &val, 1);
 6873	if (ret) {
 6874		kvm_rax_write(vcpu, val);
 6875		return ret;
 6876	}
 6877
 6878	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
 6879	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
 6880
 6881	return 0;
 6882}
 6883
 6884int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
 6885{
 6886	int ret;
 6887
 6888	if (in)
 6889		ret = kvm_fast_pio_in(vcpu, size, port);
 6890	else
 6891		ret = kvm_fast_pio_out(vcpu, size, port);
 6892	return ret && kvm_skip_emulated_instruction(vcpu);
 6893}
 6894EXPORT_SYMBOL_GPL(kvm_fast_pio);
 6895
 6896static int kvmclock_cpu_down_prep(unsigned int cpu)
 6897{
 6898	__this_cpu_write(cpu_tsc_khz, 0);
 6899	return 0;
 6900}
 6901
 6902static void tsc_khz_changed(void *data)
 6903{
 6904	struct cpufreq_freqs *freq = data;
 6905	unsigned long khz = 0;
 6906
 6907	if (data)
 6908		khz = freq->new;
 6909	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 6910		khz = cpufreq_quick_get(raw_smp_processor_id());
 6911	if (!khz)
 6912		khz = tsc_khz;
 6913	__this_cpu_write(cpu_tsc_khz, khz);
 6914}
 6915
 6916#ifdef CONFIG_X86_64
 6917static void kvm_hyperv_tsc_notifier(void)
 6918{
 6919	struct kvm *kvm;
 6920	struct kvm_vcpu *vcpu;
 6921	int cpu;
 6922
 6923	mutex_lock(&kvm_lock);
 6924	list_for_each_entry(kvm, &vm_list, vm_list)
 6925		kvm_make_mclock_inprogress_request(kvm);
 6926
 6927	hyperv_stop_tsc_emulation();
 6928
 6929	/* TSC frequency always matches when on Hyper-V */
 6930	for_each_present_cpu(cpu)
 6931		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
 6932	kvm_max_guest_tsc_khz = tsc_khz;
 6933
 6934	list_for_each_entry(kvm, &vm_list, vm_list) {
 6935		struct kvm_arch *ka = &kvm->arch;
 6936
 6937		spin_lock(&ka->pvclock_gtod_sync_lock);
 6938
 6939		pvclock_update_vm_gtod_copy(kvm);
 6940
 6941		kvm_for_each_vcpu(cpu, vcpu, kvm)
 6942			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 6943
 6944		kvm_for_each_vcpu(cpu, vcpu, kvm)
 6945			kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 6946
 6947		spin_unlock(&ka->pvclock_gtod_sync_lock);
 6948	}
 6949	mutex_unlock(&kvm_lock);
 6950}
 6951#endif
 6952
 6953static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
 6954{
 
 6955	struct kvm *kvm;
 6956	struct kvm_vcpu *vcpu;
 6957	int i, send_ipi = 0;
 6958
 6959	/*
 6960	 * We allow guests to temporarily run on slowing clocks,
 6961	 * provided we notify them after, or to run on accelerating
 6962	 * clocks, provided we notify them before.  Thus time never
 6963	 * goes backwards.
 6964	 *
 6965	 * However, we have a problem.  We can't atomically update
 6966	 * the frequency of a given CPU from this function; it is
 6967	 * merely a notifier, which can be called from any CPU.
 6968	 * Changing the TSC frequency at arbitrary points in time
 6969	 * requires a recomputation of local variables related to
 6970	 * the TSC for each VCPU.  We must flag these local variables
 6971	 * to be updated and be sure the update takes place with the
 6972	 * new frequency before any guests proceed.
 6973	 *
 6974	 * Unfortunately, the combination of hotplug CPU and frequency
 6975	 * change creates an intractable locking scenario; the order
 6976	 * of when these callouts happen is undefined with respect to
 6977	 * CPU hotplug, and they can race with each other.  As such,
 6978	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
 6979	 * undefined; you can actually have a CPU frequency change take
 6980	 * place in between the computation of X and the setting of the
 6981	 * variable.  To protect against this problem, all updates of
 6982	 * the per_cpu tsc_khz variable are done in an interrupt
 6983	 * protected IPI, and all callers wishing to update the value
 6984	 * must wait for a synchronous IPI to complete (which is trivial
 6985	 * if the caller is on the CPU already).  This establishes the
 6986	 * necessary total order on variable updates.
 6987	 *
 6988	 * Note that because a guest time update may take place
 6989	 * anytime after the setting of the VCPU's request bit, the
 6990	 * correct TSC value must be set before the request.  However,
 6991	 * to ensure the update actually makes it to any guest which
 6992	 * starts running in hardware virtualization between the set
 6993	 * and the acquisition of the spinlock, we must also ping the
 6994	 * CPU after setting the request bit.
 6995	 *
 6996	 */
 6997
 6998	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 
 
 
 
 
 6999
 7000	mutex_lock(&kvm_lock);
 7001	list_for_each_entry(kvm, &vm_list, vm_list) {
 7002		kvm_for_each_vcpu(i, vcpu, kvm) {
 7003			if (vcpu->cpu != cpu)
 7004				continue;
 7005			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 7006			if (vcpu->cpu != raw_smp_processor_id())
 7007				send_ipi = 1;
 7008		}
 7009	}
 7010	mutex_unlock(&kvm_lock);
 7011
 7012	if (freq->old < freq->new && send_ipi) {
 7013		/*
 7014		 * We upscale the frequency.  Must make the guest
 7015		 * doesn't see old kvmclock values while running with
 7016		 * the new frequency, otherwise we risk the guest sees
 7017		 * time go backwards.
 7018		 *
 7019		 * In case we update the frequency for another cpu
 7020		 * (which might be in guest context) send an interrupt
 7021		 * to kick the cpu out of guest context.  Next time
 7022		 * guest context is entered kvmclock will be updated,
 7023		 * so the guest will not see stale values.
 7024		 */
 7025		smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
 7026	}
 7027}
 7028
 7029static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 7030				     void *data)
 7031{
 7032	struct cpufreq_freqs *freq = data;
 7033	int cpu;
 7034
 7035	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
 7036		return 0;
 7037	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
 7038		return 0;
 7039
 7040	for_each_cpu(cpu, freq->policy->cpus)
 7041		__kvmclock_cpufreq_notifier(freq, cpu);
 7042
 7043	return 0;
 7044}
 7045
 7046static struct notifier_block kvmclock_cpufreq_notifier_block = {
 7047	.notifier_call  = kvmclock_cpufreq_notifier
 7048};
 7049
 7050static int kvmclock_cpu_online(unsigned int cpu)
 
 7051{
 7052	tsc_khz_changed(NULL);
 7053	return 0;
 
 
 
 
 
 
 
 
 
 
 7054}
 7055
 
 
 
 
 
 7056static void kvm_timer_init(void)
 7057{
 
 
 7058	max_tsc_khz = tsc_khz;
 7059
 7060	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 7061#ifdef CONFIG_CPU_FREQ
 7062		struct cpufreq_policy policy;
 7063		int cpu;
 7064
 7065		memset(&policy, 0, sizeof(policy));
 7066		cpu = get_cpu();
 7067		cpufreq_get_policy(&policy, cpu);
 7068		if (policy.cpuinfo.max_freq)
 7069			max_tsc_khz = policy.cpuinfo.max_freq;
 7070		put_cpu();
 7071#endif
 7072		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 7073					  CPUFREQ_TRANSITION_NOTIFIER);
 7074	}
 7075
 7076	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
 7077			  kvmclock_cpu_online, kvmclock_cpu_down_prep);
 7078}
 7079
 7080DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 7081EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
 7082
 7083int kvm_is_in_guest(void)
 7084{
 7085	return __this_cpu_read(current_vcpu) != NULL;
 7086}
 7087
 7088static int kvm_is_user_mode(void)
 7089{
 7090	int user_mode = 3;
 7091
 7092	if (__this_cpu_read(current_vcpu))
 7093		user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
 7094
 7095	return user_mode != 0;
 7096}
 7097
 7098static unsigned long kvm_get_guest_ip(void)
 7099{
 7100	unsigned long ip = 0;
 7101
 7102	if (__this_cpu_read(current_vcpu))
 7103		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
 7104
 7105	return ip;
 7106}
 7107
 7108static void kvm_handle_intel_pt_intr(void)
 7109{
 7110	struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
 7111
 7112	kvm_make_request(KVM_REQ_PMI, vcpu);
 7113	__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
 7114			(unsigned long *)&vcpu->arch.pmu.global_status);
 7115}
 7116
 7117static struct perf_guest_info_callbacks kvm_guest_cbs = {
 7118	.is_in_guest		= kvm_is_in_guest,
 7119	.is_user_mode		= kvm_is_user_mode,
 7120	.get_guest_ip		= kvm_get_guest_ip,
 7121	.handle_intel_pt_intr	= kvm_handle_intel_pt_intr,
 7122};
 7123
 7124#ifdef CONFIG_X86_64
 7125static void pvclock_gtod_update_fn(struct work_struct *work)
 7126{
 7127	struct kvm *kvm;
 
 
 7128
 7129	struct kvm_vcpu *vcpu;
 7130	int i;
 7131
 7132	mutex_lock(&kvm_lock);
 7133	list_for_each_entry(kvm, &vm_list, vm_list)
 7134		kvm_for_each_vcpu(i, vcpu, kvm)
 7135			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 7136	atomic_set(&kvm_guest_has_master_clock, 0);
 7137	mutex_unlock(&kvm_lock);
 7138}
 
 7139
 7140static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
 7141
 7142/*
 7143 * Notification about pvclock gtod data update.
 7144 */
 7145static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 7146			       void *priv)
 7147{
 7148	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 7149	struct timekeeper *tk = priv;
 7150
 7151	update_pvclock_gtod(tk);
 
 
 
 
 
 7152
 7153	/* disable master clock if host does not trust, or does not
 7154	 * use, TSC based clocksource.
 
 
 7155	 */
 7156	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
 7157	    atomic_read(&kvm_guest_has_master_clock) != 0)
 7158		queue_work(system_long_wq, &pvclock_gtod_work);
 7159
 7160	return 0;
 7161}
 7162
 7163static struct notifier_block pvclock_gtod_notifier = {
 7164	.notifier_call = pvclock_gtod_notify,
 7165};
 7166#endif
 7167
 7168int kvm_arch_init(void *opaque)
 7169{
 7170	int r;
 7171	struct kvm_x86_ops *ops = opaque;
 7172
 7173	if (kvm_x86_ops) {
 7174		printk(KERN_ERR "kvm: already loaded the other module\n");
 7175		r = -EEXIST;
 7176		goto out;
 7177	}
 7178
 7179	if (!ops->cpu_has_kvm_support()) {
 7180		printk(KERN_ERR "kvm: no hardware support\n");
 7181		r = -EOPNOTSUPP;
 7182		goto out;
 7183	}
 7184	if (ops->disabled_by_bios()) {
 7185		printk(KERN_ERR "kvm: disabled by bios\n");
 7186		r = -EOPNOTSUPP;
 7187		goto out;
 7188	}
 7189
 7190	/*
 7191	 * KVM explicitly assumes that the guest has an FPU and
 7192	 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
 7193	 * vCPU's FPU state as a fxregs_state struct.
 7194	 */
 7195	if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
 7196		printk(KERN_ERR "kvm: inadequate fpu\n");
 7197		r = -EOPNOTSUPP;
 7198		goto out;
 7199	}
 7200
 7201	r = -ENOMEM;
 7202	x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
 7203					  __alignof__(struct fpu), SLAB_ACCOUNT,
 7204					  NULL);
 7205	if (!x86_fpu_cache) {
 7206		printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
 7207		goto out;
 7208	}
 7209
 7210	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
 7211	if (!shared_msrs) {
 7212		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
 7213		goto out_free_x86_fpu_cache;
 7214	}
 7215
 7216	r = kvm_mmu_module_init();
 7217	if (r)
 7218		goto out_free_percpu;
 7219
 7220	kvm_x86_ops = ops;
 
 
 7221
 7222	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 7223			PT_DIRTY_MASK, PT64_NX_MASK, 0,
 7224			PT_PRESENT_MASK, 0, sme_me_mask);
 7225	kvm_timer_init();
 7226
 7227	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 7228
 7229	if (boot_cpu_has(X86_FEATURE_XSAVE))
 7230		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 7231
 7232	kvm_lapic_init();
 7233	if (pi_inject_timer == -1)
 7234		pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
 7235#ifdef CONFIG_X86_64
 7236	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
 7237
 7238	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
 7239		set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
 7240#endif
 7241
 7242	return 0;
 7243
 7244out_free_percpu:
 7245	free_percpu(shared_msrs);
 7246out_free_x86_fpu_cache:
 7247	kmem_cache_destroy(x86_fpu_cache);
 7248out:
 7249	return r;
 7250}
 7251
 7252void kvm_arch_exit(void)
 7253{
 7254#ifdef CONFIG_X86_64
 7255	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
 7256		clear_hv_tscchange_cb();
 7257#endif
 7258	kvm_lapic_exit();
 7259	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
 7260
 7261	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 7262		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 7263					    CPUFREQ_TRANSITION_NOTIFIER);
 7264	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
 7265#ifdef CONFIG_X86_64
 7266	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
 7267#endif
 7268	kvm_x86_ops = NULL;
 7269	kvm_mmu_module_exit();
 7270	free_percpu(shared_msrs);
 7271	kmem_cache_destroy(x86_fpu_cache);
 7272}
 7273
 7274int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 7275{
 7276	++vcpu->stat.halt_exits;
 7277	if (lapic_in_kernel(vcpu)) {
 7278		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 7279		return 1;
 7280	} else {
 7281		vcpu->run->exit_reason = KVM_EXIT_HLT;
 7282		return 0;
 7283	}
 7284}
 7285EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
 7286
 7287int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 
 7288{
 7289	int ret = kvm_skip_emulated_instruction(vcpu);
 7290	/*
 7291	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
 7292	 * KVM_EXIT_DEBUG here.
 7293	 */
 7294	return kvm_vcpu_halt(vcpu) && ret;
 7295}
 7296EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 7297
 7298#ifdef CONFIG_X86_64
 7299static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
 7300			        unsigned long clock_type)
 7301{
 7302	struct kvm_clock_pairing clock_pairing;
 7303	struct timespec64 ts;
 7304	u64 cycle;
 7305	int ret;
 7306
 7307	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
 7308		return -KVM_EOPNOTSUPP;
 
 
 
 
 
 
 7309
 7310	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
 7311		return -KVM_EOPNOTSUPP;
 7312
 7313	clock_pairing.sec = ts.tv_sec;
 7314	clock_pairing.nsec = ts.tv_nsec;
 7315	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
 7316	clock_pairing.flags = 0;
 7317	memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
 7318
 7319	ret = 0;
 7320	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
 7321			    sizeof(struct kvm_clock_pairing)))
 7322		ret = -KVM_EFAULT;
 7323
 7324	return ret;
 7325}
 
 7326#endif
 7327
 7328/*
 7329 * kvm_pv_kick_cpu_op:  Kick a vcpu.
 7330 *
 7331 * @apicid - apicid of vcpu to be kicked.
 7332 */
 7333static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 7334{
 7335	struct kvm_lapic_irq lapic_irq;
 
 
 
 
 
 
 
 7336
 7337	lapic_irq.shorthand = 0;
 7338	lapic_irq.dest_mode = 0;
 7339	lapic_irq.level = 0;
 7340	lapic_irq.dest_id = apicid;
 7341	lapic_irq.msi_redir_hint = false;
 7342
 7343	lapic_irq.delivery_mode = APIC_DM_REMRD;
 7344	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 7345}
 7346
 7347void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
 7348{
 7349	if (!lapic_in_kernel(vcpu)) {
 7350		WARN_ON_ONCE(vcpu->arch.apicv_active);
 7351		return;
 7352	}
 7353	if (!vcpu->arch.apicv_active)
 7354		return;
 7355
 7356	vcpu->arch.apicv_active = false;
 7357	kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
 7358}
 7359
 7360static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
 7361{
 7362	struct kvm_vcpu *target = NULL;
 7363	struct kvm_apic_map *map;
 7364
 7365	rcu_read_lock();
 7366	map = rcu_dereference(kvm->arch.apic_map);
 7367
 7368	if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
 7369		target = map->phys_map[dest_id]->vcpu;
 7370
 7371	rcu_read_unlock();
 7372
 7373	if (target && READ_ONCE(target->ready))
 7374		kvm_vcpu_yield_to(target);
 7375}
 7376
 7377int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 7378{
 7379	unsigned long nr, a0, a1, a2, a3, ret;
 7380	int op_64_bit;
 7381
 7382	if (kvm_hv_hypercall_enabled(vcpu->kvm))
 7383		return kvm_hv_hypercall(vcpu);
 7384
 7385	nr = kvm_rax_read(vcpu);
 7386	a0 = kvm_rbx_read(vcpu);
 7387	a1 = kvm_rcx_read(vcpu);
 7388	a2 = kvm_rdx_read(vcpu);
 7389	a3 = kvm_rsi_read(vcpu);
 7390
 7391	trace_kvm_hypercall(nr, a0, a1, a2, a3);
 7392
 7393	op_64_bit = is_64_bit_mode(vcpu);
 7394	if (!op_64_bit) {
 7395		nr &= 0xFFFFFFFF;
 7396		a0 &= 0xFFFFFFFF;
 7397		a1 &= 0xFFFFFFFF;
 7398		a2 &= 0xFFFFFFFF;
 7399		a3 &= 0xFFFFFFFF;
 7400	}
 7401
 7402	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
 7403		ret = -KVM_EPERM;
 7404		goto out;
 7405	}
 7406
 7407	switch (nr) {
 7408	case KVM_HC_VAPIC_POLL_IRQ:
 7409		ret = 0;
 7410		break;
 7411	case KVM_HC_KICK_CPU:
 7412		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
 7413		kvm_sched_yield(vcpu->kvm, a1);
 7414		ret = 0;
 7415		break;
 7416#ifdef CONFIG_X86_64
 7417	case KVM_HC_CLOCK_PAIRING:
 7418		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 7419		break;
 7420#endif
 7421	case KVM_HC_SEND_IPI:
 7422		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
 7423		break;
 7424	case KVM_HC_SCHED_YIELD:
 7425		kvm_sched_yield(vcpu->kvm, a0);
 7426		ret = 0;
 7427		break;
 7428	default:
 7429		ret = -KVM_ENOSYS;
 7430		break;
 7431	}
 7432out:
 7433	if (!op_64_bit)
 7434		ret = (u32)ret;
 7435	kvm_rax_write(vcpu, ret);
 7436
 7437	++vcpu->stat.hypercalls;
 7438	return kvm_skip_emulated_instruction(vcpu);
 7439}
 7440EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 7441
 7442static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 7443{
 7444	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
 7445	char instruction[3];
 7446	unsigned long rip = kvm_rip_read(vcpu);
 7447
 
 
 
 
 
 
 
 7448	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 7449
 7450	return emulator_write_emulated(ctxt, rip, instruction, 3,
 7451		&ctxt->exception);
 7452}
 7453
 7454static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 7455{
 7456	return vcpu->run->request_interrupt_window &&
 7457		likely(!pic_in_kernel(vcpu->kvm));
 7458}
 7459
 7460static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 7461{
 7462	struct kvm_run *kvm_run = vcpu->run;
 7463
 7464	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 7465	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 7466	kvm_run->cr8 = kvm_get_cr8(vcpu);
 7467	kvm_run->apic_base = kvm_get_apic_base(vcpu);
 7468	kvm_run->ready_for_interrupt_injection =
 7469		pic_in_kernel(vcpu->kvm) ||
 7470		kvm_vcpu_ready_for_interrupt_injection(vcpu);
 7471}
 7472
 7473static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 
 
 
 7474{
 7475	int max_irr, tpr;
 7476
 7477	if (!kvm_x86_ops->update_cr8_intercept)
 7478		return;
 7479
 7480	if (!lapic_in_kernel(vcpu))
 7481		return;
 7482
 7483	if (vcpu->arch.apicv_active)
 7484		return;
 7485
 7486	if (!vcpu->arch.apic->vapic_addr)
 7487		max_irr = kvm_lapic_find_highest_irr(vcpu);
 7488	else
 7489		max_irr = -1;
 7490
 7491	if (max_irr != -1)
 7492		max_irr >>= 4;
 7493
 7494	tpr = kvm_lapic_get_cr8(vcpu);
 7495
 7496	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 7497}
 7498
 7499static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 
 7500{
 7501	int r;
 
 7502
 7503	/* try to reinject previous events if any */
 
 7504
 7505	if (vcpu->arch.exception.injected)
 7506		kvm_x86_ops->queue_exception(vcpu);
 7507	/*
 7508	 * Do not inject an NMI or interrupt if there is a pending
 7509	 * exception.  Exceptions and interrupts are recognized at
 7510	 * instruction boundaries, i.e. the start of an instruction.
 7511	 * Trap-like exceptions, e.g. #DB, have higher priority than
 7512	 * NMIs and interrupts, i.e. traps are recognized before an
 7513	 * NMI/interrupt that's pending on the same instruction.
 7514	 * Fault-like exceptions, e.g. #GP and #PF, are the lowest
 7515	 * priority, but are only generated (pended) during instruction
 7516	 * execution, i.e. a pending fault-like exception means the
 7517	 * fault occurred on the *previous* instruction and must be
 7518	 * serviced prior to recognizing any new events in order to
 7519	 * fully complete the previous instruction.
 7520	 */
 7521	else if (!vcpu->arch.exception.pending) {
 7522		if (vcpu->arch.nmi_injected)
 7523			kvm_x86_ops->set_nmi(vcpu);
 7524		else if (vcpu->arch.interrupt.injected)
 7525			kvm_x86_ops->set_irq(vcpu);
 7526	}
 7527
 7528	/*
 7529	 * Call check_nested_events() even if we reinjected a previous event
 7530	 * in order for caller to determine if it should require immediate-exit
 7531	 * from L2 to L1 due to pending L1 events which require exit
 7532	 * from L2 to L1.
 7533	 */
 7534	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
 7535		r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
 7536		if (r != 0)
 7537			return r;
 7538	}
 7539
 7540	/* try to inject new event if pending */
 7541	if (vcpu->arch.exception.pending) {
 7542		trace_kvm_inj_exception(vcpu->arch.exception.nr,
 7543					vcpu->arch.exception.has_error_code,
 7544					vcpu->arch.exception.error_code);
 7545
 7546		WARN_ON_ONCE(vcpu->arch.exception.injected);
 7547		vcpu->arch.exception.pending = false;
 7548		vcpu->arch.exception.injected = true;
 7549
 7550		if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
 7551			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
 7552					     X86_EFLAGS_RF);
 7553
 7554		if (vcpu->arch.exception.nr == DB_VECTOR) {
 7555			/*
 7556			 * This code assumes that nSVM doesn't use
 7557			 * check_nested_events(). If it does, the
 7558			 * DR6/DR7 changes should happen before L1
 7559			 * gets a #VMEXIT for an intercepted #DB in
 7560			 * L2.  (Under VMX, on the other hand, the
 7561			 * DR6/DR7 changes should not happen in the
 7562			 * event of a VM-exit to L1 for an intercepted
 7563			 * #DB in L2.)
 7564			 */
 7565			kvm_deliver_exception_payload(vcpu);
 7566			if (vcpu->arch.dr7 & DR7_GD) {
 7567				vcpu->arch.dr7 &= ~DR7_GD;
 7568				kvm_update_dr7(vcpu);
 7569			}
 7570		}
 7571
 7572		kvm_x86_ops->queue_exception(vcpu);
 7573	}
 
 
 
 7574
 7575	/* Don't consider new event if we re-injected an event */
 7576	if (kvm_event_needs_reinjection(vcpu))
 7577		return 0;
 7578
 7579	if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
 7580	    kvm_x86_ops->smi_allowed(vcpu)) {
 7581		vcpu->arch.smi_pending = false;
 7582		++vcpu->arch.smi_count;
 7583		enter_smm(vcpu);
 7584	} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
 7585		--vcpu->arch.nmi_pending;
 7586		vcpu->arch.nmi_injected = true;
 7587		kvm_x86_ops->set_nmi(vcpu);
 7588	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
 7589		/*
 7590		 * Because interrupts can be injected asynchronously, we are
 7591		 * calling check_nested_events again here to avoid a race condition.
 7592		 * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
 7593		 * proposal and current concerns.  Perhaps we should be setting
 7594		 * KVM_REQ_EVENT only on certain events and not unconditionally?
 7595		 */
 7596		if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
 7597			r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
 7598			if (r != 0)
 7599				return r;
 7600		}
 7601		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
 7602			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
 7603					    false);
 7604			kvm_x86_ops->set_irq(vcpu);
 7605		}
 7606	}
 7607
 7608	return 0;
 
 
 
 
 
 
 
 7609}
 7610
 7611static void process_nmi(struct kvm_vcpu *vcpu)
 
 
 
 
 
 
 7612{
 7613	unsigned limit = 2;
 7614
 7615	/*
 7616	 * x86 is limited to one NMI running, and one NMI pending after it.
 7617	 * If an NMI is already in progress, limit further NMIs to just one.
 7618	 * Otherwise, allow two (and we'll inject the first one immediately).
 7619	 */
 7620	if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
 7621		limit = 1;
 7622
 7623	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
 7624	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
 7625	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 7626}
 
 7627
 7628static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
 
 
 
 
 
 
 7629{
 7630	u32 flags = 0;
 7631	flags |= seg->g       << 23;
 7632	flags |= seg->db      << 22;
 7633	flags |= seg->l       << 21;
 7634	flags |= seg->avl     << 20;
 7635	flags |= seg->present << 15;
 7636	flags |= seg->dpl     << 13;
 7637	flags |= seg->s       << 12;
 7638	flags |= seg->type    << 8;
 7639	return flags;
 7640}
 7641
 7642static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 7643{
 7644	struct kvm_segment seg;
 7645	int offset;
 7646
 7647	kvm_get_segment(vcpu, &seg, n);
 7648	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
 7649
 7650	if (n < 3)
 7651		offset = 0x7f84 + n * 12;
 7652	else
 7653		offset = 0x7f2c + (n - 3) * 12;
 7654
 7655	put_smstate(u32, buf, offset + 8, seg.base);
 7656	put_smstate(u32, buf, offset + 4, seg.limit);
 7657	put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
 7658}
 7659
 7660#ifdef CONFIG_X86_64
 7661static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 7662{
 7663	struct kvm_segment seg;
 7664	int offset;
 7665	u16 flags;
 7666
 7667	kvm_get_segment(vcpu, &seg, n);
 7668	offset = 0x7e00 + n * 16;
 7669
 7670	flags = enter_smm_get_segment_flags(&seg) >> 8;
 7671	put_smstate(u16, buf, offset, seg.selector);
 7672	put_smstate(u16, buf, offset + 2, flags);
 7673	put_smstate(u32, buf, offset + 4, seg.limit);
 7674	put_smstate(u64, buf, offset + 8, seg.base);
 7675}
 7676#endif
 7677
 7678static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 7679{
 7680	struct desc_ptr dt;
 7681	struct kvm_segment seg;
 7682	unsigned long val;
 7683	int i;
 7684
 7685	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
 7686	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
 7687	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
 7688	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
 7689
 7690	for (i = 0; i < 8; i++)
 7691		put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
 7692
 7693	kvm_get_dr(vcpu, 6, &val);
 7694	put_smstate(u32, buf, 0x7fcc, (u32)val);
 7695	kvm_get_dr(vcpu, 7, &val);
 7696	put_smstate(u32, buf, 0x7fc8, (u32)val);
 7697
 7698	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
 7699	put_smstate(u32, buf, 0x7fc4, seg.selector);
 7700	put_smstate(u32, buf, 0x7f64, seg.base);
 7701	put_smstate(u32, buf, 0x7f60, seg.limit);
 7702	put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
 7703
 7704	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
 7705	put_smstate(u32, buf, 0x7fc0, seg.selector);
 7706	put_smstate(u32, buf, 0x7f80, seg.base);
 7707	put_smstate(u32, buf, 0x7f7c, seg.limit);
 7708	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
 7709
 7710	kvm_x86_ops->get_gdt(vcpu, &dt);
 7711	put_smstate(u32, buf, 0x7f74, dt.address);
 7712	put_smstate(u32, buf, 0x7f70, dt.size);
 7713
 7714	kvm_x86_ops->get_idt(vcpu, &dt);
 7715	put_smstate(u32, buf, 0x7f58, dt.address);
 7716	put_smstate(u32, buf, 0x7f54, dt.size);
 7717
 7718	for (i = 0; i < 6; i++)
 7719		enter_smm_save_seg_32(vcpu, buf, i);
 7720
 7721	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
 7722
 7723	/* revision id */
 7724	put_smstate(u32, buf, 0x7efc, 0x00020000);
 7725	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
 7726}
 7727
 7728#ifdef CONFIG_X86_64
 7729static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
 7730{
 7731	struct desc_ptr dt;
 7732	struct kvm_segment seg;
 7733	unsigned long val;
 7734	int i;
 7735
 7736	for (i = 0; i < 16; i++)
 7737		put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
 7738
 7739	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
 7740	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
 7741
 7742	kvm_get_dr(vcpu, 6, &val);
 7743	put_smstate(u64, buf, 0x7f68, val);
 7744	kvm_get_dr(vcpu, 7, &val);
 7745	put_smstate(u64, buf, 0x7f60, val);
 7746
 7747	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
 7748	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
 7749	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
 7750
 7751	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
 7752
 7753	/* revision id */
 7754	put_smstate(u32, buf, 0x7efc, 0x00020064);
 7755
 7756	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
 7757
 7758	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
 7759	put_smstate(u16, buf, 0x7e90, seg.selector);
 7760	put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
 7761	put_smstate(u32, buf, 0x7e94, seg.limit);
 7762	put_smstate(u64, buf, 0x7e98, seg.base);
 7763
 7764	kvm_x86_ops->get_idt(vcpu, &dt);
 7765	put_smstate(u32, buf, 0x7e84, dt.size);
 7766	put_smstate(u64, buf, 0x7e88, dt.address);
 7767
 7768	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
 7769	put_smstate(u16, buf, 0x7e70, seg.selector);
 7770	put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
 7771	put_smstate(u32, buf, 0x7e74, seg.limit);
 7772	put_smstate(u64, buf, 0x7e78, seg.base);
 7773
 7774	kvm_x86_ops->get_gdt(vcpu, &dt);
 7775	put_smstate(u32, buf, 0x7e64, dt.size);
 7776	put_smstate(u64, buf, 0x7e68, dt.address);
 7777
 7778	for (i = 0; i < 6; i++)
 7779		enter_smm_save_seg_64(vcpu, buf, i);
 7780}
 7781#endif
 7782
 7783static void enter_smm(struct kvm_vcpu *vcpu)
 7784{
 7785	struct kvm_segment cs, ds;
 7786	struct desc_ptr dt;
 7787	char buf[512];
 7788	u32 cr0;
 7789
 7790	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
 7791	memset(buf, 0, 512);
 7792#ifdef CONFIG_X86_64
 7793	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
 7794		enter_smm_save_state_64(vcpu, buf);
 7795	else
 7796#endif
 7797		enter_smm_save_state_32(vcpu, buf);
 7798
 7799	/*
 7800	 * Give pre_enter_smm() a chance to make ISA-specific changes to the
 7801	 * vCPU state (e.g. leave guest mode) after we've saved the state into
 7802	 * the SMM state-save area.
 7803	 */
 7804	kvm_x86_ops->pre_enter_smm(vcpu, buf);
 7805
 7806	vcpu->arch.hflags |= HF_SMM_MASK;
 7807	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
 7808
 7809	if (kvm_x86_ops->get_nmi_mask(vcpu))
 7810		vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
 7811	else
 7812		kvm_x86_ops->set_nmi_mask(vcpu, true);
 7813
 7814	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
 7815	kvm_rip_write(vcpu, 0x8000);
 7816
 7817	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
 7818	kvm_x86_ops->set_cr0(vcpu, cr0);
 7819	vcpu->arch.cr0 = cr0;
 7820
 7821	kvm_x86_ops->set_cr4(vcpu, 0);
 7822
 7823	/* Undocumented: IDT limit is set to zero on entry to SMM.  */
 7824	dt.address = dt.size = 0;
 7825	kvm_x86_ops->set_idt(vcpu, &dt);
 7826
 7827	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
 7828
 7829	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
 7830	cs.base = vcpu->arch.smbase;
 7831
 7832	ds.selector = 0;
 7833	ds.base = 0;
 7834
 7835	cs.limit    = ds.limit = 0xffffffff;
 7836	cs.type     = ds.type = 0x3;
 7837	cs.dpl      = ds.dpl = 0;
 7838	cs.db       = ds.db = 0;
 7839	cs.s        = ds.s = 1;
 7840	cs.l        = ds.l = 0;
 7841	cs.g        = ds.g = 1;
 7842	cs.avl      = ds.avl = 0;
 7843	cs.present  = ds.present = 1;
 7844	cs.unusable = ds.unusable = 0;
 7845	cs.padding  = ds.padding = 0;
 7846
 7847	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
 7848	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
 7849	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
 7850	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
 7851	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
 7852	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
 7853
 7854#ifdef CONFIG_X86_64
 7855	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
 7856		kvm_x86_ops->set_efer(vcpu, 0);
 7857#endif
 7858
 7859	kvm_update_cpuid(vcpu);
 7860	kvm_mmu_reset_context(vcpu);
 7861}
 7862
 7863static void process_smi(struct kvm_vcpu *vcpu)
 7864{
 7865	vcpu->arch.smi_pending = true;
 7866	kvm_make_request(KVM_REQ_EVENT, vcpu);
 7867}
 
 
 
 
 
 
 
 
 7868
 7869void kvm_make_scan_ioapic_request(struct kvm *kvm)
 7870{
 7871	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 7872}
 7873
 7874static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 7875{
 7876	if (!kvm_apic_present(vcpu))
 7877		return;
 7878
 7879	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
 7880
 7881	if (irqchip_split(vcpu->kvm))
 7882		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
 7883	else {
 7884		if (vcpu->arch.apicv_active)
 7885			kvm_x86_ops->sync_pir_to_irr(vcpu);
 7886		if (ioapic_in_kernel(vcpu->kvm))
 7887			kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
 7888	}
 7889
 7890	if (is_guest_mode(vcpu))
 7891		vcpu->arch.load_eoi_exitmap_pending = true;
 7892	else
 7893		kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
 7894}
 7895
 7896static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 7897{
 7898	u64 eoi_exit_bitmap[4];
 7899
 7900	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 7901		return;
 
 7902
 7903	bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
 7904		  vcpu_to_synic(vcpu)->vec_bitmap, 256);
 7905	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 
 
 
 
 
 
 
 
 
 
 
 7906}
 7907
 7908int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 7909		unsigned long start, unsigned long end,
 7910		bool blockable)
 7911{
 7912	unsigned long apic_address;
 7913
 7914	/*
 7915	 * The physical address of apic access page is stored in the VMCS.
 7916	 * Update it when it becomes invalid.
 7917	 */
 7918	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
 7919	if (start <= apic_address && apic_address < end)
 7920		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
 7921
 7922	return 0;
 7923}
 7924
 7925void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 7926{
 7927	struct page *page = NULL;
 7928
 7929	if (!lapic_in_kernel(vcpu))
 7930		return;
 7931
 7932	if (!kvm_x86_ops->set_apic_access_page_addr)
 7933		return;
 7934
 7935	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
 7936	if (is_error_page(page))
 7937		return;
 7938	kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
 7939
 7940	/*
 7941	 * Do not pin apic access page in memory, the MMU notifier
 7942	 * will call us again if it is migrated or swapped out.
 7943	 */
 7944	put_page(page);
 7945}
 7946EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
 7947
 7948void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
 7949{
 7950	smp_send_reschedule(vcpu->cpu);
 7951}
 7952EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
 7953
 7954/*
 7955 * Returns 1 to let vcpu_run() continue the guest execution loop without
 7956 * exiting to the userspace.  Otherwise, the value will be returned to the
 7957 * userspace.
 7958 */
 7959static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 7960{
 7961	int r;
 7962	bool req_int_win =
 7963		dm_request_for_irq_injection(vcpu) &&
 7964		kvm_cpu_accept_dm_intr(vcpu);
 7965
 7966	bool req_immediate_exit = false;
 7967
 7968	if (kvm_request_pending(vcpu)) {
 7969		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
 7970			if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
 7971				r = 0;
 7972				goto out;
 7973			}
 7974		}
 7975		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 7976			kvm_mmu_unload(vcpu);
 7977		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 7978			__kvm_migrate_timers(vcpu);
 7979		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
 7980			kvm_gen_update_masterclock(vcpu->kvm);
 7981		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
 7982			kvm_gen_kvmclock_update(vcpu);
 7983		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
 7984			r = kvm_guest_time_update(vcpu);
 7985			if (unlikely(r))
 7986				goto out;
 7987		}
 7988		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 7989			kvm_mmu_sync_roots(vcpu);
 7990		if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
 7991			kvm_mmu_load_cr3(vcpu);
 7992		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 7993			kvm_vcpu_flush_tlb(vcpu, true);
 7994		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 7995			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 7996			r = 0;
 7997			goto out;
 7998		}
 7999		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 8000			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 8001			vcpu->mmio_needed = 0;
 8002			r = 0;
 8003			goto out;
 8004		}
 
 
 
 
 8005		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
 8006			/* Page is swapped out. Do synthetic halt */
 8007			vcpu->arch.apf.halted = true;
 8008			r = 1;
 8009			goto out;
 8010		}
 8011		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 8012			record_steal_time(vcpu);
 8013		if (kvm_check_request(KVM_REQ_SMI, vcpu))
 8014			process_smi(vcpu);
 8015		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 8016			process_nmi(vcpu);
 8017		if (kvm_check_request(KVM_REQ_PMU, vcpu))
 8018			kvm_pmu_handle_event(vcpu);
 8019		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 8020			kvm_pmu_deliver_pmi(vcpu);
 8021		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
 8022			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
 8023			if (test_bit(vcpu->arch.pending_ioapic_eoi,
 8024				     vcpu->arch.ioapic_handled_vectors)) {
 8025				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
 8026				vcpu->run->eoi.vector =
 8027						vcpu->arch.pending_ioapic_eoi;
 8028				r = 0;
 8029				goto out;
 8030			}
 8031		}
 8032		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 8033			vcpu_scan_ioapic(vcpu);
 8034		if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
 8035			vcpu_load_eoi_exitmap(vcpu);
 8036		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 8037			kvm_vcpu_reload_apic_access_page(vcpu);
 8038		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
 8039			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 8040			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
 8041			r = 0;
 8042			goto out;
 8043		}
 8044		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
 8045			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 8046			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
 8047			r = 0;
 8048			goto out;
 8049		}
 8050		if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
 8051			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
 8052			vcpu->run->hyperv = vcpu->arch.hyperv.exit;
 8053			r = 0;
 8054			goto out;
 8055		}
 8056
 8057		/*
 8058		 * KVM_REQ_HV_STIMER has to be processed after
 8059		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
 8060		 * depend on the guest clock being up-to-date
 8061		 */
 8062		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
 8063			kvm_hv_process_stimers(vcpu);
 8064	}
 8065
 
 
 
 
 
 
 
 
 
 
 
 
 8066	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 8067		++vcpu->stat.req_event;
 8068		kvm_apic_accept_events(vcpu);
 8069		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
 8070			r = 1;
 8071			goto out;
 8072		}
 8073
 8074		if (inject_pending_event(vcpu, req_int_win) != 0)
 8075			req_immediate_exit = true;
 8076		else {
 8077			/* Enable SMI/NMI/IRQ window open exits if needed.
 8078			 *
 8079			 * SMIs have three cases:
 8080			 * 1) They can be nested, and then there is nothing to
 8081			 *    do here because RSM will cause a vmexit anyway.
 8082			 * 2) There is an ISA-specific reason why SMI cannot be
 8083			 *    injected, and the moment when this changes can be
 8084			 *    intercepted.
 8085			 * 3) Or the SMI can be pending because
 8086			 *    inject_pending_event has completed the injection
 8087			 *    of an IRQ or NMI from the previous vmexit, and
 8088			 *    then we request an immediate exit to inject the
 8089			 *    SMI.
 8090			 */
 8091			if (vcpu->arch.smi_pending && !is_smm(vcpu))
 8092				if (!kvm_x86_ops->enable_smi_window(vcpu))
 8093					req_immediate_exit = true;
 8094			if (vcpu->arch.nmi_pending)
 8095				kvm_x86_ops->enable_nmi_window(vcpu);
 8096			if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
 8097				kvm_x86_ops->enable_irq_window(vcpu);
 8098			WARN_ON(vcpu->arch.exception.pending);
 8099		}
 8100
 8101		if (kvm_lapic_enabled(vcpu)) {
 8102			update_cr8_intercept(vcpu);
 8103			kvm_lapic_sync_to_vapic(vcpu);
 8104		}
 8105	}
 8106
 8107	r = kvm_mmu_reload(vcpu);
 8108	if (unlikely(r)) {
 8109		goto cancel_injection;
 8110	}
 8111
 8112	preempt_disable();
 8113
 8114	kvm_x86_ops->prepare_guest_switch(vcpu);
 
 
 
 8115
 8116	/*
 8117	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
 8118	 * IPI are then delayed after guest entry, which ensures that they
 8119	 * result in virtual interrupt delivery.
 8120	 */
 8121	local_irq_disable();
 8122	vcpu->mode = IN_GUEST_MODE;
 8123
 8124	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 8125
 8126	/*
 8127	 * 1) We should set ->mode before checking ->requests.  Please see
 8128	 * the comment in kvm_vcpu_exiting_guest_mode().
 8129	 *
 8130	 * 2) For APICv, we should set ->mode before checking PID.ON. This
 8131	 * pairs with the memory barrier implicit in pi_test_and_set_on
 8132	 * (see vmx_deliver_posted_interrupt).
 8133	 *
 8134	 * 3) This also orders the write to mode from any reads to the page
 8135	 * tables done while the VCPU is running.  Please see the comment
 8136	 * in kvm_flush_remote_tlbs.
 8137	 */
 8138	smp_mb__after_srcu_read_unlock();
 8139
 8140	/*
 8141	 * This handles the case where a posted interrupt was
 8142	 * notified with kvm_vcpu_kick.
 8143	 */
 8144	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
 8145		kvm_x86_ops->sync_pir_to_irr(vcpu);
 8146
 8147	if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
 8148	    || need_resched() || signal_pending(current)) {
 8149		vcpu->mode = OUTSIDE_GUEST_MODE;
 8150		smp_wmb();
 8151		local_irq_enable();
 8152		preempt_enable();
 8153		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 8154		r = 1;
 8155		goto cancel_injection;
 8156	}
 8157
 8158	if (req_immediate_exit) {
 8159		kvm_make_request(KVM_REQ_EVENT, vcpu);
 8160		kvm_x86_ops->request_immediate_exit(vcpu);
 8161	}
 8162
 8163	trace_kvm_entry(vcpu->vcpu_id);
 8164	guest_enter_irqoff();
 8165
 8166	/* The preempt notifier should have taken care of the FPU already.  */
 8167	WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
 8168
 8169	if (unlikely(vcpu->arch.switch_db_regs)) {
 8170		set_debugreg(0, 7);
 8171		set_debugreg(vcpu->arch.eff_db[0], 0);
 8172		set_debugreg(vcpu->arch.eff_db[1], 1);
 8173		set_debugreg(vcpu->arch.eff_db[2], 2);
 8174		set_debugreg(vcpu->arch.eff_db[3], 3);
 8175		set_debugreg(vcpu->arch.dr6, 6);
 8176		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 8177	}
 8178
 
 8179	kvm_x86_ops->run(vcpu);
 8180
 8181	/*
 8182	 * Do this here before restoring debug registers on the host.  And
 8183	 * since we do this before handling the vmexit, a DR access vmexit
 8184	 * can (a) read the correct value of the debug registers, (b) set
 8185	 * KVM_DEBUGREG_WONT_EXIT again.
 8186	 */
 8187	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
 8188		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
 8189		kvm_x86_ops->sync_dirty_debug_regs(vcpu);
 8190		kvm_update_dr0123(vcpu);
 8191		kvm_update_dr6(vcpu);
 8192		kvm_update_dr7(vcpu);
 8193		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 8194	}
 8195
 8196	/*
 8197	 * If the guest has used debug registers, at least dr7
 8198	 * will be disabled while returning to the host.
 8199	 * If we don't have active breakpoints in the host, we don't
 8200	 * care about the messed up debug address registers. But if
 8201	 * we have some of them active, restore the old state.
 8202	 */
 8203	if (hw_breakpoint_active())
 8204		hw_breakpoint_restore();
 8205
 8206	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
 8207
 8208	vcpu->mode = OUTSIDE_GUEST_MODE;
 8209	smp_wmb();
 
 8210
 8211	kvm_x86_ops->handle_exit_irqoff(vcpu);
 8212
 8213	/*
 8214	 * Consume any pending interrupts, including the possible source of
 8215	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
 8216	 * An instruction is required after local_irq_enable() to fully unblock
 8217	 * interrupts on processors that implement an interrupt shadow, the
 8218	 * stat.exits increment will do nicely.
 8219	 */
 8220	kvm_before_interrupt(vcpu);
 8221	local_irq_enable();
 8222	++vcpu->stat.exits;
 8223	local_irq_disable();
 8224	kvm_after_interrupt(vcpu);
 8225
 8226	guest_exit_irqoff();
 8227	if (lapic_in_kernel(vcpu)) {
 8228		s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
 8229		if (delta != S64_MIN) {
 8230			trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
 8231			vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
 8232		}
 8233	}
 8234
 8235	local_irq_enable();
 8236	preempt_enable();
 8237
 8238	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 8239
 8240	/*
 8241	 * Profile KVM exit RIPs:
 8242	 */
 8243	if (unlikely(prof_on == KVM_PROFILING)) {
 8244		unsigned long rip = kvm_rip_read(vcpu);
 8245		profile_hit(KVM_PROFILING, (void *)rip);
 8246	}
 8247
 8248	if (unlikely(vcpu->arch.tsc_always_catchup))
 8249		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 8250
 8251	if (vcpu->arch.apic_attention)
 8252		kvm_lapic_sync_from_vapic(vcpu);
 8253
 8254	vcpu->arch.gpa_available = false;
 8255	r = kvm_x86_ops->handle_exit(vcpu);
 8256	return r;
 8257
 8258cancel_injection:
 8259	kvm_x86_ops->cancel_injection(vcpu);
 8260	if (unlikely(vcpu->arch.apic_attention))
 8261		kvm_lapic_sync_from_vapic(vcpu);
 8262out:
 8263	return r;
 8264}
 8265
 8266static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 8267{
 8268	if (!kvm_arch_vcpu_runnable(vcpu) &&
 8269	    (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
 8270		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 8271		kvm_vcpu_block(vcpu);
 8272		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 8273
 8274		if (kvm_x86_ops->post_block)
 8275			kvm_x86_ops->post_block(vcpu);
 8276
 8277		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 8278			return 1;
 8279	}
 8280
 8281	kvm_apic_accept_events(vcpu);
 8282	switch(vcpu->arch.mp_state) {
 8283	case KVM_MP_STATE_HALTED:
 8284		vcpu->arch.pv.pv_unhalted = false;
 8285		vcpu->arch.mp_state =
 8286			KVM_MP_STATE_RUNNABLE;
 8287		/* fall through */
 8288	case KVM_MP_STATE_RUNNABLE:
 8289		vcpu->arch.apf.halted = false;
 8290		break;
 8291	case KVM_MP_STATE_INIT_RECEIVED:
 8292		break;
 8293	default:
 8294		return -EINTR;
 8295		break;
 8296	}
 8297	return 1;
 8298}
 8299
 8300static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 8301{
 8302	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
 8303		kvm_x86_ops->check_nested_events(vcpu, false);
 8304
 8305	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 8306		!vcpu->arch.apf.halted);
 8307}
 8308
 8309static int vcpu_run(struct kvm_vcpu *vcpu)
 8310{
 8311	int r;
 8312	struct kvm *kvm = vcpu->kvm;
 8313
 
 
 
 
 
 
 
 
 
 
 8314	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 8315	vcpu->arch.l1tf_flush_l1d = true;
 8316
 8317	for (;;) {
 8318		if (kvm_vcpu_running(vcpu)) {
 
 
 8319			r = vcpu_enter_guest(vcpu);
 8320		} else {
 8321			r = vcpu_block(kvm, vcpu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 8322		}
 8323
 8324		if (r <= 0)
 8325			break;
 8326
 8327		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
 8328		if (kvm_cpu_has_pending_timer(vcpu))
 8329			kvm_inject_pending_timer_irqs(vcpu);
 8330
 8331		if (dm_request_for_irq_injection(vcpu) &&
 8332			kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
 8333			r = 0;
 8334			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 8335			++vcpu->stat.request_irq_exits;
 8336			break;
 8337		}
 8338
 8339		kvm_check_async_pf_completion(vcpu);
 8340
 8341		if (signal_pending(current)) {
 8342			r = -EINTR;
 8343			vcpu->run->exit_reason = KVM_EXIT_INTR;
 8344			++vcpu->stat.signal_exits;
 8345			break;
 8346		}
 8347		if (need_resched()) {
 8348			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 8349			cond_resched();
 8350			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 8351		}
 8352	}
 8353
 8354	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 8355
 8356	return r;
 8357}
 8358
 8359static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
 8360{
 8361	int r;
 8362
 8363	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 8364	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 8365	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 8366	return r;
 8367}
 8368
 8369static int complete_emulated_pio(struct kvm_vcpu *vcpu)
 8370{
 8371	BUG_ON(!vcpu->arch.pio.count);
 8372
 8373	return complete_emulated_io(vcpu);
 8374}
 8375
 8376/*
 8377 * Implements the following, as a state machine:
 8378 *
 8379 * read:
 8380 *   for each fragment
 8381 *     for each mmio piece in the fragment
 8382 *       write gpa, len
 8383 *       exit
 8384 *       copy data
 8385 *   execute insn
 8386 *
 8387 * write:
 8388 *   for each fragment
 8389 *     for each mmio piece in the fragment
 8390 *       write gpa, len
 8391 *       copy data
 8392 *       exit
 8393 */
 8394static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 8395{
 8396	struct kvm_run *run = vcpu->run;
 8397	struct kvm_mmio_fragment *frag;
 8398	unsigned len;
 8399
 8400	BUG_ON(!vcpu->mmio_needed);
 8401
 8402	/* Complete previous fragment */
 8403	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
 8404	len = min(8u, frag->len);
 8405	if (!vcpu->mmio_is_write)
 8406		memcpy(frag->data, run->mmio.data, len);
 8407
 8408	if (frag->len <= 8) {
 8409		/* Switch to the next fragment. */
 8410		frag++;
 8411		vcpu->mmio_cur_fragment++;
 8412	} else {
 8413		/* Go forward to the next mmio piece. */
 8414		frag->data += len;
 8415		frag->gpa += len;
 8416		frag->len -= len;
 8417	}
 8418
 8419	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
 8420		vcpu->mmio_needed = 0;
 8421
 8422		/* FIXME: return into emulator if single-stepping.  */
 
 
 
 
 
 
 
 
 
 
 
 8423		if (vcpu->mmio_is_write)
 8424			return 1;
 8425		vcpu->mmio_read_completed = 1;
 8426		return complete_emulated_io(vcpu);
 8427	}
 8428
 8429	run->exit_reason = KVM_EXIT_MMIO;
 8430	run->mmio.phys_addr = frag->gpa;
 8431	if (vcpu->mmio_is_write)
 8432		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
 8433	run->mmio.len = min(8u, frag->len);
 8434	run->mmio.is_write = vcpu->mmio_is_write;
 8435	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 8436	return 0;
 8437}
 8438
 8439/* Swap (qemu) user FPU context for the guest FPU context. */
 8440static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 8441{
 8442	fpregs_lock();
 8443
 8444	copy_fpregs_to_fpstate(vcpu->arch.user_fpu);
 8445	/* PKRU is separately restored in kvm_x86_ops->run.  */
 8446	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
 8447				~XFEATURE_MASK_PKRU);
 8448
 8449	fpregs_mark_activate();
 8450	fpregs_unlock();
 8451
 8452	trace_kvm_fpu(1);
 8453}
 8454
 8455/* When vcpu_run ends, restore user space FPU context. */
 8456static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 8457{
 8458	fpregs_lock();
 8459
 8460	copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
 8461	copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
 8462
 8463	fpregs_mark_activate();
 8464	fpregs_unlock();
 8465
 8466	++vcpu->stat.fpu_reload;
 8467	trace_kvm_fpu(0);
 8468}
 8469
 8470int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 8471{
 8472	int r;
 
 8473
 8474	vcpu_load(vcpu);
 8475	kvm_sigset_activate(vcpu);
 8476	kvm_load_guest_fpu(vcpu);
 
 
 8477
 8478	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 8479		if (kvm_run->immediate_exit) {
 8480			r = -EINTR;
 8481			goto out;
 8482		}
 8483		kvm_vcpu_block(vcpu);
 8484		kvm_apic_accept_events(vcpu);
 8485		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 8486		r = -EAGAIN;
 8487		if (signal_pending(current)) {
 8488			r = -EINTR;
 8489			vcpu->run->exit_reason = KVM_EXIT_INTR;
 8490			++vcpu->stat.signal_exits;
 8491		}
 8492		goto out;
 8493	}
 8494
 8495	if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
 8496		r = -EINVAL;
 8497		goto out;
 8498	}
 8499
 8500	if (vcpu->run->kvm_dirty_regs) {
 8501		r = sync_regs(vcpu);
 8502		if (r != 0)
 8503			goto out;
 8504	}
 8505
 8506	/* re-sync apic's tpr */
 8507	if (!lapic_in_kernel(vcpu)) {
 8508		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 8509			r = -EINVAL;
 8510			goto out;
 8511		}
 8512	}
 8513
 8514	if (unlikely(vcpu->arch.complete_userspace_io)) {
 8515		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
 8516		vcpu->arch.complete_userspace_io = NULL;
 8517		r = cui(vcpu);
 8518		if (r <= 0)
 8519			goto out;
 8520	} else
 8521		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 8522
 8523	if (kvm_run->immediate_exit)
 8524		r = -EINTR;
 8525	else
 8526		r = vcpu_run(vcpu);
 8527
 8528out:
 8529	kvm_put_guest_fpu(vcpu);
 8530	if (vcpu->run->kvm_valid_regs)
 8531		store_regs(vcpu);
 8532	post_kvm_run_save(vcpu);
 8533	kvm_sigset_deactivate(vcpu);
 
 8534
 8535	vcpu_put(vcpu);
 8536	return r;
 8537}
 8538
 8539static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 8540{
 8541	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
 8542		/*
 8543		 * We are here if userspace calls get_regs() in the middle of
 8544		 * instruction emulation. Registers state needs to be copied
 8545		 * back from emulation context to vcpu. Userspace shouldn't do
 8546		 * that usually, but some bad designed PV devices (vmware
 8547		 * backdoor interface) need this to work
 8548		 */
 8549		emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
 
 8550		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 8551	}
 8552	regs->rax = kvm_rax_read(vcpu);
 8553	regs->rbx = kvm_rbx_read(vcpu);
 8554	regs->rcx = kvm_rcx_read(vcpu);
 8555	regs->rdx = kvm_rdx_read(vcpu);
 8556	regs->rsi = kvm_rsi_read(vcpu);
 8557	regs->rdi = kvm_rdi_read(vcpu);
 8558	regs->rsp = kvm_rsp_read(vcpu);
 8559	regs->rbp = kvm_rbp_read(vcpu);
 8560#ifdef CONFIG_X86_64
 8561	regs->r8 = kvm_r8_read(vcpu);
 8562	regs->r9 = kvm_r9_read(vcpu);
 8563	regs->r10 = kvm_r10_read(vcpu);
 8564	regs->r11 = kvm_r11_read(vcpu);
 8565	regs->r12 = kvm_r12_read(vcpu);
 8566	regs->r13 = kvm_r13_read(vcpu);
 8567	regs->r14 = kvm_r14_read(vcpu);
 8568	regs->r15 = kvm_r15_read(vcpu);
 8569#endif
 8570
 8571	regs->rip = kvm_rip_read(vcpu);
 8572	regs->rflags = kvm_get_rflags(vcpu);
 8573}
 8574
 8575int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 8576{
 8577	vcpu_load(vcpu);
 8578	__get_regs(vcpu, regs);
 8579	vcpu_put(vcpu);
 8580	return 0;
 8581}
 8582
 8583static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 8584{
 8585	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
 8586	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 8587
 8588	kvm_rax_write(vcpu, regs->rax);
 8589	kvm_rbx_write(vcpu, regs->rbx);
 8590	kvm_rcx_write(vcpu, regs->rcx);
 8591	kvm_rdx_write(vcpu, regs->rdx);
 8592	kvm_rsi_write(vcpu, regs->rsi);
 8593	kvm_rdi_write(vcpu, regs->rdi);
 8594	kvm_rsp_write(vcpu, regs->rsp);
 8595	kvm_rbp_write(vcpu, regs->rbp);
 8596#ifdef CONFIG_X86_64
 8597	kvm_r8_write(vcpu, regs->r8);
 8598	kvm_r9_write(vcpu, regs->r9);
 8599	kvm_r10_write(vcpu, regs->r10);
 8600	kvm_r11_write(vcpu, regs->r11);
 8601	kvm_r12_write(vcpu, regs->r12);
 8602	kvm_r13_write(vcpu, regs->r13);
 8603	kvm_r14_write(vcpu, regs->r14);
 8604	kvm_r15_write(vcpu, regs->r15);
 8605#endif
 8606
 8607	kvm_rip_write(vcpu, regs->rip);
 8608	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 8609
 8610	vcpu->arch.exception.pending = false;
 8611
 8612	kvm_make_request(KVM_REQ_EVENT, vcpu);
 8613}
 8614
 8615int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 8616{
 8617	vcpu_load(vcpu);
 8618	__set_regs(vcpu, regs);
 8619	vcpu_put(vcpu);
 8620	return 0;
 8621}
 8622
 8623void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 8624{
 8625	struct kvm_segment cs;
 8626
 8627	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
 8628	*db = cs.db;
 8629	*l = cs.l;
 8630}
 8631EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 8632
 8633static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 
 8634{
 8635	struct desc_ptr dt;
 8636
 8637	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 8638	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 8639	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
 8640	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
 8641	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
 8642	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
 8643
 8644	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 8645	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 8646
 8647	kvm_x86_ops->get_idt(vcpu, &dt);
 8648	sregs->idt.limit = dt.size;
 8649	sregs->idt.base = dt.address;
 8650	kvm_x86_ops->get_gdt(vcpu, &dt);
 8651	sregs->gdt.limit = dt.size;
 8652	sregs->gdt.base = dt.address;
 8653
 8654	sregs->cr0 = kvm_read_cr0(vcpu);
 8655	sregs->cr2 = vcpu->arch.cr2;
 8656	sregs->cr3 = kvm_read_cr3(vcpu);
 8657	sregs->cr4 = kvm_read_cr4(vcpu);
 8658	sregs->cr8 = kvm_get_cr8(vcpu);
 8659	sregs->efer = vcpu->arch.efer;
 8660	sregs->apic_base = kvm_get_apic_base(vcpu);
 8661
 8662	memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
 8663
 8664	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
 8665		set_bit(vcpu->arch.interrupt.nr,
 8666			(unsigned long *)sregs->interrupt_bitmap);
 8667}
 8668
 8669int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 8670				  struct kvm_sregs *sregs)
 8671{
 8672	vcpu_load(vcpu);
 8673	__get_sregs(vcpu, sregs);
 8674	vcpu_put(vcpu);
 8675	return 0;
 8676}
 8677
 8678int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 8679				    struct kvm_mp_state *mp_state)
 8680{
 8681	vcpu_load(vcpu);
 8682
 8683	kvm_apic_accept_events(vcpu);
 8684	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
 8685					vcpu->arch.pv.pv_unhalted)
 8686		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
 8687	else
 8688		mp_state->mp_state = vcpu->arch.mp_state;
 8689
 8690	vcpu_put(vcpu);
 8691	return 0;
 8692}
 8693
 8694int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 8695				    struct kvm_mp_state *mp_state)
 8696{
 8697	int ret = -EINVAL;
 8698
 8699	vcpu_load(vcpu);
 8700
 8701	if (!lapic_in_kernel(vcpu) &&
 8702	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
 8703		goto out;
 8704
 8705	/* INITs are latched while in SMM */
 8706	if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
 8707	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
 8708	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
 8709		goto out;
 8710
 8711	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
 8712		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
 8713		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
 8714	} else
 8715		vcpu->arch.mp_state = mp_state->mp_state;
 8716	kvm_make_request(KVM_REQ_EVENT, vcpu);
 8717
 8718	ret = 0;
 8719out:
 8720	vcpu_put(vcpu);
 8721	return ret;
 8722}
 8723
 8724int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 8725		    int reason, bool has_error_code, u32 error_code)
 8726{
 8727	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 8728	int ret;
 8729
 8730	init_emulate_ctxt(vcpu);
 8731
 8732	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
 8733				   has_error_code, error_code);
 8734	if (ret) {
 8735		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 8736		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 8737		vcpu->run->internal.ndata = 0;
 8738		return 0;
 8739	}
 8740
 
 
 
 
 8741	kvm_rip_write(vcpu, ctxt->eip);
 8742	kvm_set_rflags(vcpu, ctxt->eflags);
 8743	kvm_make_request(KVM_REQ_EVENT, vcpu);
 8744	return 1;
 8745}
 8746EXPORT_SYMBOL_GPL(kvm_task_switch);
 8747
 8748static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 8749{
 8750	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
 8751		/*
 8752		 * When EFER.LME and CR0.PG are set, the processor is in
 8753		 * 64-bit mode (though maybe in a 32-bit code segment).
 8754		 * CR4.PAE and EFER.LMA must be set.
 8755		 */
 8756		if (!(sregs->cr4 & X86_CR4_PAE)
 8757		    || !(sregs->efer & EFER_LMA))
 8758			return -EINVAL;
 8759	} else {
 8760		/*
 8761		 * Not in 64-bit mode: EFER.LMA is clear and the code
 8762		 * segment cannot be 64-bit.
 8763		 */
 8764		if (sregs->efer & EFER_LMA || sregs->cs.l)
 8765			return -EINVAL;
 8766	}
 8767
 8768	return kvm_valid_cr4(vcpu, sregs->cr4);
 8769}
 8770
 8771static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 8772{
 8773	struct msr_data apic_base_msr;
 8774	int mmu_reset_needed = 0;
 8775	int cpuid_update_needed = 0;
 8776	int pending_vec, max_bits, idx;
 8777	struct desc_ptr dt;
 8778	int ret = -EINVAL;
 8779
 8780	if (kvm_valid_sregs(vcpu, sregs))
 8781		goto out;
 8782
 8783	apic_base_msr.data = sregs->apic_base;
 8784	apic_base_msr.host_initiated = true;
 8785	if (kvm_set_apic_base(vcpu, &apic_base_msr))
 8786		goto out;
 8787
 8788	dt.size = sregs->idt.limit;
 8789	dt.address = sregs->idt.base;
 8790	kvm_x86_ops->set_idt(vcpu, &dt);
 8791	dt.size = sregs->gdt.limit;
 8792	dt.address = sregs->gdt.base;
 8793	kvm_x86_ops->set_gdt(vcpu, &dt);
 8794
 8795	vcpu->arch.cr2 = sregs->cr2;
 8796	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
 8797	vcpu->arch.cr3 = sregs->cr3;
 8798	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
 8799
 8800	kvm_set_cr8(vcpu, sregs->cr8);
 8801
 8802	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
 8803	kvm_x86_ops->set_efer(vcpu, sregs->efer);
 
 8804
 8805	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
 8806	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
 8807	vcpu->arch.cr0 = sregs->cr0;
 8808
 8809	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 8810	cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
 8811				(X86_CR4_OSXSAVE | X86_CR4_PKE));
 8812	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 8813	if (cpuid_update_needed)
 8814		kvm_update_cpuid(vcpu);
 8815
 8816	idx = srcu_read_lock(&vcpu->kvm->srcu);
 8817	if (is_pae_paging(vcpu)) {
 8818		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
 8819		mmu_reset_needed = 1;
 8820	}
 8821	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 8822
 8823	if (mmu_reset_needed)
 8824		kvm_mmu_reset_context(vcpu);
 8825
 8826	max_bits = KVM_NR_INTERRUPTS;
 8827	pending_vec = find_first_bit(
 8828		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
 8829	if (pending_vec < max_bits) {
 8830		kvm_queue_interrupt(vcpu, pending_vec, false);
 8831		pr_debug("Set back pending irq %d\n", pending_vec);
 8832	}
 8833
 8834	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 8835	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 8836	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
 8837	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
 8838	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
 8839	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
 8840
 8841	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
 8842	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 8843
 8844	update_cr8_intercept(vcpu);
 8845
 8846	/* Older userspace won't unhalt the vcpu on reset. */
 8847	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
 8848	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
 8849	    !is_protmode(vcpu))
 8850		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 8851
 8852	kvm_make_request(KVM_REQ_EVENT, vcpu);
 8853
 8854	ret = 0;
 8855out:
 8856	return ret;
 8857}
 8858
 8859int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 8860				  struct kvm_sregs *sregs)
 8861{
 8862	int ret;
 8863
 8864	vcpu_load(vcpu);
 8865	ret = __set_sregs(vcpu, sregs);
 8866	vcpu_put(vcpu);
 8867	return ret;
 8868}
 8869
 8870int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 8871					struct kvm_guest_debug *dbg)
 8872{
 8873	unsigned long rflags;
 8874	int i, r;
 8875
 8876	vcpu_load(vcpu);
 8877
 8878	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 8879		r = -EBUSY;
 8880		if (vcpu->arch.exception.pending)
 8881			goto out;
 8882		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 8883			kvm_queue_exception(vcpu, DB_VECTOR);
 8884		else
 8885			kvm_queue_exception(vcpu, BP_VECTOR);
 8886	}
 8887
 8888	/*
 8889	 * Read rflags as long as potentially injected trace flags are still
 8890	 * filtered out.
 8891	 */
 8892	rflags = kvm_get_rflags(vcpu);
 8893
 8894	vcpu->guest_debug = dbg->control;
 8895	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
 8896		vcpu->guest_debug = 0;
 8897
 8898	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
 8899		for (i = 0; i < KVM_NR_DB_REGS; ++i)
 8900			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
 8901		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
 
 8902	} else {
 8903		for (i = 0; i < KVM_NR_DB_REGS; i++)
 8904			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
 
 8905	}
 8906	kvm_update_dr7(vcpu);
 8907
 8908	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 8909		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
 8910			get_segment_base(vcpu, VCPU_SREG_CS);
 8911
 8912	/*
 8913	 * Trigger an rflags update that will inject or remove the trace
 8914	 * flags.
 8915	 */
 8916	kvm_set_rflags(vcpu, rflags);
 8917
 8918	kvm_x86_ops->update_bp_intercept(vcpu);
 8919
 8920	r = 0;
 8921
 8922out:
 8923	vcpu_put(vcpu);
 8924	return r;
 8925}
 8926
 8927/*
 8928 * Translate a guest virtual address to a guest physical address.
 8929 */
 8930int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 8931				    struct kvm_translation *tr)
 8932{
 8933	unsigned long vaddr = tr->linear_address;
 8934	gpa_t gpa;
 8935	int idx;
 8936
 8937	vcpu_load(vcpu);
 8938
 8939	idx = srcu_read_lock(&vcpu->kvm->srcu);
 8940	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 8941	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 8942	tr->physical_address = gpa;
 8943	tr->valid = gpa != UNMAPPED_GVA;
 8944	tr->writeable = 1;
 8945	tr->usermode = 0;
 8946
 8947	vcpu_put(vcpu);
 8948	return 0;
 8949}
 8950
 8951int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 8952{
 8953	struct fxregs_state *fxsave;
 
 8954
 8955	vcpu_load(vcpu);
 8956
 8957	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
 8958	memcpy(fpu->fpr, fxsave->st_space, 128);
 8959	fpu->fcw = fxsave->cwd;
 8960	fpu->fsw = fxsave->swd;
 8961	fpu->ftwx = fxsave->twd;
 8962	fpu->last_opcode = fxsave->fop;
 8963	fpu->last_ip = fxsave->rip;
 8964	fpu->last_dp = fxsave->rdp;
 8965	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
 8966
 8967	vcpu_put(vcpu);
 8968	return 0;
 8969}
 8970
 8971int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 8972{
 8973	struct fxregs_state *fxsave;
 8974
 8975	vcpu_load(vcpu);
 8976
 8977	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
 8978
 8979	memcpy(fxsave->st_space, fpu->fpr, 128);
 8980	fxsave->cwd = fpu->fcw;
 8981	fxsave->swd = fpu->fsw;
 8982	fxsave->twd = fpu->ftwx;
 8983	fxsave->fop = fpu->last_opcode;
 8984	fxsave->rip = fpu->last_ip;
 8985	fxsave->rdp = fpu->last_dp;
 8986	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
 8987
 8988	vcpu_put(vcpu);
 8989	return 0;
 8990}
 8991
 8992static void store_regs(struct kvm_vcpu *vcpu)
 8993{
 8994	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
 
 
 
 
 
 
 8995
 8996	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
 8997		__get_regs(vcpu, &vcpu->run->s.regs.regs);
 
 
 8998
 8999	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
 9000		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
 9001
 9002	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
 9003		kvm_vcpu_ioctl_x86_get_vcpu_events(
 9004				vcpu, &vcpu->run->s.regs.events);
 9005}
 
 9006
 9007static int sync_regs(struct kvm_vcpu *vcpu)
 9008{
 9009	if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
 9010		return -EINVAL;
 9011
 9012	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
 9013		__set_regs(vcpu, &vcpu->run->s.regs.regs);
 9014		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
 9015	}
 9016	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
 9017		if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
 9018			return -EINVAL;
 9019		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
 9020	}
 9021	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
 9022		if (kvm_vcpu_ioctl_x86_set_vcpu_events(
 9023				vcpu, &vcpu->run->s.regs.events))
 9024			return -EINVAL;
 9025		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
 9026	}
 9027
 9028	return 0;
 
 
 
 
 
 
 
 
 
 9029}
 9030
 9031static void fx_init(struct kvm_vcpu *vcpu)
 9032{
 9033	fpstate_init(&vcpu->arch.guest_fpu->state);
 9034	if (boot_cpu_has(X86_FEATURE_XSAVES))
 9035		vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
 9036			host_xcr0 | XSTATE_COMPACTION_ENABLED;
 9037
 9038	/*
 9039	 * Ensure guest xcr0 is valid for loading
 9040	 */
 9041	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
 9042
 9043	vcpu->arch.cr0 |= X86_CR0_ET;
 
 
 
 
 9044}
 9045
 9046void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 9047{
 9048	void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
 9049
 9050	kvmclock_reset(vcpu);
 9051
 
 
 9052	kvm_x86_ops->vcpu_free(vcpu);
 9053	free_cpumask_var(wbinvd_dirty_mask);
 9054}
 9055
 9056struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 9057						unsigned int id)
 9058{
 9059	struct kvm_vcpu *vcpu;
 9060
 9061	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
 9062		printk_once(KERN_WARNING
 9063		"kvm: SMP vm created on host with unstable TSC; "
 9064		"guest TSC will not be reliable\n");
 9065
 9066	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
 9067
 9068	return vcpu;
 9069}
 9070
 9071int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 9072{
 9073	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
 9074	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
 9075	kvm_vcpu_mtrr_init(vcpu);
 9076	vcpu_load(vcpu);
 9077	kvm_vcpu_reset(vcpu, false);
 9078	kvm_init_mmu(vcpu, false);
 9079	vcpu_put(vcpu);
 9080	return 0;
 9081}
 9082
 9083void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 9084{
 9085	struct msr_data msr;
 9086	struct kvm *kvm = vcpu->kvm;
 9087
 9088	kvm_hv_vcpu_postcreate(vcpu);
 9089
 9090	if (mutex_lock_killable(&vcpu->mutex))
 9091		return;
 9092	vcpu_load(vcpu);
 9093	msr.data = 0x0;
 9094	msr.index = MSR_IA32_TSC;
 9095	msr.host_initiated = true;
 9096	kvm_write_tsc(vcpu, &msr);
 9097	vcpu_put(vcpu);
 9098
 9099	/* poll control enabled by default */
 9100	vcpu->arch.msr_kvm_poll_control = 1;
 9101
 9102	mutex_unlock(&vcpu->mutex);
 9103
 9104	if (!kvmclock_periodic_sync)
 9105		return;
 9106
 9107	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 9108					KVMCLOCK_SYNC_PERIOD);
 9109}
 9110
 9111void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 9112{
 9113	vcpu->arch.apf.msr_val = 0;
 9114
 9115	vcpu_load(vcpu);
 9116	kvm_mmu_unload(vcpu);
 9117	vcpu_put(vcpu);
 9118
 
 9119	kvm_x86_ops->vcpu_free(vcpu);
 9120}
 9121
 9122void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 9123{
 9124	kvm_lapic_reset(vcpu, init_event);
 9125
 9126	vcpu->arch.hflags = 0;
 9127
 9128	vcpu->arch.smi_pending = 0;
 9129	vcpu->arch.smi_count = 0;
 9130	atomic_set(&vcpu->arch.nmi_queued, 0);
 9131	vcpu->arch.nmi_pending = 0;
 9132	vcpu->arch.nmi_injected = false;
 9133	kvm_clear_interrupt_queue(vcpu);
 9134	kvm_clear_exception_queue(vcpu);
 9135	vcpu->arch.exception.pending = false;
 9136
 
 9137	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
 9138	kvm_update_dr0123(vcpu);
 9139	vcpu->arch.dr6 = DR6_INIT;
 9140	kvm_update_dr6(vcpu);
 9141	vcpu->arch.dr7 = DR7_FIXED_1;
 9142	kvm_update_dr7(vcpu);
 9143
 9144	vcpu->arch.cr2 = 0;
 9145
 9146	kvm_make_request(KVM_REQ_EVENT, vcpu);
 9147	vcpu->arch.apf.msr_val = 0;
 9148	vcpu->arch.st.msr_val = 0;
 9149
 9150	kvmclock_reset(vcpu);
 9151
 9152	kvm_clear_async_pf_completion_queue(vcpu);
 9153	kvm_async_pf_hash_reset(vcpu);
 9154	vcpu->arch.apf.halted = false;
 9155
 9156	if (kvm_mpx_supported()) {
 9157		void *mpx_state_buffer;
 9158
 9159		/*
 9160		 * To avoid have the INIT path from kvm_apic_has_events() that be
 9161		 * called with loaded FPU and does not let userspace fix the state.
 9162		 */
 9163		if (init_event)
 9164			kvm_put_guest_fpu(vcpu);
 9165		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
 9166					XFEATURE_BNDREGS);
 9167		if (mpx_state_buffer)
 9168			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
 9169		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
 9170					XFEATURE_BNDCSR);
 9171		if (mpx_state_buffer)
 9172			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
 9173		if (init_event)
 9174			kvm_load_guest_fpu(vcpu);
 9175	}
 9176
 9177	if (!init_event) {
 9178		kvm_pmu_reset(vcpu);
 9179		vcpu->arch.smbase = 0x30000;
 9180
 9181		vcpu->arch.msr_misc_features_enables = 0;
 9182
 9183		vcpu->arch.xcr0 = XFEATURE_MASK_FP;
 9184	}
 9185
 9186	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 9187	vcpu->arch.regs_avail = ~0;
 9188	vcpu->arch.regs_dirty = ~0;
 9189
 9190	vcpu->arch.ia32_xss = 0;
 9191
 9192	kvm_x86_ops->vcpu_reset(vcpu, init_event);
 9193}
 9194
 9195void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 9196{
 9197	struct kvm_segment cs;
 9198
 9199	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
 9200	cs.selector = vector << 8;
 9201	cs.base = vector << 12;
 9202	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
 9203	kvm_rip_write(vcpu, 0);
 9204}
 9205
 9206int kvm_arch_hardware_enable(void)
 9207{
 9208	struct kvm *kvm;
 9209	struct kvm_vcpu *vcpu;
 9210	int i;
 9211	int ret;
 9212	u64 local_tsc;
 9213	u64 max_tsc = 0;
 9214	bool stable, backwards_tsc = false;
 9215
 9216	kvm_shared_msr_cpu_online();
 9217	ret = kvm_x86_ops->hardware_enable();
 9218	if (ret != 0)
 9219		return ret;
 9220
 9221	local_tsc = rdtsc();
 9222	stable = !kvm_check_tsc_unstable();
 9223	list_for_each_entry(kvm, &vm_list, vm_list) {
 9224		kvm_for_each_vcpu(i, vcpu, kvm) {
 9225			if (!stable && vcpu->cpu == smp_processor_id())
 9226				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 9227			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
 9228				backwards_tsc = true;
 9229				if (vcpu->arch.last_host_tsc > max_tsc)
 9230					max_tsc = vcpu->arch.last_host_tsc;
 9231			}
 9232		}
 9233	}
 9234
 9235	/*
 9236	 * Sometimes, even reliable TSCs go backwards.  This happens on
 9237	 * platforms that reset TSC during suspend or hibernate actions, but
 9238	 * maintain synchronization.  We must compensate.  Fortunately, we can
 9239	 * detect that condition here, which happens early in CPU bringup,
 9240	 * before any KVM threads can be running.  Unfortunately, we can't
 9241	 * bring the TSCs fully up to date with real time, as we aren't yet far
 9242	 * enough into CPU bringup that we know how much real time has actually
 9243	 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
 9244	 * variables that haven't been updated yet.
 9245	 *
 9246	 * So we simply find the maximum observed TSC above, then record the
 9247	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
 9248	 * the adjustment will be applied.  Note that we accumulate
 9249	 * adjustments, in case multiple suspend cycles happen before some VCPU
 9250	 * gets a chance to run again.  In the event that no KVM threads get a
 9251	 * chance to run, we will miss the entire elapsed period, as we'll have
 9252	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
 9253	 * loose cycle time.  This isn't too big a deal, since the loss will be
 9254	 * uniform across all VCPUs (not to mention the scenario is extremely
 9255	 * unlikely). It is possible that a second hibernate recovery happens
 9256	 * much faster than a first, causing the observed TSC here to be
 9257	 * smaller; this would require additional padding adjustment, which is
 9258	 * why we set last_host_tsc to the local tsc observed here.
 9259	 *
 9260	 * N.B. - this code below runs only on platforms with reliable TSC,
 9261	 * as that is the only way backwards_tsc is set above.  Also note
 9262	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
 9263	 * have the same delta_cyc adjustment applied if backwards_tsc
 9264	 * is detected.  Note further, this adjustment is only done once,
 9265	 * as we reset last_host_tsc on all VCPUs to stop this from being
 9266	 * called multiple times (one for each physical CPU bringup).
 9267	 *
 9268	 * Platforms with unreliable TSCs don't have to deal with this, they
 9269	 * will be compensated by the logic in vcpu_load, which sets the TSC to
 9270	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
 9271	 * guarantee that they stay in perfect synchronization.
 9272	 */
 9273	if (backwards_tsc) {
 9274		u64 delta_cyc = max_tsc - local_tsc;
 9275		list_for_each_entry(kvm, &vm_list, vm_list) {
 9276			kvm->arch.backwards_tsc_observed = true;
 9277			kvm_for_each_vcpu(i, vcpu, kvm) {
 9278				vcpu->arch.tsc_offset_adjustment += delta_cyc;
 9279				vcpu->arch.last_host_tsc = local_tsc;
 9280				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 9281			}
 9282
 9283			/*
 9284			 * We have to disable TSC offset matching.. if you were
 9285			 * booting a VM while issuing an S4 host suspend....
 9286			 * you may have some problem.  Solving this issue is
 9287			 * left as an exercise to the reader.
 9288			 */
 9289			kvm->arch.last_tsc_nsec = 0;
 9290			kvm->arch.last_tsc_write = 0;
 9291		}
 9292
 9293	}
 9294	return 0;
 9295}
 9296
 9297void kvm_arch_hardware_disable(void)
 9298{
 9299	kvm_x86_ops->hardware_disable();
 9300	drop_user_return_notifiers();
 9301}
 9302
 9303int kvm_arch_hardware_setup(void)
 9304{
 9305	int r;
 9306
 9307	r = kvm_x86_ops->hardware_setup();
 9308	if (r != 0)
 9309		return r;
 9310
 9311	if (kvm_has_tsc_control) {
 9312		/*
 9313		 * Make sure the user can only configure tsc_khz values that
 9314		 * fit into a signed integer.
 9315		 * A min value is not calculated because it will always
 9316		 * be 1 on all machines.
 9317		 */
 9318		u64 max = min(0x7fffffffULL,
 9319			      __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
 9320		kvm_max_guest_tsc_khz = max;
 9321
 9322		kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
 9323	}
 9324
 9325	kvm_init_msr_list();
 9326	return 0;
 9327}
 9328
 9329void kvm_arch_hardware_unsetup(void)
 9330{
 9331	kvm_x86_ops->hardware_unsetup();
 9332}
 9333
 9334int kvm_arch_check_processor_compat(void)
 9335{
 9336	return kvm_x86_ops->check_processor_compatibility();
 9337}
 9338
 9339bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
 9340{
 9341	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
 9342}
 9343EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
 9344
 9345bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 9346{
 9347	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
 9348}
 9349
 9350struct static_key kvm_no_apic_vcpu __read_mostly;
 9351EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 9352
 9353int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 9354{
 9355	struct page *page;
 
 9356	int r;
 9357
 
 
 
 9358	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 9359	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 
 
 
 
 9360		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 9361	else
 9362		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
 9363
 9364	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 9365	if (!page) {
 9366		r = -ENOMEM;
 9367		goto fail;
 9368	}
 9369	vcpu->arch.pio_data = page_address(page);
 9370
 9371	kvm_set_tsc_khz(vcpu, max_tsc_khz);
 9372
 9373	r = kvm_mmu_create(vcpu);
 9374	if (r < 0)
 9375		goto fail_free_pio_data;
 9376
 9377	if (irqchip_in_kernel(vcpu->kvm)) {
 9378		vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
 9379		r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
 9380		if (r < 0)
 9381			goto fail_mmu_destroy;
 9382	} else
 9383		static_key_slow_inc(&kvm_no_apic_vcpu);
 9384
 9385	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
 9386				       GFP_KERNEL_ACCOUNT);
 9387	if (!vcpu->arch.mce_banks) {
 9388		r = -ENOMEM;
 9389		goto fail_free_lapic;
 9390	}
 9391	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 9392
 9393	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
 9394				GFP_KERNEL_ACCOUNT)) {
 9395		r = -ENOMEM;
 9396		goto fail_free_mce_banks;
 9397	}
 9398
 9399	fx_init(vcpu);
 9400
 9401	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 9402
 9403	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 9404
 9405	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
 9406
 9407	kvm_async_pf_hash_reset(vcpu);
 9408	kvm_pmu_init(vcpu);
 9409
 9410	vcpu->arch.pending_external_vector = -1;
 9411	vcpu->arch.preempted_in_kernel = false;
 9412
 9413	kvm_hv_vcpu_init(vcpu);
 9414
 9415	return 0;
 9416
 9417fail_free_mce_banks:
 9418	kfree(vcpu->arch.mce_banks);
 9419fail_free_lapic:
 9420	kvm_free_lapic(vcpu);
 9421fail_mmu_destroy:
 9422	kvm_mmu_destroy(vcpu);
 9423fail_free_pio_data:
 9424	free_page((unsigned long)vcpu->arch.pio_data);
 9425fail:
 9426	return r;
 9427}
 9428
 9429void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 9430{
 9431	int idx;
 9432
 9433	kvm_hv_vcpu_uninit(vcpu);
 9434	kvm_pmu_destroy(vcpu);
 9435	kfree(vcpu->arch.mce_banks);
 9436	kvm_free_lapic(vcpu);
 9437	idx = srcu_read_lock(&vcpu->kvm->srcu);
 9438	kvm_mmu_destroy(vcpu);
 9439	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 9440	free_page((unsigned long)vcpu->arch.pio_data);
 9441	if (!lapic_in_kernel(vcpu))
 9442		static_key_slow_dec(&kvm_no_apic_vcpu);
 9443}
 9444
 9445void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 9446{
 9447	vcpu->arch.l1tf_flush_l1d = true;
 9448	kvm_x86_ops->sched_in(vcpu, cpu);
 9449}
 9450
 9451int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 9452{
 9453	if (type)
 9454		return -EINVAL;
 9455
 9456	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 9457	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 9458	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 9459	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
 9460	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 9461	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 9462
 9463	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 9464	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 9465	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
 9466	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 9467		&kvm->arch.irq_sources_bitmap);
 9468
 9469	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 9470	mutex_init(&kvm->arch.apic_map_lock);
 9471	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 9472
 9473	kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
 9474	pvclock_update_vm_gtod_copy(kvm);
 9475
 9476	kvm->arch.guest_can_read_msr_platform_info = true;
 9477
 9478	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
 9479	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 9480
 9481	kvm_hv_init_vm(kvm);
 9482	kvm_page_track_init(kvm);
 9483	kvm_mmu_init_vm(kvm);
 9484
 9485	return kvm_x86_ops->vm_init(kvm);
 9486}
 9487
 9488int kvm_arch_post_init_vm(struct kvm *kvm)
 9489{
 9490	return kvm_mmu_post_init_vm(kvm);
 9491}
 9492
 9493static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 9494{
 9495	vcpu_load(vcpu);
 9496	kvm_mmu_unload(vcpu);
 9497	vcpu_put(vcpu);
 9498}
 9499
 9500static void kvm_free_vcpus(struct kvm *kvm)
 9501{
 9502	unsigned int i;
 9503	struct kvm_vcpu *vcpu;
 9504
 9505	/*
 9506	 * Unpin any mmu pages first.
 9507	 */
 9508	kvm_for_each_vcpu(i, vcpu, kvm) {
 9509		kvm_clear_async_pf_completion_queue(vcpu);
 9510		kvm_unload_vcpu_mmu(vcpu);
 9511	}
 9512	kvm_for_each_vcpu(i, vcpu, kvm)
 9513		kvm_arch_vcpu_free(vcpu);
 9514
 9515	mutex_lock(&kvm->lock);
 9516	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
 9517		kvm->vcpus[i] = NULL;
 9518
 9519	atomic_set(&kvm->online_vcpus, 0);
 9520	mutex_unlock(&kvm->lock);
 9521}
 9522
 9523void kvm_arch_sync_events(struct kvm *kvm)
 9524{
 9525	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 9526	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
 9527	kvm_free_pit(kvm);
 9528}
 9529
 9530int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 9531{
 9532	int i, r;
 9533	unsigned long hva;
 9534	struct kvm_memslots *slots = kvm_memslots(kvm);
 9535	struct kvm_memory_slot *slot, old;
 9536
 9537	/* Called with kvm->slots_lock held.  */
 9538	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
 9539		return -EINVAL;
 9540
 9541	slot = id_to_memslot(slots, id);
 9542	if (size) {
 9543		if (slot->npages)
 9544			return -EEXIST;
 9545
 9546		/*
 9547		 * MAP_SHARED to prevent internal slot pages from being moved
 9548		 * by fork()/COW.
 9549		 */
 9550		hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
 9551			      MAP_SHARED | MAP_ANONYMOUS, 0);
 9552		if (IS_ERR((void *)hva))
 9553			return PTR_ERR((void *)hva);
 9554	} else {
 9555		if (!slot->npages)
 9556			return 0;
 9557
 9558		hva = 0;
 9559	}
 9560
 9561	old = *slot;
 9562	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 9563		struct kvm_userspace_memory_region m;
 9564
 9565		m.slot = id | (i << 16);
 9566		m.flags = 0;
 9567		m.guest_phys_addr = gpa;
 9568		m.userspace_addr = hva;
 9569		m.memory_size = size;
 9570		r = __kvm_set_memory_region(kvm, &m);
 9571		if (r < 0)
 9572			return r;
 9573	}
 9574
 9575	if (!size)
 9576		vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
 9577
 9578	return 0;
 9579}
 9580EXPORT_SYMBOL_GPL(__x86_set_memory_region);
 9581
 9582int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 9583{
 9584	int r;
 9585
 9586	mutex_lock(&kvm->slots_lock);
 9587	r = __x86_set_memory_region(kvm, id, gpa, size);
 9588	mutex_unlock(&kvm->slots_lock);
 9589
 9590	return r;
 9591}
 9592EXPORT_SYMBOL_GPL(x86_set_memory_region);
 9593
 9594void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 9595{
 9596	kvm_mmu_pre_destroy_vm(kvm);
 9597}
 9598
 9599void kvm_arch_destroy_vm(struct kvm *kvm)
 9600{
 9601	if (current->mm == kvm->mm) {
 9602		/*
 9603		 * Free memory regions allocated on behalf of userspace,
 9604		 * unless the the memory map has changed due to process exit
 9605		 * or fd copying.
 9606		 */
 9607		x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
 9608		x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
 9609		x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
 9610	}
 9611	if (kvm_x86_ops->vm_destroy)
 9612		kvm_x86_ops->vm_destroy(kvm);
 9613	kvm_pic_destroy(kvm);
 9614	kvm_ioapic_destroy(kvm);
 9615	kvm_free_vcpus(kvm);
 9616	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 9617	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
 9618	kvm_mmu_uninit_vm(kvm);
 9619	kvm_page_track_cleanup(kvm);
 9620	kvm_hv_destroy_vm(kvm);
 9621}
 9622
 9623void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 9624			   struct kvm_memory_slot *dont)
 9625{
 9626	int i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 9627
 9628	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 9629		if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
 9630			kvfree(free->arch.rmap[i]);
 9631			free->arch.rmap[i] = NULL;
 9632		}
 9633		if (i == 0)
 9634			continue;
 9635
 9636		if (!dont || free->arch.lpage_info[i - 1] !=
 9637			     dont->arch.lpage_info[i - 1]) {
 9638			kvfree(free->arch.lpage_info[i - 1]);
 9639			free->arch.lpage_info[i - 1] = NULL;
 9640		}
 9641	}
 9642
 9643	kvm_page_track_free_memslot(free, dont);
 9644}
 9645
 9646int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 9647			    unsigned long npages)
 9648{
 9649	int i;
 9650
 9651	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 9652		struct kvm_lpage_info *linfo;
 9653		unsigned long ugfn;
 9654		int lpages;
 9655		int level = i + 1;
 9656
 9657		lpages = gfn_to_index(slot->base_gfn + npages - 1,
 9658				      slot->base_gfn, level) + 1;
 9659
 9660		slot->arch.rmap[i] =
 9661			kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
 9662				 GFP_KERNEL_ACCOUNT);
 9663		if (!slot->arch.rmap[i])
 9664			goto out_free;
 9665		if (i == 0)
 9666			continue;
 9667
 9668		linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
 9669		if (!linfo)
 9670			goto out_free;
 9671
 9672		slot->arch.lpage_info[i - 1] = linfo;
 9673
 9674		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
 9675			linfo[0].disallow_lpage = 1;
 9676		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
 9677			linfo[lpages - 1].disallow_lpage = 1;
 9678		ugfn = slot->userspace_addr >> PAGE_SHIFT;
 9679		/*
 9680		 * If the gfn and userspace address are not aligned wrt each
 9681		 * other, or if explicitly asked to, disable large page
 9682		 * support for this slot
 9683		 */
 9684		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
 9685		    !kvm_largepages_enabled()) {
 9686			unsigned long j;
 9687
 9688			for (j = 0; j < lpages; ++j)
 9689				linfo[j].disallow_lpage = 1;
 9690		}
 9691	}
 9692
 9693	if (kvm_page_track_create_memslot(slot, npages))
 9694		goto out_free;
 9695
 9696	return 0;
 9697
 9698out_free:
 9699	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 9700		kvfree(slot->arch.rmap[i]);
 9701		slot->arch.rmap[i] = NULL;
 9702		if (i == 0)
 9703			continue;
 9704
 9705		kvfree(slot->arch.lpage_info[i - 1]);
 9706		slot->arch.lpage_info[i - 1] = NULL;
 9707	}
 9708	return -ENOMEM;
 9709}
 9710
 9711void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 
 
 
 9712{
 9713	/*
 9714	 * memslots->generation has been incremented.
 9715	 * mmio generation may have reached its maximum value.
 9716	 */
 9717	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
 9718}
 9719
 9720int kvm_arch_prepare_memory_region(struct kvm *kvm,
 9721				struct kvm_memory_slot *memslot,
 9722				const struct kvm_userspace_memory_region *mem,
 9723				enum kvm_mr_change change)
 9724{
 9725	return 0;
 9726}
 9727
 9728static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 9729				     struct kvm_memory_slot *new)
 9730{
 9731	/* Still write protect RO slot */
 9732	if (new->flags & KVM_MEM_READONLY) {
 9733		kvm_mmu_slot_remove_write_access(kvm, new);
 9734		return;
 9735	}
 9736
 9737	/*
 9738	 * Call kvm_x86_ops dirty logging hooks when they are valid.
 9739	 *
 9740	 * kvm_x86_ops->slot_disable_log_dirty is called when:
 9741	 *
 9742	 *  - KVM_MR_CREATE with dirty logging is disabled
 9743	 *  - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
 9744	 *
 9745	 * The reason is, in case of PML, we need to set D-bit for any slots
 9746	 * with dirty logging disabled in order to eliminate unnecessary GPA
 9747	 * logging in PML buffer (and potential PML buffer full VMEXT). This
 9748	 * guarantees leaving PML enabled during guest's lifetime won't have
 9749	 * any additional overhead from PML when guest is running with dirty
 9750	 * logging disabled for memory slots.
 9751	 *
 9752	 * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
 9753	 * to dirty logging mode.
 9754	 *
 9755	 * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
 9756	 *
 9757	 * In case of write protect:
 9758	 *
 9759	 * Write protect all pages for dirty logging.
 9760	 *
 9761	 * All the sptes including the large sptes which point to this
 9762	 * slot are set to readonly. We can not create any new large
 9763	 * spte on this slot until the end of the logging.
 9764	 *
 9765	 * See the comments in fast_page_fault().
 9766	 */
 9767	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 9768		if (kvm_x86_ops->slot_enable_log_dirty)
 9769			kvm_x86_ops->slot_enable_log_dirty(kvm, new);
 9770		else
 9771			kvm_mmu_slot_remove_write_access(kvm, new);
 9772	} else {
 9773		if (kvm_x86_ops->slot_disable_log_dirty)
 9774			kvm_x86_ops->slot_disable_log_dirty(kvm, new);
 9775	}
 9776}
 9777
 9778void kvm_arch_commit_memory_region(struct kvm *kvm,
 9779				const struct kvm_userspace_memory_region *mem,
 9780				const struct kvm_memory_slot *old,
 9781				const struct kvm_memory_slot *new,
 9782				enum kvm_mr_change change)
 9783{
 9784	if (!kvm->arch.n_requested_mmu_pages)
 9785		kvm_mmu_change_mmu_pages(kvm,
 9786				kvm_mmu_calculate_default_mmu_pages(kvm));
 9787
 9788	/*
 9789	 * Dirty logging tracks sptes in 4k granularity, meaning that large
 9790	 * sptes have to be split.  If live migration is successful, the guest
 9791	 * in the source machine will be destroyed and large sptes will be
 9792	 * created in the destination. However, if the guest continues to run
 9793	 * in the source machine (for example if live migration fails), small
 9794	 * sptes will remain around and cause bad performance.
 9795	 *
 9796	 * Scan sptes if dirty logging has been stopped, dropping those
 9797	 * which can be collapsed into a single large-page spte.  Later
 9798	 * page faults will create the large-page sptes.
 9799	 *
 9800	 * There is no need to do this in any of the following cases:
 9801	 * CREATE:	No dirty mappings will already exist.
 9802	 * MOVE/DELETE:	The old mappings will already have been cleaned up by
 9803	 *		kvm_arch_flush_shadow_memslot()
 9804	 */
 9805	if (change == KVM_MR_FLAGS_ONLY &&
 9806		(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
 9807		!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
 9808		kvm_mmu_zap_collapsible_sptes(kvm, new);
 9809
 9810	/*
 9811	 * Set up write protection and/or dirty logging for the new slot.
 9812	 *
 9813	 * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
 9814	 * been zapped so no dirty logging staff is needed for old slot. For
 9815	 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
 9816	 * new and it's also covered when dealing with the new slot.
 9817	 *
 9818	 * FIXME: const-ify all uses of struct kvm_memory_slot.
 9819	 */
 9820	if (change != KVM_MR_DELETE)
 9821		kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 9822}
 9823
 9824void kvm_arch_flush_shadow_all(struct kvm *kvm)
 9825{
 9826	kvm_mmu_zap_all(kvm);
 9827}
 9828
 9829void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 9830				   struct kvm_memory_slot *slot)
 9831{
 9832	kvm_page_track_flush_slot(kvm, slot);
 9833}
 9834
 9835static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
 9836{
 9837	return (is_guest_mode(vcpu) &&
 9838			kvm_x86_ops->guest_apic_has_interrupt &&
 9839			kvm_x86_ops->guest_apic_has_interrupt(vcpu));
 9840}
 9841
 9842static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 9843{
 9844	if (!list_empty_careful(&vcpu->async_pf.done))
 9845		return true;
 9846
 9847	if (kvm_apic_has_events(vcpu))
 9848		return true;
 9849
 9850	if (vcpu->arch.pv.pv_unhalted)
 9851		return true;
 9852
 9853	if (vcpu->arch.exception.pending)
 9854		return true;
 9855
 9856	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 9857	    (vcpu->arch.nmi_pending &&
 9858	     kvm_x86_ops->nmi_allowed(vcpu)))
 9859		return true;
 9860
 9861	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
 9862	    (vcpu->arch.smi_pending && !is_smm(vcpu)))
 9863		return true;
 9864
 9865	if (kvm_arch_interrupt_allowed(vcpu) &&
 9866	    (kvm_cpu_has_interrupt(vcpu) ||
 9867	    kvm_guest_apic_has_interrupt(vcpu)))
 9868		return true;
 9869
 9870	if (kvm_hv_has_stimer_pending(vcpu))
 9871		return true;
 9872
 9873	return false;
 9874}
 9875
 9876int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 9877{
 9878	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 
 
 
 
 
 
 9879}
 9880
 9881bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
 9882{
 9883	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
 9884		return true;
 9885
 9886	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 9887		kvm_test_request(KVM_REQ_SMI, vcpu) ||
 9888		 kvm_test_request(KVM_REQ_EVENT, vcpu))
 9889		return true;
 9890
 9891	if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
 9892		return true;
 9893
 9894	return false;
 9895}
 9896
 9897bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 9898{
 9899	return vcpu->arch.preempted_in_kernel;
 9900}
 9901
 9902int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 9903{
 9904	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
 9905}
 9906
 9907int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 9908{
 9909	return kvm_x86_ops->interrupt_allowed(vcpu);
 9910}
 9911
 9912unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
 9913{
 9914	if (is_64_bit_mode(vcpu))
 9915		return kvm_rip_read(vcpu);
 9916	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
 9917		     kvm_rip_read(vcpu));
 9918}
 9919EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
 9920
 9921bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
 9922{
 9923	return kvm_get_linear_rip(vcpu) == linear_rip;
 9924}
 9925EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
 9926
 9927unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 9928{
 9929	unsigned long rflags;
 9930
 9931	rflags = kvm_x86_ops->get_rflags(vcpu);
 9932	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 9933		rflags &= ~X86_EFLAGS_TF;
 9934	return rflags;
 9935}
 9936EXPORT_SYMBOL_GPL(kvm_get_rflags);
 9937
 9938static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 9939{
 9940	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
 9941	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 9942		rflags |= X86_EFLAGS_TF;
 9943	kvm_x86_ops->set_rflags(vcpu, rflags);
 9944}
 9945
 9946void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 9947{
 9948	__kvm_set_rflags(vcpu, rflags);
 9949	kvm_make_request(KVM_REQ_EVENT, vcpu);
 9950}
 9951EXPORT_SYMBOL_GPL(kvm_set_rflags);
 9952
 9953void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 9954{
 9955	int r;
 9956
 9957	if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
 9958	      work->wakeup_all)
 9959		return;
 9960
 9961	r = kvm_mmu_reload(vcpu);
 9962	if (unlikely(r))
 9963		return;
 9964
 9965	if (!vcpu->arch.mmu->direct_map &&
 9966	      work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
 9967		return;
 9968
 9969	vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
 9970}
 9971
 9972static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
 9973{
 9974	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
 9975}
 9976
 9977static inline u32 kvm_async_pf_next_probe(u32 key)
 9978{
 9979	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
 9980}
 9981
 9982static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 9983{
 9984	u32 key = kvm_async_pf_hash_fn(gfn);
 9985
 9986	while (vcpu->arch.apf.gfns[key] != ~0)
 9987		key = kvm_async_pf_next_probe(key);
 9988
 9989	vcpu->arch.apf.gfns[key] = gfn;
 9990}
 9991
 9992static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
 9993{
 9994	int i;
 9995	u32 key = kvm_async_pf_hash_fn(gfn);
 9996
 9997	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
 9998		     (vcpu->arch.apf.gfns[key] != gfn &&
 9999		      vcpu->arch.apf.gfns[key] != ~0); i++)
10000		key = kvm_async_pf_next_probe(key);
10001
10002	return key;
10003}
10004
10005bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10006{
10007	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
10008}
10009
10010static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
10011{
10012	u32 i, j, k;
10013
10014	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
10015	while (true) {
10016		vcpu->arch.apf.gfns[i] = ~0;
10017		do {
10018			j = kvm_async_pf_next_probe(j);
10019			if (vcpu->arch.apf.gfns[j] == ~0)
10020				return;
10021			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
10022			/*
10023			 * k lies cyclically in ]i,j]
10024			 * |    i.k.j |
10025			 * |....j i.k.| or  |.k..j i...|
10026			 */
10027		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
10028		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
10029		i = j;
10030	}
10031}
10032
10033static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
10034{
10035
10036	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
10037				      sizeof(val));
10038}
10039
10040static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
10041{
10042
10043	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
10044				      sizeof(u32));
10045}
10046
10047static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
10048{
10049	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
10050		return false;
10051
10052	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
10053	    (vcpu->arch.apf.send_user_only &&
10054	     kvm_x86_ops->get_cpl(vcpu) == 0))
10055		return false;
10056
10057	return true;
10058}
10059
10060bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
10061{
10062	if (unlikely(!lapic_in_kernel(vcpu) ||
10063		     kvm_event_needs_reinjection(vcpu) ||
10064		     vcpu->arch.exception.pending))
10065		return false;
10066
10067	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
10068		return false;
10069
10070	/*
10071	 * If interrupts are off we cannot even use an artificial
10072	 * halt state.
10073	 */
10074	return kvm_x86_ops->interrupt_allowed(vcpu);
10075}
10076
10077void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
10078				     struct kvm_async_pf *work)
10079{
10080	struct x86_exception fault;
10081
10082	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
10083	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
10084
10085	if (kvm_can_deliver_async_pf(vcpu) &&
10086	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
 
 
 
10087		fault.vector = PF_VECTOR;
10088		fault.error_code_valid = true;
10089		fault.error_code = 0;
10090		fault.nested_page_fault = false;
10091		fault.address = work->arch.token;
10092		fault.async_page_fault = true;
10093		kvm_inject_page_fault(vcpu, &fault);
10094	} else {
10095		/*
10096		 * It is not possible to deliver a paravirtualized asynchronous
10097		 * page fault, but putting the guest in an artificial halt state
10098		 * can be beneficial nevertheless: if an interrupt arrives, we
10099		 * can deliver it timely and perhaps the guest will schedule
10100		 * another process.  When the instruction that triggered a page
10101		 * fault is retried, hopefully the page will be ready in the host.
10102		 */
10103		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
10104	}
10105}
10106
10107void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
10108				 struct kvm_async_pf *work)
10109{
10110	struct x86_exception fault;
10111	u32 val;
10112
10113	if (work->wakeup_all)
 
10114		work->arch.token = ~0; /* broadcast wakeup */
10115	else
10116		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
10117	trace_kvm_async_pf_ready(work->arch.token, work->gva);
10118
10119	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
10120	    !apf_get_user(vcpu, &val)) {
10121		if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
10122		    vcpu->arch.exception.pending &&
10123		    vcpu->arch.exception.nr == PF_VECTOR &&
10124		    !apf_put_user(vcpu, 0)) {
10125			vcpu->arch.exception.injected = false;
10126			vcpu->arch.exception.pending = false;
10127			vcpu->arch.exception.nr = 0;
10128			vcpu->arch.exception.has_error_code = false;
10129			vcpu->arch.exception.error_code = 0;
10130			vcpu->arch.exception.has_payload = false;
10131			vcpu->arch.exception.payload = 0;
10132		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
10133			fault.vector = PF_VECTOR;
10134			fault.error_code_valid = true;
10135			fault.error_code = 0;
10136			fault.nested_page_fault = false;
10137			fault.address = work->arch.token;
10138			fault.async_page_fault = true;
10139			kvm_inject_page_fault(vcpu, &fault);
10140		}
10141	}
10142	vcpu->arch.apf.halted = false;
10143	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10144}
10145
10146bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
10147{
10148	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
10149		return true;
10150	else
10151		return kvm_can_do_async_pf(vcpu);
10152}
10153
10154void kvm_arch_start_assignment(struct kvm *kvm)
10155{
10156	atomic_inc(&kvm->arch.assigned_device_count);
10157}
10158EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
10159
10160void kvm_arch_end_assignment(struct kvm *kvm)
10161{
10162	atomic_dec(&kvm->arch.assigned_device_count);
10163}
10164EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
10165
10166bool kvm_arch_has_assigned_device(struct kvm *kvm)
10167{
10168	return atomic_read(&kvm->arch.assigned_device_count);
10169}
10170EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
10171
10172void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
10173{
10174	atomic_inc(&kvm->arch.noncoherent_dma_count);
10175}
10176EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
10177
10178void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
10179{
10180	atomic_dec(&kvm->arch.noncoherent_dma_count);
10181}
10182EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
10183
10184bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
10185{
10186	return atomic_read(&kvm->arch.noncoherent_dma_count);
10187}
10188EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
10189
10190bool kvm_arch_has_irq_bypass(void)
10191{
10192	return true;
10193}
10194
10195int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
10196				      struct irq_bypass_producer *prod)
10197{
10198	struct kvm_kernel_irqfd *irqfd =
10199		container_of(cons, struct kvm_kernel_irqfd, consumer);
10200
10201	irqfd->producer = prod;
10202
10203	return kvm_x86_ops->update_pi_irte(irqfd->kvm,
10204					   prod->irq, irqfd->gsi, 1);
10205}
10206
10207void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
10208				      struct irq_bypass_producer *prod)
10209{
10210	int ret;
10211	struct kvm_kernel_irqfd *irqfd =
10212		container_of(cons, struct kvm_kernel_irqfd, consumer);
10213
10214	WARN_ON(irqfd->producer != prod);
10215	irqfd->producer = NULL;
10216
10217	/*
10218	 * When producer of consumer is unregistered, we change back to
10219	 * remapped mode, so we can re-use the current implementation
10220	 * when the irq is masked/disabled or the consumer side (KVM
10221	 * int this case doesn't want to receive the interrupts.
10222	*/
10223	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
10224	if (ret)
10225		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
10226		       " fails: %d\n", irqfd->consumer.token, ret);
10227}
10228
10229int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
10230				   uint32_t guest_irq, bool set)
10231{
10232	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
10233}
10234
10235bool kvm_vector_hashing_enabled(void)
10236{
10237	return vector_hashing;
10238}
10239EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
10240
10241bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
10242{
10243	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
10244}
10245EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
10246
10247
10248EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
10249EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
10250EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
10251EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
10252EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
10253EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
10254EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
10255EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
10256EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
10257EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
10258EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
10259EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
10260EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
10261EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
10262EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
10263EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
10264EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
10265EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
10266EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
10267EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);