sev.c - arch/x86/kernel/sev.c - Linux source code v5.4

Note: File does not exist in v5.4.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * AMD Memory Encryption Support
   4 *
   5 * Copyright (C) 2019 SUSE
   6 *
   7 * Author: Joerg Roedel <jroedel@suse.de>
   8 */
   9
  10#define pr_fmt(fmt)	"SEV: " fmt
  11
  12#include <linux/sched/debug.h>	/* For show_regs() */
  13#include <linux/percpu-defs.h>
  14#include <linux/cc_platform.h>
  15#include <linux/printk.h>
  16#include <linux/mm_types.h>
  17#include <linux/set_memory.h>
  18#include <linux/memblock.h>
  19#include <linux/kernel.h>
  20#include <linux/mm.h>
  21#include <linux/cpumask.h>
  22#include <linux/efi.h>
  23#include <linux/platform_device.h>
  24#include <linux/io.h>
  25#include <linux/psp-sev.h>
  26#include <linux/dmi.h>
  27#include <uapi/linux/sev-guest.h>
  28
  29#include <asm/init.h>
  30#include <asm/cpu_entry_area.h>
  31#include <asm/stacktrace.h>
  32#include <asm/sev.h>
  33#include <asm/insn-eval.h>
  34#include <asm/fpu/xcr.h>
  35#include <asm/processor.h>
  36#include <asm/realmode.h>
  37#include <asm/setup.h>
  38#include <asm/traps.h>
  39#include <asm/svm.h>
  40#include <asm/smp.h>
  41#include <asm/cpu.h>
  42#include <asm/apic.h>
  43#include <asm/cpuid.h>
  44#include <asm/cmdline.h>
  45
  46#define DR7_RESET_VALUE        0x400
  47
  48/* AP INIT values as documented in the APM2  section "Processor Initialization State" */
  49#define AP_INIT_CS_LIMIT		0xffff
  50#define AP_INIT_DS_LIMIT		0xffff
  51#define AP_INIT_LDTR_LIMIT		0xffff
  52#define AP_INIT_GDTR_LIMIT		0xffff
  53#define AP_INIT_IDTR_LIMIT		0xffff
  54#define AP_INIT_TR_LIMIT		0xffff
  55#define AP_INIT_RFLAGS_DEFAULT		0x2
  56#define AP_INIT_DR6_DEFAULT		0xffff0ff0
  57#define AP_INIT_GPAT_DEFAULT		0x0007040600070406ULL
  58#define AP_INIT_XCR0_DEFAULT		0x1
  59#define AP_INIT_X87_FTW_DEFAULT		0x5555
  60#define AP_INIT_X87_FCW_DEFAULT		0x0040
  61#define AP_INIT_CR0_DEFAULT		0x60000010
  62#define AP_INIT_MXCSR_DEFAULT		0x1f80
  63
  64static const char * const sev_status_feat_names[] = {
  65	[MSR_AMD64_SEV_ENABLED_BIT]		= "SEV",
  66	[MSR_AMD64_SEV_ES_ENABLED_BIT]		= "SEV-ES",
  67	[MSR_AMD64_SEV_SNP_ENABLED_BIT]		= "SEV-SNP",
  68	[MSR_AMD64_SNP_VTOM_BIT]		= "vTom",
  69	[MSR_AMD64_SNP_REFLECT_VC_BIT]		= "ReflectVC",
  70	[MSR_AMD64_SNP_RESTRICTED_INJ_BIT]	= "RI",
  71	[MSR_AMD64_SNP_ALT_INJ_BIT]		= "AI",
  72	[MSR_AMD64_SNP_DEBUG_SWAP_BIT]		= "DebugSwap",
  73	[MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT]	= "NoHostIBS",
  74	[MSR_AMD64_SNP_BTB_ISOLATION_BIT]	= "BTBIsol",
  75	[MSR_AMD64_SNP_VMPL_SSS_BIT]		= "VmplSSS",
  76	[MSR_AMD64_SNP_SECURE_TSC_BIT]		= "SecureTSC",
  77	[MSR_AMD64_SNP_VMGEXIT_PARAM_BIT]	= "VMGExitParam",
  78	[MSR_AMD64_SNP_IBS_VIRT_BIT]		= "IBSVirt",
  79	[MSR_AMD64_SNP_VMSA_REG_PROT_BIT]	= "VMSARegProt",
  80	[MSR_AMD64_SNP_SMT_PROT_BIT]		= "SMTProt",
  81};
  82
  83/* For early boot hypervisor communication in SEV-ES enabled guests */
  84static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
  85
  86/*
  87 * Needs to be in the .data section because we need it NULL before bss is
  88 * cleared
  89 */
  90static struct ghcb *boot_ghcb __section(".data");
  91
  92/* Bitmap of SEV features supported by the hypervisor */
  93static u64 sev_hv_features __ro_after_init;
  94
  95/* #VC handler runtime per-CPU data */
  96struct sev_es_runtime_data {
  97	struct ghcb ghcb_page;
  98
  99	/*
 100	 * Reserve one page per CPU as backup storage for the unencrypted GHCB.
 101	 * It is needed when an NMI happens while the #VC handler uses the real
 102	 * GHCB, and the NMI handler itself is causing another #VC exception. In
 103	 * that case the GHCB content of the first handler needs to be backed up
 104	 * and restored.
 105	 */
 106	struct ghcb backup_ghcb;
 107
 108	/*
 109	 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
 110	 * There is no need for it to be atomic, because nothing is written to
 111	 * the GHCB between the read and the write of ghcb_active. So it is safe
 112	 * to use it when a nested #VC exception happens before the write.
 113	 *
 114	 * This is necessary for example in the #VC->NMI->#VC case when the NMI
 115	 * happens while the first #VC handler uses the GHCB. When the NMI code
 116	 * raises a second #VC handler it might overwrite the contents of the
 117	 * GHCB written by the first handler. To avoid this the content of the
 118	 * GHCB is saved and restored when the GHCB is detected to be in use
 119	 * already.
 120	 */
 121	bool ghcb_active;
 122	bool backup_ghcb_active;
 123
 124	/*
 125	 * Cached DR7 value - write it on DR7 writes and return it on reads.
 126	 * That value will never make it to the real hardware DR7 as debugging
 127	 * is currently unsupported in SEV-ES guests.
 128	 */
 129	unsigned long dr7;
 130};
 131
 132struct ghcb_state {
 133	struct ghcb *ghcb;
 134};
 135
 136static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
 137static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
 138
 139struct sev_config {
 140	__u64 debug		: 1,
 141
 142	      /*
 143	       * A flag used by __set_pages_state() that indicates when the
 144	       * per-CPU GHCB has been created and registered and thus can be
 145	       * used by the BSP instead of the early boot GHCB.
 146	       *
 147	       * For APs, the per-CPU GHCB is created before they are started
 148	       * and registered upon startup, so this flag can be used globally
 149	       * for the BSP and APs.
 150	       */
 151	      ghcbs_initialized	: 1,
 152
 153	      __reserved	: 62;
 154};
 155
 156static struct sev_config sev_cfg __read_mostly;
 157
 158static __always_inline bool on_vc_stack(struct pt_regs *regs)
 159{
 160	unsigned long sp = regs->sp;
 161
 162	/* User-mode RSP is not trusted */
 163	if (user_mode(regs))
 164		return false;
 165
 166	/* SYSCALL gap still has user-mode RSP */
 167	if (ip_within_syscall_gap(regs))
 168		return false;
 169
 170	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
 171}
 172
 173/*
 174 * This function handles the case when an NMI is raised in the #VC
 175 * exception handler entry code, before the #VC handler has switched off
 176 * its IST stack. In this case, the IST entry for #VC must be adjusted,
 177 * so that any nested #VC exception will not overwrite the stack
 178 * contents of the interrupted #VC handler.
 179 *
 180 * The IST entry is adjusted unconditionally so that it can be also be
 181 * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
 182 * nested sev_es_ist_exit() call may adjust back the IST entry too
 183 * early.
 184 *
 185 * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
 186 * on the NMI IST stack, as they are only called from NMI handling code
 187 * right now.
 188 */
 189void noinstr __sev_es_ist_enter(struct pt_regs *regs)
 190{
 191	unsigned long old_ist, new_ist;
 192
 193	/* Read old IST entry */
 194	new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
 195
 196	/*
 197	 * If NMI happened while on the #VC IST stack, set the new IST
 198	 * value below regs->sp, so that the interrupted stack frame is
 199	 * not overwritten by subsequent #VC exceptions.
 200	 */
 201	if (on_vc_stack(regs))
 202		new_ist = regs->sp;
 203
 204	/*
 205	 * Reserve additional 8 bytes and store old IST value so this
 206	 * adjustment can be unrolled in __sev_es_ist_exit().
 207	 */
 208	new_ist -= sizeof(old_ist);
 209	*(unsigned long *)new_ist = old_ist;
 210
 211	/* Set new IST entry */
 212	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
 213}
 214
 215void noinstr __sev_es_ist_exit(void)
 216{
 217	unsigned long ist;
 218
 219	/* Read IST entry */
 220	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
 221
 222	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
 223		return;
 224
 225	/* Read back old IST entry and write it to the TSS */
 226	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
 227}
 228
 229/*
 230 * Nothing shall interrupt this code path while holding the per-CPU
 231 * GHCB. The backup GHCB is only for NMIs interrupting this path.
 232 *
 233 * Callers must disable local interrupts around it.
 234 */
 235static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
 236{
 237	struct sev_es_runtime_data *data;
 238	struct ghcb *ghcb;
 239
 240	WARN_ON(!irqs_disabled());
 241
 242	data = this_cpu_read(runtime_data);
 243	ghcb = &data->ghcb_page;
 244
 245	if (unlikely(data->ghcb_active)) {
 246		/* GHCB is already in use - save its contents */
 247
 248		if (unlikely(data->backup_ghcb_active)) {
 249			/*
 250			 * Backup-GHCB is also already in use. There is no way
 251			 * to continue here so just kill the machine. To make
 252			 * panic() work, mark GHCBs inactive so that messages
 253			 * can be printed out.
 254			 */
 255			data->ghcb_active        = false;
 256			data->backup_ghcb_active = false;
 257
 258			instrumentation_begin();
 259			panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
 260			instrumentation_end();
 261		}
 262
 263		/* Mark backup_ghcb active before writing to it */
 264		data->backup_ghcb_active = true;
 265
 266		state->ghcb = &data->backup_ghcb;
 267
 268		/* Backup GHCB content */
 269		*state->ghcb = *ghcb;
 270	} else {
 271		state->ghcb = NULL;
 272		data->ghcb_active = true;
 273	}
 274
 275	return ghcb;
 276}
 277
 278static inline u64 sev_es_rd_ghcb_msr(void)
 279{
 280	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
 281}
 282
 283static __always_inline void sev_es_wr_ghcb_msr(u64 val)
 284{
 285	u32 low, high;
 286
 287	low  = (u32)(val);
 288	high = (u32)(val >> 32);
 289
 290	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
 291}
 292
 293static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
 294				unsigned char *buffer)
 295{
 296	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
 297}
 298
 299static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
 300{
 301	char buffer[MAX_INSN_SIZE];
 302	int insn_bytes;
 303
 304	insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
 305	if (insn_bytes == 0) {
 306		/* Nothing could be copied */
 307		ctxt->fi.vector     = X86_TRAP_PF;
 308		ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
 309		ctxt->fi.cr2        = ctxt->regs->ip;
 310		return ES_EXCEPTION;
 311	} else if (insn_bytes == -EINVAL) {
 312		/* Effective RIP could not be calculated */
 313		ctxt->fi.vector     = X86_TRAP_GP;
 314		ctxt->fi.error_code = 0;
 315		ctxt->fi.cr2        = 0;
 316		return ES_EXCEPTION;
 317	}
 318
 319	if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
 320		return ES_DECODE_FAILED;
 321
 322	if (ctxt->insn.immediate.got)
 323		return ES_OK;
 324	else
 325		return ES_DECODE_FAILED;
 326}
 327
 328static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
 329{
 330	char buffer[MAX_INSN_SIZE];
 331	int res, ret;
 332
 333	res = vc_fetch_insn_kernel(ctxt, buffer);
 334	if (res) {
 335		ctxt->fi.vector     = X86_TRAP_PF;
 336		ctxt->fi.error_code = X86_PF_INSTR;
 337		ctxt->fi.cr2        = ctxt->regs->ip;
 338		return ES_EXCEPTION;
 339	}
 340
 341	ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
 342	if (ret < 0)
 343		return ES_DECODE_FAILED;
 344	else
 345		return ES_OK;
 346}
 347
 348static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
 349{
 350	if (user_mode(ctxt->regs))
 351		return __vc_decode_user_insn(ctxt);
 352	else
 353		return __vc_decode_kern_insn(ctxt);
 354}
 355
 356static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
 357				   char *dst, char *buf, size_t size)
 358{
 359	unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
 360
 361	/*
 362	 * This function uses __put_user() independent of whether kernel or user
 363	 * memory is accessed. This works fine because __put_user() does no
 364	 * sanity checks of the pointer being accessed. All that it does is
 365	 * to report when the access failed.
 366	 *
 367	 * Also, this function runs in atomic context, so __put_user() is not
 368	 * allowed to sleep. The page-fault handler detects that it is running
 369	 * in atomic context and will not try to take mmap_sem and handle the
 370	 * fault, so additional pagefault_enable()/disable() calls are not
 371	 * needed.
 372	 *
 373	 * The access can't be done via copy_to_user() here because
 374	 * vc_write_mem() must not use string instructions to access unsafe
 375	 * memory. The reason is that MOVS is emulated by the #VC handler by
 376	 * splitting the move up into a read and a write and taking a nested #VC
 377	 * exception on whatever of them is the MMIO access. Using string
 378	 * instructions here would cause infinite nesting.
 379	 */
 380	switch (size) {
 381	case 1: {
 382		u8 d1;
 383		u8 __user *target = (u8 __user *)dst;
 384
 385		memcpy(&d1, buf, 1);
 386		if (__put_user(d1, target))
 387			goto fault;
 388		break;
 389	}
 390	case 2: {
 391		u16 d2;
 392		u16 __user *target = (u16 __user *)dst;
 393
 394		memcpy(&d2, buf, 2);
 395		if (__put_user(d2, target))
 396			goto fault;
 397		break;
 398	}
 399	case 4: {
 400		u32 d4;
 401		u32 __user *target = (u32 __user *)dst;
 402
 403		memcpy(&d4, buf, 4);
 404		if (__put_user(d4, target))
 405			goto fault;
 406		break;
 407	}
 408	case 8: {
 409		u64 d8;
 410		u64 __user *target = (u64 __user *)dst;
 411
 412		memcpy(&d8, buf, 8);
 413		if (__put_user(d8, target))
 414			goto fault;
 415		break;
 416	}
 417	default:
 418		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
 419		return ES_UNSUPPORTED;
 420	}
 421
 422	return ES_OK;
 423
 424fault:
 425	if (user_mode(ctxt->regs))
 426		error_code |= X86_PF_USER;
 427
 428	ctxt->fi.vector = X86_TRAP_PF;
 429	ctxt->fi.error_code = error_code;
 430	ctxt->fi.cr2 = (unsigned long)dst;
 431
 432	return ES_EXCEPTION;
 433}
 434
 435static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
 436				  char *src, char *buf, size_t size)
 437{
 438	unsigned long error_code = X86_PF_PROT;
 439
 440	/*
 441	 * This function uses __get_user() independent of whether kernel or user
 442	 * memory is accessed. This works fine because __get_user() does no
 443	 * sanity checks of the pointer being accessed. All that it does is
 444	 * to report when the access failed.
 445	 *
 446	 * Also, this function runs in atomic context, so __get_user() is not
 447	 * allowed to sleep. The page-fault handler detects that it is running
 448	 * in atomic context and will not try to take mmap_sem and handle the
 449	 * fault, so additional pagefault_enable()/disable() calls are not
 450	 * needed.
 451	 *
 452	 * The access can't be done via copy_from_user() here because
 453	 * vc_read_mem() must not use string instructions to access unsafe
 454	 * memory. The reason is that MOVS is emulated by the #VC handler by
 455	 * splitting the move up into a read and a write and taking a nested #VC
 456	 * exception on whatever of them is the MMIO access. Using string
 457	 * instructions here would cause infinite nesting.
 458	 */
 459	switch (size) {
 460	case 1: {
 461		u8 d1;
 462		u8 __user *s = (u8 __user *)src;
 463
 464		if (__get_user(d1, s))
 465			goto fault;
 466		memcpy(buf, &d1, 1);
 467		break;
 468	}
 469	case 2: {
 470		u16 d2;
 471		u16 __user *s = (u16 __user *)src;
 472
 473		if (__get_user(d2, s))
 474			goto fault;
 475		memcpy(buf, &d2, 2);
 476		break;
 477	}
 478	case 4: {
 479		u32 d4;
 480		u32 __user *s = (u32 __user *)src;
 481
 482		if (__get_user(d4, s))
 483			goto fault;
 484		memcpy(buf, &d4, 4);
 485		break;
 486	}
 487	case 8: {
 488		u64 d8;
 489		u64 __user *s = (u64 __user *)src;
 490		if (__get_user(d8, s))
 491			goto fault;
 492		memcpy(buf, &d8, 8);
 493		break;
 494	}
 495	default:
 496		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
 497		return ES_UNSUPPORTED;
 498	}
 499
 500	return ES_OK;
 501
 502fault:
 503	if (user_mode(ctxt->regs))
 504		error_code |= X86_PF_USER;
 505
 506	ctxt->fi.vector = X86_TRAP_PF;
 507	ctxt->fi.error_code = error_code;
 508	ctxt->fi.cr2 = (unsigned long)src;
 509
 510	return ES_EXCEPTION;
 511}
 512
 513static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
 514					   unsigned long vaddr, phys_addr_t *paddr)
 515{
 516	unsigned long va = (unsigned long)vaddr;
 517	unsigned int level;
 518	phys_addr_t pa;
 519	pgd_t *pgd;
 520	pte_t *pte;
 521
 522	pgd = __va(read_cr3_pa());
 523	pgd = &pgd[pgd_index(va)];
 524	pte = lookup_address_in_pgd(pgd, va, &level);
 525	if (!pte) {
 526		ctxt->fi.vector     = X86_TRAP_PF;
 527		ctxt->fi.cr2        = vaddr;
 528		ctxt->fi.error_code = 0;
 529
 530		if (user_mode(ctxt->regs))
 531			ctxt->fi.error_code |= X86_PF_USER;
 532
 533		return ES_EXCEPTION;
 534	}
 535
 536	if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
 537		/* Emulated MMIO to/from encrypted memory not supported */
 538		return ES_UNSUPPORTED;
 539
 540	pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 541	pa |= va & ~page_level_mask(level);
 542
 543	*paddr = pa;
 544
 545	return ES_OK;
 546}
 547
 548static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
 549{
 550	BUG_ON(size > 4);
 551
 552	if (user_mode(ctxt->regs)) {
 553		struct thread_struct *t = &current->thread;
 554		struct io_bitmap *iobm = t->io_bitmap;
 555		size_t idx;
 556
 557		if (!iobm)
 558			goto fault;
 559
 560		for (idx = port; idx < port + size; ++idx) {
 561			if (test_bit(idx, iobm->bitmap))
 562				goto fault;
 563		}
 564	}
 565
 566	return ES_OK;
 567
 568fault:
 569	ctxt->fi.vector = X86_TRAP_GP;
 570	ctxt->fi.error_code = 0;
 571
 572	return ES_EXCEPTION;
 573}
 574
 575/* Include code shared with pre-decompression boot stage */
 576#include "sev-shared.c"
 577
 578static noinstr void __sev_put_ghcb(struct ghcb_state *state)
 579{
 580	struct sev_es_runtime_data *data;
 581	struct ghcb *ghcb;
 582
 583	WARN_ON(!irqs_disabled());
 584
 585	data = this_cpu_read(runtime_data);
 586	ghcb = &data->ghcb_page;
 587
 588	if (state->ghcb) {
 589		/* Restore GHCB from Backup */
 590		*ghcb = *state->ghcb;
 591		data->backup_ghcb_active = false;
 592		state->ghcb = NULL;
 593	} else {
 594		/*
 595		 * Invalidate the GHCB so a VMGEXIT instruction issued
 596		 * from userspace won't appear to be valid.
 597		 */
 598		vc_ghcb_invalidate(ghcb);
 599		data->ghcb_active = false;
 600	}
 601}
 602
 603void noinstr __sev_es_nmi_complete(void)
 604{
 605	struct ghcb_state state;
 606	struct ghcb *ghcb;
 607
 608	ghcb = __sev_get_ghcb(&state);
 609
 610	vc_ghcb_invalidate(ghcb);
 611	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
 612	ghcb_set_sw_exit_info_1(ghcb, 0);
 613	ghcb_set_sw_exit_info_2(ghcb, 0);
 614
 615	sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
 616	VMGEXIT();
 617
 618	__sev_put_ghcb(&state);
 619}
 620
 621static u64 __init get_secrets_page(void)
 622{
 623	u64 pa_data = boot_params.cc_blob_address;
 624	struct cc_blob_sev_info info;
 625	void *map;
 626
 627	/*
 628	 * The CC blob contains the address of the secrets page, check if the
 629	 * blob is present.
 630	 */
 631	if (!pa_data)
 632		return 0;
 633
 634	map = early_memremap(pa_data, sizeof(info));
 635	if (!map) {
 636		pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
 637		return 0;
 638	}
 639	memcpy(&info, map, sizeof(info));
 640	early_memunmap(map, sizeof(info));
 641
 642	/* smoke-test the secrets page passed */
 643	if (!info.secrets_phys || info.secrets_len != PAGE_SIZE)
 644		return 0;
 645
 646	return info.secrets_phys;
 647}
 648
 649static u64 __init get_snp_jump_table_addr(void)
 650{
 651	struct snp_secrets_page_layout *layout;
 652	void __iomem *mem;
 653	u64 pa, addr;
 654
 655	pa = get_secrets_page();
 656	if (!pa)
 657		return 0;
 658
 659	mem = ioremap_encrypted(pa, PAGE_SIZE);
 660	if (!mem) {
 661		pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
 662		return 0;
 663	}
 664
 665	layout = (__force struct snp_secrets_page_layout *)mem;
 666
 667	addr = layout->os_area.ap_jump_table_pa;
 668	iounmap(mem);
 669
 670	return addr;
 671}
 672
 673static u64 __init get_jump_table_addr(void)
 674{
 675	struct ghcb_state state;
 676	unsigned long flags;
 677	struct ghcb *ghcb;
 678	u64 ret = 0;
 679
 680	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
 681		return get_snp_jump_table_addr();
 682
 683	local_irq_save(flags);
 684
 685	ghcb = __sev_get_ghcb(&state);
 686
 687	vc_ghcb_invalidate(ghcb);
 688	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
 689	ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
 690	ghcb_set_sw_exit_info_2(ghcb, 0);
 691
 692	sev_es_wr_ghcb_msr(__pa(ghcb));
 693	VMGEXIT();
 694
 695	if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
 696	    ghcb_sw_exit_info_2_is_valid(ghcb))
 697		ret = ghcb->save.sw_exit_info_2;
 698
 699	__sev_put_ghcb(&state);
 700
 701	local_irq_restore(flags);
 702
 703	return ret;
 704}
 705
 706static void __head
 707early_set_pages_state(unsigned long vaddr, unsigned long paddr,
 708		      unsigned long npages, enum psc_op op)
 709{
 710	unsigned long paddr_end;
 711	u64 val;
 712	int ret;
 713
 714	vaddr = vaddr & PAGE_MASK;
 715
 716	paddr = paddr & PAGE_MASK;
 717	paddr_end = paddr + (npages << PAGE_SHIFT);
 718
 719	while (paddr < paddr_end) {
 720		if (op == SNP_PAGE_STATE_SHARED) {
 721			/* Page validation must be rescinded before changing to shared */
 722			ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false);
 723			if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
 724				goto e_term;
 725		}
 726
 727		/*
 728		 * Use the MSR protocol because this function can be called before
 729		 * the GHCB is established.
 730		 */
 731		sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
 732		VMGEXIT();
 733
 734		val = sev_es_rd_ghcb_msr();
 735
 736		if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
 737			 "Wrong PSC response code: 0x%x\n",
 738			 (unsigned int)GHCB_RESP_CODE(val)))
 739			goto e_term;
 740
 741		if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
 742			 "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
 743			 op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
 744			 paddr, GHCB_MSR_PSC_RESP_VAL(val)))
 745			goto e_term;
 746
 747		if (op == SNP_PAGE_STATE_PRIVATE) {
 748			/* Page validation must be performed after changing to private */
 749			ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true);
 750			if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
 751				goto e_term;
 752		}
 753
 754		vaddr += PAGE_SIZE;
 755		paddr += PAGE_SIZE;
 756	}
 757
 758	return;
 759
 760e_term:
 761	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
 762}
 763
 764void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
 765					 unsigned long npages)
 766{
 767	/*
 768	 * This can be invoked in early boot while running identity mapped, so
 769	 * use an open coded check for SNP instead of using cc_platform_has().
 770	 * This eliminates worries about jump tables or checking boot_cpu_data
 771	 * in the cc_platform_has() function.
 772	 */
 773	if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
 774		return;
 775
 776	 /*
 777	  * Ask the hypervisor to mark the memory pages as private in the RMP
 778	  * table.
 779	  */
 780	early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
 781}
 782
 783void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
 784					unsigned long npages)
 785{
 786	/*
 787	 * This can be invoked in early boot while running identity mapped, so
 788	 * use an open coded check for SNP instead of using cc_platform_has().
 789	 * This eliminates worries about jump tables or checking boot_cpu_data
 790	 * in the cc_platform_has() function.
 791	 */
 792	if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
 793		return;
 794
 795	 /* Ask hypervisor to mark the memory pages shared in the RMP table. */
 796	early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
 797}
 798
 799static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
 800				       unsigned long vaddr_end, int op)
 801{
 802	struct ghcb_state state;
 803	bool use_large_entry;
 804	struct psc_hdr *hdr;
 805	struct psc_entry *e;
 806	unsigned long flags;
 807	unsigned long pfn;
 808	struct ghcb *ghcb;
 809	int i;
 810
 811	hdr = &data->hdr;
 812	e = data->entries;
 813
 814	memset(data, 0, sizeof(*data));
 815	i = 0;
 816
 817	while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
 818		hdr->end_entry = i;
 819
 820		if (is_vmalloc_addr((void *)vaddr)) {
 821			pfn = vmalloc_to_pfn((void *)vaddr);
 822			use_large_entry = false;
 823		} else {
 824			pfn = __pa(vaddr) >> PAGE_SHIFT;
 825			use_large_entry = true;
 826		}
 827
 828		e->gfn = pfn;
 829		e->operation = op;
 830
 831		if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
 832		    (vaddr_end - vaddr) >= PMD_SIZE) {
 833			e->pagesize = RMP_PG_SIZE_2M;
 834			vaddr += PMD_SIZE;
 835		} else {
 836			e->pagesize = RMP_PG_SIZE_4K;
 837			vaddr += PAGE_SIZE;
 838		}
 839
 840		e++;
 841		i++;
 842	}
 843
 844	/* Page validation must be rescinded before changing to shared */
 845	if (op == SNP_PAGE_STATE_SHARED)
 846		pvalidate_pages(data);
 847
 848	local_irq_save(flags);
 849
 850	if (sev_cfg.ghcbs_initialized)
 851		ghcb = __sev_get_ghcb(&state);
 852	else
 853		ghcb = boot_ghcb;
 854
 855	/* Invoke the hypervisor to perform the page state changes */
 856	if (!ghcb || vmgexit_psc(ghcb, data))
 857		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
 858
 859	if (sev_cfg.ghcbs_initialized)
 860		__sev_put_ghcb(&state);
 861
 862	local_irq_restore(flags);
 863
 864	/* Page validation must be performed after changing to private */
 865	if (op == SNP_PAGE_STATE_PRIVATE)
 866		pvalidate_pages(data);
 867
 868	return vaddr;
 869}
 870
 871static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
 872{
 873	struct snp_psc_desc desc;
 874	unsigned long vaddr_end;
 875
 876	/* Use the MSR protocol when a GHCB is not available. */
 877	if (!boot_ghcb)
 878		return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
 879
 880	vaddr = vaddr & PAGE_MASK;
 881	vaddr_end = vaddr + (npages << PAGE_SHIFT);
 882
 883	while (vaddr < vaddr_end)
 884		vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op);
 885}
 886
 887void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
 888{
 889	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
 890		return;
 891
 892	set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
 893}
 894
 895void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
 896{
 897	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
 898		return;
 899
 900	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 901}
 902
 903void snp_accept_memory(phys_addr_t start, phys_addr_t end)
 904{
 905	unsigned long vaddr, npages;
 906
 907	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
 908		return;
 909
 910	vaddr = (unsigned long)__va(start);
 911	npages = (end - start) >> PAGE_SHIFT;
 912
 913	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
 914}
 915
 916static int snp_set_vmsa(void *va, bool vmsa)
 917{
 918	u64 attrs;
 919
 920	/*
 921	 * Running at VMPL0 allows the kernel to change the VMSA bit for a page
 922	 * using the RMPADJUST instruction. However, for the instruction to
 923	 * succeed it must target the permissions of a lesser privileged
 924	 * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
 925	 * instruction in the AMD64 APM Volume 3).
 926	 */
 927	attrs = 1;
 928	if (vmsa)
 929		attrs |= RMPADJUST_VMSA_PAGE_BIT;
 930
 931	return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
 932}
 933
 934#define __ATTR_BASE		(SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
 935#define INIT_CS_ATTRIBS		(__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
 936#define INIT_DS_ATTRIBS		(__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
 937
 938#define INIT_LDTR_ATTRIBS	(SVM_SELECTOR_P_MASK | 2)
 939#define INIT_TR_ATTRIBS		(SVM_SELECTOR_P_MASK | 3)
 940
 941static void *snp_alloc_vmsa_page(void)
 942{
 943	struct page *p;
 944
 945	/*
 946	 * Allocate VMSA page to work around the SNP erratum where the CPU will
 947	 * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
 948	 * collides with the RMP entry of VMSA page. The recommended workaround
 949	 * is to not use a large page.
 950	 *
 951	 * Allocate an 8k page which is also 8k-aligned.
 952	 */
 953	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
 954	if (!p)
 955		return NULL;
 956
 957	split_page(p, 1);
 958
 959	/* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
 960	__free_page(p);
 961
 962	return page_address(p + 1);
 963}
 964
 965static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
 966{
 967	int err;
 968
 969	err = snp_set_vmsa(vmsa, false);
 970	if (err)
 971		pr_err("clear VMSA page failed (%u), leaking page\n", err);
 972	else
 973		free_page((unsigned long)vmsa);
 974}
 975
 976static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
 977{
 978	struct sev_es_save_area *cur_vmsa, *vmsa;
 979	struct ghcb_state state;
 980	unsigned long flags;
 981	struct ghcb *ghcb;
 982	u8 sipi_vector;
 983	int cpu, ret;
 984	u64 cr4;
 985
 986	/*
 987	 * The hypervisor SNP feature support check has happened earlier, just check
 988	 * the AP_CREATION one here.
 989	 */
 990	if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
 991		return -EOPNOTSUPP;
 992
 993	/*
 994	 * Verify the desired start IP against the known trampoline start IP
 995	 * to catch any future new trampolines that may be introduced that
 996	 * would require a new protected guest entry point.
 997	 */
 998	if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
 999		      "Unsupported SNP start_ip: %lx\n", start_ip))
1000		return -EINVAL;
1001
1002	/* Override start_ip with known protected guest start IP */
1003	start_ip = real_mode_header->sev_es_trampoline_start;
1004
1005	/* Find the logical CPU for the APIC ID */
1006	for_each_present_cpu(cpu) {
1007		if (arch_match_cpu_phys_id(cpu, apic_id))
1008			break;
1009	}
1010	if (cpu >= nr_cpu_ids)
1011		return -EINVAL;
1012
1013	cur_vmsa = per_cpu(sev_vmsa, cpu);
1014
1015	/*
1016	 * A new VMSA is created each time because there is no guarantee that
1017	 * the current VMSA is the kernels or that the vCPU is not running. If
1018	 * an attempt was done to use the current VMSA with a running vCPU, a
1019	 * #VMEXIT of that vCPU would wipe out all of the settings being done
1020	 * here.
1021	 */
1022	vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
1023	if (!vmsa)
1024		return -ENOMEM;
1025
1026	/* CR4 should maintain the MCE value */
1027	cr4 = native_read_cr4() & X86_CR4_MCE;
1028
1029	/* Set the CS value based on the start_ip converted to a SIPI vector */
1030	sipi_vector		= (start_ip >> 12);
1031	vmsa->cs.base		= sipi_vector << 12;
1032	vmsa->cs.limit		= AP_INIT_CS_LIMIT;
1033	vmsa->cs.attrib		= INIT_CS_ATTRIBS;
1034	vmsa->cs.selector	= sipi_vector << 8;
1035
1036	/* Set the RIP value based on start_ip */
1037	vmsa->rip		= start_ip & 0xfff;
1038
1039	/* Set AP INIT defaults as documented in the APM */
1040	vmsa->ds.limit		= AP_INIT_DS_LIMIT;
1041	vmsa->ds.attrib		= INIT_DS_ATTRIBS;
1042	vmsa->es		= vmsa->ds;
1043	vmsa->fs		= vmsa->ds;
1044	vmsa->gs		= vmsa->ds;
1045	vmsa->ss		= vmsa->ds;
1046
1047	vmsa->gdtr.limit	= AP_INIT_GDTR_LIMIT;
1048	vmsa->ldtr.limit	= AP_INIT_LDTR_LIMIT;
1049	vmsa->ldtr.attrib	= INIT_LDTR_ATTRIBS;
1050	vmsa->idtr.limit	= AP_INIT_IDTR_LIMIT;
1051	vmsa->tr.limit		= AP_INIT_TR_LIMIT;
1052	vmsa->tr.attrib		= INIT_TR_ATTRIBS;
1053
1054	vmsa->cr4		= cr4;
1055	vmsa->cr0		= AP_INIT_CR0_DEFAULT;
1056	vmsa->dr7		= DR7_RESET_VALUE;
1057	vmsa->dr6		= AP_INIT_DR6_DEFAULT;
1058	vmsa->rflags		= AP_INIT_RFLAGS_DEFAULT;
1059	vmsa->g_pat		= AP_INIT_GPAT_DEFAULT;
1060	vmsa->xcr0		= AP_INIT_XCR0_DEFAULT;
1061	vmsa->mxcsr		= AP_INIT_MXCSR_DEFAULT;
1062	vmsa->x87_ftw		= AP_INIT_X87_FTW_DEFAULT;
1063	vmsa->x87_fcw		= AP_INIT_X87_FCW_DEFAULT;
1064
1065	/* SVME must be set. */
1066	vmsa->efer		= EFER_SVME;
1067
1068	/*
1069	 * Set the SNP-specific fields for this VMSA:
1070	 *   VMPL level
1071	 *   SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
1072	 */
1073	vmsa->vmpl		= 0;
1074	vmsa->sev_features	= sev_status >> 2;
1075
1076	/* Switch the page over to a VMSA page now that it is initialized */
1077	ret = snp_set_vmsa(vmsa, true);
1078	if (ret) {
1079		pr_err("set VMSA page failed (%u)\n", ret);
1080		free_page((unsigned long)vmsa);
1081
1082		return -EINVAL;
1083	}
1084
1085	/* Issue VMGEXIT AP Creation NAE event */
1086	local_irq_save(flags);
1087
1088	ghcb = __sev_get_ghcb(&state);
1089
1090	vc_ghcb_invalidate(ghcb);
1091	ghcb_set_rax(ghcb, vmsa->sev_features);
1092	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
1093	ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
1094	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
1095
1096	sev_es_wr_ghcb_msr(__pa(ghcb));
1097	VMGEXIT();
1098
1099	if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
1100	    lower_32_bits(ghcb->save.sw_exit_info_1)) {
1101		pr_err("SNP AP Creation error\n");
1102		ret = -EINVAL;
1103	}
1104
1105	__sev_put_ghcb(&state);
1106
1107	local_irq_restore(flags);
1108
1109	/* Perform cleanup if there was an error */
1110	if (ret) {
1111		snp_cleanup_vmsa(vmsa);
1112		vmsa = NULL;
1113	}
1114
1115	/* Free up any previous VMSA page */
1116	if (cur_vmsa)
1117		snp_cleanup_vmsa(cur_vmsa);
1118
1119	/* Record the current VMSA page */
1120	per_cpu(sev_vmsa, cpu) = vmsa;
1121
1122	return ret;
1123}
1124
1125void __init snp_set_wakeup_secondary_cpu(void)
1126{
1127	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1128		return;
1129
1130	/*
1131	 * Always set this override if SNP is enabled. This makes it the
1132	 * required method to start APs under SNP. If the hypervisor does
1133	 * not support AP creation, then no APs will be started.
1134	 */
1135	apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
1136}
1137
1138int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
1139{
1140	u16 startup_cs, startup_ip;
1141	phys_addr_t jump_table_pa;
1142	u64 jump_table_addr;
1143	u16 __iomem *jump_table;
1144
1145	jump_table_addr = get_jump_table_addr();
1146
1147	/* On UP guests there is no jump table so this is not a failure */
1148	if (!jump_table_addr)
1149		return 0;
1150
1151	/* Check if AP Jump Table is page-aligned */
1152	if (jump_table_addr & ~PAGE_MASK)
1153		return -EINVAL;
1154
1155	jump_table_pa = jump_table_addr & PAGE_MASK;
1156
1157	startup_cs = (u16)(rmh->trampoline_start >> 4);
1158	startup_ip = (u16)(rmh->sev_es_trampoline_start -
1159			   rmh->trampoline_start);
1160
1161	jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
1162	if (!jump_table)
1163		return -EIO;
1164
1165	writew(startup_ip, &jump_table[0]);
1166	writew(startup_cs, &jump_table[1]);
1167
1168	iounmap(jump_table);
1169
1170	return 0;
1171}
1172
1173/*
1174 * This is needed by the OVMF UEFI firmware which will use whatever it finds in
1175 * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
1176 * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
1177 */
1178int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
1179{
1180	struct sev_es_runtime_data *data;
1181	unsigned long address, pflags;
1182	int cpu;
1183	u64 pfn;
1184
1185	if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
1186		return 0;
1187
1188	pflags = _PAGE_NX | _PAGE_RW;
1189
1190	for_each_possible_cpu(cpu) {
1191		data = per_cpu(runtime_data, cpu);
1192
1193		address = __pa(&data->ghcb_page);
1194		pfn = address >> PAGE_SHIFT;
1195
1196		if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
1197			return 1;
1198	}
1199
1200	return 0;
1201}
1202
1203static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
1204{
1205	struct pt_regs *regs = ctxt->regs;
1206	enum es_result ret;
1207	u64 exit_info_1;
1208
1209	/* Is it a WRMSR? */
1210	exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
1211
1212	ghcb_set_rcx(ghcb, regs->cx);
1213	if (exit_info_1) {
1214		ghcb_set_rax(ghcb, regs->ax);
1215		ghcb_set_rdx(ghcb, regs->dx);
1216	}
1217
1218	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
1219
1220	if ((ret == ES_OK) && (!exit_info_1)) {
1221		regs->ax = ghcb->save.rax;
1222		regs->dx = ghcb->save.rdx;
1223	}
1224
1225	return ret;
1226}
1227
1228static void snp_register_per_cpu_ghcb(void)
1229{
1230	struct sev_es_runtime_data *data;
1231	struct ghcb *ghcb;
1232
1233	data = this_cpu_read(runtime_data);
1234	ghcb = &data->ghcb_page;
1235
1236	snp_register_ghcb_early(__pa(ghcb));
1237}
1238
1239void setup_ghcb(void)
1240{
1241	if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
1242		return;
1243
1244	/*
1245	 * Check whether the runtime #VC exception handler is active. It uses
1246	 * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
1247	 *
1248	 * If SNP is active, register the per-CPU GHCB page so that the runtime
1249	 * exception handler can use it.
1250	 */
1251	if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
1252		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1253			snp_register_per_cpu_ghcb();
1254
1255		sev_cfg.ghcbs_initialized = true;
1256
1257		return;
1258	}
1259
1260	/*
1261	 * Make sure the hypervisor talks a supported protocol.
1262	 * This gets called only in the BSP boot phase.
1263	 */
1264	if (!sev_es_negotiate_protocol())
1265		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1266
1267	/*
1268	 * Clear the boot_ghcb. The first exception comes in before the bss
1269	 * section is cleared.
1270	 */
1271	memset(&boot_ghcb_page, 0, PAGE_SIZE);
1272
1273	/* Alright - Make the boot-ghcb public */
1274	boot_ghcb = &boot_ghcb_page;
1275
1276	/* SNP guest requires that GHCB GPA must be registered. */
1277	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
1278		snp_register_ghcb_early(__pa(&boot_ghcb_page));
1279}
1280
1281#ifdef CONFIG_HOTPLUG_CPU
1282static void sev_es_ap_hlt_loop(void)
1283{
1284	struct ghcb_state state;
1285	struct ghcb *ghcb;
1286
1287	ghcb = __sev_get_ghcb(&state);
1288
1289	while (true) {
1290		vc_ghcb_invalidate(ghcb);
1291		ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
1292		ghcb_set_sw_exit_info_1(ghcb, 0);
1293		ghcb_set_sw_exit_info_2(ghcb, 0);
1294
1295		sev_es_wr_ghcb_msr(__pa(ghcb));
1296		VMGEXIT();
1297
1298		/* Wakeup signal? */
1299		if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
1300		    ghcb->save.sw_exit_info_2)
1301			break;
1302	}
1303
1304	__sev_put_ghcb(&state);
1305}
1306
1307/*
1308 * Play_dead handler when running under SEV-ES. This is needed because
1309 * the hypervisor can't deliver an SIPI request to restart the AP.
1310 * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
1311 * hypervisor wakes it up again.
1312 */
1313static void sev_es_play_dead(void)
1314{
1315	play_dead_common();
1316
1317	/* IRQs now disabled */
1318
1319	sev_es_ap_hlt_loop();
1320
1321	/*
1322	 * If we get here, the VCPU was woken up again. Jump to CPU
1323	 * startup code to get it back online.
1324	 */
1325	soft_restart_cpu();
1326}
1327#else  /* CONFIG_HOTPLUG_CPU */
1328#define sev_es_play_dead	native_play_dead
1329#endif /* CONFIG_HOTPLUG_CPU */
1330
1331#ifdef CONFIG_SMP
1332static void __init sev_es_setup_play_dead(void)
1333{
1334	smp_ops.play_dead = sev_es_play_dead;
1335}
1336#else
1337static inline void sev_es_setup_play_dead(void) { }
1338#endif
1339
1340static void __init alloc_runtime_data(int cpu)
1341{
1342	struct sev_es_runtime_data *data;
1343
1344	data = memblock_alloc(sizeof(*data), PAGE_SIZE);
1345	if (!data)
1346		panic("Can't allocate SEV-ES runtime data");
1347
1348	per_cpu(runtime_data, cpu) = data;
1349}
1350
1351static void __init init_ghcb(int cpu)
1352{
1353	struct sev_es_runtime_data *data;
1354	int err;
1355
1356	data = per_cpu(runtime_data, cpu);
1357
1358	err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
1359					 sizeof(data->ghcb_page));
1360	if (err)
1361		panic("Can't map GHCBs unencrypted");
1362
1363	memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
1364
1365	data->ghcb_active = false;
1366	data->backup_ghcb_active = false;
1367}
1368
1369void __init sev_es_init_vc_handling(void)
1370{
1371	int cpu;
1372
1373	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
1374
1375	if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
1376		return;
1377
1378	if (!sev_es_check_cpu_features())
1379		panic("SEV-ES CPU Features missing");
1380
1381	/*
1382	 * SNP is supported in v2 of the GHCB spec which mandates support for HV
1383	 * features.
1384	 */
1385	if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
1386		sev_hv_features = get_hv_features();
1387
1388		if (!(sev_hv_features & GHCB_HV_FT_SNP))
1389			sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
1390	}
1391
1392	/* Initialize per-cpu GHCB pages */
1393	for_each_possible_cpu(cpu) {
1394		alloc_runtime_data(cpu);
1395		init_ghcb(cpu);
1396	}
1397
1398	sev_es_setup_play_dead();
1399
1400	/* Secondary CPUs use the runtime #VC handler */
1401	initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
1402}
1403
1404static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
1405{
1406	int trapnr = ctxt->fi.vector;
1407
1408	if (trapnr == X86_TRAP_PF)
1409		native_write_cr2(ctxt->fi.cr2);
1410
1411	ctxt->regs->orig_ax = ctxt->fi.error_code;
1412	do_early_exception(ctxt->regs, trapnr);
1413}
1414
1415static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
1416{
1417	long *reg_array;
1418	int offset;
1419
1420	reg_array = (long *)ctxt->regs;
1421	offset    = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
1422
1423	if (offset < 0)
1424		return NULL;
1425
1426	offset /= sizeof(long);
1427
1428	return reg_array + offset;
1429}
1430static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
1431				 unsigned int bytes, bool read)
1432{
1433	u64 exit_code, exit_info_1, exit_info_2;
1434	unsigned long ghcb_pa = __pa(ghcb);
1435	enum es_result res;
1436	phys_addr_t paddr;
1437	void __user *ref;
1438
1439	ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
1440	if (ref == (void __user *)-1L)
1441		return ES_UNSUPPORTED;
1442
1443	exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
1444
1445	res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
1446	if (res != ES_OK) {
1447		if (res == ES_EXCEPTION && !read)
1448			ctxt->fi.error_code |= X86_PF_WRITE;
1449
1450		return res;
1451	}
1452
1453	exit_info_1 = paddr;
1454	/* Can never be greater than 8 */
1455	exit_info_2 = bytes;
1456
1457	ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
1458
1459	return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
1460}
1461
1462/*
1463 * The MOVS instruction has two memory operands, which raises the
1464 * problem that it is not known whether the access to the source or the
1465 * destination caused the #VC exception (and hence whether an MMIO read
1466 * or write operation needs to be emulated).
1467 *
1468 * Instead of playing games with walking page-tables and trying to guess
1469 * whether the source or destination is an MMIO range, split the move
1470 * into two operations, a read and a write with only one memory operand.
1471 * This will cause a nested #VC exception on the MMIO address which can
1472 * then be handled.
1473 *
1474 * This implementation has the benefit that it also supports MOVS where
1475 * source _and_ destination are MMIO regions.
1476 *
1477 * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
1478 * rare operation. If it turns out to be a performance problem the split
1479 * operations can be moved to memcpy_fromio() and memcpy_toio().
1480 */
1481static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
1482					  unsigned int bytes)
1483{
1484	unsigned long ds_base, es_base;
1485	unsigned char *src, *dst;
1486	unsigned char buffer[8];
1487	enum es_result ret;
1488	bool rep;
1489	int off;
1490
1491	ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
1492	es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
1493
1494	if (ds_base == -1L || es_base == -1L) {
1495		ctxt->fi.vector = X86_TRAP_GP;
1496		ctxt->fi.error_code = 0;
1497		return ES_EXCEPTION;
1498	}
1499
1500	src = ds_base + (unsigned char *)ctxt->regs->si;
1501	dst = es_base + (unsigned char *)ctxt->regs->di;
1502
1503	ret = vc_read_mem(ctxt, src, buffer, bytes);
1504	if (ret != ES_OK)
1505		return ret;
1506
1507	ret = vc_write_mem(ctxt, dst, buffer, bytes);
1508	if (ret != ES_OK)
1509		return ret;
1510
1511	if (ctxt->regs->flags & X86_EFLAGS_DF)
1512		off = -bytes;
1513	else
1514		off =  bytes;
1515
1516	ctxt->regs->si += off;
1517	ctxt->regs->di += off;
1518
1519	rep = insn_has_rep_prefix(&ctxt->insn);
1520	if (rep)
1521		ctxt->regs->cx -= 1;
1522
1523	if (!rep || ctxt->regs->cx == 0)
1524		return ES_OK;
1525	else
1526		return ES_RETRY;
1527}
1528
1529static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
1530{
1531	struct insn *insn = &ctxt->insn;
1532	enum insn_mmio_type mmio;
1533	unsigned int bytes = 0;
1534	enum es_result ret;
1535	u8 sign_byte;
1536	long *reg_data;
1537
1538	mmio = insn_decode_mmio(insn, &bytes);
1539	if (mmio == INSN_MMIO_DECODE_FAILED)
1540		return ES_DECODE_FAILED;
1541
1542	if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
1543		reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
1544		if (!reg_data)
1545			return ES_DECODE_FAILED;
1546	}
1547
1548	if (user_mode(ctxt->regs))
1549		return ES_UNSUPPORTED;
1550
1551	switch (mmio) {
1552	case INSN_MMIO_WRITE:
1553		memcpy(ghcb->shared_buffer, reg_data, bytes);
1554		ret = vc_do_mmio(ghcb, ctxt, bytes, false);
1555		break;
1556	case INSN_MMIO_WRITE_IMM:
1557		memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
1558		ret = vc_do_mmio(ghcb, ctxt, bytes, false);
1559		break;
1560	case INSN_MMIO_READ:
1561		ret = vc_do_mmio(ghcb, ctxt, bytes, true);
1562		if (ret)
1563			break;
1564
1565		/* Zero-extend for 32-bit operation */
1566		if (bytes == 4)
1567			*reg_data = 0;
1568
1569		memcpy(reg_data, ghcb->shared_buffer, bytes);
1570		break;
1571	case INSN_MMIO_READ_ZERO_EXTEND:
1572		ret = vc_do_mmio(ghcb, ctxt, bytes, true);
1573		if (ret)
1574			break;
1575
1576		/* Zero extend based on operand size */
1577		memset(reg_data, 0, insn->opnd_bytes);
1578		memcpy(reg_data, ghcb->shared_buffer, bytes);
1579		break;
1580	case INSN_MMIO_READ_SIGN_EXTEND:
1581		ret = vc_do_mmio(ghcb, ctxt, bytes, true);
1582		if (ret)
1583			break;
1584
1585		if (bytes == 1) {
1586			u8 *val = (u8 *)ghcb->shared_buffer;
1587
1588			sign_byte = (*val & 0x80) ? 0xff : 0x00;
1589		} else {
1590			u16 *val = (u16 *)ghcb->shared_buffer;
1591
1592			sign_byte = (*val & 0x8000) ? 0xff : 0x00;
1593		}
1594
1595		/* Sign extend based on operand size */
1596		memset(reg_data, sign_byte, insn->opnd_bytes);
1597		memcpy(reg_data, ghcb->shared_buffer, bytes);
1598		break;
1599	case INSN_MMIO_MOVS:
1600		ret = vc_handle_mmio_movs(ctxt, bytes);
1601		break;
1602	default:
1603		ret = ES_UNSUPPORTED;
1604		break;
1605	}
1606
1607	return ret;
1608}
1609
1610static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
1611					  struct es_em_ctxt *ctxt)
1612{
1613	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1614	long val, *reg = vc_insn_get_rm(ctxt);
1615	enum es_result ret;
1616
1617	if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1618		return ES_VMM_ERROR;
1619
1620	if (!reg)
1621		return ES_DECODE_FAILED;
1622
1623	val = *reg;
1624
1625	/* Upper 32 bits must be written as zeroes */
1626	if (val >> 32) {
1627		ctxt->fi.vector = X86_TRAP_GP;
1628		ctxt->fi.error_code = 0;
1629		return ES_EXCEPTION;
1630	}
1631
1632	/* Clear out other reserved bits and set bit 10 */
1633	val = (val & 0xffff23ffL) | BIT(10);
1634
1635	/* Early non-zero writes to DR7 are not supported */
1636	if (!data && (val & ~DR7_RESET_VALUE))
1637		return ES_UNSUPPORTED;
1638
1639	/* Using a value of 0 for ExitInfo1 means RAX holds the value */
1640	ghcb_set_rax(ghcb, val);
1641	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
1642	if (ret != ES_OK)
1643		return ret;
1644
1645	if (data)
1646		data->dr7 = val;
1647
1648	return ES_OK;
1649}
1650
1651static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
1652					 struct es_em_ctxt *ctxt)
1653{
1654	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1655	long *reg = vc_insn_get_rm(ctxt);
1656
1657	if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1658		return ES_VMM_ERROR;
1659
1660	if (!reg)
1661		return ES_DECODE_FAILED;
1662
1663	if (data)
1664		*reg = data->dr7;
1665	else
1666		*reg = DR7_RESET_VALUE;
1667
1668	return ES_OK;
1669}
1670
1671static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
1672				       struct es_em_ctxt *ctxt)
1673{
1674	return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
1675}
1676
1677static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
1678{
1679	enum es_result ret;
1680
1681	ghcb_set_rcx(ghcb, ctxt->regs->cx);
1682
1683	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
1684	if (ret != ES_OK)
1685		return ret;
1686
1687	if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
1688		return ES_VMM_ERROR;
1689
1690	ctxt->regs->ax = ghcb->save.rax;
1691	ctxt->regs->dx = ghcb->save.rdx;
1692
1693	return ES_OK;
1694}
1695
1696static enum es_result vc_handle_monitor(struct ghcb *ghcb,
1697					struct es_em_ctxt *ctxt)
1698{
1699	/*
1700	 * Treat it as a NOP and do not leak a physical address to the
1701	 * hypervisor.
1702	 */
1703	return ES_OK;
1704}
1705
1706static enum es_result vc_handle_mwait(struct ghcb *ghcb,
1707				      struct es_em_ctxt *ctxt)
1708{
1709	/* Treat the same as MONITOR/MONITORX */
1710	return ES_OK;
1711}
1712
1713static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
1714					struct es_em_ctxt *ctxt)
1715{
1716	enum es_result ret;
1717
1718	ghcb_set_rax(ghcb, ctxt->regs->ax);
1719	ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
1720
1721	if (x86_platform.hyper.sev_es_hcall_prepare)
1722		x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
1723
1724	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
1725	if (ret != ES_OK)
1726		return ret;
1727
1728	if (!ghcb_rax_is_valid(ghcb))
1729		return ES_VMM_ERROR;
1730
1731	ctxt->regs->ax = ghcb->save.rax;
1732
1733	/*
1734	 * Call sev_es_hcall_finish() after regs->ax is already set.
1735	 * This allows the hypervisor handler to overwrite it again if
1736	 * necessary.
1737	 */
1738	if (x86_platform.hyper.sev_es_hcall_finish &&
1739	    !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
1740		return ES_VMM_ERROR;
1741
1742	return ES_OK;
1743}
1744
1745static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
1746					struct es_em_ctxt *ctxt)
1747{
1748	/*
1749	 * Calling ecx_alignment_check() directly does not work, because it
1750	 * enables IRQs and the GHCB is active. Forward the exception and call
1751	 * it later from vc_forward_exception().
1752	 */
1753	ctxt->fi.vector = X86_TRAP_AC;
1754	ctxt->fi.error_code = 0;
1755	return ES_EXCEPTION;
1756}
1757
1758static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
1759					 struct ghcb *ghcb,
1760					 unsigned long exit_code)
1761{
1762	enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
1763
1764	if (result != ES_OK)
1765		return result;
1766
1767	switch (exit_code) {
1768	case SVM_EXIT_READ_DR7:
1769		result = vc_handle_dr7_read(ghcb, ctxt);
1770		break;
1771	case SVM_EXIT_WRITE_DR7:
1772		result = vc_handle_dr7_write(ghcb, ctxt);
1773		break;
1774	case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
1775		result = vc_handle_trap_ac(ghcb, ctxt);
1776		break;
1777	case SVM_EXIT_RDTSC:
1778	case SVM_EXIT_RDTSCP:
1779		result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
1780		break;
1781	case SVM_EXIT_RDPMC:
1782		result = vc_handle_rdpmc(ghcb, ctxt);
1783		break;
1784	case SVM_EXIT_INVD:
1785		pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
1786		result = ES_UNSUPPORTED;
1787		break;
1788	case SVM_EXIT_CPUID:
1789		result = vc_handle_cpuid(ghcb, ctxt);
1790		break;
1791	case SVM_EXIT_IOIO:
1792		result = vc_handle_ioio(ghcb, ctxt);
1793		break;
1794	case SVM_EXIT_MSR:
1795		result = vc_handle_msr(ghcb, ctxt);
1796		break;
1797	case SVM_EXIT_VMMCALL:
1798		result = vc_handle_vmmcall(ghcb, ctxt);
1799		break;
1800	case SVM_EXIT_WBINVD:
1801		result = vc_handle_wbinvd(ghcb, ctxt);
1802		break;
1803	case SVM_EXIT_MONITOR:
1804		result = vc_handle_monitor(ghcb, ctxt);
1805		break;
1806	case SVM_EXIT_MWAIT:
1807		result = vc_handle_mwait(ghcb, ctxt);
1808		break;
1809	case SVM_EXIT_NPF:
1810		result = vc_handle_mmio(ghcb, ctxt);
1811		break;
1812	default:
1813		/*
1814		 * Unexpected #VC exception
1815		 */
1816		result = ES_UNSUPPORTED;
1817	}
1818
1819	return result;
1820}
1821
1822static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
1823{
1824	long error_code = ctxt->fi.error_code;
1825	int trapnr = ctxt->fi.vector;
1826
1827	ctxt->regs->orig_ax = ctxt->fi.error_code;
1828
1829	switch (trapnr) {
1830	case X86_TRAP_GP:
1831		exc_general_protection(ctxt->regs, error_code);
1832		break;
1833	case X86_TRAP_UD:
1834		exc_invalid_op(ctxt->regs);
1835		break;
1836	case X86_TRAP_PF:
1837		write_cr2(ctxt->fi.cr2);
1838		exc_page_fault(ctxt->regs, error_code);
1839		break;
1840	case X86_TRAP_AC:
1841		exc_alignment_check(ctxt->regs, error_code);
1842		break;
1843	default:
1844		pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
1845		BUG();
1846	}
1847}
1848
1849static __always_inline bool is_vc2_stack(unsigned long sp)
1850{
1851	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
1852}
1853
1854static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
1855{
1856	unsigned long sp, prev_sp;
1857
1858	sp      = (unsigned long)regs;
1859	prev_sp = regs->sp;
1860
1861	/*
1862	 * If the code was already executing on the VC2 stack when the #VC
1863	 * happened, let it proceed to the normal handling routine. This way the
1864	 * code executing on the VC2 stack can cause #VC exceptions to get handled.
1865	 */
1866	return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
1867}
1868
1869static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
1870{
1871	struct ghcb_state state;
1872	struct es_em_ctxt ctxt;
1873	enum es_result result;
1874	struct ghcb *ghcb;
1875	bool ret = true;
1876
1877	ghcb = __sev_get_ghcb(&state);
1878
1879	vc_ghcb_invalidate(ghcb);
1880	result = vc_init_em_ctxt(&ctxt, regs, error_code);
1881
1882	if (result == ES_OK)
1883		result = vc_handle_exitcode(&ctxt, ghcb, error_code);
1884
1885	__sev_put_ghcb(&state);
1886
1887	/* Done - now check the result */
1888	switch (result) {
1889	case ES_OK:
1890		vc_finish_insn(&ctxt);
1891		break;
1892	case ES_UNSUPPORTED:
1893		pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
1894				   error_code, regs->ip);
1895		ret = false;
1896		break;
1897	case ES_VMM_ERROR:
1898		pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
1899				   error_code, regs->ip);
1900		ret = false;
1901		break;
1902	case ES_DECODE_FAILED:
1903		pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
1904				   error_code, regs->ip);
1905		ret = false;
1906		break;
1907	case ES_EXCEPTION:
1908		vc_forward_exception(&ctxt);
1909		break;
1910	case ES_RETRY:
1911		/* Nothing to do */
1912		break;
1913	default:
1914		pr_emerg("Unknown result in %s():%d\n", __func__, result);
1915		/*
1916		 * Emulating the instruction which caused the #VC exception
1917		 * failed - can't continue so print debug information
1918		 */
1919		BUG();
1920	}
1921
1922	return ret;
1923}
1924
1925static __always_inline bool vc_is_db(unsigned long error_code)
1926{
1927	return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
1928}
1929
1930/*
1931 * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
1932 * and will panic when an error happens.
1933 */
1934DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
1935{
1936	irqentry_state_t irq_state;
1937
1938	/*
1939	 * With the current implementation it is always possible to switch to a
1940	 * safe stack because #VC exceptions only happen at known places, like
1941	 * intercepted instructions or accesses to MMIO areas/IO ports. They can
1942	 * also happen with code instrumentation when the hypervisor intercepts
1943	 * #DB, but the critical paths are forbidden to be instrumented, so #DB
1944	 * exceptions currently also only happen in safe places.
1945	 *
1946	 * But keep this here in case the noinstr annotations are violated due
1947	 * to bug elsewhere.
1948	 */
1949	if (unlikely(vc_from_invalid_context(regs))) {
1950		instrumentation_begin();
1951		panic("Can't handle #VC exception from unsupported context\n");
1952		instrumentation_end();
1953	}
1954
1955	/*
1956	 * Handle #DB before calling into !noinstr code to avoid recursive #DB.
1957	 */
1958	if (vc_is_db(error_code)) {
1959		exc_debug(regs);
1960		return;
1961	}
1962
1963	irq_state = irqentry_nmi_enter(regs);
1964
1965	instrumentation_begin();
1966
1967	if (!vc_raw_handle_exception(regs, error_code)) {
1968		/* Show some debug info */
1969		show_regs(regs);
1970
1971		/* Ask hypervisor to sev_es_terminate */
1972		sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1973
1974		/* If that fails and we get here - just panic */
1975		panic("Returned from Terminate-Request to Hypervisor\n");
1976	}
1977
1978	instrumentation_end();
1979	irqentry_nmi_exit(regs, irq_state);
1980}
1981
1982/*
1983 * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
1984 * and will kill the current task with SIGBUS when an error happens.
1985 */
1986DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
1987{
1988	/*
1989	 * Handle #DB before calling into !noinstr code to avoid recursive #DB.
1990	 */
1991	if (vc_is_db(error_code)) {
1992		noist_exc_debug(regs);
1993		return;
1994	}
1995
1996	irqentry_enter_from_user_mode(regs);
1997	instrumentation_begin();
1998
1999	if (!vc_raw_handle_exception(regs, error_code)) {
2000		/*
2001		 * Do not kill the machine if user-space triggered the
2002		 * exception. Send SIGBUS instead and let user-space deal with
2003		 * it.
2004		 */
2005		force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
2006	}
2007
2008	instrumentation_end();
2009	irqentry_exit_to_user_mode(regs);
2010}
2011
2012bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
2013{
2014	unsigned long exit_code = regs->orig_ax;
2015	struct es_em_ctxt ctxt;
2016	enum es_result result;
2017
2018	vc_ghcb_invalidate(boot_ghcb);
2019
2020	result = vc_init_em_ctxt(&ctxt, regs, exit_code);
2021	if (result == ES_OK)
2022		result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
2023
2024	/* Done - now check the result */
2025	switch (result) {
2026	case ES_OK:
2027		vc_finish_insn(&ctxt);
2028		break;
2029	case ES_UNSUPPORTED:
2030		early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
2031				exit_code, regs->ip);
2032		goto fail;
2033	case ES_VMM_ERROR:
2034		early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
2035				exit_code, regs->ip);
2036		goto fail;
2037	case ES_DECODE_FAILED:
2038		early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
2039				exit_code, regs->ip);
2040		goto fail;
2041	case ES_EXCEPTION:
2042		vc_early_forward_exception(&ctxt);
2043		break;
2044	case ES_RETRY:
2045		/* Nothing to do */
2046		break;
2047	default:
2048		BUG();
2049	}
2050
2051	return true;
2052
2053fail:
2054	show_regs(regs);
2055
2056	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
2057}
2058
2059/*
2060 * Initial set up of SNP relies on information provided by the
2061 * Confidential Computing blob, which can be passed to the kernel
2062 * in the following ways, depending on how it is booted:
2063 *
2064 * - when booted via the boot/decompress kernel:
2065 *   - via boot_params
2066 *
2067 * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
2068 *   - via a setup_data entry, as defined by the Linux Boot Protocol
2069 *
2070 * Scan for the blob in that order.
2071 */
2072static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
2073{
2074	struct cc_blob_sev_info *cc_info;
2075
2076	/* Boot kernel would have passed the CC blob via boot_params. */
2077	if (bp->cc_blob_address) {
2078		cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
2079		goto found_cc_info;
2080	}
2081
2082	/*
2083	 * If kernel was booted directly, without the use of the
2084	 * boot/decompression kernel, the CC blob may have been passed via
2085	 * setup_data instead.
2086	 */
2087	cc_info = find_cc_blob_setup_data(bp);
2088	if (!cc_info)
2089		return NULL;
2090
2091found_cc_info:
2092	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
2093		snp_abort();
2094
2095	return cc_info;
2096}
2097
2098bool __head snp_init(struct boot_params *bp)
2099{
2100	struct cc_blob_sev_info *cc_info;
2101
2102	if (!bp)
2103		return false;
2104
2105	cc_info = find_cc_blob(bp);
2106	if (!cc_info)
2107		return false;
2108
2109	setup_cpuid_table(cc_info);
2110
2111	/*
2112	 * The CC blob will be used later to access the secrets page. Cache
2113	 * it here like the boot kernel does.
2114	 */
2115	bp->cc_blob_address = (u32)(unsigned long)cc_info;
2116
2117	return true;
2118}
2119
2120void __head __noreturn snp_abort(void)
2121{
2122	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
2123}
2124
2125/*
2126 * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
2127 * enabled, as the alternative (fallback) logic for DMI probing in the legacy
2128 * ROM region can cause a crash since this region is not pre-validated.
2129 */
2130void __init snp_dmi_setup(void)
2131{
2132	if (efi_enabled(EFI_CONFIG_TABLES))
2133		dmi_setup();
2134}
2135
2136static void dump_cpuid_table(void)
2137{
2138	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2139	int i = 0;
2140
2141	pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
2142		cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
2143
2144	for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
2145		const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
2146
2147		pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
2148			i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
2149			fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
2150	}
2151}
2152
2153/*
2154 * It is useful from an auditing/testing perspective to provide an easy way
2155 * for the guest owner to know that the CPUID table has been initialized as
2156 * expected, but that initialization happens too early in boot to print any
2157 * sort of indicator, and there's not really any other good place to do it,
2158 * so do it here.
2159 */
2160static int __init report_cpuid_table(void)
2161{
2162	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2163
2164	if (!cpuid_table->count)
2165		return 0;
2166
2167	pr_info("Using SNP CPUID table, %d entries present.\n",
2168		cpuid_table->count);
2169
2170	if (sev_cfg.debug)
2171		dump_cpuid_table();
2172
2173	return 0;
2174}
2175arch_initcall(report_cpuid_table);
2176
2177static int __init init_sev_config(char *str)
2178{
2179	char *s;
2180
2181	while ((s = strsep(&str, ","))) {
2182		if (!strcmp(s, "debug")) {
2183			sev_cfg.debug = true;
2184			continue;
2185		}
2186
2187		pr_info("SEV command-line option '%s' was not recognized\n", s);
2188	}
2189
2190	return 1;
2191}
2192__setup("sev=", init_sev_config);
2193
2194int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
2195{
2196	struct ghcb_state state;
2197	struct es_em_ctxt ctxt;
2198	unsigned long flags;
2199	struct ghcb *ghcb;
2200	int ret;
2201
2202	rio->exitinfo2 = SEV_RET_NO_FW_CALL;
2203
2204	/*
2205	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using
2206	 * a per-CPU GHCB.
2207	 */
2208	local_irq_save(flags);
2209
2210	ghcb = __sev_get_ghcb(&state);
2211	if (!ghcb) {
2212		ret = -EIO;
2213		goto e_restore_irq;
2214	}
2215
2216	vc_ghcb_invalidate(ghcb);
2217
2218	if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2219		ghcb_set_rax(ghcb, input->data_gpa);
2220		ghcb_set_rbx(ghcb, input->data_npages);
2221	}
2222
2223	ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
2224	if (ret)
2225		goto e_put;
2226
2227	rio->exitinfo2 = ghcb->save.sw_exit_info_2;
2228	switch (rio->exitinfo2) {
2229	case 0:
2230		break;
2231
2232	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
2233		ret = -EAGAIN;
2234		break;
2235
2236	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
2237		/* Number of expected pages are returned in RBX */
2238		if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2239			input->data_npages = ghcb_get_rbx(ghcb);
2240			ret = -ENOSPC;
2241			break;
2242		}
2243		fallthrough;
2244	default:
2245		ret = -EIO;
2246		break;
2247	}
2248
2249e_put:
2250	__sev_put_ghcb(&state);
2251e_restore_irq:
2252	local_irq_restore(flags);
2253
2254	return ret;
2255}
2256EXPORT_SYMBOL_GPL(snp_issue_guest_request);
2257
2258static struct platform_device sev_guest_device = {
2259	.name		= "sev-guest",
2260	.id		= -1,
2261};
2262
2263static int __init snp_init_platform_device(void)
2264{
2265	struct sev_guest_platform_data data;
2266	u64 gpa;
2267
2268	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
2269		return -ENODEV;
2270
2271	gpa = get_secrets_page();
2272	if (!gpa)
2273		return -ENODEV;
2274
2275	data.secrets_gpa = gpa;
2276	if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
2277		return -ENODEV;
2278
2279	if (platform_device_register(&sev_guest_device))
2280		return -ENODEV;
2281
2282	pr_info("SNP guest platform device initialized.\n");
2283	return 0;
2284}
2285device_initcall(snp_init_platform_device);
2286
2287void sev_show_status(void)
2288{
2289	int i;
2290
2291	pr_info("Status: ");
2292	for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) {
2293		if (sev_status & BIT_ULL(i)) {
2294			if (!sev_status_feat_names[i])
2295				continue;
2296
2297			pr_cont("%s ", sev_status_feat_names[i]);
2298		}
2299	}
2300	pr_cont("\n");
2301}