Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.17.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
   4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
   5 */
   6
   7#include <linux/bug.h>
   8#include <linux/cpu_pm.h>
   9#include <linux/entry-kvm.h>
  10#include <linux/errno.h>
  11#include <linux/err.h>
  12#include <linux/kvm_host.h>
  13#include <linux/list.h>
  14#include <linux/module.h>
  15#include <linux/vmalloc.h>
  16#include <linux/fs.h>
  17#include <linux/mman.h>
  18#include <linux/sched.h>
  19#include <linux/kmemleak.h>
  20#include <linux/kvm.h>
  21#include <linux/kvm_irqfd.h>
  22#include <linux/irqbypass.h>
  23#include <linux/sched/stat.h>
  24#include <linux/psci.h>
  25#include <trace/events/kvm.h>
  26
  27#define CREATE_TRACE_POINTS
  28#include "trace_arm.h"
  29
  30#include <linux/uaccess.h>
  31#include <asm/ptrace.h>
  32#include <asm/mman.h>
  33#include <asm/tlbflush.h>
  34#include <asm/cacheflush.h>
  35#include <asm/cpufeature.h>
  36#include <asm/virt.h>
  37#include <asm/kvm_arm.h>
  38#include <asm/kvm_asm.h>
  39#include <asm/kvm_mmu.h>
  40#include <asm/kvm_pkvm.h>
  41#include <asm/kvm_emulate.h>
  42#include <asm/sections.h>
  43
  44#include <kvm/arm_hypercalls.h>
  45#include <kvm/arm_pmu.h>
  46#include <kvm/arm_psci.h>
  47
  48static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
  49DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
  50
  51DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
  52
  53DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
  54DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
  55
  56static bool vgic_present;
  57
  58static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
  59DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
  60
  61int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  62{
  63	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
  64}
  65
  66int kvm_arch_hardware_setup(void *opaque)
  67{
  68	return 0;
  69}
  70
  71int kvm_arch_check_processor_compat(void *opaque)
  72{
  73	return 0;
  74}
  75
  76int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
  77			    struct kvm_enable_cap *cap)
  78{
  79	int r;
  80
  81	if (cap->flags)
  82		return -EINVAL;
  83
  84	switch (cap->cap) {
  85	case KVM_CAP_ARM_NISV_TO_USER:
  86		r = 0;
  87		set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
  88			&kvm->arch.flags);
  89		break;
  90	case KVM_CAP_ARM_MTE:
  91		mutex_lock(&kvm->lock);
  92		if (!system_supports_mte() || kvm->created_vcpus) {
  93			r = -EINVAL;
  94		} else {
  95			r = 0;
  96			set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
  97		}
  98		mutex_unlock(&kvm->lock);
  99		break;
 100	case KVM_CAP_ARM_SYSTEM_SUSPEND:
 101		r = 0;
 102		set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
 103		break;
 104	default:
 105		r = -EINVAL;
 106		break;
 107	}
 108
 109	return r;
 110}
 111
 112static int kvm_arm_default_max_vcpus(void)
 113{
 114	return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
 115}
 116
 117static void set_default_spectre(struct kvm *kvm)
 118{
 119	/*
 120	 * The default is to expose CSV2 == 1 if the HW isn't affected.
 121	 * Although this is a per-CPU feature, we make it global because
 122	 * asymmetric systems are just a nuisance.
 123	 *
 124	 * Userspace can override this as long as it doesn't promise
 125	 * the impossible.
 126	 */
 127	if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
 128		kvm->arch.pfr0_csv2 = 1;
 129	if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
 130		kvm->arch.pfr0_csv3 = 1;
 131}
 132
 133/**
 134 * kvm_arch_init_vm - initializes a VM data structure
 135 * @kvm:	pointer to the KVM struct
 136 */
 137int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 138{
 139	int ret;
 140
 141	ret = kvm_share_hyp(kvm, kvm + 1);
 142	if (ret)
 143		return ret;
 144
 145	ret = pkvm_init_host_vm(kvm);
 146	if (ret)
 147		goto err_unshare_kvm;
 148
 149	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
 150		ret = -ENOMEM;
 151		goto err_unshare_kvm;
 152	}
 153	cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
 154
 155	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
 156	if (ret)
 157		goto err_free_cpumask;
 158
 159	kvm_vgic_early_init(kvm);
 160
 161	/* The maximum number of VCPUs is limited by the host's GIC model */
 162	kvm->max_vcpus = kvm_arm_default_max_vcpus();
 163
 164	set_default_spectre(kvm);
 165	kvm_arm_init_hypercalls(kvm);
 166
 167	/*
 168	 * Initialise the default PMUver before there is a chance to
 169	 * create an actual PMU.
 170	 */
 171	kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
 172
 173	return 0;
 174
 175err_free_cpumask:
 176	free_cpumask_var(kvm->arch.supported_cpus);
 177err_unshare_kvm:
 178	kvm_unshare_hyp(kvm, kvm + 1);
 179	return ret;
 180}
 181
 182vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 183{
 184	return VM_FAULT_SIGBUS;
 185}
 186
 187
 188/**
 189 * kvm_arch_destroy_vm - destroy the VM data structure
 190 * @kvm:	pointer to the KVM struct
 191 */
 192void kvm_arch_destroy_vm(struct kvm *kvm)
 193{
 194	bitmap_free(kvm->arch.pmu_filter);
 195	free_cpumask_var(kvm->arch.supported_cpus);
 196
 197	kvm_vgic_destroy(kvm);
 198
 199	if (is_protected_kvm_enabled())
 200		pkvm_destroy_hyp_vm(kvm);
 201
 202	kvm_destroy_vcpus(kvm);
 203
 204	kvm_unshare_hyp(kvm, kvm + 1);
 205}
 206
 207int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 208{
 209	int r;
 210	switch (ext) {
 211	case KVM_CAP_IRQCHIP:
 212		r = vgic_present;
 213		break;
 214	case KVM_CAP_IOEVENTFD:
 215	case KVM_CAP_DEVICE_CTRL:
 216	case KVM_CAP_USER_MEMORY:
 217	case KVM_CAP_SYNC_MMU:
 218	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 219	case KVM_CAP_ONE_REG:
 220	case KVM_CAP_ARM_PSCI:
 221	case KVM_CAP_ARM_PSCI_0_2:
 222	case KVM_CAP_READONLY_MEM:
 223	case KVM_CAP_MP_STATE:
 224	case KVM_CAP_IMMEDIATE_EXIT:
 225	case KVM_CAP_VCPU_EVENTS:
 226	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
 227	case KVM_CAP_ARM_NISV_TO_USER:
 228	case KVM_CAP_ARM_INJECT_EXT_DABT:
 229	case KVM_CAP_SET_GUEST_DEBUG:
 230	case KVM_CAP_VCPU_ATTRIBUTES:
 231	case KVM_CAP_PTP_KVM:
 232	case KVM_CAP_ARM_SYSTEM_SUSPEND:
 233		r = 1;
 234		break;
 235	case KVM_CAP_SET_GUEST_DEBUG2:
 236		return KVM_GUESTDBG_VALID_MASK;
 237	case KVM_CAP_ARM_SET_DEVICE_ADDR:
 238		r = 1;
 239		break;
 240	case KVM_CAP_NR_VCPUS:
 241		/*
 242		 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
 243		 * architectures, as it does not always bound it to
 244		 * KVM_CAP_MAX_VCPUS. It should not matter much because
 245		 * this is just an advisory value.
 246		 */
 247		r = min_t(unsigned int, num_online_cpus(),
 248			  kvm_arm_default_max_vcpus());
 249		break;
 250	case KVM_CAP_MAX_VCPUS:
 251	case KVM_CAP_MAX_VCPU_ID:
 252		if (kvm)
 253			r = kvm->max_vcpus;
 254		else
 255			r = kvm_arm_default_max_vcpus();
 256		break;
 257	case KVM_CAP_MSI_DEVID:
 258		if (!kvm)
 259			r = -EINVAL;
 260		else
 261			r = kvm->arch.vgic.msis_require_devid;
 262		break;
 263	case KVM_CAP_ARM_USER_IRQ:
 264		/*
 265		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
 266		 * (bump this number if adding more devices)
 267		 */
 268		r = 1;
 269		break;
 270	case KVM_CAP_ARM_MTE:
 271		r = system_supports_mte();
 272		break;
 273	case KVM_CAP_STEAL_TIME:
 274		r = kvm_arm_pvtime_supported();
 275		break;
 276	case KVM_CAP_ARM_EL1_32BIT:
 277		r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
 278		break;
 279	case KVM_CAP_GUEST_DEBUG_HW_BPS:
 280		r = get_num_brps();
 281		break;
 282	case KVM_CAP_GUEST_DEBUG_HW_WPS:
 283		r = get_num_wrps();
 284		break;
 285	case KVM_CAP_ARM_PMU_V3:
 286		r = kvm_arm_support_pmu_v3();
 287		break;
 288	case KVM_CAP_ARM_INJECT_SERROR_ESR:
 289		r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
 290		break;
 291	case KVM_CAP_ARM_VM_IPA_SIZE:
 292		r = get_kvm_ipa_limit();
 293		break;
 294	case KVM_CAP_ARM_SVE:
 295		r = system_supports_sve();
 296		break;
 297	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
 298	case KVM_CAP_ARM_PTRAUTH_GENERIC:
 299		r = system_has_full_ptr_auth();
 300		break;
 301	default:
 302		r = 0;
 303	}
 304
 305	return r;
 306}
 307
 308long kvm_arch_dev_ioctl(struct file *filp,
 309			unsigned int ioctl, unsigned long arg)
 310{
 311	return -EINVAL;
 312}
 313
 314struct kvm *kvm_arch_alloc_vm(void)
 315{
 316	size_t sz = sizeof(struct kvm);
 317
 318	if (!has_vhe())
 319		return kzalloc(sz, GFP_KERNEL_ACCOUNT);
 320
 321	return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
 322}
 323
 324int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 325{
 326	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
 327		return -EBUSY;
 328
 329	if (id >= kvm->max_vcpus)
 330		return -EINVAL;
 331
 332	return 0;
 333}
 334
 335int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 336{
 337	int err;
 338
 339	/* Force users to call KVM_ARM_VCPU_INIT */
 340	vcpu->arch.target = -1;
 341	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
 342
 343	vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
 344
 345	/*
 346	 * Default value for the FP state, will be overloaded at load
 347	 * time if we support FP (pretty likely)
 348	 */
 349	vcpu->arch.fp_state = FP_STATE_FREE;
 350
 351	/* Set up the timer */
 352	kvm_timer_vcpu_init(vcpu);
 353
 354	kvm_pmu_vcpu_init(vcpu);
 355
 356	kvm_arm_reset_debug_ptr(vcpu);
 357
 358	kvm_arm_pvtime_vcpu_init(&vcpu->arch);
 359
 360	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
 361
 362	err = kvm_vgic_vcpu_init(vcpu);
 363	if (err)
 364		return err;
 365
 366	return kvm_share_hyp(vcpu, vcpu + 1);
 367}
 368
 369void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 370{
 371}
 372
 373void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 374{
 375	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
 376		static_branch_dec(&userspace_irqchip_in_use);
 377
 378	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 379	kvm_timer_vcpu_terminate(vcpu);
 380	kvm_pmu_vcpu_destroy(vcpu);
 381
 382	kvm_arm_vcpu_destroy(vcpu);
 383}
 384
 385void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
 386{
 387
 388}
 389
 390void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
 391{
 392
 393}
 394
 395void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 396{
 397	struct kvm_s2_mmu *mmu;
 398	int *last_ran;
 399
 400	mmu = vcpu->arch.hw_mmu;
 401	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
 402
 403	/*
 404	 * We guarantee that both TLBs and I-cache are private to each
 405	 * vcpu. If detecting that a vcpu from the same VM has
 406	 * previously run on the same physical CPU, call into the
 407	 * hypervisor code to nuke the relevant contexts.
 408	 *
 409	 * We might get preempted before the vCPU actually runs, but
 410	 * over-invalidation doesn't affect correctness.
 411	 */
 412	if (*last_ran != vcpu->vcpu_id) {
 413		kvm_call_hyp(__kvm_flush_cpu_context, mmu);
 414		*last_ran = vcpu->vcpu_id;
 415	}
 416
 417	vcpu->cpu = cpu;
 418
 419	kvm_vgic_load(vcpu);
 420	kvm_timer_vcpu_load(vcpu);
 421	if (has_vhe())
 422		kvm_vcpu_load_sysregs_vhe(vcpu);
 423	kvm_arch_vcpu_load_fp(vcpu);
 424	kvm_vcpu_pmu_restore_guest(vcpu);
 425	if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
 426		kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
 427
 428	if (single_task_running())
 429		vcpu_clear_wfx_traps(vcpu);
 430	else
 431		vcpu_set_wfx_traps(vcpu);
 432
 433	if (vcpu_has_ptrauth(vcpu))
 434		vcpu_ptrauth_disable(vcpu);
 435	kvm_arch_vcpu_load_debug_state_flags(vcpu);
 436
 437	if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus))
 438		vcpu_set_on_unsupported_cpu(vcpu);
 439}
 440
 441void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 442{
 443	kvm_arch_vcpu_put_debug_state_flags(vcpu);
 444	kvm_arch_vcpu_put_fp(vcpu);
 445	if (has_vhe())
 446		kvm_vcpu_put_sysregs_vhe(vcpu);
 447	kvm_timer_vcpu_put(vcpu);
 448	kvm_vgic_put(vcpu);
 449	kvm_vcpu_pmu_restore_host(vcpu);
 450	kvm_arm_vmid_clear_active();
 451
 452	vcpu_clear_on_unsupported_cpu(vcpu);
 453	vcpu->cpu = -1;
 454}
 455
 456void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
 457{
 458	vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
 459	kvm_make_request(KVM_REQ_SLEEP, vcpu);
 460	kvm_vcpu_kick(vcpu);
 461}
 462
 463bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
 464{
 465	return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
 466}
 467
 468static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
 469{
 470	vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED;
 471	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
 472	kvm_vcpu_kick(vcpu);
 473}
 474
 475static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
 476{
 477	return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
 478}
 479
 480int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 481				    struct kvm_mp_state *mp_state)
 482{
 483	*mp_state = vcpu->arch.mp_state;
 484
 485	return 0;
 486}
 487
 488int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 489				    struct kvm_mp_state *mp_state)
 490{
 491	int ret = 0;
 492
 493	switch (mp_state->mp_state) {
 494	case KVM_MP_STATE_RUNNABLE:
 495		vcpu->arch.mp_state = *mp_state;
 496		break;
 497	case KVM_MP_STATE_STOPPED:
 498		kvm_arm_vcpu_power_off(vcpu);
 499		break;
 500	case KVM_MP_STATE_SUSPENDED:
 501		kvm_arm_vcpu_suspend(vcpu);
 502		break;
 503	default:
 504		ret = -EINVAL;
 505	}
 506
 507	return ret;
 508}
 509
 510/**
 511 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
 512 * @v:		The VCPU pointer
 513 *
 514 * If the guest CPU is not waiting for interrupts or an interrupt line is
 515 * asserted, the CPU is by definition runnable.
 516 */
 517int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 518{
 519	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
 520	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
 521		&& !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
 522}
 523
 524bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 525{
 526	return vcpu_mode_priv(vcpu);
 527}
 528
 529#ifdef CONFIG_GUEST_PERF_EVENTS
 530unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
 531{
 532	return *vcpu_pc(vcpu);
 533}
 534#endif
 535
 536static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
 537{
 538	return vcpu->arch.target >= 0;
 539}
 540
 541/*
 542 * Handle both the initialisation that is being done when the vcpu is
 543 * run for the first time, as well as the updates that must be
 544 * performed each time we get a new thread dealing with this vcpu.
 545 */
 546int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 547{
 548	struct kvm *kvm = vcpu->kvm;
 549	int ret;
 550
 551	if (!kvm_vcpu_initialized(vcpu))
 552		return -ENOEXEC;
 553
 554	if (!kvm_arm_vcpu_is_finalized(vcpu))
 555		return -EPERM;
 556
 557	ret = kvm_arch_vcpu_run_map_fp(vcpu);
 558	if (ret)
 559		return ret;
 560
 561	if (likely(vcpu_has_run_once(vcpu)))
 562		return 0;
 563
 564	kvm_arm_vcpu_init_debug(vcpu);
 565
 566	if (likely(irqchip_in_kernel(kvm))) {
 567		/*
 568		 * Map the VGIC hardware resources before running a vcpu the
 569		 * first time on this VM.
 570		 */
 571		ret = kvm_vgic_map_resources(kvm);
 572		if (ret)
 573			return ret;
 574	}
 575
 576	ret = kvm_timer_enable(vcpu);
 577	if (ret)
 578		return ret;
 579
 580	ret = kvm_arm_pmu_v3_enable(vcpu);
 581	if (ret)
 582		return ret;
 583
 584	if (is_protected_kvm_enabled()) {
 585		ret = pkvm_create_hyp_vm(kvm);
 586		if (ret)
 587			return ret;
 588	}
 589
 590	if (!irqchip_in_kernel(kvm)) {
 591		/*
 592		 * Tell the rest of the code that there are userspace irqchip
 593		 * VMs in the wild.
 594		 */
 595		static_branch_inc(&userspace_irqchip_in_use);
 596	}
 597
 598	/*
 599	 * Initialize traps for protected VMs.
 600	 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
 601	 * the code is in place for first run initialization at EL2.
 602	 */
 603	if (kvm_vm_is_protected(kvm))
 604		kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
 605
 606	mutex_lock(&kvm->lock);
 607	set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
 608	mutex_unlock(&kvm->lock);
 609
 610	return ret;
 611}
 612
 613bool kvm_arch_intc_initialized(struct kvm *kvm)
 614{
 615	return vgic_initialized(kvm);
 616}
 617
 618void kvm_arm_halt_guest(struct kvm *kvm)
 619{
 620	unsigned long i;
 621	struct kvm_vcpu *vcpu;
 622
 623	kvm_for_each_vcpu(i, vcpu, kvm)
 624		vcpu->arch.pause = true;
 625	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
 626}
 627
 628void kvm_arm_resume_guest(struct kvm *kvm)
 629{
 630	unsigned long i;
 631	struct kvm_vcpu *vcpu;
 632
 633	kvm_for_each_vcpu(i, vcpu, kvm) {
 634		vcpu->arch.pause = false;
 635		__kvm_vcpu_wake_up(vcpu);
 636	}
 637}
 638
 639static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
 640{
 641	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
 642
 643	rcuwait_wait_event(wait,
 644			   (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
 645			   TASK_INTERRUPTIBLE);
 646
 647	if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
 648		/* Awaken to handle a signal, request we sleep again later. */
 649		kvm_make_request(KVM_REQ_SLEEP, vcpu);
 650	}
 651
 652	/*
 653	 * Make sure we will observe a potential reset request if we've
 654	 * observed a change to the power state. Pairs with the smp_wmb() in
 655	 * kvm_psci_vcpu_on().
 656	 */
 657	smp_rmb();
 658}
 659
 660/**
 661 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
 662 * @vcpu:	The VCPU pointer
 663 *
 664 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
 665 * the vCPU is runnable.  The vCPU may or may not be scheduled out, depending
 666 * on when a wake event arrives, e.g. there may already be a pending wake event.
 667 */
 668void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
 669{
 670	/*
 671	 * Sync back the state of the GIC CPU interface so that we have
 672	 * the latest PMR and group enables. This ensures that
 673	 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
 674	 * we have pending interrupts, e.g. when determining if the
 675	 * vCPU should block.
 676	 *
 677	 * For the same reason, we want to tell GICv4 that we need
 678	 * doorbells to be signalled, should an interrupt become pending.
 679	 */
 680	preempt_disable();
 681	kvm_vgic_vmcr_sync(vcpu);
 682	vgic_v4_put(vcpu, true);
 683	preempt_enable();
 684
 685	kvm_vcpu_halt(vcpu);
 686	vcpu_clear_flag(vcpu, IN_WFIT);
 687
 688	preempt_disable();
 689	vgic_v4_load(vcpu);
 690	preempt_enable();
 691}
 692
 693static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
 694{
 695	if (!kvm_arm_vcpu_suspended(vcpu))
 696		return 1;
 697
 698	kvm_vcpu_wfi(vcpu);
 699
 700	/*
 701	 * The suspend state is sticky; we do not leave it until userspace
 702	 * explicitly marks the vCPU as runnable. Request that we suspend again
 703	 * later.
 704	 */
 705	kvm_make_request(KVM_REQ_SUSPEND, vcpu);
 706
 707	/*
 708	 * Check to make sure the vCPU is actually runnable. If so, exit to
 709	 * userspace informing it of the wakeup condition.
 710	 */
 711	if (kvm_arch_vcpu_runnable(vcpu)) {
 712		memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
 713		vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
 714		vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 715		return 0;
 716	}
 717
 718	/*
 719	 * Otherwise, we were unblocked to process a different event, such as a
 720	 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
 721	 * process the event.
 722	 */
 723	return 1;
 724}
 725
 726/**
 727 * check_vcpu_requests - check and handle pending vCPU requests
 728 * @vcpu:	the VCPU pointer
 729 *
 730 * Return: 1 if we should enter the guest
 731 *	   0 if we should exit to userspace
 732 *	   < 0 if we should exit to userspace, where the return value indicates
 733 *	   an error
 734 */
 735static int check_vcpu_requests(struct kvm_vcpu *vcpu)
 736{
 737	if (kvm_request_pending(vcpu)) {
 738		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
 739			kvm_vcpu_sleep(vcpu);
 740
 741		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
 742			kvm_reset_vcpu(vcpu);
 743
 744		/*
 745		 * Clear IRQ_PENDING requests that were made to guarantee
 746		 * that a VCPU sees new virtual interrupts.
 747		 */
 748		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
 749
 750		if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
 751			kvm_update_stolen_time(vcpu);
 752
 753		if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
 754			/* The distributor enable bits were changed */
 755			preempt_disable();
 756			vgic_v4_put(vcpu, false);
 757			vgic_v4_load(vcpu);
 758			preempt_enable();
 759		}
 760
 761		if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
 762			kvm_pmu_handle_pmcr(vcpu,
 763					    __vcpu_sys_reg(vcpu, PMCR_EL0));
 764
 765		if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
 766			return kvm_vcpu_suspend(vcpu);
 767
 768		if (kvm_dirty_ring_check_request(vcpu))
 769			return 0;
 770	}
 771
 772	return 1;
 773}
 774
 775static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
 776{
 777	if (likely(!vcpu_mode_is_32bit(vcpu)))
 778		return false;
 779
 780	return !kvm_supports_32bit_el0();
 781}
 782
 783/**
 784 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
 785 * @vcpu:	The VCPU pointer
 786 * @ret:	Pointer to write optional return code
 787 *
 788 * Returns: true if the VCPU needs to return to a preemptible + interruptible
 789 *	    and skip guest entry.
 790 *
 791 * This function disambiguates between two different types of exits: exits to a
 792 * preemptible + interruptible kernel context and exits to userspace. For an
 793 * exit to userspace, this function will write the return code to ret and return
 794 * true. For an exit to preemptible + interruptible kernel context (i.e. check
 795 * for pending work and re-enter), return true without writing to ret.
 796 */
 797static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
 798{
 799	struct kvm_run *run = vcpu->run;
 800
 801	/*
 802	 * If we're using a userspace irqchip, then check if we need
 803	 * to tell a userspace irqchip about timer or PMU level
 804	 * changes and if so, exit to userspace (the actual level
 805	 * state gets updated in kvm_timer_update_run and
 806	 * kvm_pmu_update_run below).
 807	 */
 808	if (static_branch_unlikely(&userspace_irqchip_in_use)) {
 809		if (kvm_timer_should_notify_user(vcpu) ||
 810		    kvm_pmu_should_notify_user(vcpu)) {
 811			*ret = -EINTR;
 812			run->exit_reason = KVM_EXIT_INTR;
 813			return true;
 814		}
 815	}
 816
 817	if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
 818		run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 819		run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
 820		run->fail_entry.cpu = smp_processor_id();
 821		*ret = 0;
 822		return true;
 823	}
 824
 825	return kvm_request_pending(vcpu) ||
 826			xfer_to_guest_mode_work_pending();
 827}
 828
 829/*
 830 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
 831 * the vCPU is running.
 832 *
 833 * This must be noinstr as instrumentation may make use of RCU, and this is not
 834 * safe during the EQS.
 835 */
 836static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
 837{
 838	int ret;
 839
 840	guest_state_enter_irqoff();
 841	ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
 842	guest_state_exit_irqoff();
 843
 844	return ret;
 845}
 846
 847/**
 848 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
 849 * @vcpu:	The VCPU pointer
 850 *
 851 * This function is called through the VCPU_RUN ioctl called from user space. It
 852 * will execute VM code in a loop until the time slice for the process is used
 853 * or some emulation is needed from user space in which case the function will
 854 * return with return value 0 and with the kvm_run structure filled in with the
 855 * required data for the requested emulation.
 856 */
 857int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 858{
 859	struct kvm_run *run = vcpu->run;
 860	int ret;
 861
 862	if (run->exit_reason == KVM_EXIT_MMIO) {
 863		ret = kvm_handle_mmio_return(vcpu);
 864		if (ret)
 865			return ret;
 866	}
 867
 868	vcpu_load(vcpu);
 869
 870	if (run->immediate_exit) {
 871		ret = -EINTR;
 872		goto out;
 873	}
 874
 875	kvm_sigset_activate(vcpu);
 876
 877	ret = 1;
 878	run->exit_reason = KVM_EXIT_UNKNOWN;
 879	run->flags = 0;
 880	while (ret > 0) {
 881		/*
 882		 * Check conditions before entering the guest
 883		 */
 884		ret = xfer_to_guest_mode_handle_work(vcpu);
 885		if (!ret)
 886			ret = 1;
 887
 888		if (ret > 0)
 889			ret = check_vcpu_requests(vcpu);
 890
 891		/*
 892		 * Preparing the interrupts to be injected also
 893		 * involves poking the GIC, which must be done in a
 894		 * non-preemptible context.
 895		 */
 896		preempt_disable();
 897
 898		/*
 899		 * The VMID allocator only tracks active VMIDs per
 900		 * physical CPU, and therefore the VMID allocated may not be
 901		 * preserved on VMID roll-over if the task was preempted,
 902		 * making a thread's VMID inactive. So we need to call
 903		 * kvm_arm_vmid_update() in non-premptible context.
 904		 */
 905		kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid);
 906
 907		kvm_pmu_flush_hwstate(vcpu);
 908
 909		local_irq_disable();
 910
 911		kvm_vgic_flush_hwstate(vcpu);
 912
 913		kvm_pmu_update_vcpu_events(vcpu);
 914
 915		/*
 916		 * Ensure we set mode to IN_GUEST_MODE after we disable
 917		 * interrupts and before the final VCPU requests check.
 918		 * See the comment in kvm_vcpu_exiting_guest_mode() and
 919		 * Documentation/virt/kvm/vcpu-requests.rst
 920		 */
 921		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
 922
 923		if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
 924			vcpu->mode = OUTSIDE_GUEST_MODE;
 925			isb(); /* Ensure work in x_flush_hwstate is committed */
 926			kvm_pmu_sync_hwstate(vcpu);
 927			if (static_branch_unlikely(&userspace_irqchip_in_use))
 928				kvm_timer_sync_user(vcpu);
 929			kvm_vgic_sync_hwstate(vcpu);
 930			local_irq_enable();
 931			preempt_enable();
 932			continue;
 933		}
 934
 935		kvm_arm_setup_debug(vcpu);
 936		kvm_arch_vcpu_ctxflush_fp(vcpu);
 937
 938		/**************************************************************
 939		 * Enter the guest
 940		 */
 941		trace_kvm_entry(*vcpu_pc(vcpu));
 942		guest_timing_enter_irqoff();
 943
 944		ret = kvm_arm_vcpu_enter_exit(vcpu);
 945
 946		vcpu->mode = OUTSIDE_GUEST_MODE;
 947		vcpu->stat.exits++;
 948		/*
 949		 * Back from guest
 950		 *************************************************************/
 951
 952		kvm_arm_clear_debug(vcpu);
 953
 954		/*
 955		 * We must sync the PMU state before the vgic state so
 956		 * that the vgic can properly sample the updated state of the
 957		 * interrupt line.
 958		 */
 959		kvm_pmu_sync_hwstate(vcpu);
 960
 961		/*
 962		 * Sync the vgic state before syncing the timer state because
 963		 * the timer code needs to know if the virtual timer
 964		 * interrupts are active.
 965		 */
 966		kvm_vgic_sync_hwstate(vcpu);
 967
 968		/*
 969		 * Sync the timer hardware state before enabling interrupts as
 970		 * we don't want vtimer interrupts to race with syncing the
 971		 * timer virtual interrupt state.
 972		 */
 973		if (static_branch_unlikely(&userspace_irqchip_in_use))
 974			kvm_timer_sync_user(vcpu);
 975
 976		kvm_arch_vcpu_ctxsync_fp(vcpu);
 977
 978		/*
 979		 * We must ensure that any pending interrupts are taken before
 980		 * we exit guest timing so that timer ticks are accounted as
 981		 * guest time. Transiently unmask interrupts so that any
 982		 * pending interrupts are taken.
 983		 *
 984		 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
 985		 * context synchronization event) is necessary to ensure that
 986		 * pending interrupts are taken.
 987		 */
 988		if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
 989			local_irq_enable();
 990			isb();
 991			local_irq_disable();
 992		}
 993
 994		guest_timing_exit_irqoff();
 995
 996		local_irq_enable();
 997
 998		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 999
1000		/* Exit types that need handling before we can be preempted */
1001		handle_exit_early(vcpu, ret);
1002
1003		preempt_enable();
1004
1005		/*
1006		 * The ARMv8 architecture doesn't give the hypervisor
1007		 * a mechanism to prevent a guest from dropping to AArch32 EL0
1008		 * if implemented by the CPU. If we spot the guest in such
1009		 * state and that we decided it wasn't supposed to do so (like
1010		 * with the asymmetric AArch32 case), return to userspace with
1011		 * a fatal error.
1012		 */
1013		if (vcpu_mode_is_bad_32bit(vcpu)) {
1014			/*
1015			 * As we have caught the guest red-handed, decide that
1016			 * it isn't fit for purpose anymore by making the vcpu
1017			 * invalid. The VMM can try and fix it by issuing  a
1018			 * KVM_ARM_VCPU_INIT if it really wants to.
1019			 */
1020			vcpu->arch.target = -1;
1021			ret = ARM_EXCEPTION_IL;
1022		}
1023
1024		ret = handle_exit(vcpu, ret);
1025	}
1026
1027	/* Tell userspace about in-kernel device output levels */
1028	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1029		kvm_timer_update_run(vcpu);
1030		kvm_pmu_update_run(vcpu);
1031	}
1032
1033	kvm_sigset_deactivate(vcpu);
1034
1035out:
1036	/*
1037	 * In the unlikely event that we are returning to userspace
1038	 * with pending exceptions or PC adjustment, commit these
1039	 * adjustments in order to give userspace a consistent view of
1040	 * the vcpu state. Note that this relies on __kvm_adjust_pc()
1041	 * being preempt-safe on VHE.
1042	 */
1043	if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
1044		     vcpu_get_flag(vcpu, INCREMENT_PC)))
1045		kvm_call_hyp(__kvm_adjust_pc, vcpu);
1046
1047	vcpu_put(vcpu);
1048	return ret;
1049}
1050
1051static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
1052{
1053	int bit_index;
1054	bool set;
1055	unsigned long *hcr;
1056
1057	if (number == KVM_ARM_IRQ_CPU_IRQ)
1058		bit_index = __ffs(HCR_VI);
1059	else /* KVM_ARM_IRQ_CPU_FIQ */
1060		bit_index = __ffs(HCR_VF);
1061
1062	hcr = vcpu_hcr(vcpu);
1063	if (level)
1064		set = test_and_set_bit(bit_index, hcr);
1065	else
1066		set = test_and_clear_bit(bit_index, hcr);
1067
1068	/*
1069	 * If we didn't change anything, no need to wake up or kick other CPUs
1070	 */
1071	if (set == level)
1072		return 0;
1073
1074	/*
1075	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
1076	 * trigger a world-switch round on the running physical CPU to set the
1077	 * virtual IRQ/FIQ fields in the HCR appropriately.
1078	 */
1079	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
1080	kvm_vcpu_kick(vcpu);
1081
1082	return 0;
1083}
1084
1085int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
1086			  bool line_status)
1087{
1088	u32 irq = irq_level->irq;
1089	unsigned int irq_type, vcpu_idx, irq_num;
1090	int nrcpus = atomic_read(&kvm->online_vcpus);
1091	struct kvm_vcpu *vcpu = NULL;
1092	bool level = irq_level->level;
1093
1094	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
1095	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
1096	vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
1097	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
1098
1099	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
1100
1101	switch (irq_type) {
1102	case KVM_ARM_IRQ_TYPE_CPU:
1103		if (irqchip_in_kernel(kvm))
1104			return -ENXIO;
1105
1106		if (vcpu_idx >= nrcpus)
1107			return -EINVAL;
1108
1109		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
1110		if (!vcpu)
1111			return -EINVAL;
1112
1113		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
1114			return -EINVAL;
1115
1116		return vcpu_interrupt_line(vcpu, irq_num, level);
1117	case KVM_ARM_IRQ_TYPE_PPI:
1118		if (!irqchip_in_kernel(kvm))
1119			return -ENXIO;
1120
1121		if (vcpu_idx >= nrcpus)
1122			return -EINVAL;
1123
1124		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
1125		if (!vcpu)
1126			return -EINVAL;
1127
1128		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
1129			return -EINVAL;
1130
1131		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
1132	case KVM_ARM_IRQ_TYPE_SPI:
1133		if (!irqchip_in_kernel(kvm))
1134			return -ENXIO;
1135
1136		if (irq_num < VGIC_NR_PRIVATE_IRQS)
1137			return -EINVAL;
1138
1139		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
1140	}
1141
1142	return -EINVAL;
1143}
1144
1145static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1146			       const struct kvm_vcpu_init *init)
1147{
1148	unsigned int i, ret;
1149	u32 phys_target = kvm_target_cpu();
1150
1151	if (init->target != phys_target)
1152		return -EINVAL;
1153
1154	/*
1155	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
1156	 * use the same target.
1157	 */
1158	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
1159		return -EINVAL;
1160
1161	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
1162	for (i = 0; i < sizeof(init->features) * 8; i++) {
1163		bool set = (init->features[i / 32] & (1 << (i % 32)));
1164
1165		if (set && i >= KVM_VCPU_MAX_FEATURES)
1166			return -ENOENT;
1167
1168		/*
1169		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
1170		 * use the same feature set.
1171		 */
1172		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
1173		    test_bit(i, vcpu->arch.features) != set)
1174			return -EINVAL;
1175
1176		if (set)
1177			set_bit(i, vcpu->arch.features);
1178	}
1179
1180	vcpu->arch.target = phys_target;
1181
1182	/* Now we know what it is, we can reset it. */
1183	ret = kvm_reset_vcpu(vcpu);
1184	if (ret) {
1185		vcpu->arch.target = -1;
1186		bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
1187	}
1188
1189	return ret;
1190}
1191
1192static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1193					 struct kvm_vcpu_init *init)
1194{
1195	int ret;
1196
1197	ret = kvm_vcpu_set_target(vcpu, init);
1198	if (ret)
1199		return ret;
1200
1201	/*
1202	 * Ensure a rebooted VM will fault in RAM pages and detect if the
1203	 * guest MMU is turned off and flush the caches as needed.
1204	 *
1205	 * S2FWB enforces all memory accesses to RAM being cacheable,
1206	 * ensuring that the data side is always coherent. We still
1207	 * need to invalidate the I-cache though, as FWB does *not*
1208	 * imply CTR_EL0.DIC.
1209	 */
1210	if (vcpu_has_run_once(vcpu)) {
1211		if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1212			stage2_unmap_vm(vcpu->kvm);
1213		else
1214			icache_inval_all_pou();
1215	}
1216
1217	vcpu_reset_hcr(vcpu);
1218	vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
1219
1220	/*
1221	 * Handle the "start in power-off" case.
1222	 */
1223	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
1224		kvm_arm_vcpu_power_off(vcpu);
1225	else
1226		vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
1227
1228	return 0;
1229}
1230
1231static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1232				 struct kvm_device_attr *attr)
1233{
1234	int ret = -ENXIO;
1235
1236	switch (attr->group) {
1237	default:
1238		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1239		break;
1240	}
1241
1242	return ret;
1243}
1244
1245static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1246				 struct kvm_device_attr *attr)
1247{
1248	int ret = -ENXIO;
1249
1250	switch (attr->group) {
1251	default:
1252		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1253		break;
1254	}
1255
1256	return ret;
1257}
1258
1259static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1260				 struct kvm_device_attr *attr)
1261{
1262	int ret = -ENXIO;
1263
1264	switch (attr->group) {
1265	default:
1266		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1267		break;
1268	}
1269
1270	return ret;
1271}
1272
1273static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1274				   struct kvm_vcpu_events *events)
1275{
1276	memset(events, 0, sizeof(*events));
1277
1278	return __kvm_arm_vcpu_get_events(vcpu, events);
1279}
1280
1281static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1282				   struct kvm_vcpu_events *events)
1283{
1284	int i;
1285
1286	/* check whether the reserved field is zero */
1287	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1288		if (events->reserved[i])
1289			return -EINVAL;
1290
1291	/* check whether the pad field is zero */
1292	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1293		if (events->exception.pad[i])
1294			return -EINVAL;
1295
1296	return __kvm_arm_vcpu_set_events(vcpu, events);
1297}
1298
1299long kvm_arch_vcpu_ioctl(struct file *filp,
1300			 unsigned int ioctl, unsigned long arg)
1301{
1302	struct kvm_vcpu *vcpu = filp->private_data;
1303	void __user *argp = (void __user *)arg;
1304	struct kvm_device_attr attr;
1305	long r;
1306
1307	switch (ioctl) {
1308	case KVM_ARM_VCPU_INIT: {
1309		struct kvm_vcpu_init init;
1310
1311		r = -EFAULT;
1312		if (copy_from_user(&init, argp, sizeof(init)))
1313			break;
1314
1315		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1316		break;
1317	}
1318	case KVM_SET_ONE_REG:
1319	case KVM_GET_ONE_REG: {
1320		struct kvm_one_reg reg;
1321
1322		r = -ENOEXEC;
1323		if (unlikely(!kvm_vcpu_initialized(vcpu)))
1324			break;
1325
1326		r = -EFAULT;
1327		if (copy_from_user(&reg, argp, sizeof(reg)))
1328			break;
1329
1330		/*
1331		 * We could owe a reset due to PSCI. Handle the pending reset
1332		 * here to ensure userspace register accesses are ordered after
1333		 * the reset.
1334		 */
1335		if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1336			kvm_reset_vcpu(vcpu);
1337
1338		if (ioctl == KVM_SET_ONE_REG)
1339			r = kvm_arm_set_reg(vcpu, &reg);
1340		else
1341			r = kvm_arm_get_reg(vcpu, &reg);
1342		break;
1343	}
1344	case KVM_GET_REG_LIST: {
1345		struct kvm_reg_list __user *user_list = argp;
1346		struct kvm_reg_list reg_list;
1347		unsigned n;
1348
1349		r = -ENOEXEC;
1350		if (unlikely(!kvm_vcpu_initialized(vcpu)))
1351			break;
1352
1353		r = -EPERM;
1354		if (!kvm_arm_vcpu_is_finalized(vcpu))
1355			break;
1356
1357		r = -EFAULT;
1358		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
1359			break;
1360		n = reg_list.n;
1361		reg_list.n = kvm_arm_num_regs(vcpu);
1362		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
1363			break;
1364		r = -E2BIG;
1365		if (n < reg_list.n)
1366			break;
1367		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1368		break;
1369	}
1370	case KVM_SET_DEVICE_ATTR: {
1371		r = -EFAULT;
1372		if (copy_from_user(&attr, argp, sizeof(attr)))
1373			break;
1374		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1375		break;
1376	}
1377	case KVM_GET_DEVICE_ATTR: {
1378		r = -EFAULT;
1379		if (copy_from_user(&attr, argp, sizeof(attr)))
1380			break;
1381		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1382		break;
1383	}
1384	case KVM_HAS_DEVICE_ATTR: {
1385		r = -EFAULT;
1386		if (copy_from_user(&attr, argp, sizeof(attr)))
1387			break;
1388		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1389		break;
1390	}
1391	case KVM_GET_VCPU_EVENTS: {
1392		struct kvm_vcpu_events events;
1393
1394		if (kvm_arm_vcpu_get_events(vcpu, &events))
1395			return -EINVAL;
1396
1397		if (copy_to_user(argp, &events, sizeof(events)))
1398			return -EFAULT;
1399
1400		return 0;
1401	}
1402	case KVM_SET_VCPU_EVENTS: {
1403		struct kvm_vcpu_events events;
1404
1405		if (copy_from_user(&events, argp, sizeof(events)))
1406			return -EFAULT;
1407
1408		return kvm_arm_vcpu_set_events(vcpu, &events);
1409	}
1410	case KVM_ARM_VCPU_FINALIZE: {
1411		int what;
1412
1413		if (!kvm_vcpu_initialized(vcpu))
1414			return -ENOEXEC;
1415
1416		if (get_user(what, (const int __user *)argp))
1417			return -EFAULT;
1418
1419		return kvm_arm_vcpu_finalize(vcpu, what);
1420	}
1421	default:
1422		r = -EINVAL;
1423	}
1424
1425	return r;
1426}
1427
1428void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1429{
1430
1431}
1432
1433void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
1434					const struct kvm_memory_slot *memslot)
1435{
1436	kvm_flush_remote_tlbs(kvm);
1437}
1438
1439static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1440					struct kvm_arm_device_addr *dev_addr)
1441{
1442	switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
1443	case KVM_ARM_DEVICE_VGIC_V2:
1444		if (!vgic_present)
1445			return -ENXIO;
1446		return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
1447	default:
1448		return -ENODEV;
1449	}
1450}
1451
1452long kvm_arch_vm_ioctl(struct file *filp,
1453		       unsigned int ioctl, unsigned long arg)
1454{
1455	struct kvm *kvm = filp->private_data;
1456	void __user *argp = (void __user *)arg;
1457
1458	switch (ioctl) {
1459	case KVM_CREATE_IRQCHIP: {
1460		int ret;
1461		if (!vgic_present)
1462			return -ENXIO;
1463		mutex_lock(&kvm->lock);
1464		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
1465		mutex_unlock(&kvm->lock);
1466		return ret;
1467	}
1468	case KVM_ARM_SET_DEVICE_ADDR: {
1469		struct kvm_arm_device_addr dev_addr;
1470
1471		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
1472			return -EFAULT;
1473		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
1474	}
1475	case KVM_ARM_PREFERRED_TARGET: {
1476		struct kvm_vcpu_init init;
1477
1478		kvm_vcpu_preferred_target(&init);
1479
1480		if (copy_to_user(argp, &init, sizeof(init)))
1481			return -EFAULT;
1482
1483		return 0;
1484	}
1485	case KVM_ARM_MTE_COPY_TAGS: {
1486		struct kvm_arm_copy_mte_tags copy_tags;
1487
1488		if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
1489			return -EFAULT;
1490		return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
1491	}
1492	default:
1493		return -EINVAL;
1494	}
1495}
1496
1497static unsigned long nvhe_percpu_size(void)
1498{
1499	return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
1500		(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
1501}
1502
1503static unsigned long nvhe_percpu_order(void)
1504{
1505	unsigned long size = nvhe_percpu_size();
1506
1507	return size ? get_order(size) : 0;
1508}
1509
1510/* A lookup table holding the hypervisor VA for each vector slot */
1511static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
1512
1513static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
1514{
1515	hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
1516}
1517
1518static int kvm_init_vector_slots(void)
1519{
1520	int err;
1521	void *base;
1522
1523	base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
1524	kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
1525
1526	base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
1527	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
1528
1529	if (kvm_system_needs_idmapped_vectors() &&
1530	    !is_protected_kvm_enabled()) {
1531		err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
1532					       __BP_HARDEN_HYP_VECS_SZ, &base);
1533		if (err)
1534			return err;
1535	}
1536
1537	kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
1538	kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
1539	return 0;
1540}
1541
1542static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
1543{
1544	struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
1545	unsigned long tcr;
1546
1547	/*
1548	 * Calculate the raw per-cpu offset without a translation from the
1549	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
1550	 * so that we can use adr_l to access per-cpu variables in EL2.
1551	 * Also drop the KASAN tag which gets in the way...
1552	 */
1553	params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
1554			    (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
1555
1556	params->mair_el2 = read_sysreg(mair_el1);
1557
1558	tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
1559	tcr &= ~TCR_T0SZ_MASK;
1560	tcr |= TCR_T0SZ(hyp_va_bits);
1561	params->tcr_el2 = tcr;
1562
1563	params->pgd_pa = kvm_mmu_get_httbr();
1564	if (is_protected_kvm_enabled())
1565		params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
1566	else
1567		params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
1568	params->vttbr = params->vtcr = 0;
1569
1570	/*
1571	 * Flush the init params from the data cache because the struct will
1572	 * be read while the MMU is off.
1573	 */
1574	kvm_flush_dcache_to_poc(params, sizeof(*params));
1575}
1576
1577static void hyp_install_host_vector(void)
1578{
1579	struct kvm_nvhe_init_params *params;
1580	struct arm_smccc_res res;
1581
1582	/* Switch from the HYP stub to our own HYP init vector */
1583	__hyp_set_vectors(kvm_get_idmap_vector());
1584
1585	/*
1586	 * Call initialization code, and switch to the full blown HYP code.
1587	 * If the cpucaps haven't been finalized yet, something has gone very
1588	 * wrong, and hyp will crash and burn when it uses any
1589	 * cpus_have_const_cap() wrapper.
1590	 */
1591	BUG_ON(!system_capabilities_finalized());
1592	params = this_cpu_ptr_nvhe_sym(kvm_init_params);
1593	arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
1594	WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
1595}
1596
1597static void cpu_init_hyp_mode(void)
1598{
1599	hyp_install_host_vector();
1600
1601	/*
1602	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
1603	 * at EL2.
1604	 */
1605	if (this_cpu_has_cap(ARM64_SSBS) &&
1606	    arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
1607		kvm_call_hyp_nvhe(__kvm_enable_ssbs);
1608	}
1609}
1610
1611static void cpu_hyp_reset(void)
1612{
1613	if (!is_kernel_in_hyp_mode())
1614		__hyp_reset_vectors();
1615}
1616
1617/*
1618 * EL2 vectors can be mapped and rerouted in a number of ways,
1619 * depending on the kernel configuration and CPU present:
1620 *
1621 * - If the CPU is affected by Spectre-v2, the hardening sequence is
1622 *   placed in one of the vector slots, which is executed before jumping
1623 *   to the real vectors.
1624 *
1625 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
1626 *   containing the hardening sequence is mapped next to the idmap page,
1627 *   and executed before jumping to the real vectors.
1628 *
1629 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
1630 *   empty slot is selected, mapped next to the idmap page, and
1631 *   executed before jumping to the real vectors.
1632 *
1633 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
1634 * VHE, as we don't have hypervisor-specific mappings. If the system
1635 * is VHE and yet selects this capability, it will be ignored.
1636 */
1637static void cpu_set_hyp_vector(void)
1638{
1639	struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
1640	void *vector = hyp_spectre_vector_selector[data->slot];
1641
1642	if (!is_protected_kvm_enabled())
1643		*this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
1644	else
1645		kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
1646}
1647
1648static void cpu_hyp_init_context(void)
1649{
1650	kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
1651
1652	if (!is_kernel_in_hyp_mode())
1653		cpu_init_hyp_mode();
1654}
1655
1656static void cpu_hyp_init_features(void)
1657{
1658	cpu_set_hyp_vector();
1659	kvm_arm_init_debug();
1660
1661	if (is_kernel_in_hyp_mode())
1662		kvm_timer_init_vhe();
1663
1664	if (vgic_present)
1665		kvm_vgic_init_cpu_hardware();
1666}
1667
1668static void cpu_hyp_reinit(void)
1669{
1670	cpu_hyp_reset();
1671	cpu_hyp_init_context();
1672	cpu_hyp_init_features();
1673}
1674
1675static void _kvm_arch_hardware_enable(void *discard)
1676{
1677	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
1678		cpu_hyp_reinit();
1679		__this_cpu_write(kvm_arm_hardware_enabled, 1);
1680	}
1681}
1682
1683int kvm_arch_hardware_enable(void)
1684{
1685	_kvm_arch_hardware_enable(NULL);
1686	return 0;
1687}
1688
1689static void _kvm_arch_hardware_disable(void *discard)
1690{
1691	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
1692		cpu_hyp_reset();
1693		__this_cpu_write(kvm_arm_hardware_enabled, 0);
1694	}
1695}
1696
1697void kvm_arch_hardware_disable(void)
1698{
1699	if (!is_protected_kvm_enabled())
1700		_kvm_arch_hardware_disable(NULL);
1701}
1702
1703#ifdef CONFIG_CPU_PM
1704static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
1705				    unsigned long cmd,
1706				    void *v)
1707{
1708	/*
1709	 * kvm_arm_hardware_enabled is left with its old value over
1710	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
1711	 * re-enable hyp.
1712	 */
1713	switch (cmd) {
1714	case CPU_PM_ENTER:
1715		if (__this_cpu_read(kvm_arm_hardware_enabled))
1716			/*
1717			 * don't update kvm_arm_hardware_enabled here
1718			 * so that the hardware will be re-enabled
1719			 * when we resume. See below.
1720			 */
1721			cpu_hyp_reset();
1722
1723		return NOTIFY_OK;
1724	case CPU_PM_ENTER_FAILED:
1725	case CPU_PM_EXIT:
1726		if (__this_cpu_read(kvm_arm_hardware_enabled))
1727			/* The hardware was enabled before suspend. */
1728			cpu_hyp_reinit();
1729
1730		return NOTIFY_OK;
1731
1732	default:
1733		return NOTIFY_DONE;
1734	}
1735}
1736
1737static struct notifier_block hyp_init_cpu_pm_nb = {
1738	.notifier_call = hyp_init_cpu_pm_notifier,
1739};
1740
1741static void hyp_cpu_pm_init(void)
1742{
1743	if (!is_protected_kvm_enabled())
1744		cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
1745}
1746static void hyp_cpu_pm_exit(void)
1747{
1748	if (!is_protected_kvm_enabled())
1749		cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
1750}
1751#else
1752static inline void hyp_cpu_pm_init(void)
1753{
1754}
1755static inline void hyp_cpu_pm_exit(void)
1756{
1757}
1758#endif
1759
1760static void init_cpu_logical_map(void)
1761{
1762	unsigned int cpu;
1763
1764	/*
1765	 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
1766	 * Only copy the set of online CPUs whose features have been checked
1767	 * against the finalized system capabilities. The hypervisor will not
1768	 * allow any other CPUs from the `possible` set to boot.
1769	 */
1770	for_each_online_cpu(cpu)
1771		hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
1772}
1773
1774#define init_psci_0_1_impl_state(config, what)	\
1775	config.psci_0_1_ ## what ## _implemented = psci_ops.what
1776
1777static bool init_psci_relay(void)
1778{
1779	/*
1780	 * If PSCI has not been initialized, protected KVM cannot install
1781	 * itself on newly booted CPUs.
1782	 */
1783	if (!psci_ops.get_version) {
1784		kvm_err("Cannot initialize protected mode without PSCI\n");
1785		return false;
1786	}
1787
1788	kvm_host_psci_config.version = psci_ops.get_version();
1789
1790	if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
1791		kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
1792		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
1793		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
1794		init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
1795		init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
1796	}
1797	return true;
1798}
1799
1800static int init_subsystems(void)
1801{
1802	int err = 0;
1803
1804	/*
1805	 * Enable hardware so that subsystem initialisation can access EL2.
1806	 */
1807	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
1808
1809	/*
1810	 * Register CPU lower-power notifier
1811	 */
1812	hyp_cpu_pm_init();
1813
1814	/*
1815	 * Init HYP view of VGIC
1816	 */
1817	err = kvm_vgic_hyp_init();
1818	switch (err) {
1819	case 0:
1820		vgic_present = true;
1821		break;
1822	case -ENODEV:
1823	case -ENXIO:
1824		vgic_present = false;
1825		err = 0;
1826		break;
1827	default:
1828		goto out;
1829	}
1830
1831	/*
1832	 * Init HYP architected timer support
1833	 */
1834	err = kvm_timer_hyp_init(vgic_present);
1835	if (err)
1836		goto out;
1837
1838	kvm_register_perf_callbacks(NULL);
1839
1840out:
1841	if (err || !is_protected_kvm_enabled())
1842		on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
1843
1844	return err;
1845}
1846
1847static void teardown_hyp_mode(void)
1848{
1849	int cpu;
1850
1851	free_hyp_pgds();
1852	for_each_possible_cpu(cpu) {
1853		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
1854		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
1855	}
1856}
1857
1858static int do_pkvm_init(u32 hyp_va_bits)
1859{
1860	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
1861	int ret;
1862
1863	preempt_disable();
1864	cpu_hyp_init_context();
1865	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
1866				num_possible_cpus(), kern_hyp_va(per_cpu_base),
1867				hyp_va_bits);
1868	cpu_hyp_init_features();
1869
1870	/*
1871	 * The stub hypercalls are now disabled, so set our local flag to
1872	 * prevent a later re-init attempt in kvm_arch_hardware_enable().
1873	 */
1874	__this_cpu_write(kvm_arm_hardware_enabled, 1);
1875	preempt_enable();
1876
1877	return ret;
1878}
1879
1880static void kvm_hyp_init_symbols(void)
1881{
1882	kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
1883	kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
1884	kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
1885	kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
1886	kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
1887	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
1888	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
1889	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
1890	kvm_nvhe_sym(__icache_flags) = __icache_flags;
1891	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
1892}
1893
1894static int kvm_hyp_init_protection(u32 hyp_va_bits)
1895{
1896	void *addr = phys_to_virt(hyp_mem_base);
1897	int ret;
1898
1899	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
1900	if (ret)
1901		return ret;
1902
1903	ret = do_pkvm_init(hyp_va_bits);
1904	if (ret)
1905		return ret;
1906
1907	free_hyp_pgds();
1908
1909	return 0;
1910}
1911
1912/**
1913 * Inits Hyp-mode on all online CPUs
1914 */
1915static int init_hyp_mode(void)
1916{
1917	u32 hyp_va_bits;
1918	int cpu;
1919	int err = -ENOMEM;
1920
1921	/*
1922	 * The protected Hyp-mode cannot be initialized if the memory pool
1923	 * allocation has failed.
1924	 */
1925	if (is_protected_kvm_enabled() && !hyp_mem_base)
1926		goto out_err;
1927
1928	/*
1929	 * Allocate Hyp PGD and setup Hyp identity mapping
1930	 */
1931	err = kvm_mmu_init(&hyp_va_bits);
1932	if (err)
1933		goto out_err;
1934
1935	/*
1936	 * Allocate stack pages for Hypervisor-mode
1937	 */
1938	for_each_possible_cpu(cpu) {
1939		unsigned long stack_page;
1940
1941		stack_page = __get_free_page(GFP_KERNEL);
1942		if (!stack_page) {
1943			err = -ENOMEM;
1944			goto out_err;
1945		}
1946
1947		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
1948	}
1949
1950	/*
1951	 * Allocate and initialize pages for Hypervisor-mode percpu regions.
1952	 */
1953	for_each_possible_cpu(cpu) {
1954		struct page *page;
1955		void *page_addr;
1956
1957		page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
1958		if (!page) {
1959			err = -ENOMEM;
1960			goto out_err;
1961		}
1962
1963		page_addr = page_address(page);
1964		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
1965		kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
1966	}
1967
1968	/*
1969	 * Map the Hyp-code called directly from the host
1970	 */
1971	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
1972				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
1973	if (err) {
1974		kvm_err("Cannot map world-switch code\n");
1975		goto out_err;
1976	}
1977
1978	err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
1979				  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
1980	if (err) {
1981		kvm_err("Cannot map .hyp.rodata section\n");
1982		goto out_err;
1983	}
1984
1985	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
1986				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
1987	if (err) {
1988		kvm_err("Cannot map rodata section\n");
1989		goto out_err;
1990	}
1991
1992	/*
1993	 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
1994	 * section thanks to an assertion in the linker script. Map it RW and
1995	 * the rest of .bss RO.
1996	 */
1997	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
1998				  kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
1999	if (err) {
2000		kvm_err("Cannot map hyp bss section: %d\n", err);
2001		goto out_err;
2002	}
2003
2004	err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
2005				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
2006	if (err) {
2007		kvm_err("Cannot map bss section\n");
2008		goto out_err;
2009	}
2010
2011	/*
2012	 * Map the Hyp stack pages
2013	 */
2014	for_each_possible_cpu(cpu) {
2015		struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2016		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
2017		unsigned long hyp_addr;
2018
2019		/*
2020		 * Allocate a contiguous HYP private VA range for the stack
2021		 * and guard page. The allocation is also aligned based on
2022		 * the order of its size.
2023		 */
2024		err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr);
2025		if (err) {
2026			kvm_err("Cannot allocate hyp stack guard page\n");
2027			goto out_err;
2028		}
2029
2030		/*
2031		 * Since the stack grows downwards, map the stack to the page
2032		 * at the higher address and leave the lower guard page
2033		 * unbacked.
2034		 *
2035		 * Any valid stack address now has the PAGE_SHIFT bit as 1
2036		 * and addresses corresponding to the guard page have the
2037		 * PAGE_SHIFT bit as 0 - this is used for overflow detection.
2038		 */
2039		err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE,
2040					    __pa(stack_page), PAGE_HYP);
2041		if (err) {
2042			kvm_err("Cannot map hyp stack\n");
2043			goto out_err;
2044		}
2045
2046		/*
2047		 * Save the stack PA in nvhe_init_params. This will be needed
2048		 * to recreate the stack mapping in protected nVHE mode.
2049		 * __hyp_pa() won't do the right thing there, since the stack
2050		 * has been mapped in the flexible private VA space.
2051		 */
2052		params->stack_pa = __pa(stack_page);
2053
2054		params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE);
2055	}
2056
2057	for_each_possible_cpu(cpu) {
2058		char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
2059		char *percpu_end = percpu_begin + nvhe_percpu_size();
2060
2061		/* Map Hyp percpu pages */
2062		err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
2063		if (err) {
2064			kvm_err("Cannot map hyp percpu region\n");
2065			goto out_err;
2066		}
2067
2068		/* Prepare the CPU initialization parameters */
2069		cpu_prepare_hyp_mode(cpu, hyp_va_bits);
2070	}
2071
2072	kvm_hyp_init_symbols();
2073
2074	if (is_protected_kvm_enabled()) {
2075		init_cpu_logical_map();
2076
2077		if (!init_psci_relay()) {
2078			err = -ENODEV;
2079			goto out_err;
2080		}
2081
2082		err = kvm_hyp_init_protection(hyp_va_bits);
2083		if (err) {
2084			kvm_err("Failed to init hyp memory protection\n");
2085			goto out_err;
2086		}
2087	}
2088
2089	return 0;
2090
2091out_err:
2092	teardown_hyp_mode();
2093	kvm_err("error initializing Hyp mode: %d\n", err);
2094	return err;
2095}
2096
2097static void _kvm_host_prot_finalize(void *arg)
2098{
2099	int *err = arg;
2100
2101	if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
2102		WRITE_ONCE(*err, -EINVAL);
2103}
2104
2105static int pkvm_drop_host_privileges(void)
2106{
2107	int ret = 0;
2108
2109	/*
2110	 * Flip the static key upfront as that may no longer be possible
2111	 * once the host stage 2 is installed.
2112	 */
2113	static_branch_enable(&kvm_protected_mode_initialized);
2114	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
2115	return ret;
2116}
2117
2118static int finalize_hyp_mode(void)
2119{
2120	if (!is_protected_kvm_enabled())
2121		return 0;
2122
2123	/*
2124	 * Exclude HYP sections from kmemleak so that they don't get peeked
2125	 * at, which would end badly once inaccessible.
2126	 */
2127	kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
2128	kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
2129	return pkvm_drop_host_privileges();
2130}
2131
2132struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
2133{
2134	struct kvm_vcpu *vcpu;
2135	unsigned long i;
2136
2137	mpidr &= MPIDR_HWID_BITMASK;
2138	kvm_for_each_vcpu(i, vcpu, kvm) {
2139		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
2140			return vcpu;
2141	}
2142	return NULL;
2143}
2144
2145bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
2146{
2147	return irqchip_in_kernel(kvm);
2148}
2149
2150bool kvm_arch_has_irq_bypass(void)
2151{
2152	return true;
2153}
2154
2155int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
2156				      struct irq_bypass_producer *prod)
2157{
2158	struct kvm_kernel_irqfd *irqfd =
2159		container_of(cons, struct kvm_kernel_irqfd, consumer);
2160
2161	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
2162					  &irqfd->irq_entry);
2163}
2164void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
2165				      struct irq_bypass_producer *prod)
2166{
2167	struct kvm_kernel_irqfd *irqfd =
2168		container_of(cons, struct kvm_kernel_irqfd, consumer);
2169
2170	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
2171				     &irqfd->irq_entry);
2172}
2173
2174void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
2175{
2176	struct kvm_kernel_irqfd *irqfd =
2177		container_of(cons, struct kvm_kernel_irqfd, consumer);
2178
2179	kvm_arm_halt_guest(irqfd->kvm);
2180}
2181
2182void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
2183{
2184	struct kvm_kernel_irqfd *irqfd =
2185		container_of(cons, struct kvm_kernel_irqfd, consumer);
2186
2187	kvm_arm_resume_guest(irqfd->kvm);
2188}
2189
2190/**
2191 * Initialize Hyp-mode and memory mappings on all CPUs.
2192 */
2193int kvm_arch_init(void *opaque)
2194{
2195	int err;
2196	bool in_hyp_mode;
2197
2198	if (!is_hyp_mode_available()) {
2199		kvm_info("HYP mode not available\n");
2200		return -ENODEV;
2201	}
2202
2203	if (kvm_get_mode() == KVM_MODE_NONE) {
2204		kvm_info("KVM disabled from command line\n");
2205		return -ENODEV;
2206	}
2207
2208	err = kvm_sys_reg_table_init();
2209	if (err) {
2210		kvm_info("Error initializing system register tables");
2211		return err;
2212	}
2213
2214	in_hyp_mode = is_kernel_in_hyp_mode();
2215
2216	if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
2217	    cpus_have_final_cap(ARM64_WORKAROUND_1508412))
2218		kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
2219			 "Only trusted guests should be used on this system.\n");
2220
2221	err = kvm_set_ipa_limit();
2222	if (err)
2223		return err;
2224
2225	err = kvm_arm_init_sve();
2226	if (err)
2227		return err;
2228
2229	err = kvm_arm_vmid_alloc_init();
2230	if (err) {
2231		kvm_err("Failed to initialize VMID allocator.\n");
2232		return err;
2233	}
2234
2235	if (!in_hyp_mode) {
2236		err = init_hyp_mode();
2237		if (err)
2238			goto out_err;
2239	}
2240
2241	err = kvm_init_vector_slots();
2242	if (err) {
2243		kvm_err("Cannot initialise vector slots\n");
2244		goto out_err;
2245	}
2246
2247	err = init_subsystems();
2248	if (err)
2249		goto out_hyp;
2250
2251	if (!in_hyp_mode) {
2252		err = finalize_hyp_mode();
2253		if (err) {
2254			kvm_err("Failed to finalize Hyp protection\n");
2255			goto out_hyp;
2256		}
2257	}
2258
2259	if (is_protected_kvm_enabled()) {
2260		kvm_info("Protected nVHE mode initialized successfully\n");
2261	} else if (in_hyp_mode) {
2262		kvm_info("VHE mode initialized successfully\n");
2263	} else {
2264		kvm_info("Hyp mode initialized successfully\n");
2265	}
2266
2267	return 0;
2268
2269out_hyp:
2270	hyp_cpu_pm_exit();
2271	if (!in_hyp_mode)
2272		teardown_hyp_mode();
2273out_err:
2274	kvm_arm_vmid_alloc_free();
2275	return err;
2276}
2277
2278/* NOP: Compiling as a module not supported */
2279void kvm_arch_exit(void)
2280{
2281	kvm_unregister_perf_callbacks();
2282}
2283
2284static int __init early_kvm_mode_cfg(char *arg)
2285{
2286	if (!arg)
2287		return -EINVAL;
2288
2289	if (strcmp(arg, "none") == 0) {
2290		kvm_mode = KVM_MODE_NONE;
2291		return 0;
2292	}
2293
2294	if (!is_hyp_mode_available()) {
2295		pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
2296		return 0;
2297	}
2298
2299	if (strcmp(arg, "protected") == 0) {
2300		if (!is_kernel_in_hyp_mode())
2301			kvm_mode = KVM_MODE_PROTECTED;
2302		else
2303			pr_warn_once("Protected KVM not available with VHE\n");
2304
2305		return 0;
2306	}
2307
2308	if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
2309		kvm_mode = KVM_MODE_DEFAULT;
2310		return 0;
2311	}
2312
2313	return -EINVAL;
2314}
2315early_param("kvm-arm.mode", early_kvm_mode_cfg);
2316
2317enum kvm_mode kvm_get_mode(void)
2318{
2319	return kvm_mode;
2320}
2321
2322static int arm_init(void)
2323{
2324	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
2325	return rc;
2326}
2327
2328module_init(arm_init);