Loading...
Note: File does not exist in v4.17.
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
4 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
5 */
6
7#include <linux/bug.h>
8#include <linux/cpu_pm.h>
9#include <linux/entry-kvm.h>
10#include <linux/errno.h>
11#include <linux/err.h>
12#include <linux/kvm_host.h>
13#include <linux/list.h>
14#include <linux/module.h>
15#include <linux/vmalloc.h>
16#include <linux/fs.h>
17#include <linux/mman.h>
18#include <linux/sched.h>
19#include <linux/kvm.h>
20#include <linux/kvm_irqfd.h>
21#include <linux/irqbypass.h>
22#include <linux/sched/stat.h>
23#include <linux/psci.h>
24#include <trace/events/kvm.h>
25
26#define CREATE_TRACE_POINTS
27#include "trace_arm.h"
28
29#include <linux/uaccess.h>
30#include <asm/ptrace.h>
31#include <asm/mman.h>
32#include <asm/tlbflush.h>
33#include <asm/cacheflush.h>
34#include <asm/cpufeature.h>
35#include <asm/virt.h>
36#include <asm/kvm_arm.h>
37#include <asm/kvm_asm.h>
38#include <asm/kvm_emulate.h>
39#include <asm/kvm_mmu.h>
40#include <asm/kvm_nested.h>
41#include <asm/kvm_pkvm.h>
42#include <asm/kvm_ptrauth.h>
43#include <asm/sections.h>
44
45#include <kvm/arm_hypercalls.h>
46#include <kvm/arm_pmu.h>
47#include <kvm/arm_psci.h>
48
49#include "sys_regs.h"
50
51static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;
52
53enum kvm_wfx_trap_policy {
54 KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
55 KVM_WFX_NOTRAP,
56 KVM_WFX_TRAP,
57};
58
59static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
60static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
61
62DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
63
64DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
65DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
66
67DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
68
69static bool vgic_present, kvm_arm_initialised;
70
71static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);
72
73bool is_kvm_arm_initialised(void)
74{
75 return kvm_arm_initialised;
76}
77
78int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
79{
80 return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
81}
82
83/*
84 * This functions as an allow-list of protected VM capabilities.
85 * Features not explicitly allowed by this function are denied.
86 */
87static bool pkvm_ext_allowed(struct kvm *kvm, long ext)
88{
89 switch (ext) {
90 case KVM_CAP_IRQCHIP:
91 case KVM_CAP_ARM_PSCI:
92 case KVM_CAP_ARM_PSCI_0_2:
93 case KVM_CAP_NR_VCPUS:
94 case KVM_CAP_MAX_VCPUS:
95 case KVM_CAP_MAX_VCPU_ID:
96 case KVM_CAP_MSI_DEVID:
97 case KVM_CAP_ARM_VM_IPA_SIZE:
98 case KVM_CAP_ARM_PMU_V3:
99 case KVM_CAP_ARM_SVE:
100 case KVM_CAP_ARM_PTRAUTH_ADDRESS:
101 case KVM_CAP_ARM_PTRAUTH_GENERIC:
102 return true;
103 default:
104 return false;
105 }
106}
107
108int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
109 struct kvm_enable_cap *cap)
110{
111 int r = -EINVAL;
112
113 if (cap->flags)
114 return -EINVAL;
115
116 if (kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, cap->cap))
117 return -EINVAL;
118
119 switch (cap->cap) {
120 case KVM_CAP_ARM_NISV_TO_USER:
121 r = 0;
122 set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
123 &kvm->arch.flags);
124 break;
125 case KVM_CAP_ARM_MTE:
126 mutex_lock(&kvm->lock);
127 if (system_supports_mte() && !kvm->created_vcpus) {
128 r = 0;
129 set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
130 }
131 mutex_unlock(&kvm->lock);
132 break;
133 case KVM_CAP_ARM_SYSTEM_SUSPEND:
134 r = 0;
135 set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
136 break;
137 case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
138 mutex_lock(&kvm->slots_lock);
139 /*
140 * To keep things simple, allow changing the chunk
141 * size only when no memory slots have been created.
142 */
143 if (kvm_are_all_memslots_empty(kvm)) {
144 u64 new_cap = cap->args[0];
145
146 if (!new_cap || kvm_is_block_size_supported(new_cap)) {
147 r = 0;
148 kvm->arch.mmu.split_page_chunk_size = new_cap;
149 }
150 }
151 mutex_unlock(&kvm->slots_lock);
152 break;
153 default:
154 break;
155 }
156
157 return r;
158}
159
160static int kvm_arm_default_max_vcpus(void)
161{
162 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
163}
164
165/**
166 * kvm_arch_init_vm - initializes a VM data structure
167 * @kvm: pointer to the KVM struct
168 * @type: kvm device type
169 */
170int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
171{
172 int ret;
173
174 mutex_init(&kvm->arch.config_lock);
175
176#ifdef CONFIG_LOCKDEP
177 /* Clue in lockdep that the config_lock must be taken inside kvm->lock */
178 mutex_lock(&kvm->lock);
179 mutex_lock(&kvm->arch.config_lock);
180 mutex_unlock(&kvm->arch.config_lock);
181 mutex_unlock(&kvm->lock);
182#endif
183
184 kvm_init_nested(kvm);
185
186 ret = kvm_share_hyp(kvm, kvm + 1);
187 if (ret)
188 return ret;
189
190 ret = pkvm_init_host_vm(kvm);
191 if (ret)
192 goto err_unshare_kvm;
193
194 if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
195 ret = -ENOMEM;
196 goto err_unshare_kvm;
197 }
198 cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
199
200 ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
201 if (ret)
202 goto err_free_cpumask;
203
204 kvm_vgic_early_init(kvm);
205
206 kvm_timer_init_vm(kvm);
207
208 /* The maximum number of VCPUs is limited by the host's GIC model */
209 kvm->max_vcpus = kvm_arm_default_max_vcpus();
210
211 kvm_arm_init_hypercalls(kvm);
212
213 bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
214
215 return 0;
216
217err_free_cpumask:
218 free_cpumask_var(kvm->arch.supported_cpus);
219err_unshare_kvm:
220 kvm_unshare_hyp(kvm, kvm + 1);
221 return ret;
222}
223
224vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
225{
226 return VM_FAULT_SIGBUS;
227}
228
229void kvm_arch_create_vm_debugfs(struct kvm *kvm)
230{
231 kvm_sys_regs_create_debugfs(kvm);
232 kvm_s2_ptdump_create_debugfs(kvm);
233}
234
235static void kvm_destroy_mpidr_data(struct kvm *kvm)
236{
237 struct kvm_mpidr_data *data;
238
239 mutex_lock(&kvm->arch.config_lock);
240
241 data = rcu_dereference_protected(kvm->arch.mpidr_data,
242 lockdep_is_held(&kvm->arch.config_lock));
243 if (data) {
244 rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
245 synchronize_rcu();
246 kfree(data);
247 }
248
249 mutex_unlock(&kvm->arch.config_lock);
250}
251
252/**
253 * kvm_arch_destroy_vm - destroy the VM data structure
254 * @kvm: pointer to the KVM struct
255 */
256void kvm_arch_destroy_vm(struct kvm *kvm)
257{
258 bitmap_free(kvm->arch.pmu_filter);
259 free_cpumask_var(kvm->arch.supported_cpus);
260
261 kvm_vgic_destroy(kvm);
262
263 if (is_protected_kvm_enabled())
264 pkvm_destroy_hyp_vm(kvm);
265
266 kvm_destroy_mpidr_data(kvm);
267
268 kfree(kvm->arch.sysreg_masks);
269 kvm_destroy_vcpus(kvm);
270
271 kvm_unshare_hyp(kvm, kvm + 1);
272
273 kvm_arm_teardown_hypercalls(kvm);
274}
275
276static bool kvm_has_full_ptr_auth(void)
277{
278 bool apa, gpa, api, gpi, apa3, gpa3;
279 u64 isar1, isar2, val;
280
281 /*
282 * Check that:
283 *
284 * - both Address and Generic auth are implemented for a given
285 * algorithm (Q5, IMPDEF or Q3)
286 * - only a single algorithm is implemented.
287 */
288 if (!system_has_full_ptr_auth())
289 return false;
290
291 isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
292 isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
293
294 apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
295 val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
296 gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);
297
298 api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
299 val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
300 gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);
301
302 apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
303 val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
304 gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);
305
306 return (apa == gpa && api == gpi && apa3 == gpa3 &&
307 (apa + api + apa3) == 1);
308}
309
310int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
311{
312 int r;
313
314 if (kvm && kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, ext))
315 return 0;
316
317 switch (ext) {
318 case KVM_CAP_IRQCHIP:
319 r = vgic_present;
320 break;
321 case KVM_CAP_IOEVENTFD:
322 case KVM_CAP_USER_MEMORY:
323 case KVM_CAP_SYNC_MMU:
324 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
325 case KVM_CAP_ONE_REG:
326 case KVM_CAP_ARM_PSCI:
327 case KVM_CAP_ARM_PSCI_0_2:
328 case KVM_CAP_READONLY_MEM:
329 case KVM_CAP_MP_STATE:
330 case KVM_CAP_IMMEDIATE_EXIT:
331 case KVM_CAP_VCPU_EVENTS:
332 case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
333 case KVM_CAP_ARM_NISV_TO_USER:
334 case KVM_CAP_ARM_INJECT_EXT_DABT:
335 case KVM_CAP_SET_GUEST_DEBUG:
336 case KVM_CAP_VCPU_ATTRIBUTES:
337 case KVM_CAP_PTP_KVM:
338 case KVM_CAP_ARM_SYSTEM_SUSPEND:
339 case KVM_CAP_IRQFD_RESAMPLE:
340 case KVM_CAP_COUNTER_OFFSET:
341 r = 1;
342 break;
343 case KVM_CAP_SET_GUEST_DEBUG2:
344 return KVM_GUESTDBG_VALID_MASK;
345 case KVM_CAP_ARM_SET_DEVICE_ADDR:
346 r = 1;
347 break;
348 case KVM_CAP_NR_VCPUS:
349 /*
350 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
351 * architectures, as it does not always bound it to
352 * KVM_CAP_MAX_VCPUS. It should not matter much because
353 * this is just an advisory value.
354 */
355 r = min_t(unsigned int, num_online_cpus(),
356 kvm_arm_default_max_vcpus());
357 break;
358 case KVM_CAP_MAX_VCPUS:
359 case KVM_CAP_MAX_VCPU_ID:
360 if (kvm)
361 r = kvm->max_vcpus;
362 else
363 r = kvm_arm_default_max_vcpus();
364 break;
365 case KVM_CAP_MSI_DEVID:
366 if (!kvm)
367 r = -EINVAL;
368 else
369 r = kvm->arch.vgic.msis_require_devid;
370 break;
371 case KVM_CAP_ARM_USER_IRQ:
372 /*
373 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
374 * (bump this number if adding more devices)
375 */
376 r = 1;
377 break;
378 case KVM_CAP_ARM_MTE:
379 r = system_supports_mte();
380 break;
381 case KVM_CAP_STEAL_TIME:
382 r = kvm_arm_pvtime_supported();
383 break;
384 case KVM_CAP_ARM_EL1_32BIT:
385 r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
386 break;
387 case KVM_CAP_GUEST_DEBUG_HW_BPS:
388 r = get_num_brps();
389 break;
390 case KVM_CAP_GUEST_DEBUG_HW_WPS:
391 r = get_num_wrps();
392 break;
393 case KVM_CAP_ARM_PMU_V3:
394 r = kvm_arm_support_pmu_v3();
395 break;
396 case KVM_CAP_ARM_INJECT_SERROR_ESR:
397 r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
398 break;
399 case KVM_CAP_ARM_VM_IPA_SIZE:
400 r = get_kvm_ipa_limit();
401 break;
402 case KVM_CAP_ARM_SVE:
403 r = system_supports_sve();
404 break;
405 case KVM_CAP_ARM_PTRAUTH_ADDRESS:
406 case KVM_CAP_ARM_PTRAUTH_GENERIC:
407 r = kvm_has_full_ptr_auth();
408 break;
409 case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
410 if (kvm)
411 r = kvm->arch.mmu.split_page_chunk_size;
412 else
413 r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
414 break;
415 case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
416 r = kvm_supported_block_sizes();
417 break;
418 case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
419 r = BIT(0);
420 break;
421 default:
422 r = 0;
423 }
424
425 return r;
426}
427
428long kvm_arch_dev_ioctl(struct file *filp,
429 unsigned int ioctl, unsigned long arg)
430{
431 return -EINVAL;
432}
433
434struct kvm *kvm_arch_alloc_vm(void)
435{
436 size_t sz = sizeof(struct kvm);
437
438 if (!has_vhe())
439 return kzalloc(sz, GFP_KERNEL_ACCOUNT);
440
441 return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
442}
443
444int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
445{
446 if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
447 return -EBUSY;
448
449 if (id >= kvm->max_vcpus)
450 return -EINVAL;
451
452 return 0;
453}
454
455int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
456{
457 int err;
458
459 spin_lock_init(&vcpu->arch.mp_state_lock);
460
461#ifdef CONFIG_LOCKDEP
462 /* Inform lockdep that the config_lock is acquired after vcpu->mutex */
463 mutex_lock(&vcpu->mutex);
464 mutex_lock(&vcpu->kvm->arch.config_lock);
465 mutex_unlock(&vcpu->kvm->arch.config_lock);
466 mutex_unlock(&vcpu->mutex);
467#endif
468
469 /* Force users to call KVM_ARM_VCPU_INIT */
470 vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
471
472 vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;
473
474 /* Set up the timer */
475 kvm_timer_vcpu_init(vcpu);
476
477 kvm_pmu_vcpu_init(vcpu);
478
479 kvm_arm_reset_debug_ptr(vcpu);
480
481 kvm_arm_pvtime_vcpu_init(&vcpu->arch);
482
483 vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
484
485 /*
486 * This vCPU may have been created after mpidr_data was initialized.
487 * Throw out the pre-computed mappings if that is the case which forces
488 * KVM to fall back to iteratively searching the vCPUs.
489 */
490 kvm_destroy_mpidr_data(vcpu->kvm);
491
492 err = kvm_vgic_vcpu_init(vcpu);
493 if (err)
494 return err;
495
496 return kvm_share_hyp(vcpu, vcpu + 1);
497}
498
499void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
500{
501}
502
503void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
504{
505 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
506 kvm_timer_vcpu_terminate(vcpu);
507 kvm_pmu_vcpu_destroy(vcpu);
508 kvm_vgic_vcpu_destroy(vcpu);
509 kvm_arm_vcpu_destroy(vcpu);
510}
511
512void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
513{
514
515}
516
517void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
518{
519
520}
521
522static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
523{
524 if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
525 /*
526 * Either we're running an L2 guest, and the API/APK bits come
527 * from L1's HCR_EL2, or API/APK are both set.
528 */
529 if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
530 u64 val;
531
532 val = __vcpu_sys_reg(vcpu, HCR_EL2);
533 val &= (HCR_API | HCR_APK);
534 vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
535 vcpu->arch.hcr_el2 |= val;
536 } else {
537 vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
538 }
539
540 /*
541 * Save the host keys if there is any chance for the guest
542 * to use pauth, as the entry code will reload the guest
543 * keys in that case.
544 */
545 if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
546 struct kvm_cpu_context *ctxt;
547
548 ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
549 ptrauth_save_keys(ctxt);
550 }
551 }
552}
553
554static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
555{
556 if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
557 return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;
558
559 return single_task_running() &&
560 (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
561 vcpu->kvm->arch.vgic.nassgireq);
562}
563
564static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
565{
566 if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
567 return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;
568
569 return single_task_running();
570}
571
572void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
573{
574 struct kvm_s2_mmu *mmu;
575 int *last_ran;
576
577 if (vcpu_has_nv(vcpu))
578 kvm_vcpu_load_hw_mmu(vcpu);
579
580 mmu = vcpu->arch.hw_mmu;
581 last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
582
583 /*
584 * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2,
585 * which happens eagerly in VHE.
586 *
587 * Also, the VMID allocator only preserves VMIDs that are active at the
588 * time of rollover, so KVM might need to grab a new VMID for the MMU if
589 * this is called from kvm_sched_in().
590 */
591 kvm_arm_vmid_update(&mmu->vmid);
592
593 /*
594 * We guarantee that both TLBs and I-cache are private to each
595 * vcpu. If detecting that a vcpu from the same VM has
596 * previously run on the same physical CPU, call into the
597 * hypervisor code to nuke the relevant contexts.
598 *
599 * We might get preempted before the vCPU actually runs, but
600 * over-invalidation doesn't affect correctness.
601 */
602 if (*last_ran != vcpu->vcpu_idx) {
603 kvm_call_hyp(__kvm_flush_cpu_context, mmu);
604 *last_ran = vcpu->vcpu_idx;
605 }
606
607 vcpu->cpu = cpu;
608
609 kvm_vgic_load(vcpu);
610 kvm_timer_vcpu_load(vcpu);
611 if (has_vhe())
612 kvm_vcpu_load_vhe(vcpu);
613 kvm_arch_vcpu_load_fp(vcpu);
614 kvm_vcpu_pmu_restore_guest(vcpu);
615 if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
616 kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
617
618 if (kvm_vcpu_should_clear_twe(vcpu))
619 vcpu->arch.hcr_el2 &= ~HCR_TWE;
620 else
621 vcpu->arch.hcr_el2 |= HCR_TWE;
622
623 if (kvm_vcpu_should_clear_twi(vcpu))
624 vcpu->arch.hcr_el2 &= ~HCR_TWI;
625 else
626 vcpu->arch.hcr_el2 |= HCR_TWI;
627
628 vcpu_set_pauth_traps(vcpu);
629
630 kvm_arch_vcpu_load_debug_state_flags(vcpu);
631
632 if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
633 vcpu_set_on_unsupported_cpu(vcpu);
634}
635
636void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
637{
638 kvm_arch_vcpu_put_debug_state_flags(vcpu);
639 kvm_arch_vcpu_put_fp(vcpu);
640 if (has_vhe())
641 kvm_vcpu_put_vhe(vcpu);
642 kvm_timer_vcpu_put(vcpu);
643 kvm_vgic_put(vcpu);
644 kvm_vcpu_pmu_restore_host(vcpu);
645 if (vcpu_has_nv(vcpu))
646 kvm_vcpu_put_hw_mmu(vcpu);
647 kvm_arm_vmid_clear_active();
648
649 vcpu_clear_on_unsupported_cpu(vcpu);
650 vcpu->cpu = -1;
651}
652
653static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
654{
655 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
656 kvm_make_request(KVM_REQ_SLEEP, vcpu);
657 kvm_vcpu_kick(vcpu);
658}
659
660void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
661{
662 spin_lock(&vcpu->arch.mp_state_lock);
663 __kvm_arm_vcpu_power_off(vcpu);
664 spin_unlock(&vcpu->arch.mp_state_lock);
665}
666
667bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
668{
669 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
670}
671
672static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
673{
674 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
675 kvm_make_request(KVM_REQ_SUSPEND, vcpu);
676 kvm_vcpu_kick(vcpu);
677}
678
679static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
680{
681 return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
682}
683
684int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
685 struct kvm_mp_state *mp_state)
686{
687 *mp_state = READ_ONCE(vcpu->arch.mp_state);
688
689 return 0;
690}
691
692int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
693 struct kvm_mp_state *mp_state)
694{
695 int ret = 0;
696
697 spin_lock(&vcpu->arch.mp_state_lock);
698
699 switch (mp_state->mp_state) {
700 case KVM_MP_STATE_RUNNABLE:
701 WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
702 break;
703 case KVM_MP_STATE_STOPPED:
704 __kvm_arm_vcpu_power_off(vcpu);
705 break;
706 case KVM_MP_STATE_SUSPENDED:
707 kvm_arm_vcpu_suspend(vcpu);
708 break;
709 default:
710 ret = -EINVAL;
711 }
712
713 spin_unlock(&vcpu->arch.mp_state_lock);
714
715 return ret;
716}
717
718/**
719 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
720 * @v: The VCPU pointer
721 *
722 * If the guest CPU is not waiting for interrupts or an interrupt line is
723 * asserted, the CPU is by definition runnable.
724 */
725int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
726{
727 bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
728 return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
729 && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
730}
731
732bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
733{
734 return vcpu_mode_priv(vcpu);
735}
736
737#ifdef CONFIG_GUEST_PERF_EVENTS
738unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
739{
740 return *vcpu_pc(vcpu);
741}
742#endif
743
744static void kvm_init_mpidr_data(struct kvm *kvm)
745{
746 struct kvm_mpidr_data *data = NULL;
747 unsigned long c, mask, nr_entries;
748 u64 aff_set = 0, aff_clr = ~0UL;
749 struct kvm_vcpu *vcpu;
750
751 mutex_lock(&kvm->arch.config_lock);
752
753 if (rcu_access_pointer(kvm->arch.mpidr_data) ||
754 atomic_read(&kvm->online_vcpus) == 1)
755 goto out;
756
757 kvm_for_each_vcpu(c, vcpu, kvm) {
758 u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
759 aff_set |= aff;
760 aff_clr &= aff;
761 }
762
763 /*
764 * A significant bit can be either 0 or 1, and will only appear in
765 * aff_set. Use aff_clr to weed out the useless stuff.
766 */
767 mask = aff_set ^ aff_clr;
768 nr_entries = BIT_ULL(hweight_long(mask));
769
770 /*
771 * Don't let userspace fool us. If we need more than a single page
772 * to describe the compressed MPIDR array, just fall back to the
773 * iterative method. Single vcpu VMs do not need this either.
774 */
775 if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
776 data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
777 GFP_KERNEL_ACCOUNT);
778
779 if (!data)
780 goto out;
781
782 data->mpidr_mask = mask;
783
784 kvm_for_each_vcpu(c, vcpu, kvm) {
785 u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
786 u16 index = kvm_mpidr_index(data, aff);
787
788 data->cmpidr_to_idx[index] = c;
789 }
790
791 rcu_assign_pointer(kvm->arch.mpidr_data, data);
792out:
793 mutex_unlock(&kvm->arch.config_lock);
794}
795
796/*
797 * Handle both the initialisation that is being done when the vcpu is
798 * run for the first time, as well as the updates that must be
799 * performed each time we get a new thread dealing with this vcpu.
800 */
801int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
802{
803 struct kvm *kvm = vcpu->kvm;
804 int ret;
805
806 if (!kvm_vcpu_initialized(vcpu))
807 return -ENOEXEC;
808
809 if (!kvm_arm_vcpu_is_finalized(vcpu))
810 return -EPERM;
811
812 ret = kvm_arch_vcpu_run_map_fp(vcpu);
813 if (ret)
814 return ret;
815
816 if (likely(vcpu_has_run_once(vcpu)))
817 return 0;
818
819 kvm_init_mpidr_data(kvm);
820
821 kvm_arm_vcpu_init_debug(vcpu);
822
823 if (likely(irqchip_in_kernel(kvm))) {
824 /*
825 * Map the VGIC hardware resources before running a vcpu the
826 * first time on this VM.
827 */
828 ret = kvm_vgic_map_resources(kvm);
829 if (ret)
830 return ret;
831 }
832
833 ret = kvm_finalize_sys_regs(vcpu);
834 if (ret)
835 return ret;
836
837 /*
838 * This needs to happen after any restriction has been applied
839 * to the feature set.
840 */
841 kvm_calculate_traps(vcpu);
842
843 ret = kvm_timer_enable(vcpu);
844 if (ret)
845 return ret;
846
847 ret = kvm_arm_pmu_v3_enable(vcpu);
848 if (ret)
849 return ret;
850
851 if (is_protected_kvm_enabled()) {
852 ret = pkvm_create_hyp_vm(kvm);
853 if (ret)
854 return ret;
855 }
856
857 mutex_lock(&kvm->arch.config_lock);
858 set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
859 mutex_unlock(&kvm->arch.config_lock);
860
861 return ret;
862}
863
864bool kvm_arch_intc_initialized(struct kvm *kvm)
865{
866 return vgic_initialized(kvm);
867}
868
869void kvm_arm_halt_guest(struct kvm *kvm)
870{
871 unsigned long i;
872 struct kvm_vcpu *vcpu;
873
874 kvm_for_each_vcpu(i, vcpu, kvm)
875 vcpu->arch.pause = true;
876 kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
877}
878
879void kvm_arm_resume_guest(struct kvm *kvm)
880{
881 unsigned long i;
882 struct kvm_vcpu *vcpu;
883
884 kvm_for_each_vcpu(i, vcpu, kvm) {
885 vcpu->arch.pause = false;
886 __kvm_vcpu_wake_up(vcpu);
887 }
888}
889
890static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
891{
892 struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
893
894 rcuwait_wait_event(wait,
895 (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
896 TASK_INTERRUPTIBLE);
897
898 if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
899 /* Awaken to handle a signal, request we sleep again later. */
900 kvm_make_request(KVM_REQ_SLEEP, vcpu);
901 }
902
903 /*
904 * Make sure we will observe a potential reset request if we've
905 * observed a change to the power state. Pairs with the smp_wmb() in
906 * kvm_psci_vcpu_on().
907 */
908 smp_rmb();
909}
910
911/**
912 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
913 * @vcpu: The VCPU pointer
914 *
915 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
916 * the vCPU is runnable. The vCPU may or may not be scheduled out, depending
917 * on when a wake event arrives, e.g. there may already be a pending wake event.
918 */
919void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
920{
921 /*
922 * Sync back the state of the GIC CPU interface so that we have
923 * the latest PMR and group enables. This ensures that
924 * kvm_arch_vcpu_runnable has up-to-date data to decide whether
925 * we have pending interrupts, e.g. when determining if the
926 * vCPU should block.
927 *
928 * For the same reason, we want to tell GICv4 that we need
929 * doorbells to be signalled, should an interrupt become pending.
930 */
931 preempt_disable();
932 vcpu_set_flag(vcpu, IN_WFI);
933 kvm_vgic_put(vcpu);
934 preempt_enable();
935
936 kvm_vcpu_halt(vcpu);
937 vcpu_clear_flag(vcpu, IN_WFIT);
938
939 preempt_disable();
940 vcpu_clear_flag(vcpu, IN_WFI);
941 kvm_vgic_load(vcpu);
942 preempt_enable();
943}
944
945static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
946{
947 if (!kvm_arm_vcpu_suspended(vcpu))
948 return 1;
949
950 kvm_vcpu_wfi(vcpu);
951
952 /*
953 * The suspend state is sticky; we do not leave it until userspace
954 * explicitly marks the vCPU as runnable. Request that we suspend again
955 * later.
956 */
957 kvm_make_request(KVM_REQ_SUSPEND, vcpu);
958
959 /*
960 * Check to make sure the vCPU is actually runnable. If so, exit to
961 * userspace informing it of the wakeup condition.
962 */
963 if (kvm_arch_vcpu_runnable(vcpu)) {
964 memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
965 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
966 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
967 return 0;
968 }
969
970 /*
971 * Otherwise, we were unblocked to process a different event, such as a
972 * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
973 * process the event.
974 */
975 return 1;
976}
977
978/**
979 * check_vcpu_requests - check and handle pending vCPU requests
980 * @vcpu: the VCPU pointer
981 *
982 * Return: 1 if we should enter the guest
983 * 0 if we should exit to userspace
984 * < 0 if we should exit to userspace, where the return value indicates
985 * an error
986 */
987static int check_vcpu_requests(struct kvm_vcpu *vcpu)
988{
989 if (kvm_request_pending(vcpu)) {
990 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
991 return -EIO;
992
993 if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
994 kvm_vcpu_sleep(vcpu);
995
996 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
997 kvm_reset_vcpu(vcpu);
998
999 /*
1000 * Clear IRQ_PENDING requests that were made to guarantee
1001 * that a VCPU sees new virtual interrupts.
1002 */
1003 kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
1004
1005 if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
1006 kvm_update_stolen_time(vcpu);
1007
1008 if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
1009 /* The distributor enable bits were changed */
1010 preempt_disable();
1011 vgic_v4_put(vcpu);
1012 vgic_v4_load(vcpu);
1013 preempt_enable();
1014 }
1015
1016 if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
1017 kvm_vcpu_reload_pmu(vcpu);
1018
1019 if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
1020 kvm_vcpu_pmu_restore_guest(vcpu);
1021
1022 if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
1023 return kvm_vcpu_suspend(vcpu);
1024
1025 if (kvm_dirty_ring_check_request(vcpu))
1026 return 0;
1027
1028 check_nested_vcpu_requests(vcpu);
1029 }
1030
1031 return 1;
1032}
1033
1034static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
1035{
1036 if (likely(!vcpu_mode_is_32bit(vcpu)))
1037 return false;
1038
1039 if (vcpu_has_nv(vcpu))
1040 return true;
1041
1042 return !kvm_supports_32bit_el0();
1043}
1044
1045/**
1046 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
1047 * @vcpu: The VCPU pointer
1048 * @ret: Pointer to write optional return code
1049 *
1050 * Returns: true if the VCPU needs to return to a preemptible + interruptible
1051 * and skip guest entry.
1052 *
1053 * This function disambiguates between two different types of exits: exits to a
1054 * preemptible + interruptible kernel context and exits to userspace. For an
1055 * exit to userspace, this function will write the return code to ret and return
1056 * true. For an exit to preemptible + interruptible kernel context (i.e. check
1057 * for pending work and re-enter), return true without writing to ret.
1058 */
1059static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
1060{
1061 struct kvm_run *run = vcpu->run;
1062
1063 /*
1064 * If we're using a userspace irqchip, then check if we need
1065 * to tell a userspace irqchip about timer or PMU level
1066 * changes and if so, exit to userspace (the actual level
1067 * state gets updated in kvm_timer_update_run and
1068 * kvm_pmu_update_run below).
1069 */
1070 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1071 if (kvm_timer_should_notify_user(vcpu) ||
1072 kvm_pmu_should_notify_user(vcpu)) {
1073 *ret = -EINTR;
1074 run->exit_reason = KVM_EXIT_INTR;
1075 return true;
1076 }
1077 }
1078
1079 if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
1080 run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1081 run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
1082 run->fail_entry.cpu = smp_processor_id();
1083 *ret = 0;
1084 return true;
1085 }
1086
1087 return kvm_request_pending(vcpu) ||
1088 xfer_to_guest_mode_work_pending();
1089}
1090
1091/*
1092 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
1093 * the vCPU is running.
1094 *
1095 * This must be noinstr as instrumentation may make use of RCU, and this is not
1096 * safe during the EQS.
1097 */
1098static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
1099{
1100 int ret;
1101
1102 guest_state_enter_irqoff();
1103 ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
1104 guest_state_exit_irqoff();
1105
1106 return ret;
1107}
1108
1109/**
1110 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
1111 * @vcpu: The VCPU pointer
1112 *
1113 * This function is called through the VCPU_RUN ioctl called from user space. It
1114 * will execute VM code in a loop until the time slice for the process is used
1115 * or some emulation is needed from user space in which case the function will
1116 * return with return value 0 and with the kvm_run structure filled in with the
1117 * required data for the requested emulation.
1118 */
1119int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
1120{
1121 struct kvm_run *run = vcpu->run;
1122 int ret;
1123
1124 if (run->exit_reason == KVM_EXIT_MMIO) {
1125 ret = kvm_handle_mmio_return(vcpu);
1126 if (ret <= 0)
1127 return ret;
1128 }
1129
1130 vcpu_load(vcpu);
1131
1132 if (!vcpu->wants_to_run) {
1133 ret = -EINTR;
1134 goto out;
1135 }
1136
1137 kvm_sigset_activate(vcpu);
1138
1139 ret = 1;
1140 run->exit_reason = KVM_EXIT_UNKNOWN;
1141 run->flags = 0;
1142 while (ret > 0) {
1143 /*
1144 * Check conditions before entering the guest
1145 */
1146 ret = xfer_to_guest_mode_handle_work(vcpu);
1147 if (!ret)
1148 ret = 1;
1149
1150 if (ret > 0)
1151 ret = check_vcpu_requests(vcpu);
1152
1153 /*
1154 * Preparing the interrupts to be injected also
1155 * involves poking the GIC, which must be done in a
1156 * non-preemptible context.
1157 */
1158 preempt_disable();
1159
1160 kvm_pmu_flush_hwstate(vcpu);
1161
1162 local_irq_disable();
1163
1164 kvm_vgic_flush_hwstate(vcpu);
1165
1166 kvm_pmu_update_vcpu_events(vcpu);
1167
1168 /*
1169 * Ensure we set mode to IN_GUEST_MODE after we disable
1170 * interrupts and before the final VCPU requests check.
1171 * See the comment in kvm_vcpu_exiting_guest_mode() and
1172 * Documentation/virt/kvm/vcpu-requests.rst
1173 */
1174 smp_store_mb(vcpu->mode, IN_GUEST_MODE);
1175
1176 if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
1177 vcpu->mode = OUTSIDE_GUEST_MODE;
1178 isb(); /* Ensure work in x_flush_hwstate is committed */
1179 kvm_pmu_sync_hwstate(vcpu);
1180 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
1181 kvm_timer_sync_user(vcpu);
1182 kvm_vgic_sync_hwstate(vcpu);
1183 local_irq_enable();
1184 preempt_enable();
1185 continue;
1186 }
1187
1188 kvm_arm_setup_debug(vcpu);
1189 kvm_arch_vcpu_ctxflush_fp(vcpu);
1190
1191 /**************************************************************
1192 * Enter the guest
1193 */
1194 trace_kvm_entry(*vcpu_pc(vcpu));
1195 guest_timing_enter_irqoff();
1196
1197 ret = kvm_arm_vcpu_enter_exit(vcpu);
1198
1199 vcpu->mode = OUTSIDE_GUEST_MODE;
1200 vcpu->stat.exits++;
1201 /*
1202 * Back from guest
1203 *************************************************************/
1204
1205 kvm_arm_clear_debug(vcpu);
1206
1207 /*
1208 * We must sync the PMU state before the vgic state so
1209 * that the vgic can properly sample the updated state of the
1210 * interrupt line.
1211 */
1212 kvm_pmu_sync_hwstate(vcpu);
1213
1214 /*
1215 * Sync the vgic state before syncing the timer state because
1216 * the timer code needs to know if the virtual timer
1217 * interrupts are active.
1218 */
1219 kvm_vgic_sync_hwstate(vcpu);
1220
1221 /*
1222 * Sync the timer hardware state before enabling interrupts as
1223 * we don't want vtimer interrupts to race with syncing the
1224 * timer virtual interrupt state.
1225 */
1226 if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
1227 kvm_timer_sync_user(vcpu);
1228
1229 kvm_arch_vcpu_ctxsync_fp(vcpu);
1230
1231 /*
1232 * We must ensure that any pending interrupts are taken before
1233 * we exit guest timing so that timer ticks are accounted as
1234 * guest time. Transiently unmask interrupts so that any
1235 * pending interrupts are taken.
1236 *
1237 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
1238 * context synchronization event) is necessary to ensure that
1239 * pending interrupts are taken.
1240 */
1241 if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
1242 local_irq_enable();
1243 isb();
1244 local_irq_disable();
1245 }
1246
1247 guest_timing_exit_irqoff();
1248
1249 local_irq_enable();
1250
1251 trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
1252
1253 /* Exit types that need handling before we can be preempted */
1254 handle_exit_early(vcpu, ret);
1255
1256 preempt_enable();
1257
1258 /*
1259 * The ARMv8 architecture doesn't give the hypervisor
1260 * a mechanism to prevent a guest from dropping to AArch32 EL0
1261 * if implemented by the CPU. If we spot the guest in such
1262 * state and that we decided it wasn't supposed to do so (like
1263 * with the asymmetric AArch32 case), return to userspace with
1264 * a fatal error.
1265 */
1266 if (vcpu_mode_is_bad_32bit(vcpu)) {
1267 /*
1268 * As we have caught the guest red-handed, decide that
1269 * it isn't fit for purpose anymore by making the vcpu
1270 * invalid. The VMM can try and fix it by issuing a
1271 * KVM_ARM_VCPU_INIT if it really wants to.
1272 */
1273 vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
1274 ret = ARM_EXCEPTION_IL;
1275 }
1276
1277 ret = handle_exit(vcpu, ret);
1278 }
1279
1280 /* Tell userspace about in-kernel device output levels */
1281 if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
1282 kvm_timer_update_run(vcpu);
1283 kvm_pmu_update_run(vcpu);
1284 }
1285
1286 kvm_sigset_deactivate(vcpu);
1287
1288out:
1289 /*
1290 * In the unlikely event that we are returning to userspace
1291 * with pending exceptions or PC adjustment, commit these
1292 * adjustments in order to give userspace a consistent view of
1293 * the vcpu state. Note that this relies on __kvm_adjust_pc()
1294 * being preempt-safe on VHE.
1295 */
1296 if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
1297 vcpu_get_flag(vcpu, INCREMENT_PC)))
1298 kvm_call_hyp(__kvm_adjust_pc, vcpu);
1299
1300 vcpu_put(vcpu);
1301 return ret;
1302}
1303
1304static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
1305{
1306 int bit_index;
1307 bool set;
1308 unsigned long *hcr;
1309
1310 if (number == KVM_ARM_IRQ_CPU_IRQ)
1311 bit_index = __ffs(HCR_VI);
1312 else /* KVM_ARM_IRQ_CPU_FIQ */
1313 bit_index = __ffs(HCR_VF);
1314
1315 hcr = vcpu_hcr(vcpu);
1316 if (level)
1317 set = test_and_set_bit(bit_index, hcr);
1318 else
1319 set = test_and_clear_bit(bit_index, hcr);
1320
1321 /*
1322 * If we didn't change anything, no need to wake up or kick other CPUs
1323 */
1324 if (set == level)
1325 return 0;
1326
1327 /*
1328 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
1329 * trigger a world-switch round on the running physical CPU to set the
1330 * virtual IRQ/FIQ fields in the HCR appropriately.
1331 */
1332 kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
1333 kvm_vcpu_kick(vcpu);
1334
1335 return 0;
1336}
1337
1338int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
1339 bool line_status)
1340{
1341 u32 irq = irq_level->irq;
1342 unsigned int irq_type, vcpu_id, irq_num;
1343 struct kvm_vcpu *vcpu = NULL;
1344 bool level = irq_level->level;
1345
1346 irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
1347 vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
1348 vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
1349 irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
1350
1351 trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);
1352
1353 switch (irq_type) {
1354 case KVM_ARM_IRQ_TYPE_CPU:
1355 if (irqchip_in_kernel(kvm))
1356 return -ENXIO;
1357
1358 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1359 if (!vcpu)
1360 return -EINVAL;
1361
1362 if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
1363 return -EINVAL;
1364
1365 return vcpu_interrupt_line(vcpu, irq_num, level);
1366 case KVM_ARM_IRQ_TYPE_PPI:
1367 if (!irqchip_in_kernel(kvm))
1368 return -ENXIO;
1369
1370 vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
1371 if (!vcpu)
1372 return -EINVAL;
1373
1374 if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
1375 return -EINVAL;
1376
1377 return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
1378 case KVM_ARM_IRQ_TYPE_SPI:
1379 if (!irqchip_in_kernel(kvm))
1380 return -ENXIO;
1381
1382 if (irq_num < VGIC_NR_PRIVATE_IRQS)
1383 return -EINVAL;
1384
1385 return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
1386 }
1387
1388 return -EINVAL;
1389}
1390
1391static unsigned long system_supported_vcpu_features(void)
1392{
1393 unsigned long features = KVM_VCPU_VALID_FEATURES;
1394
1395 if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
1396 clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);
1397
1398 if (!kvm_arm_support_pmu_v3())
1399 clear_bit(KVM_ARM_VCPU_PMU_V3, &features);
1400
1401 if (!system_supports_sve())
1402 clear_bit(KVM_ARM_VCPU_SVE, &features);
1403
1404 if (!kvm_has_full_ptr_auth()) {
1405 clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
1406 clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
1407 }
1408
1409 if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
1410 clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);
1411
1412 return features;
1413}
1414
1415static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
1416 const struct kvm_vcpu_init *init)
1417{
1418 unsigned long features = init->features[0];
1419 int i;
1420
1421 if (features & ~KVM_VCPU_VALID_FEATURES)
1422 return -ENOENT;
1423
1424 for (i = 1; i < ARRAY_SIZE(init->features); i++) {
1425 if (init->features[i])
1426 return -ENOENT;
1427 }
1428
1429 if (features & ~system_supported_vcpu_features())
1430 return -EINVAL;
1431
1432 /*
1433 * For now make sure that both address/generic pointer authentication
1434 * features are requested by the userspace together.
1435 */
1436 if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
1437 test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
1438 return -EINVAL;
1439
1440 if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
1441 return 0;
1442
1443 /* MTE is incompatible with AArch32 */
1444 if (kvm_has_mte(vcpu->kvm))
1445 return -EINVAL;
1446
1447 /* NV is incompatible with AArch32 */
1448 if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
1449 return -EINVAL;
1450
1451 return 0;
1452}
1453
1454static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
1455 const struct kvm_vcpu_init *init)
1456{
1457 unsigned long features = init->features[0];
1458
1459 return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
1460 KVM_VCPU_MAX_FEATURES);
1461}
1462
1463static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
1464{
1465 struct kvm *kvm = vcpu->kvm;
1466 int ret = 0;
1467
1468 /*
1469 * When the vCPU has a PMU, but no PMU is set for the guest
1470 * yet, set the default one.
1471 */
1472 if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
1473 ret = kvm_arm_set_default_pmu(kvm);
1474
1475 /* Prepare for nested if required */
1476 if (!ret && vcpu_has_nv(vcpu))
1477 ret = kvm_vcpu_init_nested(vcpu);
1478
1479 return ret;
1480}
1481
1482static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1483 const struct kvm_vcpu_init *init)
1484{
1485 unsigned long features = init->features[0];
1486 struct kvm *kvm = vcpu->kvm;
1487 int ret = -EINVAL;
1488
1489 mutex_lock(&kvm->arch.config_lock);
1490
1491 if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
1492 kvm_vcpu_init_changed(vcpu, init))
1493 goto out_unlock;
1494
1495 bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
1496
1497 ret = kvm_setup_vcpu(vcpu);
1498 if (ret)
1499 goto out_unlock;
1500
1501 /* Now we know what it is, we can reset it. */
1502 kvm_reset_vcpu(vcpu);
1503
1504 set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
1505 vcpu_set_flag(vcpu, VCPU_INITIALIZED);
1506 ret = 0;
1507out_unlock:
1508 mutex_unlock(&kvm->arch.config_lock);
1509 return ret;
1510}
1511
1512static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
1513 const struct kvm_vcpu_init *init)
1514{
1515 int ret;
1516
1517 if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
1518 init->target != kvm_target_cpu())
1519 return -EINVAL;
1520
1521 ret = kvm_vcpu_init_check_features(vcpu, init);
1522 if (ret)
1523 return ret;
1524
1525 if (!kvm_vcpu_initialized(vcpu))
1526 return __kvm_vcpu_set_target(vcpu, init);
1527
1528 if (kvm_vcpu_init_changed(vcpu, init))
1529 return -EINVAL;
1530
1531 kvm_reset_vcpu(vcpu);
1532 return 0;
1533}
1534
1535static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
1536 struct kvm_vcpu_init *init)
1537{
1538 bool power_off = false;
1539 int ret;
1540
1541 /*
1542 * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
1543 * reflecting it in the finalized feature set, thus limiting its scope
1544 * to a single KVM_ARM_VCPU_INIT call.
1545 */
1546 if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
1547 init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
1548 power_off = true;
1549 }
1550
1551 ret = kvm_vcpu_set_target(vcpu, init);
1552 if (ret)
1553 return ret;
1554
1555 /*
1556 * Ensure a rebooted VM will fault in RAM pages and detect if the
1557 * guest MMU is turned off and flush the caches as needed.
1558 *
1559 * S2FWB enforces all memory accesses to RAM being cacheable,
1560 * ensuring that the data side is always coherent. We still
1561 * need to invalidate the I-cache though, as FWB does *not*
1562 * imply CTR_EL0.DIC.
1563 */
1564 if (vcpu_has_run_once(vcpu)) {
1565 if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
1566 stage2_unmap_vm(vcpu->kvm);
1567 else
1568 icache_inval_all_pou();
1569 }
1570
1571 vcpu_reset_hcr(vcpu);
1572 vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
1573
1574 /*
1575 * Handle the "start in power-off" case.
1576 */
1577 spin_lock(&vcpu->arch.mp_state_lock);
1578
1579 if (power_off)
1580 __kvm_arm_vcpu_power_off(vcpu);
1581 else
1582 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
1583
1584 spin_unlock(&vcpu->arch.mp_state_lock);
1585
1586 return 0;
1587}
1588
1589static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
1590 struct kvm_device_attr *attr)
1591{
1592 int ret = -ENXIO;
1593
1594 switch (attr->group) {
1595 default:
1596 ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
1597 break;
1598 }
1599
1600 return ret;
1601}
1602
1603static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
1604 struct kvm_device_attr *attr)
1605{
1606 int ret = -ENXIO;
1607
1608 switch (attr->group) {
1609 default:
1610 ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
1611 break;
1612 }
1613
1614 return ret;
1615}
1616
1617static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
1618 struct kvm_device_attr *attr)
1619{
1620 int ret = -ENXIO;
1621
1622 switch (attr->group) {
1623 default:
1624 ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
1625 break;
1626 }
1627
1628 return ret;
1629}
1630
1631static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
1632 struct kvm_vcpu_events *events)
1633{
1634 memset(events, 0, sizeof(*events));
1635
1636 return __kvm_arm_vcpu_get_events(vcpu, events);
1637}
1638
1639static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
1640 struct kvm_vcpu_events *events)
1641{
1642 int i;
1643
1644 /* check whether the reserved field is zero */
1645 for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
1646 if (events->reserved[i])
1647 return -EINVAL;
1648
1649 /* check whether the pad field is zero */
1650 for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
1651 if (events->exception.pad[i])
1652 return -EINVAL;
1653
1654 return __kvm_arm_vcpu_set_events(vcpu, events);
1655}
1656
1657long kvm_arch_vcpu_ioctl(struct file *filp,
1658 unsigned int ioctl, unsigned long arg)
1659{
1660 struct kvm_vcpu *vcpu = filp->private_data;
1661 void __user *argp = (void __user *)arg;
1662 struct kvm_device_attr attr;
1663 long r;
1664
1665 switch (ioctl) {
1666 case KVM_ARM_VCPU_INIT: {
1667 struct kvm_vcpu_init init;
1668
1669 r = -EFAULT;
1670 if (copy_from_user(&init, argp, sizeof(init)))
1671 break;
1672
1673 r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
1674 break;
1675 }
1676 case KVM_SET_ONE_REG:
1677 case KVM_GET_ONE_REG: {
1678 struct kvm_one_reg reg;
1679
1680 r = -ENOEXEC;
1681 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1682 break;
1683
1684 r = -EFAULT;
1685 if (copy_from_user(®, argp, sizeof(reg)))
1686 break;
1687
1688 /*
1689 * We could owe a reset due to PSCI. Handle the pending reset
1690 * here to ensure userspace register accesses are ordered after
1691 * the reset.
1692 */
1693 if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
1694 kvm_reset_vcpu(vcpu);
1695
1696 if (ioctl == KVM_SET_ONE_REG)
1697 r = kvm_arm_set_reg(vcpu, ®);
1698 else
1699 r = kvm_arm_get_reg(vcpu, ®);
1700 break;
1701 }
1702 case KVM_GET_REG_LIST: {
1703 struct kvm_reg_list __user *user_list = argp;
1704 struct kvm_reg_list reg_list;
1705 unsigned n;
1706
1707 r = -ENOEXEC;
1708 if (unlikely(!kvm_vcpu_initialized(vcpu)))
1709 break;
1710
1711 r = -EPERM;
1712 if (!kvm_arm_vcpu_is_finalized(vcpu))
1713 break;
1714
1715 r = -EFAULT;
1716 if (copy_from_user(®_list, user_list, sizeof(reg_list)))
1717 break;
1718 n = reg_list.n;
1719 reg_list.n = kvm_arm_num_regs(vcpu);
1720 if (copy_to_user(user_list, ®_list, sizeof(reg_list)))
1721 break;
1722 r = -E2BIG;
1723 if (n < reg_list.n)
1724 break;
1725 r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
1726 break;
1727 }
1728 case KVM_SET_DEVICE_ATTR: {
1729 r = -EFAULT;
1730 if (copy_from_user(&attr, argp, sizeof(attr)))
1731 break;
1732 r = kvm_arm_vcpu_set_attr(vcpu, &attr);
1733 break;
1734 }
1735 case KVM_GET_DEVICE_ATTR: {
1736 r = -EFAULT;
1737 if (copy_from_user(&attr, argp, sizeof(attr)))
1738 break;
1739 r = kvm_arm_vcpu_get_attr(vcpu, &attr);
1740 break;
1741 }
1742 case KVM_HAS_DEVICE_ATTR: {
1743 r = -EFAULT;
1744 if (copy_from_user(&attr, argp, sizeof(attr)))
1745 break;
1746 r = kvm_arm_vcpu_has_attr(vcpu, &attr);
1747 break;
1748 }
1749 case KVM_GET_VCPU_EVENTS: {
1750 struct kvm_vcpu_events events;
1751
1752 if (kvm_arm_vcpu_get_events(vcpu, &events))
1753 return -EINVAL;
1754
1755 if (copy_to_user(argp, &events, sizeof(events)))
1756 return -EFAULT;
1757
1758 return 0;
1759 }
1760 case KVM_SET_VCPU_EVENTS: {
1761 struct kvm_vcpu_events events;
1762
1763 if (copy_from_user(&events, argp, sizeof(events)))
1764 return -EFAULT;
1765
1766 return kvm_arm_vcpu_set_events(vcpu, &events);
1767 }
1768 case KVM_ARM_VCPU_FINALIZE: {
1769 int what;
1770
1771 if (!kvm_vcpu_initialized(vcpu))
1772 return -ENOEXEC;
1773
1774 if (get_user(what, (const int __user *)argp))
1775 return -EFAULT;
1776
1777 return kvm_arm_vcpu_finalize(vcpu, what);
1778 }
1779 default:
1780 r = -EINVAL;
1781 }
1782
1783 return r;
1784}
1785
1786void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
1787{
1788
1789}
1790
1791static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
1792 struct kvm_arm_device_addr *dev_addr)
1793{
1794 switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
1795 case KVM_ARM_DEVICE_VGIC_V2:
1796 if (!vgic_present)
1797 return -ENXIO;
1798 return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
1799 default:
1800 return -ENODEV;
1801 }
1802}
1803
1804static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1805{
1806 switch (attr->group) {
1807 case KVM_ARM_VM_SMCCC_CTRL:
1808 return kvm_vm_smccc_has_attr(kvm, attr);
1809 default:
1810 return -ENXIO;
1811 }
1812}
1813
1814static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
1815{
1816 switch (attr->group) {
1817 case KVM_ARM_VM_SMCCC_CTRL:
1818 return kvm_vm_smccc_set_attr(kvm, attr);
1819 default:
1820 return -ENXIO;
1821 }
1822}
1823
1824int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
1825{
1826 struct kvm *kvm = filp->private_data;
1827 void __user *argp = (void __user *)arg;
1828 struct kvm_device_attr attr;
1829
1830 switch (ioctl) {
1831 case KVM_CREATE_IRQCHIP: {
1832 int ret;
1833 if (!vgic_present)
1834 return -ENXIO;
1835 mutex_lock(&kvm->lock);
1836 ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
1837 mutex_unlock(&kvm->lock);
1838 return ret;
1839 }
1840 case KVM_ARM_SET_DEVICE_ADDR: {
1841 struct kvm_arm_device_addr dev_addr;
1842
1843 if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
1844 return -EFAULT;
1845 return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
1846 }
1847 case KVM_ARM_PREFERRED_TARGET: {
1848 struct kvm_vcpu_init init = {
1849 .target = KVM_ARM_TARGET_GENERIC_V8,
1850 };
1851
1852 if (copy_to_user(argp, &init, sizeof(init)))
1853 return -EFAULT;
1854
1855 return 0;
1856 }
1857 case KVM_ARM_MTE_COPY_TAGS: {
1858 struct kvm_arm_copy_mte_tags copy_tags;
1859
1860 if (copy_from_user(©_tags, argp, sizeof(copy_tags)))
1861 return -EFAULT;
1862 return kvm_vm_ioctl_mte_copy_tags(kvm, ©_tags);
1863 }
1864 case KVM_ARM_SET_COUNTER_OFFSET: {
1865 struct kvm_arm_counter_offset offset;
1866
1867 if (copy_from_user(&offset, argp, sizeof(offset)))
1868 return -EFAULT;
1869 return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
1870 }
1871 case KVM_HAS_DEVICE_ATTR: {
1872 if (copy_from_user(&attr, argp, sizeof(attr)))
1873 return -EFAULT;
1874
1875 return kvm_vm_has_attr(kvm, &attr);
1876 }
1877 case KVM_SET_DEVICE_ATTR: {
1878 if (copy_from_user(&attr, argp, sizeof(attr)))
1879 return -EFAULT;
1880
1881 return kvm_vm_set_attr(kvm, &attr);
1882 }
1883 case KVM_ARM_GET_REG_WRITABLE_MASKS: {
1884 struct reg_mask_range range;
1885
1886 if (copy_from_user(&range, argp, sizeof(range)))
1887 return -EFAULT;
1888 return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
1889 }
1890 default:
1891 return -EINVAL;
1892 }
1893}
1894
1895/* unlocks vcpus from @vcpu_lock_idx and smaller */
1896static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
1897{
1898 struct kvm_vcpu *tmp_vcpu;
1899
1900 for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
1901 tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
1902 mutex_unlock(&tmp_vcpu->mutex);
1903 }
1904}
1905
1906void unlock_all_vcpus(struct kvm *kvm)
1907{
1908 lockdep_assert_held(&kvm->lock);
1909
1910 unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
1911}
1912
1913/* Returns true if all vcpus were locked, false otherwise */
1914bool lock_all_vcpus(struct kvm *kvm)
1915{
1916 struct kvm_vcpu *tmp_vcpu;
1917 unsigned long c;
1918
1919 lockdep_assert_held(&kvm->lock);
1920
1921 /*
1922 * Any time a vcpu is in an ioctl (including running), the
1923 * core KVM code tries to grab the vcpu->mutex.
1924 *
1925 * By grabbing the vcpu->mutex of all VCPUs we ensure that no
1926 * other VCPUs can fiddle with the state while we access it.
1927 */
1928 kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
1929 if (!mutex_trylock(&tmp_vcpu->mutex)) {
1930 unlock_vcpus(kvm, c - 1);
1931 return false;
1932 }
1933 }
1934
1935 return true;
1936}
1937
1938static unsigned long nvhe_percpu_size(void)
1939{
1940 return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
1941 (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
1942}
1943
1944static unsigned long nvhe_percpu_order(void)
1945{
1946 unsigned long size = nvhe_percpu_size();
1947
1948 return size ? get_order(size) : 0;
1949}
1950
1951static size_t pkvm_host_sve_state_order(void)
1952{
1953 return get_order(pkvm_host_sve_state_size());
1954}
1955
1956/* A lookup table holding the hypervisor VA for each vector slot */
1957static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];
1958
1959static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
1960{
1961 hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
1962}
1963
1964static int kvm_init_vector_slots(void)
1965{
1966 int err;
1967 void *base;
1968
1969 base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
1970 kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);
1971
1972 base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
1973 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);
1974
1975 if (kvm_system_needs_idmapped_vectors() &&
1976 !is_protected_kvm_enabled()) {
1977 err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
1978 __BP_HARDEN_HYP_VECS_SZ, &base);
1979 if (err)
1980 return err;
1981 }
1982
1983 kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
1984 kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
1985 return 0;
1986}
1987
1988static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
1989{
1990 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
1991 unsigned long tcr, ips;
1992
1993 /*
1994 * Calculate the raw per-cpu offset without a translation from the
1995 * kernel's mapping to the linear mapping, and store it in tpidr_el2
1996 * so that we can use adr_l to access per-cpu variables in EL2.
1997 * Also drop the KASAN tag which gets in the way...
1998 */
1999 params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
2000 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
2001
2002 params->mair_el2 = read_sysreg(mair_el1);
2003
2004 tcr = read_sysreg(tcr_el1);
2005 ips = FIELD_GET(TCR_IPS_MASK, tcr);
2006 if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
2007 tcr |= TCR_EPD1_MASK;
2008 } else {
2009 tcr &= TCR_EL2_MASK;
2010 tcr |= TCR_EL2_RES1;
2011 }
2012 tcr &= ~TCR_T0SZ_MASK;
2013 tcr |= TCR_T0SZ(hyp_va_bits);
2014 tcr &= ~TCR_EL2_PS_MASK;
2015 tcr |= FIELD_PREP(TCR_EL2_PS_MASK, ips);
2016 if (lpa2_is_enabled())
2017 tcr |= TCR_EL2_DS;
2018 params->tcr_el2 = tcr;
2019
2020 params->pgd_pa = kvm_mmu_get_httbr();
2021 if (is_protected_kvm_enabled())
2022 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
2023 else
2024 params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
2025 if (cpus_have_final_cap(ARM64_KVM_HVHE))
2026 params->hcr_el2 |= HCR_E2H;
2027 params->vttbr = params->vtcr = 0;
2028
2029 /*
2030 * Flush the init params from the data cache because the struct will
2031 * be read while the MMU is off.
2032 */
2033 kvm_flush_dcache_to_poc(params, sizeof(*params));
2034}
2035
2036static void hyp_install_host_vector(void)
2037{
2038 struct kvm_nvhe_init_params *params;
2039 struct arm_smccc_res res;
2040
2041 /* Switch from the HYP stub to our own HYP init vector */
2042 __hyp_set_vectors(kvm_get_idmap_vector());
2043
2044 /*
2045 * Call initialization code, and switch to the full blown HYP code.
2046 * If the cpucaps haven't been finalized yet, something has gone very
2047 * wrong, and hyp will crash and burn when it uses any
2048 * cpus_have_*_cap() wrapper.
2049 */
2050 BUG_ON(!system_capabilities_finalized());
2051 params = this_cpu_ptr_nvhe_sym(kvm_init_params);
2052 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
2053 WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
2054}
2055
2056static void cpu_init_hyp_mode(void)
2057{
2058 hyp_install_host_vector();
2059
2060 /*
2061 * Disabling SSBD on a non-VHE system requires us to enable SSBS
2062 * at EL2.
2063 */
2064 if (this_cpu_has_cap(ARM64_SSBS) &&
2065 arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
2066 kvm_call_hyp_nvhe(__kvm_enable_ssbs);
2067 }
2068}
2069
2070static void cpu_hyp_reset(void)
2071{
2072 if (!is_kernel_in_hyp_mode())
2073 __hyp_reset_vectors();
2074}
2075
2076/*
2077 * EL2 vectors can be mapped and rerouted in a number of ways,
2078 * depending on the kernel configuration and CPU present:
2079 *
2080 * - If the CPU is affected by Spectre-v2, the hardening sequence is
2081 * placed in one of the vector slots, which is executed before jumping
2082 * to the real vectors.
2083 *
2084 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
2085 * containing the hardening sequence is mapped next to the idmap page,
2086 * and executed before jumping to the real vectors.
2087 *
2088 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
2089 * empty slot is selected, mapped next to the idmap page, and
2090 * executed before jumping to the real vectors.
2091 *
2092 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
2093 * VHE, as we don't have hypervisor-specific mappings. If the system
2094 * is VHE and yet selects this capability, it will be ignored.
2095 */
2096static void cpu_set_hyp_vector(void)
2097{
2098 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
2099 void *vector = hyp_spectre_vector_selector[data->slot];
2100
2101 if (!is_protected_kvm_enabled())
2102 *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
2103 else
2104 kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
2105}
2106
2107static void cpu_hyp_init_context(void)
2108{
2109 kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
2110
2111 if (!is_kernel_in_hyp_mode())
2112 cpu_init_hyp_mode();
2113}
2114
2115static void cpu_hyp_init_features(void)
2116{
2117 cpu_set_hyp_vector();
2118 kvm_arm_init_debug();
2119
2120 if (is_kernel_in_hyp_mode())
2121 kvm_timer_init_vhe();
2122
2123 if (vgic_present)
2124 kvm_vgic_init_cpu_hardware();
2125}
2126
2127static void cpu_hyp_reinit(void)
2128{
2129 cpu_hyp_reset();
2130 cpu_hyp_init_context();
2131 cpu_hyp_init_features();
2132}
2133
2134static void cpu_hyp_init(void *discard)
2135{
2136 if (!__this_cpu_read(kvm_hyp_initialized)) {
2137 cpu_hyp_reinit();
2138 __this_cpu_write(kvm_hyp_initialized, 1);
2139 }
2140}
2141
2142static void cpu_hyp_uninit(void *discard)
2143{
2144 if (__this_cpu_read(kvm_hyp_initialized)) {
2145 cpu_hyp_reset();
2146 __this_cpu_write(kvm_hyp_initialized, 0);
2147 }
2148}
2149
2150int kvm_arch_enable_virtualization_cpu(void)
2151{
2152 /*
2153 * Most calls to this function are made with migration
2154 * disabled, but not with preemption disabled. The former is
2155 * enough to ensure correctness, but most of the helpers
2156 * expect the later and will throw a tantrum otherwise.
2157 */
2158 preempt_disable();
2159
2160 cpu_hyp_init(NULL);
2161
2162 kvm_vgic_cpu_up();
2163 kvm_timer_cpu_up();
2164
2165 preempt_enable();
2166
2167 return 0;
2168}
2169
2170void kvm_arch_disable_virtualization_cpu(void)
2171{
2172 kvm_timer_cpu_down();
2173 kvm_vgic_cpu_down();
2174
2175 if (!is_protected_kvm_enabled())
2176 cpu_hyp_uninit(NULL);
2177}
2178
2179#ifdef CONFIG_CPU_PM
2180static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
2181 unsigned long cmd,
2182 void *v)
2183{
2184 /*
2185 * kvm_hyp_initialized is left with its old value over
2186 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
2187 * re-enable hyp.
2188 */
2189 switch (cmd) {
2190 case CPU_PM_ENTER:
2191 if (__this_cpu_read(kvm_hyp_initialized))
2192 /*
2193 * don't update kvm_hyp_initialized here
2194 * so that the hyp will be re-enabled
2195 * when we resume. See below.
2196 */
2197 cpu_hyp_reset();
2198
2199 return NOTIFY_OK;
2200 case CPU_PM_ENTER_FAILED:
2201 case CPU_PM_EXIT:
2202 if (__this_cpu_read(kvm_hyp_initialized))
2203 /* The hyp was enabled before suspend. */
2204 cpu_hyp_reinit();
2205
2206 return NOTIFY_OK;
2207
2208 default:
2209 return NOTIFY_DONE;
2210 }
2211}
2212
2213static struct notifier_block hyp_init_cpu_pm_nb = {
2214 .notifier_call = hyp_init_cpu_pm_notifier,
2215};
2216
2217static void __init hyp_cpu_pm_init(void)
2218{
2219 if (!is_protected_kvm_enabled())
2220 cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
2221}
2222static void __init hyp_cpu_pm_exit(void)
2223{
2224 if (!is_protected_kvm_enabled())
2225 cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
2226}
2227#else
2228static inline void __init hyp_cpu_pm_init(void)
2229{
2230}
2231static inline void __init hyp_cpu_pm_exit(void)
2232{
2233}
2234#endif
2235
2236static void __init init_cpu_logical_map(void)
2237{
2238 unsigned int cpu;
2239
2240 /*
2241 * Copy the MPIDR <-> logical CPU ID mapping to hyp.
2242 * Only copy the set of online CPUs whose features have been checked
2243 * against the finalized system capabilities. The hypervisor will not
2244 * allow any other CPUs from the `possible` set to boot.
2245 */
2246 for_each_online_cpu(cpu)
2247 hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
2248}
2249
2250#define init_psci_0_1_impl_state(config, what) \
2251 config.psci_0_1_ ## what ## _implemented = psci_ops.what
2252
2253static bool __init init_psci_relay(void)
2254{
2255 /*
2256 * If PSCI has not been initialized, protected KVM cannot install
2257 * itself on newly booted CPUs.
2258 */
2259 if (!psci_ops.get_version) {
2260 kvm_err("Cannot initialize protected mode without PSCI\n");
2261 return false;
2262 }
2263
2264 kvm_host_psci_config.version = psci_ops.get_version();
2265 kvm_host_psci_config.smccc_version = arm_smccc_get_version();
2266
2267 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
2268 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
2269 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
2270 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
2271 init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
2272 init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
2273 }
2274 return true;
2275}
2276
2277static int __init init_subsystems(void)
2278{
2279 int err = 0;
2280
2281 /*
2282 * Enable hardware so that subsystem initialisation can access EL2.
2283 */
2284 on_each_cpu(cpu_hyp_init, NULL, 1);
2285
2286 /*
2287 * Register CPU lower-power notifier
2288 */
2289 hyp_cpu_pm_init();
2290
2291 /*
2292 * Init HYP view of VGIC
2293 */
2294 err = kvm_vgic_hyp_init();
2295 switch (err) {
2296 case 0:
2297 vgic_present = true;
2298 break;
2299 case -ENODEV:
2300 case -ENXIO:
2301 vgic_present = false;
2302 err = 0;
2303 break;
2304 default:
2305 goto out;
2306 }
2307
2308 /*
2309 * Init HYP architected timer support
2310 */
2311 err = kvm_timer_hyp_init(vgic_present);
2312 if (err)
2313 goto out;
2314
2315 kvm_register_perf_callbacks(NULL);
2316
2317out:
2318 if (err)
2319 hyp_cpu_pm_exit();
2320
2321 if (err || !is_protected_kvm_enabled())
2322 on_each_cpu(cpu_hyp_uninit, NULL, 1);
2323
2324 return err;
2325}
2326
2327static void __init teardown_subsystems(void)
2328{
2329 kvm_unregister_perf_callbacks();
2330 hyp_cpu_pm_exit();
2331}
2332
2333static void __init teardown_hyp_mode(void)
2334{
2335 bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
2336 int cpu;
2337
2338 free_hyp_pgds();
2339 for_each_possible_cpu(cpu) {
2340 free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
2341 free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
2342
2343 if (free_sve) {
2344 struct cpu_sve_state *sve_state;
2345
2346 sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2347 free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
2348 }
2349 }
2350}
2351
2352static int __init do_pkvm_init(u32 hyp_va_bits)
2353{
2354 void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
2355 int ret;
2356
2357 preempt_disable();
2358 cpu_hyp_init_context();
2359 ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
2360 num_possible_cpus(), kern_hyp_va(per_cpu_base),
2361 hyp_va_bits);
2362 cpu_hyp_init_features();
2363
2364 /*
2365 * The stub hypercalls are now disabled, so set our local flag to
2366 * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
2367 */
2368 __this_cpu_write(kvm_hyp_initialized, 1);
2369 preempt_enable();
2370
2371 return ret;
2372}
2373
2374static u64 get_hyp_id_aa64pfr0_el1(void)
2375{
2376 /*
2377 * Track whether the system isn't affected by spectre/meltdown in the
2378 * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
2379 * Although this is per-CPU, we make it global for simplicity, e.g., not
2380 * to have to worry about vcpu migration.
2381 *
2382 * Unlike for non-protected VMs, userspace cannot override this for
2383 * protected VMs.
2384 */
2385 u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
2386
2387 val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
2388 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
2389
2390 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
2391 arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
2392 val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
2393 arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);
2394
2395 return val;
2396}
2397
2398static void kvm_hyp_init_symbols(void)
2399{
2400 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
2401 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
2402 kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
2403 kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
2404 kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
2405 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
2406 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
2407 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
2408 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
2409 kvm_nvhe_sym(__icache_flags) = __icache_flags;
2410 kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
2411}
2412
2413static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
2414{
2415 void *addr = phys_to_virt(hyp_mem_base);
2416 int ret;
2417
2418 ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
2419 if (ret)
2420 return ret;
2421
2422 ret = do_pkvm_init(hyp_va_bits);
2423 if (ret)
2424 return ret;
2425
2426 free_hyp_pgds();
2427
2428 return 0;
2429}
2430
2431static int init_pkvm_host_sve_state(void)
2432{
2433 int cpu;
2434
2435 if (!system_supports_sve())
2436 return 0;
2437
2438 /* Allocate pages for host sve state in protected mode. */
2439 for_each_possible_cpu(cpu) {
2440 struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());
2441
2442 if (!page)
2443 return -ENOMEM;
2444
2445 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
2446 }
2447
2448 /*
2449 * Don't map the pages in hyp since these are only used in protected
2450 * mode, which will (re)create its own mapping when initialized.
2451 */
2452
2453 return 0;
2454}
2455
2456/*
2457 * Finalizes the initialization of hyp mode, once everything else is initialized
2458 * and the initialziation process cannot fail.
2459 */
2460static void finalize_init_hyp_mode(void)
2461{
2462 int cpu;
2463
2464 if (system_supports_sve() && is_protected_kvm_enabled()) {
2465 for_each_possible_cpu(cpu) {
2466 struct cpu_sve_state *sve_state;
2467
2468 sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
2469 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
2470 kern_hyp_va(sve_state);
2471 }
2472 } else {
2473 for_each_possible_cpu(cpu) {
2474 struct user_fpsimd_state *fpsimd_state;
2475
2476 fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs;
2477 per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state =
2478 kern_hyp_va(fpsimd_state);
2479 }
2480 }
2481}
2482
2483static void pkvm_hyp_init_ptrauth(void)
2484{
2485 struct kvm_cpu_context *hyp_ctxt;
2486 int cpu;
2487
2488 for_each_possible_cpu(cpu) {
2489 hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
2490 hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
2491 hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
2492 hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
2493 hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
2494 hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
2495 hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
2496 hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
2497 hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
2498 hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
2499 hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
2500 }
2501}
2502
2503/* Inits Hyp-mode on all online CPUs */
2504static int __init init_hyp_mode(void)
2505{
2506 u32 hyp_va_bits;
2507 int cpu;
2508 int err = -ENOMEM;
2509
2510 /*
2511 * The protected Hyp-mode cannot be initialized if the memory pool
2512 * allocation has failed.
2513 */
2514 if (is_protected_kvm_enabled() && !hyp_mem_base)
2515 goto out_err;
2516
2517 /*
2518 * Allocate Hyp PGD and setup Hyp identity mapping
2519 */
2520 err = kvm_mmu_init(&hyp_va_bits);
2521 if (err)
2522 goto out_err;
2523
2524 /*
2525 * Allocate stack pages for Hypervisor-mode
2526 */
2527 for_each_possible_cpu(cpu) {
2528 unsigned long stack_page;
2529
2530 stack_page = __get_free_page(GFP_KERNEL);
2531 if (!stack_page) {
2532 err = -ENOMEM;
2533 goto out_err;
2534 }
2535
2536 per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
2537 }
2538
2539 /*
2540 * Allocate and initialize pages for Hypervisor-mode percpu regions.
2541 */
2542 for_each_possible_cpu(cpu) {
2543 struct page *page;
2544 void *page_addr;
2545
2546 page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
2547 if (!page) {
2548 err = -ENOMEM;
2549 goto out_err;
2550 }
2551
2552 page_addr = page_address(page);
2553 memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
2554 kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
2555 }
2556
2557 /*
2558 * Map the Hyp-code called directly from the host
2559 */
2560 err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
2561 kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
2562 if (err) {
2563 kvm_err("Cannot map world-switch code\n");
2564 goto out_err;
2565 }
2566
2567 err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
2568 kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
2569 if (err) {
2570 kvm_err("Cannot map .hyp.rodata section\n");
2571 goto out_err;
2572 }
2573
2574 err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
2575 kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
2576 if (err) {
2577 kvm_err("Cannot map rodata section\n");
2578 goto out_err;
2579 }
2580
2581 /*
2582 * .hyp.bss is guaranteed to be placed at the beginning of the .bss
2583 * section thanks to an assertion in the linker script. Map it RW and
2584 * the rest of .bss RO.
2585 */
2586 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
2587 kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
2588 if (err) {
2589 kvm_err("Cannot map hyp bss section: %d\n", err);
2590 goto out_err;
2591 }
2592
2593 err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
2594 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
2595 if (err) {
2596 kvm_err("Cannot map bss section\n");
2597 goto out_err;
2598 }
2599
2600 /*
2601 * Map the Hyp stack pages
2602 */
2603 for_each_possible_cpu(cpu) {
2604 struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
2605 char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
2606
2607 err = create_hyp_stack(__pa(stack_page), ¶ms->stack_hyp_va);
2608 if (err) {
2609 kvm_err("Cannot map hyp stack\n");
2610 goto out_err;
2611 }
2612
2613 /*
2614 * Save the stack PA in nvhe_init_params. This will be needed
2615 * to recreate the stack mapping in protected nVHE mode.
2616 * __hyp_pa() won't do the right thing there, since the stack
2617 * has been mapped in the flexible private VA space.
2618 */
2619 params->stack_pa = __pa(stack_page);
2620 }
2621
2622 for_each_possible_cpu(cpu) {
2623 char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
2624 char *percpu_end = percpu_begin + nvhe_percpu_size();
2625
2626 /* Map Hyp percpu pages */
2627 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
2628 if (err) {
2629 kvm_err("Cannot map hyp percpu region\n");
2630 goto out_err;
2631 }
2632
2633 /* Prepare the CPU initialization parameters */
2634 cpu_prepare_hyp_mode(cpu, hyp_va_bits);
2635 }
2636
2637 kvm_hyp_init_symbols();
2638
2639 if (is_protected_kvm_enabled()) {
2640 if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
2641 cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
2642 pkvm_hyp_init_ptrauth();
2643
2644 init_cpu_logical_map();
2645
2646 if (!init_psci_relay()) {
2647 err = -ENODEV;
2648 goto out_err;
2649 }
2650
2651 err = init_pkvm_host_sve_state();
2652 if (err)
2653 goto out_err;
2654
2655 err = kvm_hyp_init_protection(hyp_va_bits);
2656 if (err) {
2657 kvm_err("Failed to init hyp memory protection\n");
2658 goto out_err;
2659 }
2660 }
2661
2662 return 0;
2663
2664out_err:
2665 teardown_hyp_mode();
2666 kvm_err("error initializing Hyp mode: %d\n", err);
2667 return err;
2668}
2669
2670struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
2671{
2672 struct kvm_vcpu *vcpu = NULL;
2673 struct kvm_mpidr_data *data;
2674 unsigned long i;
2675
2676 mpidr &= MPIDR_HWID_BITMASK;
2677
2678 rcu_read_lock();
2679 data = rcu_dereference(kvm->arch.mpidr_data);
2680
2681 if (data) {
2682 u16 idx = kvm_mpidr_index(data, mpidr);
2683
2684 vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
2685 if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
2686 vcpu = NULL;
2687 }
2688
2689 rcu_read_unlock();
2690
2691 if (vcpu)
2692 return vcpu;
2693
2694 kvm_for_each_vcpu(i, vcpu, kvm) {
2695 if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
2696 return vcpu;
2697 }
2698 return NULL;
2699}
2700
2701bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
2702{
2703 return irqchip_in_kernel(kvm);
2704}
2705
2706bool kvm_arch_has_irq_bypass(void)
2707{
2708 return true;
2709}
2710
2711int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
2712 struct irq_bypass_producer *prod)
2713{
2714 struct kvm_kernel_irqfd *irqfd =
2715 container_of(cons, struct kvm_kernel_irqfd, consumer);
2716
2717 return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
2718 &irqfd->irq_entry);
2719}
2720void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
2721 struct irq_bypass_producer *prod)
2722{
2723 struct kvm_kernel_irqfd *irqfd =
2724 container_of(cons, struct kvm_kernel_irqfd, consumer);
2725
2726 kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
2727 &irqfd->irq_entry);
2728}
2729
2730void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
2731{
2732 struct kvm_kernel_irqfd *irqfd =
2733 container_of(cons, struct kvm_kernel_irqfd, consumer);
2734
2735 kvm_arm_halt_guest(irqfd->kvm);
2736}
2737
2738void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
2739{
2740 struct kvm_kernel_irqfd *irqfd =
2741 container_of(cons, struct kvm_kernel_irqfd, consumer);
2742
2743 kvm_arm_resume_guest(irqfd->kvm);
2744}
2745
2746/* Initialize Hyp-mode and memory mappings on all CPUs */
2747static __init int kvm_arm_init(void)
2748{
2749 int err;
2750 bool in_hyp_mode;
2751
2752 if (!is_hyp_mode_available()) {
2753 kvm_info("HYP mode not available\n");
2754 return -ENODEV;
2755 }
2756
2757 if (kvm_get_mode() == KVM_MODE_NONE) {
2758 kvm_info("KVM disabled from command line\n");
2759 return -ENODEV;
2760 }
2761
2762 err = kvm_sys_reg_table_init();
2763 if (err) {
2764 kvm_info("Error initializing system register tables");
2765 return err;
2766 }
2767
2768 in_hyp_mode = is_kernel_in_hyp_mode();
2769
2770 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
2771 cpus_have_final_cap(ARM64_WORKAROUND_1508412))
2772 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
2773 "Only trusted guests should be used on this system.\n");
2774
2775 err = kvm_set_ipa_limit();
2776 if (err)
2777 return err;
2778
2779 err = kvm_arm_init_sve();
2780 if (err)
2781 return err;
2782
2783 err = kvm_arm_vmid_alloc_init();
2784 if (err) {
2785 kvm_err("Failed to initialize VMID allocator.\n");
2786 return err;
2787 }
2788
2789 if (!in_hyp_mode) {
2790 err = init_hyp_mode();
2791 if (err)
2792 goto out_err;
2793 }
2794
2795 err = kvm_init_vector_slots();
2796 if (err) {
2797 kvm_err("Cannot initialise vector slots\n");
2798 goto out_hyp;
2799 }
2800
2801 err = init_subsystems();
2802 if (err)
2803 goto out_hyp;
2804
2805 kvm_info("%s%sVHE mode initialized successfully\n",
2806 in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
2807 "Protected " : "Hyp "),
2808 in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
2809 "h" : "n"));
2810
2811 /*
2812 * FIXME: Do something reasonable if kvm_init() fails after pKVM
2813 * hypervisor protection is finalized.
2814 */
2815 err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
2816 if (err)
2817 goto out_subs;
2818
2819 /*
2820 * This should be called after initialization is done and failure isn't
2821 * possible anymore.
2822 */
2823 if (!in_hyp_mode)
2824 finalize_init_hyp_mode();
2825
2826 kvm_arm_initialised = true;
2827
2828 return 0;
2829
2830out_subs:
2831 teardown_subsystems();
2832out_hyp:
2833 if (!in_hyp_mode)
2834 teardown_hyp_mode();
2835out_err:
2836 kvm_arm_vmid_alloc_free();
2837 return err;
2838}
2839
2840static int __init early_kvm_mode_cfg(char *arg)
2841{
2842 if (!arg)
2843 return -EINVAL;
2844
2845 if (strcmp(arg, "none") == 0) {
2846 kvm_mode = KVM_MODE_NONE;
2847 return 0;
2848 }
2849
2850 if (!is_hyp_mode_available()) {
2851 pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
2852 return 0;
2853 }
2854
2855 if (strcmp(arg, "protected") == 0) {
2856 if (!is_kernel_in_hyp_mode())
2857 kvm_mode = KVM_MODE_PROTECTED;
2858 else
2859 pr_warn_once("Protected KVM not available with VHE\n");
2860
2861 return 0;
2862 }
2863
2864 if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
2865 kvm_mode = KVM_MODE_DEFAULT;
2866 return 0;
2867 }
2868
2869 if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
2870 kvm_mode = KVM_MODE_NV;
2871 return 0;
2872 }
2873
2874 return -EINVAL;
2875}
2876early_param("kvm-arm.mode", early_kvm_mode_cfg);
2877
2878static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
2879{
2880 if (!arg)
2881 return -EINVAL;
2882
2883 if (strcmp(arg, "trap") == 0) {
2884 *p = KVM_WFX_TRAP;
2885 return 0;
2886 }
2887
2888 if (strcmp(arg, "notrap") == 0) {
2889 *p = KVM_WFX_NOTRAP;
2890 return 0;
2891 }
2892
2893 return -EINVAL;
2894}
2895
2896static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
2897{
2898 return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
2899}
2900early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);
2901
2902static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
2903{
2904 return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
2905}
2906early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);
2907
2908enum kvm_mode kvm_get_mode(void)
2909{
2910 return kvm_mode;
2911}
2912
2913module_init(kvm_arm_init);