Linux Audio

Check our new training course

Loading...
v6.2
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Hosting Protected Virtual Machines
  4 *
  5 * Copyright IBM Corp. 2019, 2020
  6 *    Author(s): Janosch Frank <frankja@linux.ibm.com>
  7 */
  8#include <linux/kvm.h>
  9#include <linux/kvm_host.h>
 10#include <linux/minmax.h>
 11#include <linux/pagemap.h>
 12#include <linux/sched/signal.h>
 13#include <asm/gmap.h>
 14#include <asm/uv.h>
 15#include <asm/mman.h>
 16#include <linux/pagewalk.h>
 17#include <linux/sched/mm.h>
 18#include <linux/mmu_notifier.h>
 19#include "kvm-s390.h"
 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 21/**
 22 * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
 23 * be destroyed
 24 *
 25 * @list: list head for the list of leftover VMs
 26 * @old_gmap_table: the gmap table of the leftover protected VM
 27 * @handle: the handle of the leftover protected VM
 28 * @stor_var: pointer to the variable storage of the leftover protected VM
 29 * @stor_base: address of the base storage of the leftover protected VM
 30 *
 31 * Represents a protected VM that is still registered with the Ultravisor,
 32 * but which does not correspond any longer to an active KVM VM. It should
 33 * be destroyed at some point later, either asynchronously or when the
 34 * process terminates.
 35 */
 36struct pv_vm_to_be_destroyed {
 37	struct list_head list;
 38	unsigned long old_gmap_table;
 39	u64 handle;
 40	void *stor_var;
 41	unsigned long stor_base;
 42};
 43
 44static void kvm_s390_clear_pv_state(struct kvm *kvm)
 45{
 46	kvm->arch.pv.handle = 0;
 47	kvm->arch.pv.guest_len = 0;
 48	kvm->arch.pv.stor_base = 0;
 49	kvm->arch.pv.stor_var = NULL;
 50}
 51
 52int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
 53{
 54	int cc;
 55
 56	if (!kvm_s390_pv_cpu_get_handle(vcpu))
 57		return 0;
 58
 59	cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
 60
 61	KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
 62		     vcpu->vcpu_id, *rc, *rrc);
 63	WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc);
 64
 65	/* Intended memory leak for something that should never happen. */
 66	if (!cc)
 67		free_pages(vcpu->arch.pv.stor_base,
 68			   get_order(uv_info.guest_cpu_stor_len));
 69
 70	free_page((unsigned long)sida_addr(vcpu->arch.sie_block));
 71	vcpu->arch.sie_block->pv_handle_cpu = 0;
 72	vcpu->arch.sie_block->pv_handle_config = 0;
 73	memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
 74	vcpu->arch.sie_block->sdf = 0;
 75	/*
 76	 * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0).
 77	 * Use the reset value of gbea to avoid leaking the kernel pointer of
 78	 * the just freed sida.
 79	 */
 80	vcpu->arch.sie_block->gbea = 1;
 81	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 82
 83	return cc ? EIO : 0;
 84}
 85
 86int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
 87{
 88	struct uv_cb_csc uvcb = {
 89		.header.cmd = UVC_CMD_CREATE_SEC_CPU,
 90		.header.len = sizeof(uvcb),
 91	};
 92	void *sida_addr;
 93	int cc;
 94
 95	if (kvm_s390_pv_cpu_get_handle(vcpu))
 96		return -EINVAL;
 97
 98	vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT,
 99						   get_order(uv_info.guest_cpu_stor_len));
100	if (!vcpu->arch.pv.stor_base)
101		return -ENOMEM;
102
103	/* Input */
104	uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
105	uvcb.num = vcpu->arch.sie_block->icpua;
106	uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block);
107	uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base);
108
109	/* Alloc Secure Instruction Data Area Designation */
110	sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
111	if (!sida_addr) {
112		free_pages(vcpu->arch.pv.stor_base,
113			   get_order(uv_info.guest_cpu_stor_len));
114		return -ENOMEM;
115	}
116	vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr);
117
118	cc = uv_call(0, (u64)&uvcb);
119	*rc = uvcb.header.rc;
120	*rrc = uvcb.header.rrc;
121	KVM_UV_EVENT(vcpu->kvm, 3,
122		     "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
123		     vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
124		     uvcb.header.rrc);
125
126	if (cc) {
127		u16 dummy;
128
129		kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
130		return -EIO;
131	}
132
133	/* Output */
134	vcpu->arch.pv.handle = uvcb.cpu_handle;
135	vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
136	vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
137	vcpu->arch.sie_block->sdf = 2;
138	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
139	return 0;
140}
141
142/* only free resources when the destroy was successful */
143static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
144{
145	vfree(kvm->arch.pv.stor_var);
146	free_pages(kvm->arch.pv.stor_base,
147		   get_order(uv_info.guest_base_stor_len));
148	kvm_s390_clear_pv_state(kvm);
149}
150
151static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
152{
153	unsigned long base = uv_info.guest_base_stor_len;
154	unsigned long virt = uv_info.guest_virt_var_stor_len;
155	unsigned long npages = 0, vlen = 0;
156
157	kvm->arch.pv.stor_var = NULL;
158	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base));
159	if (!kvm->arch.pv.stor_base)
160		return -ENOMEM;
161
162	/*
163	 * Calculate current guest storage for allocation of the
164	 * variable storage, which is based on the length in MB.
165	 *
166	 * Slots are sorted by GFN
167	 */
168	mutex_lock(&kvm->slots_lock);
169	npages = kvm_s390_get_gfn_end(kvm_memslots(kvm));
170	mutex_unlock(&kvm->slots_lock);
171
172	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
173
174	/* Allocate variable storage */
175	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
176	vlen += uv_info.guest_virt_base_stor_len;
177	kvm->arch.pv.stor_var = vzalloc(vlen);
178	if (!kvm->arch.pv.stor_var)
179		goto out_err;
180	return 0;
181
182out_err:
183	kvm_s390_pv_dealloc_vm(kvm);
184	return -ENOMEM;
185}
186
187/**
188 * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM.
189 * @kvm: the KVM that was associated with this leftover protected VM
190 * @leftover: details about the leftover protected VM that needs a clean up
191 * @rc: the RC code of the Destroy Secure Configuration UVC
192 * @rrc: the RRC code of the Destroy Secure Configuration UVC
193 *
194 * Destroy one leftover protected VM.
195 * On success, kvm->mm->context.protected_count will be decremented atomically
196 * and all other resources used by the VM will be freed.
197 *
198 * Return: 0 in case of success, otherwise 1
199 */
200static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
201					    struct pv_vm_to_be_destroyed *leftover,
202					    u16 *rc, u16 *rrc)
203{
204	int cc;
205
206	/* It used the destroy-fast UVC, nothing left to do here */
207	if (!leftover->handle)
208		goto done_fast;
209	cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
210	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc);
211	WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc);
212	if (cc)
213		return cc;
214	/*
215	 * Intentionally leak unusable memory. If the UVC fails, the memory
216	 * used for the VM and its metadata is permanently unusable.
217	 * This can only happen in case of a serious KVM or hardware bug; it
218	 * is not expected to happen in normal operation.
219	 */
220	free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len));
221	free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER);
222	vfree(leftover->stor_var);
223done_fast:
224	atomic_dec(&kvm->mm->context.protected_count);
225	return 0;
226}
227
228/**
229 * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
230 * @kvm: the VM whose memory is to be cleared.
231 *
232 * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
233 * The CPUs of the protected VM need to be destroyed beforehand.
234 */
235static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
236{
237	const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
238	struct kvm_memory_slot *slot;
239	unsigned long len;
240	int srcu_idx;
241
242	srcu_idx = srcu_read_lock(&kvm->srcu);
243
244	/* Take the memslot containing guest absolute address 0 */
245	slot = gfn_to_memslot(kvm, 0);
246	/* Clear all slots or parts thereof that are below 2GB */
247	while (slot && slot->base_gfn < pages_2g) {
248		len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
249		s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
250		/* Take the next memslot */
251		slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
252	}
253
254	srcu_read_unlock(&kvm->srcu, srcu_idx);
255}
256
257static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
258{
259	struct uv_cb_destroy_fast uvcb = {
260		.header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST,
261		.header.len = sizeof(uvcb),
262		.handle = kvm_s390_pv_get_handle(kvm),
263	};
264	int cc;
265
266	cc = uv_call_sched(0, (u64)&uvcb);
267	if (rc)
268		*rc = uvcb.header.rc;
269	if (rrc)
270		*rrc = uvcb.header.rrc;
271	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
272	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
273		     uvcb.header.rc, uvcb.header.rrc);
274	WARN_ONCE(cc, "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
 
275		  kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
276	/* Inteded memory leak on "impossible" error */
277	if (!cc)
278		kvm_s390_pv_dealloc_vm(kvm);
279	return cc ? -EIO : 0;
280}
281
282static inline bool is_destroy_fast_available(void)
283{
284	return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list);
285}
286
287/**
288 * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown.
289 * @kvm: the VM
290 * @rc: return value for the RC field of the UVCB
291 * @rrc: return value for the RRC field of the UVCB
292 *
293 * Set aside the protected VM for a subsequent teardown. The VM will be able
294 * to continue immediately as a non-secure VM, and the information needed to
295 * properly tear down the protected VM is set aside. If another protected VM
296 * was already set aside without starting its teardown, this function will
297 * fail.
298 * The CPUs of the protected VM need to be destroyed beforehand.
299 *
300 * Context: kvm->lock needs to be held
301 *
302 * Return: 0 in case of success, -EINVAL if another protected VM was already set
303 * aside, -ENOMEM if the system ran out of memory.
304 */
305int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
306{
307	struct pv_vm_to_be_destroyed *priv;
308	int res = 0;
309
310	lockdep_assert_held(&kvm->lock);
311	/*
312	 * If another protected VM was already prepared for teardown, refuse.
313	 * A normal deinitialization has to be performed instead.
314	 */
315	if (kvm->arch.pv.set_aside)
316		return -EINVAL;
 
 
 
 
 
317	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
318	if (!priv)
319		return -ENOMEM;
320
321	if (is_destroy_fast_available()) {
322		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
323	} else {
324		priv->stor_var = kvm->arch.pv.stor_var;
325		priv->stor_base = kvm->arch.pv.stor_base;
326		priv->handle = kvm_s390_pv_get_handle(kvm);
327		priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
328		WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
329		if (s390_replace_asce(kvm->arch.gmap))
330			res = -ENOMEM;
331	}
332
333	if (res) {
334		kfree(priv);
335		return res;
336	}
337
338	kvm_s390_destroy_lower_2g(kvm);
339	kvm_s390_clear_pv_state(kvm);
340	kvm->arch.pv.set_aside = priv;
341
342	*rc = UVC_RC_EXECUTED;
343	*rrc = 42;
344	return 0;
345}
346
347/**
348 * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM
349 * @kvm: the KVM whose protected VM needs to be deinitialized
350 * @rc: the RC code of the UVC
351 * @rrc: the RRC code of the UVC
352 *
353 * Deinitialize the current protected VM. This function will destroy and
354 * cleanup the current protected VM, but it will not cleanup the guest
355 * memory. This function should only be called when the protected VM has
356 * just been created and therefore does not have any guest memory, or when
357 * the caller cleans up the guest memory separately.
358 *
359 * This function should not fail, but if it does, the donated memory must
360 * not be freed.
361 *
362 * Context: kvm->lock needs to be held
363 *
364 * Return: 0 in case of success, otherwise -EIO
365 */
366int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
367{
368	int cc;
369
370	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
371			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
372	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
373	if (!cc) {
374		atomic_dec(&kvm->mm->context.protected_count);
375		kvm_s390_pv_dealloc_vm(kvm);
376	} else {
377		/* Intended memory leak on "impossible" error */
378		s390_replace_asce(kvm->arch.gmap);
379	}
380	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
381	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
382
383	return cc ? -EIO : 0;
384}
385
386/**
387 * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated
388 * with a specific KVM.
389 * @kvm: the KVM to be cleaned up
390 * @rc: the RC code of the first failing UVC
391 * @rrc: the RRC code of the first failing UVC
392 *
393 * This function will clean up all protected VMs associated with a KVM.
394 * This includes the active one, the one prepared for deinitialization with
395 * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list.
396 *
397 * Context: kvm->lock needs to be held unless being called from
398 * kvm_arch_destroy_vm.
399 *
400 * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO
401 */
402int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
403{
404	struct pv_vm_to_be_destroyed *cur;
405	bool need_zap = false;
406	u16 _rc, _rrc;
407	int cc = 0;
408
409	/* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */
410	atomic_inc(&kvm->mm->context.protected_count);
 
 
 
 
411
412	*rc = 1;
413	/* If the current VM is protected, destroy it */
414	if (kvm_s390_pv_get_handle(kvm)) {
415		cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
416		need_zap = true;
417	}
418
419	/* If a previous protected VM was set aside, put it in the need_cleanup list */
420	if (kvm->arch.pv.set_aside) {
421		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
422		kvm->arch.pv.set_aside = NULL;
423	}
424
425	/* Cleanup all protected VMs in the need_cleanup list */
426	while (!list_empty(&kvm->arch.pv.need_cleanup)) {
427		cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list);
428		need_zap = true;
429		if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) {
430			cc = 1;
431			/*
432			 * Only return the first error rc and rrc, so make
433			 * sure it is not overwritten. All destroys will
434			 * additionally be reported via KVM_UV_EVENT().
435			 */
436			if (*rc == UVC_RC_EXECUTED) {
437				*rc = _rc;
438				*rrc = _rrc;
439			}
440		}
441		list_del(&cur->list);
442		kfree(cur);
443	}
444
445	/*
446	 * If the mm still has a mapping, try to mark all its pages as
447	 * accessible. The counter should not reach zero before this
448	 * cleanup has been performed.
449	 */
450	if (need_zap && mmget_not_zero(kvm->mm)) {
451		s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
452		mmput(kvm->mm);
453	}
454
455	/* Now the counter can safely reach 0 */
456	atomic_dec(&kvm->mm->context.protected_count);
457	return cc ? -EIO : 0;
458}
459
460/**
461 * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM.
462 * @kvm: the VM previously associated with the protected VM
463 * @rc: return value for the RC field of the UVCB
464 * @rrc: return value for the RRC field of the UVCB
465 *
466 * Tear down the protected VM that had been previously prepared for teardown
467 * using kvm_s390_pv_set_aside_vm. Ideally this should be called by
468 * userspace asynchronously from a separate thread.
469 *
470 * Context: kvm->lock must not be held.
471 *
472 * Return: 0 in case of success, -EINVAL if no protected VM had been
473 * prepared for asynchronous teardowm, -EIO in case of other errors.
474 */
475int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
476{
477	struct pv_vm_to_be_destroyed *p;
478	int ret = 0;
479
480	lockdep_assert_not_held(&kvm->lock);
481	mutex_lock(&kvm->lock);
482	p = kvm->arch.pv.set_aside;
483	kvm->arch.pv.set_aside = NULL;
484	mutex_unlock(&kvm->lock);
485	if (!p)
486		return -EINVAL;
487
488	/* When a fatal signal is received, stop immediately */
489	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
490		goto done;
491	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
492		ret = -EIO;
493	kfree(p);
494	p = NULL;
495done:
496	/*
497	 * p is not NULL if we aborted because of a fatal signal, in which
498	 * case queue the leftover for later cleanup.
499	 */
500	if (p) {
501		mutex_lock(&kvm->lock);
502		list_add(&p->list, &kvm->arch.pv.need_cleanup);
503		mutex_unlock(&kvm->lock);
504		/* Did not finish, but pretend things went well */
505		*rc = UVC_RC_EXECUTED;
506		*rrc = 42;
507	}
508	return ret;
509}
510
511static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
512					     struct mm_struct *mm)
513{
514	struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
515	u16 dummy;
516	int r;
517
518	/*
519	 * No locking is needed since this is the last thread of the last user of this
520	 * struct mm.
521	 * When the struct kvm gets deinitialized, this notifier is also
522	 * unregistered. This means that if this notifier runs, then the
523	 * struct kvm is still valid.
524	 */
525	r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
526	if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
527		kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
528}
529
530static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
531	.release = kvm_s390_pv_mmu_notifier_release,
532};
533
534int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
535{
536	struct uv_cb_cgc uvcb = {
537		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
538		.header.len = sizeof(uvcb)
539	};
540	int cc, ret;
541	u16 dummy;
542
543	ret = kvm_s390_pv_alloc_vm(kvm);
544	if (ret)
545		return ret;
546
547	/* Inputs */
548	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
549	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
550	uvcb.guest_asce = kvm->arch.gmap->asce;
551	uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
552	uvcb.conf_base_stor_origin =
553		virt_to_phys((void *)kvm->arch.pv.stor_base);
554	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
 
 
555
556	cc = uv_call_sched(0, (u64)&uvcb);
557	*rc = uvcb.header.rc;
558	*rrc = uvcb.header.rrc;
559	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
560		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
561
562	/* Outputs */
563	kvm->arch.pv.handle = uvcb.guest_handle;
564
565	atomic_inc(&kvm->mm->context.protected_count);
566	if (cc) {
567		if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
568			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
569		} else {
570			atomic_dec(&kvm->mm->context.protected_count);
571			kvm_s390_pv_dealloc_vm(kvm);
572		}
573		return -EIO;
574	}
575	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
576	/* Add the notifier only once. No races because we hold kvm->lock */
577	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
578		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
579		mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
580	}
581	return 0;
582}
583
584int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
585			      u16 *rrc)
586{
587	struct uv_cb_ssc uvcb = {
588		.header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
589		.header.len = sizeof(uvcb),
590		.sec_header_origin = (u64)hdr,
591		.sec_header_len = length,
592		.guest_handle = kvm_s390_pv_get_handle(kvm),
593	};
594	int cc = uv_call(0, (u64)&uvcb);
595
596	*rc = uvcb.header.rc;
597	*rrc = uvcb.header.rrc;
598	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
599		     *rc, *rrc);
600	return cc ? -EINVAL : 0;
601}
602
603static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
604		      u64 offset, u16 *rc, u16 *rrc)
605{
606	struct uv_cb_unp uvcb = {
607		.header.cmd = UVC_CMD_UNPACK_IMG,
608		.header.len = sizeof(uvcb),
609		.guest_handle = kvm_s390_pv_get_handle(kvm),
610		.gaddr = addr,
611		.tweak[0] = tweak,
612		.tweak[1] = offset,
613	};
614	int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
615
616	*rc = uvcb.header.rc;
617	*rrc = uvcb.header.rrc;
618
619	if (ret && ret != -EAGAIN)
620		KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
621			     uvcb.gaddr, *rc, *rrc);
622	return ret;
623}
624
625int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
626		       unsigned long tweak, u16 *rc, u16 *rrc)
627{
628	u64 offset = 0;
629	int ret = 0;
630
631	if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
632		return -EINVAL;
633
634	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
635		     addr, size);
636
637	while (offset < size) {
638		ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
639		if (ret == -EAGAIN) {
640			cond_resched();
641			if (fatal_signal_pending(current))
642				break;
643			continue;
644		}
645		if (ret)
646			break;
647		addr += PAGE_SIZE;
648		offset += PAGE_SIZE;
649	}
650	if (!ret)
651		KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
652	return ret;
653}
654
655int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
656{
657	struct uv_cb_cpu_set_state uvcb = {
658		.header.cmd	= UVC_CMD_CPU_SET_STATE,
659		.header.len	= sizeof(uvcb),
660		.cpu_handle	= kvm_s390_pv_cpu_get_handle(vcpu),
661		.state		= state,
662	};
663	int cc;
664
665	cc = uv_call(0, (u64)&uvcb);
666	KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x",
667		     vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc);
668	if (cc)
669		return -EINVAL;
670	return 0;
671}
672
673int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
674{
675	struct uv_cb_dump_cpu uvcb = {
676		.header.cmd = UVC_CMD_DUMP_CPU,
677		.header.len = sizeof(uvcb),
678		.cpu_handle = vcpu->arch.pv.handle,
679		.dump_area_origin = (u64)buff,
680	};
681	int cc;
682
683	cc = uv_call_sched(0, (u64)&uvcb);
684	*rc = uvcb.header.rc;
685	*rrc = uvcb.header.rrc;
686	return cc;
687}
688
689/* Size of the cache for the storage state dump data. 1MB for now */
690#define DUMP_BUFF_LEN HPAGE_SIZE
691
692/**
693 * kvm_s390_pv_dump_stor_state
694 *
695 * @kvm: pointer to the guest's KVM struct
696 * @buff_user: Userspace pointer where we will write the results to
697 * @gaddr: Starting absolute guest address for which the storage state
698 *	   is requested.
699 * @buff_user_len: Length of the buff_user buffer
700 * @rc: Pointer to where the uvcb return code is stored
701 * @rrc: Pointer to where the uvcb return reason code is stored
702 *
703 * Stores buff_len bytes of tweak component values to buff_user
704 * starting with the 1MB block specified by the absolute guest address
705 * (gaddr). The gaddr pointer will be updated with the last address
706 * for which data was written when returning to userspace. buff_user
707 * might be written to even if an error rc is returned. For instance
708 * if we encounter a fault after writing the first page of data.
709 *
710 * Context: kvm->lock needs to be held
711 *
712 * Return:
713 *  0 on success
714 *  -ENOMEM if allocating the cache fails
715 *  -EINVAL if gaddr is not aligned to 1MB
716 *  -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
717 *  -EINVAL if the UV call fails, rc and rrc will be set in this case
718 *  -EFAULT if copying the result to buff_user failed
719 */
720int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
721				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
722{
723	struct uv_cb_dump_stor_state uvcb = {
724		.header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
725		.header.len = sizeof(uvcb),
726		.config_handle = kvm->arch.pv.handle,
727		.gaddr = *gaddr,
728		.dump_area_origin = 0,
729	};
730	const u64 increment_len = uv_info.conf_dump_storage_state_len;
731	size_t buff_kvm_size;
732	size_t size_done = 0;
733	u8 *buff_kvm = NULL;
734	int cc, ret;
735
736	ret = -EINVAL;
737	/* UV call processes 1MB guest storage chunks at a time */
738	if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
739		goto out;
740
741	/*
742	 * We provide the storage state for 1MB chunks of guest
743	 * storage. The buffer will need to be aligned to
744	 * conf_dump_storage_state_len so we don't end on a partial
745	 * chunk.
746	 */
747	if (!buff_user_len ||
748	    !IS_ALIGNED(buff_user_len, increment_len))
749		goto out;
750
751	/*
752	 * Allocate a buffer from which we will later copy to the user
753	 * process. We don't want userspace to dictate our buffer size
754	 * so we limit it to DUMP_BUFF_LEN.
755	 */
756	ret = -ENOMEM;
757	buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
758	buff_kvm = vzalloc(buff_kvm_size);
759	if (!buff_kvm)
760		goto out;
761
762	ret = 0;
763	uvcb.dump_area_origin = (u64)buff_kvm;
764	/* We will loop until the user buffer is filled or an error occurs */
765	do {
766		/* Get 1MB worth of guest storage state data */
767		cc = uv_call_sched(0, (u64)&uvcb);
768
769		/* All or nothing */
770		if (cc) {
771			ret = -EINVAL;
772			break;
773		}
774
775		size_done += increment_len;
776		uvcb.dump_area_origin += increment_len;
777		buff_user_len -= increment_len;
778		uvcb.gaddr += HPAGE_SIZE;
779
780		/* KVM Buffer full, time to copy to the process */
781		if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
782			if (copy_to_user(buff_user, buff_kvm, size_done)) {
783				ret = -EFAULT;
784				break;
785			}
786
787			buff_user += size_done;
788			size_done = 0;
789			uvcb.dump_area_origin = (u64)buff_kvm;
790		}
791	} while (buff_user_len);
792
793	/* Report back where we ended dumping */
794	*gaddr = uvcb.gaddr;
795
796	/* Lets only log errors, we don't want to spam */
797out:
798	if (ret)
799		KVM_UV_EVENT(kvm, 3,
800			     "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
801			     uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
802	*rc = uvcb.header.rc;
803	*rrc = uvcb.header.rrc;
804	vfree(buff_kvm);
805
806	return ret;
807}
808
809/**
810 * kvm_s390_pv_dump_complete
811 *
812 * @kvm: pointer to the guest's KVM struct
813 * @buff_user: Userspace pointer where we will write the results to
814 * @rc: Pointer to where the uvcb return code is stored
815 * @rrc: Pointer to where the uvcb return reason code is stored
816 *
817 * Completes the dumping operation and writes the completion data to
818 * user space.
819 *
820 * Context: kvm->lock needs to be held
821 *
822 * Return:
823 *  0 on success
824 *  -ENOMEM if allocating the completion buffer fails
825 *  -EINVAL if the UV call fails, rc and rrc will be set in this case
826 *  -EFAULT if copying the result to buff_user failed
827 */
828int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
829			      u16 *rc, u16 *rrc)
830{
831	struct uv_cb_dump_complete complete = {
832		.header.len = sizeof(complete),
833		.header.cmd = UVC_CMD_DUMP_COMPLETE,
834		.config_handle = kvm_s390_pv_get_handle(kvm),
835	};
836	u64 *compl_data;
837	int ret;
838
839	/* Allocate dump area */
840	compl_data = vzalloc(uv_info.conf_dump_finalize_len);
841	if (!compl_data)
842		return -ENOMEM;
843	complete.dump_area_origin = (u64)compl_data;
844
845	ret = uv_call_sched(0, (u64)&complete);
846	*rc = complete.header.rc;
847	*rrc = complete.header.rrc;
848	KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
849		     complete.header.rc, complete.header.rrc);
850
851	if (!ret) {
852		/*
853		 * kvm_s390_pv_dealloc_vm() will also (mem)set
854		 * this to false on a reboot or other destroy
855		 * operation for this vm.
856		 */
857		kvm->arch.pv.dumping = false;
858		kvm_s390_vcpu_unblock_all(kvm);
859		ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
860		if (ret)
861			ret = -EFAULT;
862	}
863	vfree(compl_data);
864	/* If the UVC returned an error, translate it to -EINVAL */
865	if (ret > 0)
866		ret = -EINVAL;
867	return ret;
868}
v6.8
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Hosting Protected Virtual Machines
  4 *
  5 * Copyright IBM Corp. 2019, 2020
  6 *    Author(s): Janosch Frank <frankja@linux.ibm.com>
  7 */
  8#include <linux/kvm.h>
  9#include <linux/kvm_host.h>
 10#include <linux/minmax.h>
 11#include <linux/pagemap.h>
 12#include <linux/sched/signal.h>
 13#include <asm/gmap.h>
 14#include <asm/uv.h>
 15#include <asm/mman.h>
 16#include <linux/pagewalk.h>
 17#include <linux/sched/mm.h>
 18#include <linux/mmu_notifier.h>
 19#include "kvm-s390.h"
 20
 21bool kvm_s390_pv_is_protected(struct kvm *kvm)
 22{
 23	lockdep_assert_held(&kvm->lock);
 24	return !!kvm_s390_pv_get_handle(kvm);
 25}
 26EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected);
 27
 28bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
 29{
 30	lockdep_assert_held(&vcpu->mutex);
 31	return !!kvm_s390_pv_cpu_get_handle(vcpu);
 32}
 33EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
 34
 35/**
 36 * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
 37 * be destroyed
 38 *
 39 * @list: list head for the list of leftover VMs
 40 * @old_gmap_table: the gmap table of the leftover protected VM
 41 * @handle: the handle of the leftover protected VM
 42 * @stor_var: pointer to the variable storage of the leftover protected VM
 43 * @stor_base: address of the base storage of the leftover protected VM
 44 *
 45 * Represents a protected VM that is still registered with the Ultravisor,
 46 * but which does not correspond any longer to an active KVM VM. It should
 47 * be destroyed at some point later, either asynchronously or when the
 48 * process terminates.
 49 */
 50struct pv_vm_to_be_destroyed {
 51	struct list_head list;
 52	unsigned long old_gmap_table;
 53	u64 handle;
 54	void *stor_var;
 55	unsigned long stor_base;
 56};
 57
 58static void kvm_s390_clear_pv_state(struct kvm *kvm)
 59{
 60	kvm->arch.pv.handle = 0;
 61	kvm->arch.pv.guest_len = 0;
 62	kvm->arch.pv.stor_base = 0;
 63	kvm->arch.pv.stor_var = NULL;
 64}
 65
 66int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
 67{
 68	int cc;
 69
 70	if (!kvm_s390_pv_cpu_get_handle(vcpu))
 71		return 0;
 72
 73	cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
 74
 75	KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
 76		     vcpu->vcpu_id, *rc, *rrc);
 77	WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc);
 78
 79	/* Intended memory leak for something that should never happen. */
 80	if (!cc)
 81		free_pages(vcpu->arch.pv.stor_base,
 82			   get_order(uv_info.guest_cpu_stor_len));
 83
 84	free_page((unsigned long)sida_addr(vcpu->arch.sie_block));
 85	vcpu->arch.sie_block->pv_handle_cpu = 0;
 86	vcpu->arch.sie_block->pv_handle_config = 0;
 87	memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
 88	vcpu->arch.sie_block->sdf = 0;
 89	/*
 90	 * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0).
 91	 * Use the reset value of gbea to avoid leaking the kernel pointer of
 92	 * the just freed sida.
 93	 */
 94	vcpu->arch.sie_block->gbea = 1;
 95	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 96
 97	return cc ? EIO : 0;
 98}
 99
100int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
101{
102	struct uv_cb_csc uvcb = {
103		.header.cmd = UVC_CMD_CREATE_SEC_CPU,
104		.header.len = sizeof(uvcb),
105	};
106	void *sida_addr;
107	int cc;
108
109	if (kvm_s390_pv_cpu_get_handle(vcpu))
110		return -EINVAL;
111
112	vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT,
113						   get_order(uv_info.guest_cpu_stor_len));
114	if (!vcpu->arch.pv.stor_base)
115		return -ENOMEM;
116
117	/* Input */
118	uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
119	uvcb.num = vcpu->arch.sie_block->icpua;
120	uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block);
121	uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base);
122
123	/* Alloc Secure Instruction Data Area Designation */
124	sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
125	if (!sida_addr) {
126		free_pages(vcpu->arch.pv.stor_base,
127			   get_order(uv_info.guest_cpu_stor_len));
128		return -ENOMEM;
129	}
130	vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr);
131
132	cc = uv_call(0, (u64)&uvcb);
133	*rc = uvcb.header.rc;
134	*rrc = uvcb.header.rrc;
135	KVM_UV_EVENT(vcpu->kvm, 3,
136		     "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
137		     vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
138		     uvcb.header.rrc);
139
140	if (cc) {
141		u16 dummy;
142
143		kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
144		return -EIO;
145	}
146
147	/* Output */
148	vcpu->arch.pv.handle = uvcb.cpu_handle;
149	vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
150	vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
151	vcpu->arch.sie_block->sdf = 2;
152	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
153	return 0;
154}
155
156/* only free resources when the destroy was successful */
157static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
158{
159	vfree(kvm->arch.pv.stor_var);
160	free_pages(kvm->arch.pv.stor_base,
161		   get_order(uv_info.guest_base_stor_len));
162	kvm_s390_clear_pv_state(kvm);
163}
164
165static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
166{
167	unsigned long base = uv_info.guest_base_stor_len;
168	unsigned long virt = uv_info.guest_virt_var_stor_len;
169	unsigned long npages = 0, vlen = 0;
170
171	kvm->arch.pv.stor_var = NULL;
172	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base));
173	if (!kvm->arch.pv.stor_base)
174		return -ENOMEM;
175
176	/*
177	 * Calculate current guest storage for allocation of the
178	 * variable storage, which is based on the length in MB.
179	 *
180	 * Slots are sorted by GFN
181	 */
182	mutex_lock(&kvm->slots_lock);
183	npages = kvm_s390_get_gfn_end(kvm_memslots(kvm));
184	mutex_unlock(&kvm->slots_lock);
185
186	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
187
188	/* Allocate variable storage */
189	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
190	vlen += uv_info.guest_virt_base_stor_len;
191	kvm->arch.pv.stor_var = vzalloc(vlen);
192	if (!kvm->arch.pv.stor_var)
193		goto out_err;
194	return 0;
195
196out_err:
197	kvm_s390_pv_dealloc_vm(kvm);
198	return -ENOMEM;
199}
200
201/**
202 * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM.
203 * @kvm: the KVM that was associated with this leftover protected VM
204 * @leftover: details about the leftover protected VM that needs a clean up
205 * @rc: the RC code of the Destroy Secure Configuration UVC
206 * @rrc: the RRC code of the Destroy Secure Configuration UVC
207 *
208 * Destroy one leftover protected VM.
209 * On success, kvm->mm->context.protected_count will be decremented atomically
210 * and all other resources used by the VM will be freed.
211 *
212 * Return: 0 in case of success, otherwise 1
213 */
214static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
215					    struct pv_vm_to_be_destroyed *leftover,
216					    u16 *rc, u16 *rrc)
217{
218	int cc;
219
220	/* It used the destroy-fast UVC, nothing left to do here */
221	if (!leftover->handle)
222		goto done_fast;
223	cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
224	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc);
225	WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc);
226	if (cc)
227		return cc;
228	/*
229	 * Intentionally leak unusable memory. If the UVC fails, the memory
230	 * used for the VM and its metadata is permanently unusable.
231	 * This can only happen in case of a serious KVM or hardware bug; it
232	 * is not expected to happen in normal operation.
233	 */
234	free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len));
235	free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER);
236	vfree(leftover->stor_var);
237done_fast:
238	atomic_dec(&kvm->mm->context.protected_count);
239	return 0;
240}
241
242/**
243 * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
244 * @kvm: the VM whose memory is to be cleared.
245 *
246 * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
247 * The CPUs of the protected VM need to be destroyed beforehand.
248 */
249static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
250{
251	const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
252	struct kvm_memory_slot *slot;
253	unsigned long len;
254	int srcu_idx;
255
256	srcu_idx = srcu_read_lock(&kvm->srcu);
257
258	/* Take the memslot containing guest absolute address 0 */
259	slot = gfn_to_memslot(kvm, 0);
260	/* Clear all slots or parts thereof that are below 2GB */
261	while (slot && slot->base_gfn < pages_2g) {
262		len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
263		s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
264		/* Take the next memslot */
265		slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
266	}
267
268	srcu_read_unlock(&kvm->srcu, srcu_idx);
269}
270
271static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
272{
273	struct uv_cb_destroy_fast uvcb = {
274		.header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST,
275		.header.len = sizeof(uvcb),
276		.handle = kvm_s390_pv_get_handle(kvm),
277	};
278	int cc;
279
280	cc = uv_call_sched(0, (u64)&uvcb);
281	if (rc)
282		*rc = uvcb.header.rc;
283	if (rrc)
284		*rrc = uvcb.header.rrc;
285	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
286	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
287		     uvcb.header.rc, uvcb.header.rrc);
288	WARN_ONCE(cc && uvcb.header.rc != 0x104,
289		  "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
290		  kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
291	/* Intended memory leak on "impossible" error */
292	if (!cc)
293		kvm_s390_pv_dealloc_vm(kvm);
294	return cc ? -EIO : 0;
295}
296
297static inline bool is_destroy_fast_available(void)
298{
299	return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list);
300}
301
302/**
303 * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown.
304 * @kvm: the VM
305 * @rc: return value for the RC field of the UVCB
306 * @rrc: return value for the RRC field of the UVCB
307 *
308 * Set aside the protected VM for a subsequent teardown. The VM will be able
309 * to continue immediately as a non-secure VM, and the information needed to
310 * properly tear down the protected VM is set aside. If another protected VM
311 * was already set aside without starting its teardown, this function will
312 * fail.
313 * The CPUs of the protected VM need to be destroyed beforehand.
314 *
315 * Context: kvm->lock needs to be held
316 *
317 * Return: 0 in case of success, -EINVAL if another protected VM was already set
318 * aside, -ENOMEM if the system ran out of memory.
319 */
320int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
321{
322	struct pv_vm_to_be_destroyed *priv;
323	int res = 0;
324
325	lockdep_assert_held(&kvm->lock);
326	/*
327	 * If another protected VM was already prepared for teardown, refuse.
328	 * A normal deinitialization has to be performed instead.
329	 */
330	if (kvm->arch.pv.set_aside)
331		return -EINVAL;
332
333	/* Guest with segment type ASCE, refuse to destroy asynchronously */
334	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
335		return -EINVAL;
336
337	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
338	if (!priv)
339		return -ENOMEM;
340
341	if (is_destroy_fast_available()) {
342		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
343	} else {
344		priv->stor_var = kvm->arch.pv.stor_var;
345		priv->stor_base = kvm->arch.pv.stor_base;
346		priv->handle = kvm_s390_pv_get_handle(kvm);
347		priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
348		WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
349		if (s390_replace_asce(kvm->arch.gmap))
350			res = -ENOMEM;
351	}
352
353	if (res) {
354		kfree(priv);
355		return res;
356	}
357
358	kvm_s390_destroy_lower_2g(kvm);
359	kvm_s390_clear_pv_state(kvm);
360	kvm->arch.pv.set_aside = priv;
361
362	*rc = UVC_RC_EXECUTED;
363	*rrc = 42;
364	return 0;
365}
366
367/**
368 * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM
369 * @kvm: the KVM whose protected VM needs to be deinitialized
370 * @rc: the RC code of the UVC
371 * @rrc: the RRC code of the UVC
372 *
373 * Deinitialize the current protected VM. This function will destroy and
374 * cleanup the current protected VM, but it will not cleanup the guest
375 * memory. This function should only be called when the protected VM has
376 * just been created and therefore does not have any guest memory, or when
377 * the caller cleans up the guest memory separately.
378 *
379 * This function should not fail, but if it does, the donated memory must
380 * not be freed.
381 *
382 * Context: kvm->lock needs to be held
383 *
384 * Return: 0 in case of success, otherwise -EIO
385 */
386int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
387{
388	int cc;
389
390	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
391			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
392	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
393	if (!cc) {
394		atomic_dec(&kvm->mm->context.protected_count);
395		kvm_s390_pv_dealloc_vm(kvm);
396	} else {
397		/* Intended memory leak on "impossible" error */
398		s390_replace_asce(kvm->arch.gmap);
399	}
400	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
401	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
402
403	return cc ? -EIO : 0;
404}
405
406/**
407 * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated
408 * with a specific KVM.
409 * @kvm: the KVM to be cleaned up
410 * @rc: the RC code of the first failing UVC
411 * @rrc: the RRC code of the first failing UVC
412 *
413 * This function will clean up all protected VMs associated with a KVM.
414 * This includes the active one, the one prepared for deinitialization with
415 * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list.
416 *
417 * Context: kvm->lock needs to be held unless being called from
418 * kvm_arch_destroy_vm.
419 *
420 * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO
421 */
422int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
423{
424	struct pv_vm_to_be_destroyed *cur;
425	bool need_zap = false;
426	u16 _rc, _rrc;
427	int cc = 0;
428
429	/*
430	 * Nothing to do if the counter was already 0. Otherwise make sure
431	 * the counter does not reach 0 before calling s390_uv_destroy_range.
432	 */
433	if (!atomic_inc_not_zero(&kvm->mm->context.protected_count))
434		return 0;
435
436	*rc = 1;
437	/* If the current VM is protected, destroy it */
438	if (kvm_s390_pv_get_handle(kvm)) {
439		cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
440		need_zap = true;
441	}
442
443	/* If a previous protected VM was set aside, put it in the need_cleanup list */
444	if (kvm->arch.pv.set_aside) {
445		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
446		kvm->arch.pv.set_aside = NULL;
447	}
448
449	/* Cleanup all protected VMs in the need_cleanup list */
450	while (!list_empty(&kvm->arch.pv.need_cleanup)) {
451		cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list);
452		need_zap = true;
453		if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) {
454			cc = 1;
455			/*
456			 * Only return the first error rc and rrc, so make
457			 * sure it is not overwritten. All destroys will
458			 * additionally be reported via KVM_UV_EVENT().
459			 */
460			if (*rc == UVC_RC_EXECUTED) {
461				*rc = _rc;
462				*rrc = _rrc;
463			}
464		}
465		list_del(&cur->list);
466		kfree(cur);
467	}
468
469	/*
470	 * If the mm still has a mapping, try to mark all its pages as
471	 * accessible. The counter should not reach zero before this
472	 * cleanup has been performed.
473	 */
474	if (need_zap && mmget_not_zero(kvm->mm)) {
475		s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
476		mmput(kvm->mm);
477	}
478
479	/* Now the counter can safely reach 0 */
480	atomic_dec(&kvm->mm->context.protected_count);
481	return cc ? -EIO : 0;
482}
483
484/**
485 * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM.
486 * @kvm: the VM previously associated with the protected VM
487 * @rc: return value for the RC field of the UVCB
488 * @rrc: return value for the RRC field of the UVCB
489 *
490 * Tear down the protected VM that had been previously prepared for teardown
491 * using kvm_s390_pv_set_aside_vm. Ideally this should be called by
492 * userspace asynchronously from a separate thread.
493 *
494 * Context: kvm->lock must not be held.
495 *
496 * Return: 0 in case of success, -EINVAL if no protected VM had been
497 * prepared for asynchronous teardowm, -EIO in case of other errors.
498 */
499int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
500{
501	struct pv_vm_to_be_destroyed *p;
502	int ret = 0;
503
504	lockdep_assert_not_held(&kvm->lock);
505	mutex_lock(&kvm->lock);
506	p = kvm->arch.pv.set_aside;
507	kvm->arch.pv.set_aside = NULL;
508	mutex_unlock(&kvm->lock);
509	if (!p)
510		return -EINVAL;
511
512	/* When a fatal signal is received, stop immediately */
513	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
514		goto done;
515	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
516		ret = -EIO;
517	kfree(p);
518	p = NULL;
519done:
520	/*
521	 * p is not NULL if we aborted because of a fatal signal, in which
522	 * case queue the leftover for later cleanup.
523	 */
524	if (p) {
525		mutex_lock(&kvm->lock);
526		list_add(&p->list, &kvm->arch.pv.need_cleanup);
527		mutex_unlock(&kvm->lock);
528		/* Did not finish, but pretend things went well */
529		*rc = UVC_RC_EXECUTED;
530		*rrc = 42;
531	}
532	return ret;
533}
534
535static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
536					     struct mm_struct *mm)
537{
538	struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
539	u16 dummy;
540	int r;
541
542	/*
543	 * No locking is needed since this is the last thread of the last user of this
544	 * struct mm.
545	 * When the struct kvm gets deinitialized, this notifier is also
546	 * unregistered. This means that if this notifier runs, then the
547	 * struct kvm is still valid.
548	 */
549	r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
550	if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
551		kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
552}
553
554static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
555	.release = kvm_s390_pv_mmu_notifier_release,
556};
557
558int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
559{
560	struct uv_cb_cgc uvcb = {
561		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
562		.header.len = sizeof(uvcb)
563	};
564	int cc, ret;
565	u16 dummy;
566
567	ret = kvm_s390_pv_alloc_vm(kvm);
568	if (ret)
569		return ret;
570
571	/* Inputs */
572	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
573	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
574	uvcb.guest_asce = kvm->arch.gmap->asce;
575	uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
576	uvcb.conf_base_stor_origin =
577		virt_to_phys((void *)kvm->arch.pv.stor_base);
578	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
579	uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
580	uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;
581
582	cc = uv_call_sched(0, (u64)&uvcb);
583	*rc = uvcb.header.rc;
584	*rrc = uvcb.header.rrc;
585	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x",
586		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw);
587
588	/* Outputs */
589	kvm->arch.pv.handle = uvcb.guest_handle;
590
591	atomic_inc(&kvm->mm->context.protected_count);
592	if (cc) {
593		if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
594			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
595		} else {
596			atomic_dec(&kvm->mm->context.protected_count);
597			kvm_s390_pv_dealloc_vm(kvm);
598		}
599		return -EIO;
600	}
601	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
602	/* Add the notifier only once. No races because we hold kvm->lock */
603	if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
604		kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
605		mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
606	}
607	return 0;
608}
609
610int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
611			      u16 *rrc)
612{
613	struct uv_cb_ssc uvcb = {
614		.header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS,
615		.header.len = sizeof(uvcb),
616		.sec_header_origin = (u64)hdr,
617		.sec_header_len = length,
618		.guest_handle = kvm_s390_pv_get_handle(kvm),
619	};
620	int cc = uv_call(0, (u64)&uvcb);
621
622	*rc = uvcb.header.rc;
623	*rrc = uvcb.header.rrc;
624	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
625		     *rc, *rrc);
626	return cc ? -EINVAL : 0;
627}
628
629static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
630		      u64 offset, u16 *rc, u16 *rrc)
631{
632	struct uv_cb_unp uvcb = {
633		.header.cmd = UVC_CMD_UNPACK_IMG,
634		.header.len = sizeof(uvcb),
635		.guest_handle = kvm_s390_pv_get_handle(kvm),
636		.gaddr = addr,
637		.tweak[0] = tweak,
638		.tweak[1] = offset,
639	};
640	int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
641
642	*rc = uvcb.header.rc;
643	*rrc = uvcb.header.rrc;
644
645	if (ret && ret != -EAGAIN)
646		KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
647			     uvcb.gaddr, *rc, *rrc);
648	return ret;
649}
650
651int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
652		       unsigned long tweak, u16 *rc, u16 *rrc)
653{
654	u64 offset = 0;
655	int ret = 0;
656
657	if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK)
658		return -EINVAL;
659
660	KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
661		     addr, size);
662
663	while (offset < size) {
664		ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
665		if (ret == -EAGAIN) {
666			cond_resched();
667			if (fatal_signal_pending(current))
668				break;
669			continue;
670		}
671		if (ret)
672			break;
673		addr += PAGE_SIZE;
674		offset += PAGE_SIZE;
675	}
676	if (!ret)
677		KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful");
678	return ret;
679}
680
681int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state)
682{
683	struct uv_cb_cpu_set_state uvcb = {
684		.header.cmd	= UVC_CMD_CPU_SET_STATE,
685		.header.len	= sizeof(uvcb),
686		.cpu_handle	= kvm_s390_pv_cpu_get_handle(vcpu),
687		.state		= state,
688	};
689	int cc;
690
691	cc = uv_call(0, (u64)&uvcb);
692	KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x",
693		     vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc);
694	if (cc)
695		return -EINVAL;
696	return 0;
697}
698
699int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc)
700{
701	struct uv_cb_dump_cpu uvcb = {
702		.header.cmd = UVC_CMD_DUMP_CPU,
703		.header.len = sizeof(uvcb),
704		.cpu_handle = vcpu->arch.pv.handle,
705		.dump_area_origin = (u64)buff,
706	};
707	int cc;
708
709	cc = uv_call_sched(0, (u64)&uvcb);
710	*rc = uvcb.header.rc;
711	*rrc = uvcb.header.rrc;
712	return cc;
713}
714
715/* Size of the cache for the storage state dump data. 1MB for now */
716#define DUMP_BUFF_LEN HPAGE_SIZE
717
718/**
719 * kvm_s390_pv_dump_stor_state
720 *
721 * @kvm: pointer to the guest's KVM struct
722 * @buff_user: Userspace pointer where we will write the results to
723 * @gaddr: Starting absolute guest address for which the storage state
724 *	   is requested.
725 * @buff_user_len: Length of the buff_user buffer
726 * @rc: Pointer to where the uvcb return code is stored
727 * @rrc: Pointer to where the uvcb return reason code is stored
728 *
729 * Stores buff_len bytes of tweak component values to buff_user
730 * starting with the 1MB block specified by the absolute guest address
731 * (gaddr). The gaddr pointer will be updated with the last address
732 * for which data was written when returning to userspace. buff_user
733 * might be written to even if an error rc is returned. For instance
734 * if we encounter a fault after writing the first page of data.
735 *
736 * Context: kvm->lock needs to be held
737 *
738 * Return:
739 *  0 on success
740 *  -ENOMEM if allocating the cache fails
741 *  -EINVAL if gaddr is not aligned to 1MB
742 *  -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len
743 *  -EINVAL if the UV call fails, rc and rrc will be set in this case
744 *  -EFAULT if copying the result to buff_user failed
745 */
746int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
747				u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc)
748{
749	struct uv_cb_dump_stor_state uvcb = {
750		.header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE,
751		.header.len = sizeof(uvcb),
752		.config_handle = kvm->arch.pv.handle,
753		.gaddr = *gaddr,
754		.dump_area_origin = 0,
755	};
756	const u64 increment_len = uv_info.conf_dump_storage_state_len;
757	size_t buff_kvm_size;
758	size_t size_done = 0;
759	u8 *buff_kvm = NULL;
760	int cc, ret;
761
762	ret = -EINVAL;
763	/* UV call processes 1MB guest storage chunks at a time */
764	if (!IS_ALIGNED(*gaddr, HPAGE_SIZE))
765		goto out;
766
767	/*
768	 * We provide the storage state for 1MB chunks of guest
769	 * storage. The buffer will need to be aligned to
770	 * conf_dump_storage_state_len so we don't end on a partial
771	 * chunk.
772	 */
773	if (!buff_user_len ||
774	    !IS_ALIGNED(buff_user_len, increment_len))
775		goto out;
776
777	/*
778	 * Allocate a buffer from which we will later copy to the user
779	 * process. We don't want userspace to dictate our buffer size
780	 * so we limit it to DUMP_BUFF_LEN.
781	 */
782	ret = -ENOMEM;
783	buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN);
784	buff_kvm = vzalloc(buff_kvm_size);
785	if (!buff_kvm)
786		goto out;
787
788	ret = 0;
789	uvcb.dump_area_origin = (u64)buff_kvm;
790	/* We will loop until the user buffer is filled or an error occurs */
791	do {
792		/* Get 1MB worth of guest storage state data */
793		cc = uv_call_sched(0, (u64)&uvcb);
794
795		/* All or nothing */
796		if (cc) {
797			ret = -EINVAL;
798			break;
799		}
800
801		size_done += increment_len;
802		uvcb.dump_area_origin += increment_len;
803		buff_user_len -= increment_len;
804		uvcb.gaddr += HPAGE_SIZE;
805
806		/* KVM Buffer full, time to copy to the process */
807		if (!buff_user_len || size_done == DUMP_BUFF_LEN) {
808			if (copy_to_user(buff_user, buff_kvm, size_done)) {
809				ret = -EFAULT;
810				break;
811			}
812
813			buff_user += size_done;
814			size_done = 0;
815			uvcb.dump_area_origin = (u64)buff_kvm;
816		}
817	} while (buff_user_len);
818
819	/* Report back where we ended dumping */
820	*gaddr = uvcb.gaddr;
821
822	/* Lets only log errors, we don't want to spam */
823out:
824	if (ret)
825		KVM_UV_EVENT(kvm, 3,
826			     "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x",
827			     uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc);
828	*rc = uvcb.header.rc;
829	*rrc = uvcb.header.rrc;
830	vfree(buff_kvm);
831
832	return ret;
833}
834
835/**
836 * kvm_s390_pv_dump_complete
837 *
838 * @kvm: pointer to the guest's KVM struct
839 * @buff_user: Userspace pointer where we will write the results to
840 * @rc: Pointer to where the uvcb return code is stored
841 * @rrc: Pointer to where the uvcb return reason code is stored
842 *
843 * Completes the dumping operation and writes the completion data to
844 * user space.
845 *
846 * Context: kvm->lock needs to be held
847 *
848 * Return:
849 *  0 on success
850 *  -ENOMEM if allocating the completion buffer fails
851 *  -EINVAL if the UV call fails, rc and rrc will be set in this case
852 *  -EFAULT if copying the result to buff_user failed
853 */
854int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
855			      u16 *rc, u16 *rrc)
856{
857	struct uv_cb_dump_complete complete = {
858		.header.len = sizeof(complete),
859		.header.cmd = UVC_CMD_DUMP_COMPLETE,
860		.config_handle = kvm_s390_pv_get_handle(kvm),
861	};
862	u64 *compl_data;
863	int ret;
864
865	/* Allocate dump area */
866	compl_data = vzalloc(uv_info.conf_dump_finalize_len);
867	if (!compl_data)
868		return -ENOMEM;
869	complete.dump_area_origin = (u64)compl_data;
870
871	ret = uv_call_sched(0, (u64)&complete);
872	*rc = complete.header.rc;
873	*rrc = complete.header.rrc;
874	KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x",
875		     complete.header.rc, complete.header.rrc);
876
877	if (!ret) {
878		/*
879		 * kvm_s390_pv_dealloc_vm() will also (mem)set
880		 * this to false on a reboot or other destroy
881		 * operation for this vm.
882		 */
883		kvm->arch.pv.dumping = false;
884		kvm_s390_vcpu_unblock_all(kvm);
885		ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len);
886		if (ret)
887			ret = -EFAULT;
888	}
889	vfree(compl_data);
890	/* If the UVC returned an error, translate it to -EINVAL */
891	if (ret > 0)
892		ret = -EINVAL;
893	return ret;
894}