Linux Audio

Check our new training course

Open-source upstreaming

Need help get the support for your hardware in upstream Linux?
Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   5 */
   6
   7#include <linux/types.h>
   8#include <linux/string.h>
   9#include <linux/kvm.h>
  10#include <linux/kvm_host.h>
  11#include <linux/anon_inodes.h>
  12#include <linux/file.h>
  13#include <linux/debugfs.h>
  14#include <linux/pgtable.h>
  15
  16#include <asm/kvm_ppc.h>
  17#include <asm/kvm_book3s.h>
  18#include "book3s_hv.h"
  19#include <asm/page.h>
  20#include <asm/mmu.h>
 
  21#include <asm/pgalloc.h>
  22#include <asm/pte-walk.h>
  23#include <asm/ultravisor.h>
  24#include <asm/kvm_book3s_uvmem.h>
  25#include <asm/plpar_wrappers.h>
  26#include <asm/firmware.h>
  27
  28/*
  29 * Supported radix tree geometry.
  30 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
  31 * for a page size of 64k or 4k.
  32 */
  33static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  34
  35unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
  36					      gva_t eaddr, void *to, void *from,
  37					      unsigned long n)
  38{
  39	int old_pid, old_lpid;
  40	unsigned long quadrant, ret = n;
  41	bool is_load = !!to;
  42
  43	if (kvmhv_is_nestedv2())
  44		return H_UNSUPPORTED;
  45
  46	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
  47	if (kvmhv_on_pseries())
  48		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
  49					  (to != NULL) ? __pa(to): 0,
  50					  (from != NULL) ? __pa(from): 0, n);
  51
  52	if (eaddr & (0xFFFUL << 52))
  53		return ret;
  54
  55	quadrant = 1;
  56	if (!pid)
  57		quadrant = 2;
  58	if (is_load)
  59		from = (void *) (eaddr | (quadrant << 62));
  60	else
  61		to = (void *) (eaddr | (quadrant << 62));
  62
  63	preempt_disable();
  64
  65	asm volatile("hwsync" ::: "memory");
  66	isync();
  67	/* switch the lpid first to avoid running host with unallocated pid */
  68	old_lpid = mfspr(SPRN_LPID);
  69	if (old_lpid != lpid)
  70		mtspr(SPRN_LPID, lpid);
  71	if (quadrant == 1) {
  72		old_pid = mfspr(SPRN_PID);
  73		if (old_pid != pid)
  74			mtspr(SPRN_PID, pid);
  75	}
  76	isync();
  77
  78	pagefault_disable();
  79	if (is_load)
  80		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
  81	else
  82		ret = __copy_to_user_inatomic((void __user *)to, from, n);
  83	pagefault_enable();
  84
  85	asm volatile("hwsync" ::: "memory");
  86	isync();
  87	/* switch the pid first to avoid running host with unallocated pid */
  88	if (quadrant == 1 && pid != old_pid)
  89		mtspr(SPRN_PID, old_pid);
  90	if (lpid != old_lpid)
  91		mtspr(SPRN_LPID, old_lpid);
  92	isync();
  93
  94	preempt_enable();
  95
  96	return ret;
  97}
 
  98
  99static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
 100					  void *to, void *from, unsigned long n)
 101{
 102	int lpid = vcpu->kvm->arch.lpid;
 103	int pid;
 104
 105	/* This would cause a data segment intr so don't allow the access */
 106	if (eaddr & (0x3FFUL << 52))
 107		return -EINVAL;
 108
 109	/* Should we be using the nested lpid */
 110	if (vcpu->arch.nested)
 111		lpid = vcpu->arch.nested->shadow_lpid;
 112
 113	/* If accessing quadrant 3 then pid is expected to be 0 */
 114	if (((eaddr >> 62) & 0x3) == 0x3)
 115		pid = 0;
 116	else
 117		pid = kvmppc_get_pid(vcpu);
 118
 119	eaddr &= ~(0xFFFUL << 52);
 120
 121	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
 122}
 123
 124long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
 125				 unsigned long n)
 126{
 127	long ret;
 128
 129	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
 130	if (ret > 0)
 131		memset(to + (n - ret), 0, ret);
 132
 133	return ret;
 134}
 
 135
 136long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
 137			       unsigned long n)
 138{
 139	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
 140}
 
 141
 142int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
 143			       struct kvmppc_pte *gpte, u64 root,
 144			       u64 *pte_ret_p)
 145{
 146	struct kvm *kvm = vcpu->kvm;
 147	int ret, level, ps;
 148	unsigned long rts, bits, offset, index;
 149	u64 pte, base, gpa;
 150	__be64 rpte;
 151
 152	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
 153		((root & RTS2_MASK) >> RTS2_SHIFT);
 154	bits = root & RPDS_MASK;
 155	base = root & RPDB_MASK;
 156
 157	offset = rts + 31;
 158
 159	/* Current implementations only support 52-bit space */
 160	if (offset != 52)
 161		return -EINVAL;
 162
 163	/* Walk each level of the radix tree */
 164	for (level = 3; level >= 0; --level) {
 165		u64 addr;
 166		/* Check a valid size */
 167		if (level && bits != p9_supported_radix_bits[level])
 168			return -EINVAL;
 169		if (level == 0 && !(bits == 5 || bits == 9))
 170			return -EINVAL;
 171		offset -= bits;
 172		index = (eaddr >> offset) & ((1UL << bits) - 1);
 173		/* Check that low bits of page table base are zero */
 174		if (base & ((1UL << (bits + 3)) - 1))
 175			return -EINVAL;
 176		/* Read the entry from guest memory */
 177		addr = base + (index * sizeof(rpte));
 178
 179		kvm_vcpu_srcu_read_lock(vcpu);
 180		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
 181		kvm_vcpu_srcu_read_unlock(vcpu);
 182		if (ret) {
 183			if (pte_ret_p)
 184				*pte_ret_p = addr;
 185			return ret;
 186		}
 187		pte = __be64_to_cpu(rpte);
 188		if (!(pte & _PAGE_PRESENT))
 189			return -ENOENT;
 190		/* Check if a leaf entry */
 191		if (pte & _PAGE_PTE)
 192			break;
 193		/* Get ready to walk the next level */
 194		base = pte & RPDB_MASK;
 195		bits = pte & RPDS_MASK;
 196	}
 197
 198	/* Need a leaf at lowest level; 512GB pages not supported */
 199	if (level < 0 || level == 3)
 200		return -EINVAL;
 201
 202	/* We found a valid leaf PTE */
 203	/* Offset is now log base 2 of the page size */
 204	gpa = pte & 0x01fffffffffff000ul;
 205	if (gpa & ((1ul << offset) - 1))
 206		return -EINVAL;
 207	gpa |= eaddr & ((1ul << offset) - 1);
 208	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
 209		if (offset == mmu_psize_defs[ps].shift)
 210			break;
 211	gpte->page_size = ps;
 212	gpte->page_shift = offset;
 213
 214	gpte->eaddr = eaddr;
 215	gpte->raddr = gpa;
 216
 217	/* Work out permissions */
 218	gpte->may_read = !!(pte & _PAGE_READ);
 219	gpte->may_write = !!(pte & _PAGE_WRITE);
 220	gpte->may_execute = !!(pte & _PAGE_EXEC);
 221
 222	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
 223
 224	if (pte_ret_p)
 225		*pte_ret_p = pte;
 226
 227	return 0;
 228}
 229
 230/*
 231 * Used to walk a partition or process table radix tree in guest memory
 232 * Note: We exploit the fact that a partition table and a process
 233 * table have the same layout, a partition-scoped page table and a
 234 * process-scoped page table have the same layout, and the 2nd
 235 * doubleword of a partition table entry has the same layout as
 236 * the PTCR register.
 237 */
 238int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
 239				     struct kvmppc_pte *gpte, u64 table,
 240				     int table_index, u64 *pte_ret_p)
 241{
 242	struct kvm *kvm = vcpu->kvm;
 243	int ret;
 244	unsigned long size, ptbl, root;
 245	struct prtb_entry entry;
 246
 247	if ((table & PRTS_MASK) > 24)
 248		return -EINVAL;
 249	size = 1ul << ((table & PRTS_MASK) + 12);
 250
 251	/* Is the table big enough to contain this entry? */
 252	if ((table_index * sizeof(entry)) >= size)
 253		return -EINVAL;
 254
 255	/* Read the table to find the root of the radix tree */
 256	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
 257	kvm_vcpu_srcu_read_lock(vcpu);
 258	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
 259	kvm_vcpu_srcu_read_unlock(vcpu);
 260	if (ret)
 261		return ret;
 262
 263	/* Root is stored in the first double word */
 264	root = be64_to_cpu(entry.prtb0);
 265
 266	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
 267}
 268
 269int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 270			   struct kvmppc_pte *gpte, bool data, bool iswrite)
 271{
 272	u32 pid;
 273	u64 pte;
 274	int ret;
 275
 276	/* Work out effective PID */
 277	switch (eaddr >> 62) {
 278	case 0:
 279		pid = kvmppc_get_pid(vcpu);
 280		break;
 281	case 3:
 282		pid = 0;
 283		break;
 284	default:
 285		return -EINVAL;
 286	}
 287
 288	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
 289				vcpu->kvm->arch.process_table, pid, &pte);
 290	if (ret)
 291		return ret;
 292
 293	/* Check privilege (applies only to process scoped translations) */
 294	if (kvmppc_get_msr(vcpu) & MSR_PR) {
 295		if (pte & _PAGE_PRIVILEGED) {
 296			gpte->may_read = 0;
 297			gpte->may_write = 0;
 298			gpte->may_execute = 0;
 299		}
 300	} else {
 301		if (!(pte & _PAGE_PRIVILEGED)) {
 302			/* Check AMR/IAMR to see if strict mode is in force */
 303			if (kvmppc_get_amr_hv(vcpu) & (1ul << 62))
 304				gpte->may_read = 0;
 305			if (kvmppc_get_amr_hv(vcpu) & (1ul << 63))
 306				gpte->may_write = 0;
 307			if (vcpu->arch.iamr & (1ul << 62))
 308				gpte->may_execute = 0;
 309		}
 310	}
 311
 312	return 0;
 313}
 314
 315void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 316			     unsigned int pshift, u64 lpid)
 317{
 318	unsigned long psize = PAGE_SIZE;
 319	int psi;
 320	long rc;
 321	unsigned long rb;
 322
 323	if (pshift)
 324		psize = 1UL << pshift;
 325	else
 326		pshift = PAGE_SHIFT;
 327
 328	addr &= ~(psize - 1);
 329
 330	if (!kvmhv_on_pseries()) {
 331		radix__flush_tlb_lpid_page(lpid, addr, psize);
 332		return;
 333	}
 334
 335	psi = shift_to_mmu_psize(pshift);
 336
 337	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
 338		rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
 339		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
 340					lpid, rb);
 341	} else {
 342		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
 343					    H_RPTI_TYPE_NESTED |
 344					    H_RPTI_TYPE_TLB,
 345					    psize_to_rpti_pgsize(psi),
 346					    addr, addr + psize);
 347	}
 348
 349	if (rc)
 350		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
 351}
 352
 353static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid)
 354{
 355	long rc;
 356
 357	if (!kvmhv_on_pseries()) {
 358		radix__flush_pwc_lpid(lpid);
 359		return;
 360	}
 361
 362	if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
 363		rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
 364					lpid, TLBIEL_INVAL_SET_LPID);
 365	else
 366		rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
 367					    H_RPTI_TYPE_NESTED |
 368					    H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
 369					    0, -1UL);
 370	if (rc)
 371		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
 372}
 373
 374static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
 375				      unsigned long clr, unsigned long set,
 376				      unsigned long addr, unsigned int shift)
 377{
 378	return __radix_pte_update(ptep, clr, set);
 379}
 380
 381static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
 382			     pte_t *ptep, pte_t pte)
 383{
 384	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
 385}
 386
 387static struct kmem_cache *kvm_pte_cache;
 388static struct kmem_cache *kvm_pmd_cache;
 389
 390static pte_t *kvmppc_pte_alloc(void)
 391{
 392	pte_t *pte;
 393
 394	pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
 395	/* pmd_populate() will only reference _pa(pte). */
 396	kmemleak_ignore(pte);
 397
 398	return pte;
 399}
 400
 401static void kvmppc_pte_free(pte_t *ptep)
 402{
 403	kmem_cache_free(kvm_pte_cache, ptep);
 404}
 405
 406static pmd_t *kvmppc_pmd_alloc(void)
 407{
 408	pmd_t *pmd;
 409
 410	pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
 411	/* pud_populate() will only reference _pa(pmd). */
 412	kmemleak_ignore(pmd);
 413
 414	return pmd;
 415}
 416
 417static void kvmppc_pmd_free(pmd_t *pmdp)
 418{
 419	kmem_cache_free(kvm_pmd_cache, pmdp);
 420}
 421
 422/* Called with kvm->mmu_lock held */
 423void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
 424		      unsigned int shift,
 425		      const struct kvm_memory_slot *memslot,
 426		      u64 lpid)
 427
 428{
 429	unsigned long old;
 430	unsigned long gfn = gpa >> PAGE_SHIFT;
 431	unsigned long page_size = PAGE_SIZE;
 432	unsigned long hpa;
 433
 434	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
 435	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
 436
 437	/* The following only applies to L1 entries */
 438	if (lpid != kvm->arch.lpid)
 439		return;
 440
 441	if (!memslot) {
 442		memslot = gfn_to_memslot(kvm, gfn);
 443		if (!memslot)
 444			return;
 445	}
 446	if (shift) { /* 1GB or 2MB page */
 447		page_size = 1ul << shift;
 448		if (shift == PMD_SHIFT)
 449			kvm->stat.num_2M_pages--;
 450		else if (shift == PUD_SHIFT)
 451			kvm->stat.num_1G_pages--;
 452	}
 453
 454	gpa &= ~(page_size - 1);
 455	hpa = old & PTE_RPN_MASK;
 456	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
 457
 458	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
 459		kvmppc_update_dirty_map(memslot, gfn, page_size);
 460}
 461
 462/*
 463 * kvmppc_free_p?d are used to free existing page tables, and recursively
 464 * descend and clear and free children.
 465 * Callers are responsible for flushing the PWC.
 466 *
 467 * When page tables are being unmapped/freed as part of page fault path
 468 * (full == false), valid ptes are generally not expected; however, there
 469 * is one situation where they arise, which is when dirty page logging is
 470 * turned off for a memslot while the VM is running.  The new memslot
 471 * becomes visible to page faults before the memslot commit function
 472 * gets to flush the memslot, which can lead to a 2MB page mapping being
 473 * installed for a guest physical address where there are already 64kB
 474 * (or 4kB) mappings (of sub-pages of the same 2MB page).
 475 */
 476static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
 477				  u64 lpid)
 478{
 479	if (full) {
 480		memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
 481	} else {
 482		pte_t *p = pte;
 483		unsigned long it;
 484
 485		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
 486			if (pte_val(*p) == 0)
 487				continue;
 
 488			kvmppc_unmap_pte(kvm, p,
 489					 pte_pfn(*p) << PAGE_SHIFT,
 490					 PAGE_SHIFT, NULL, lpid);
 491		}
 492	}
 493
 494	kvmppc_pte_free(pte);
 495}
 496
 497static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
 498				  u64 lpid)
 499{
 500	unsigned long im;
 501	pmd_t *p = pmd;
 502
 503	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
 504		if (!pmd_present(*p))
 505			continue;
 506		if (pmd_is_leaf(*p)) {
 507			if (full) {
 508				pmd_clear(p);
 509			} else {
 510				WARN_ON_ONCE(1);
 511				kvmppc_unmap_pte(kvm, (pte_t *)p,
 512					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
 513					 PMD_SHIFT, NULL, lpid);
 514			}
 515		} else {
 516			pte_t *pte;
 517
 518			pte = pte_offset_kernel(p, 0);
 519			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
 520			pmd_clear(p);
 521		}
 522	}
 523	kvmppc_pmd_free(pmd);
 524}
 525
 526static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
 527				  u64 lpid)
 528{
 529	unsigned long iu;
 530	pud_t *p = pud;
 531
 532	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
 533		if (!pud_present(*p))
 534			continue;
 535		if (pud_is_leaf(*p)) {
 536			pud_clear(p);
 537		} else {
 538			pmd_t *pmd;
 539
 540			pmd = pmd_offset(p, 0);
 541			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
 542			pud_clear(p);
 543		}
 544	}
 545	pud_free(kvm->mm, pud);
 546}
 547
 548void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid)
 549{
 550	unsigned long ig;
 551
 552	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
 553		p4d_t *p4d = p4d_offset(pgd, 0);
 554		pud_t *pud;
 555
 556		if (!p4d_present(*p4d))
 557			continue;
 558		pud = pud_offset(p4d, 0);
 559		kvmppc_unmap_free_pud(kvm, pud, lpid);
 560		p4d_clear(p4d);
 561	}
 562}
 563
 564void kvmppc_free_radix(struct kvm *kvm)
 565{
 566	if (kvm->arch.pgtable) {
 567		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
 568					  kvm->arch.lpid);
 569		pgd_free(kvm->mm, kvm->arch.pgtable);
 570		kvm->arch.pgtable = NULL;
 571	}
 572}
 573
 574static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
 575					unsigned long gpa, u64 lpid)
 576{
 577	pte_t *pte = pte_offset_kernel(pmd, 0);
 578
 579	/*
 580	 * Clearing the pmd entry then flushing the PWC ensures that the pte
 581	 * page no longer be cached by the MMU, so can be freed without
 582	 * flushing the PWC again.
 583	 */
 584	pmd_clear(pmd);
 585	kvmppc_radix_flush_pwc(kvm, lpid);
 586
 587	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
 588}
 589
 590static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 591					unsigned long gpa, u64 lpid)
 592{
 593	pmd_t *pmd = pmd_offset(pud, 0);
 594
 595	/*
 596	 * Clearing the pud entry then flushing the PWC ensures that the pmd
 597	 * page and any children pte pages will no longer be cached by the MMU,
 598	 * so can be freed without flushing the PWC again.
 599	 */
 600	pud_clear(pud);
 601	kvmppc_radix_flush_pwc(kvm, lpid);
 602
 603	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
 604}
 605
 606/*
 607 * There are a number of bits which may differ between different faults to
 608 * the same partition scope entry. RC bits, in the course of cleaning and
 609 * aging. And the write bit can change, either the access could have been
 610 * upgraded, or a read fault could happen concurrently with a write fault
 611 * that sets those bits first.
 612 */
 613#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
 614
 615int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 616		      unsigned long gpa, unsigned int level,
 617		      unsigned long mmu_seq, u64 lpid,
 618		      unsigned long *rmapp, struct rmap_nested **n_rmap)
 619{
 620	pgd_t *pgd;
 621	p4d_t *p4d;
 622	pud_t *pud, *new_pud = NULL;
 623	pmd_t *pmd, *new_pmd = NULL;
 624	pte_t *ptep, *new_ptep = NULL;
 625	int ret;
 626
 627	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
 628	pgd = pgtable + pgd_index(gpa);
 629	p4d = p4d_offset(pgd, gpa);
 630
 631	pud = NULL;
 632	if (p4d_present(*p4d))
 633		pud = pud_offset(p4d, gpa);
 634	else
 635		new_pud = pud_alloc_one(kvm->mm, gpa);
 636
 637	pmd = NULL;
 638	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
 639		pmd = pmd_offset(pud, gpa);
 640	else if (level <= 1)
 641		new_pmd = kvmppc_pmd_alloc();
 642
 643	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
 644		new_ptep = kvmppc_pte_alloc();
 645
 646	/* Check if we might have been invalidated; let the guest retry if so */
 647	spin_lock(&kvm->mmu_lock);
 648	ret = -EAGAIN;
 649	if (mmu_invalidate_retry(kvm, mmu_seq))
 650		goto out_unlock;
 651
 652	/* Now traverse again under the lock and change the tree */
 653	ret = -ENOMEM;
 654	if (p4d_none(*p4d)) {
 655		if (!new_pud)
 656			goto out_unlock;
 657		p4d_populate(kvm->mm, p4d, new_pud);
 658		new_pud = NULL;
 659	}
 660	pud = pud_offset(p4d, gpa);
 661	if (pud_is_leaf(*pud)) {
 662		unsigned long hgpa = gpa & PUD_MASK;
 663
 664		/* Check if we raced and someone else has set the same thing */
 665		if (level == 2) {
 666			if (pud_raw(*pud) == pte_raw(pte)) {
 667				ret = 0;
 668				goto out_unlock;
 669			}
 670			/* Valid 1GB page here already, add our extra bits */
 671			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
 672							PTE_BITS_MUST_MATCH);
 673			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
 674					      0, pte_val(pte), hgpa, PUD_SHIFT);
 675			ret = 0;
 676			goto out_unlock;
 677		}
 678		/*
 679		 * If we raced with another CPU which has just put
 680		 * a 1GB pte in after we saw a pmd page, try again.
 681		 */
 682		if (!new_pmd) {
 683			ret = -EAGAIN;
 684			goto out_unlock;
 685		}
 686		/* Valid 1GB page here already, remove it */
 687		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
 688				 lpid);
 689	}
 690	if (level == 2) {
 691		if (!pud_none(*pud)) {
 692			/*
 693			 * There's a page table page here, but we wanted to
 694			 * install a large page, so remove and free the page
 695			 * table page.
 696			 */
 697			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
 698		}
 699		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
 700		if (rmapp && n_rmap)
 701			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 702		ret = 0;
 703		goto out_unlock;
 704	}
 705	if (pud_none(*pud)) {
 706		if (!new_pmd)
 707			goto out_unlock;
 708		pud_populate(kvm->mm, pud, new_pmd);
 709		new_pmd = NULL;
 710	}
 711	pmd = pmd_offset(pud, gpa);
 712	if (pmd_is_leaf(*pmd)) {
 713		unsigned long lgpa = gpa & PMD_MASK;
 714
 715		/* Check if we raced and someone else has set the same thing */
 716		if (level == 1) {
 717			if (pmd_raw(*pmd) == pte_raw(pte)) {
 718				ret = 0;
 719				goto out_unlock;
 720			}
 721			/* Valid 2MB page here already, add our extra bits */
 722			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
 723							PTE_BITS_MUST_MATCH);
 724			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
 725					0, pte_val(pte), lgpa, PMD_SHIFT);
 726			ret = 0;
 727			goto out_unlock;
 728		}
 729
 730		/*
 731		 * If we raced with another CPU which has just put
 732		 * a 2MB pte in after we saw a pte page, try again.
 733		 */
 734		if (!new_ptep) {
 735			ret = -EAGAIN;
 736			goto out_unlock;
 737		}
 738		/* Valid 2MB page here already, remove it */
 739		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
 740				 lpid);
 741	}
 742	if (level == 1) {
 743		if (!pmd_none(*pmd)) {
 744			/*
 745			 * There's a page table page here, but we wanted to
 746			 * install a large page, so remove and free the page
 747			 * table page.
 748			 */
 749			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
 750		}
 751		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
 752		if (rmapp && n_rmap)
 753			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 754		ret = 0;
 755		goto out_unlock;
 756	}
 757	if (pmd_none(*pmd)) {
 758		if (!new_ptep)
 759			goto out_unlock;
 760		pmd_populate(kvm->mm, pmd, new_ptep);
 761		new_ptep = NULL;
 762	}
 763	ptep = pte_offset_kernel(pmd, gpa);
 764	if (pte_present(*ptep)) {
 765		/* Check if someone else set the same thing */
 766		if (pte_raw(*ptep) == pte_raw(pte)) {
 767			ret = 0;
 768			goto out_unlock;
 769		}
 770		/* Valid page here already, add our extra bits */
 771		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
 772							PTE_BITS_MUST_MATCH);
 773		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
 774		ret = 0;
 775		goto out_unlock;
 776	}
 777	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 778	if (rmapp && n_rmap)
 779		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 780	ret = 0;
 781
 782 out_unlock:
 783	spin_unlock(&kvm->mmu_lock);
 784	if (new_pud)
 785		pud_free(kvm->mm, new_pud);
 786	if (new_pmd)
 787		kvmppc_pmd_free(new_pmd);
 788	if (new_ptep)
 789		kvmppc_pte_free(new_ptep);
 790	return ret;
 791}
 792
 793bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
 794			     unsigned long gpa, u64 lpid)
 795{
 796	unsigned long pgflags;
 797	unsigned int shift;
 798	pte_t *ptep;
 799
 800	/*
 801	 * Need to set an R or C bit in the 2nd-level tables;
 802	 * since we are just helping out the hardware here,
 803	 * it is sufficient to do what the hardware does.
 804	 */
 805	pgflags = _PAGE_ACCESSED;
 806	if (writing)
 807		pgflags |= _PAGE_DIRTY;
 808
 809	if (nested)
 810		ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
 811	else
 812		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
 813
 814	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
 815		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
 816		return true;
 817	}
 818	return false;
 819}
 820
 821int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 822				   unsigned long gpa,
 823				   struct kvm_memory_slot *memslot,
 824				   bool writing, bool kvm_ro,
 825				   pte_t *inserted_pte, unsigned int *levelp)
 826{
 827	struct kvm *kvm = vcpu->kvm;
 828	struct page *page = NULL;
 829	unsigned long mmu_seq;
 830	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
 831	bool upgrade_write = false;
 832	bool *upgrade_p = &upgrade_write;
 833	pte_t pte, *ptep;
 834	unsigned int shift, level;
 835	int ret;
 836	bool large_enable;
 837
 838	/* used to check for invalidations in progress */
 839	mmu_seq = kvm->mmu_invalidate_seq;
 840	smp_rmb();
 841
 842	/*
 843	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
 844	 * do it with !atomic && !async, which is how we call it.
 845	 * We always ask for write permission since the common case
 846	 * is that the page is writable.
 847	 */
 848	hva = gfn_to_hva_memslot(memslot, gfn);
 849	if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
 850		upgrade_write = true;
 851	} else {
 852		unsigned long pfn;
 853
 854		/* Call KVM generic code to do the slow-path check */
 855		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
 856					   writing, upgrade_p, NULL);
 857		if (is_error_noslot_pfn(pfn))
 858			return -EFAULT;
 859		page = NULL;
 860		if (pfn_valid(pfn)) {
 861			page = pfn_to_page(pfn);
 862			if (PageReserved(page))
 863				page = NULL;
 864		}
 865	}
 866
 867	/*
 868	 * Read the PTE from the process' radix tree and use that
 869	 * so we get the shift and attribute bits.
 870	 */
 871	spin_lock(&kvm->mmu_lock);
 872	ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
 873	pte = __pte(0);
 874	if (ptep)
 875		pte = READ_ONCE(*ptep);
 876	spin_unlock(&kvm->mmu_lock);
 877	/*
 878	 * If the PTE disappeared temporarily due to a THP
 879	 * collapse, just return and let the guest try again.
 880	 */
 881	if (!pte_present(pte)) {
 
 882		if (page)
 883			put_page(page);
 884		return RESUME_GUEST;
 885	}
 
 
 886
 887	/* If we're logging dirty pages, always map single pages */
 888	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
 889
 890	/* Get pte level from shift/size */
 891	if (large_enable && shift == PUD_SHIFT &&
 892	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
 893	    (hva & (PUD_SIZE - PAGE_SIZE))) {
 894		level = 2;
 895	} else if (large_enable && shift == PMD_SHIFT &&
 896		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
 897		   (hva & (PMD_SIZE - PAGE_SIZE))) {
 898		level = 1;
 899	} else {
 900		level = 0;
 901		if (shift > PAGE_SHIFT) {
 902			/*
 903			 * If the pte maps more than one page, bring over
 904			 * bits from the virtual address to get the real
 905			 * address of the specific single page we want.
 906			 */
 907			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
 908			pte = __pte(pte_val(pte) | (hva & rpnmask));
 909		}
 910	}
 911
 912	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
 913	if (writing || upgrade_write) {
 914		if (pte_val(pte) & _PAGE_WRITE)
 915			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
 916	} else {
 917		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
 918	}
 919
 920	/* Allocate space in the tree and write the PTE */
 921	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
 922				mmu_seq, kvm->arch.lpid, NULL, NULL);
 923	if (inserted_pte)
 924		*inserted_pte = pte;
 925	if (levelp)
 926		*levelp = level;
 927
 928	if (page) {
 929		if (!ret && (pte_val(pte) & _PAGE_WRITE))
 930			set_page_dirty_lock(page);
 931		put_page(page);
 932	}
 933
 934	/* Increment number of large pages if we (successfully) inserted one */
 935	if (!ret) {
 936		if (level == 1)
 937			kvm->stat.num_2M_pages++;
 938		else if (level == 2)
 939			kvm->stat.num_1G_pages++;
 940	}
 941
 942	return ret;
 943}
 944
 945int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 946				   unsigned long ea, unsigned long dsisr)
 947{
 948	struct kvm *kvm = vcpu->kvm;
 949	unsigned long gpa, gfn;
 950	struct kvm_memory_slot *memslot;
 951	long ret;
 952	bool writing = !!(dsisr & DSISR_ISSTORE);
 953	bool kvm_ro = false;
 954
 955	/* Check for unusual errors */
 956	if (dsisr & DSISR_UNSUPP_MMU) {
 957		pr_err("KVM: Got unsupported MMU fault\n");
 958		return -EFAULT;
 959	}
 960	if (dsisr & DSISR_BADACCESS) {
 961		/* Reflect to the guest as DSI */
 962		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
 963		kvmppc_core_queue_data_storage(vcpu,
 964				kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
 965				ea, dsisr);
 966		return RESUME_GUEST;
 967	}
 968
 969	/* Translate the logical address */
 970	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
 971	gpa &= ~0xF000000000000000ul;
 972	gfn = gpa >> PAGE_SHIFT;
 973	if (!(dsisr & DSISR_PRTABLE_FAULT))
 974		gpa |= ea & 0xfff;
 975
 976	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
 977		return kvmppc_send_page_to_uv(kvm, gfn);
 978
 979	/* Get the corresponding memslot */
 980	memslot = gfn_to_memslot(kvm, gfn);
 981
 982	/* No memslot means it's an emulated MMIO region */
 983	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 984		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
 985			     DSISR_SET_RC)) {
 986			/*
 987			 * Bad address in guest page table tree, or other
 988			 * unusual error - reflect it to the guest as DSI.
 989			 */
 990			kvmppc_core_queue_data_storage(vcpu,
 991					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
 992					ea, dsisr);
 993			return RESUME_GUEST;
 994		}
 995		return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
 996	}
 997
 998	if (memslot->flags & KVM_MEM_READONLY) {
 999		if (writing) {
1000			/* give the guest a DSI */
1001			kvmppc_core_queue_data_storage(vcpu,
1002					kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
1003					ea, DSISR_ISSTORE | DSISR_PROTFAULT);
1004			return RESUME_GUEST;
1005		}
1006		kvm_ro = true;
1007	}
1008
1009	/* Failed to set the reference/change bits */
1010	if (dsisr & DSISR_SET_RC) {
1011		spin_lock(&kvm->mmu_lock);
1012		if (kvmppc_hv_handle_set_rc(kvm, false, writing,
1013					    gpa, kvm->arch.lpid))
1014			dsisr &= ~DSISR_SET_RC;
1015		spin_unlock(&kvm->mmu_lock);
1016
1017		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
1018			       DSISR_PROTFAULT | DSISR_SET_RC)))
1019			return RESUME_GUEST;
1020	}
1021
1022	/* Try to insert a pte */
1023	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
1024					     kvm_ro, NULL, NULL);
1025
1026	if (ret == 0 || ret == -EAGAIN)
1027		ret = RESUME_GUEST;
1028	return ret;
1029}
1030
1031/* Called with kvm->mmu_lock held */
1032void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1033		     unsigned long gfn)
1034{
1035	pte_t *ptep;
1036	unsigned long gpa = gfn << PAGE_SHIFT;
1037	unsigned int shift;
1038
1039	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1040		uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1041		return;
1042	}
1043
1044	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1045	if (ptep && pte_present(*ptep))
1046		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1047				 kvm->arch.lpid);
 
1048}
1049
1050/* Called with kvm->mmu_lock held */
1051bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1052		   unsigned long gfn)
1053{
1054	pte_t *ptep;
1055	unsigned long gpa = gfn << PAGE_SHIFT;
1056	unsigned int shift;
1057	bool ref = false;
1058	unsigned long old, *rmapp;
1059
1060	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1061		return ref;
1062
1063	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1064	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1065		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1066					      gpa, shift);
1067		/* XXX need to flush tlb here? */
1068		/* Also clear bit in ptes in shadow pgtable for nested guests */
1069		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1070		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1071					       old & PTE_RPN_MASK,
1072					       1UL << shift);
1073		ref = true;
1074	}
1075	return ref;
1076}
1077
1078/* Called with kvm->mmu_lock held */
1079bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1080			unsigned long gfn)
1081
1082{
1083	pte_t *ptep;
1084	unsigned long gpa = gfn << PAGE_SHIFT;
1085	unsigned int shift;
1086	bool ref = false;
1087
1088	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1089		return ref;
1090
1091	ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1092	if (ptep && pte_present(*ptep) && pte_young(*ptep))
1093		ref = true;
1094	return ref;
1095}
1096
1097/* Returns the number of PAGE_SIZE pages that are dirty */
1098static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1099				struct kvm_memory_slot *memslot, int pagenum)
1100{
1101	unsigned long gfn = memslot->base_gfn + pagenum;
1102	unsigned long gpa = gfn << PAGE_SHIFT;
1103	pte_t *ptep, pte;
1104	unsigned int shift;
1105	int ret = 0;
1106	unsigned long old, *rmapp;
1107
1108	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1109		return ret;
1110
1111	/*
1112	 * For performance reasons we don't hold kvm->mmu_lock while walking the
1113	 * partition scoped table.
1114	 */
1115	ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1116	if (!ptep)
1117		return 0;
1118
1119	pte = READ_ONCE(*ptep);
1120	if (pte_present(pte) && pte_dirty(pte)) {
1121		spin_lock(&kvm->mmu_lock);
1122		/*
1123		 * Recheck the pte again
1124		 */
1125		if (pte_val(pte) != pte_val(*ptep)) {
1126			/*
1127			 * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1128			 * only find PAGE_SIZE pte entries here. We can continue
1129			 * to use the pte addr returned by above page table
1130			 * walk.
1131			 */
1132			if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1133				spin_unlock(&kvm->mmu_lock);
1134				return 0;
1135			}
1136		}
1137
1138		ret = 1;
1139		VM_BUG_ON(shift);
 
 
1140		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1141					      gpa, shift);
1142		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1143		/* Also clear bit in ptes in shadow pgtable for nested guests */
1144		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1145		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1146					       old & PTE_RPN_MASK,
1147					       1UL << shift);
1148		spin_unlock(&kvm->mmu_lock);
1149	}
1150	return ret;
1151}
1152
1153long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1154			struct kvm_memory_slot *memslot, unsigned long *map)
1155{
1156	unsigned long i, j;
1157	int npages;
1158
1159	for (i = 0; i < memslot->npages; i = j) {
1160		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1161
1162		/*
1163		 * Note that if npages > 0 then i must be a multiple of npages,
1164		 * since huge pages are only used to back the guest at guest
1165		 * real addresses that are a multiple of their size.
1166		 * Since we have at most one PTE covering any given guest
1167		 * real address, if npages > 1 we can skip to i + npages.
1168		 */
1169		j = i + 1;
1170		if (npages) {
1171			set_dirty_bits(map, i, npages);
1172			j = i + npages;
1173		}
1174	}
1175	return 0;
1176}
1177
1178void kvmppc_radix_flush_memslot(struct kvm *kvm,
1179				const struct kvm_memory_slot *memslot)
1180{
1181	unsigned long n;
1182	pte_t *ptep;
1183	unsigned long gpa;
1184	unsigned int shift;
1185
1186	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1187		kvmppc_uvmem_drop_pages(memslot, kvm, true);
1188
1189	if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1190		return;
1191
1192	gpa = memslot->base_gfn << PAGE_SHIFT;
1193	spin_lock(&kvm->mmu_lock);
1194	for (n = memslot->npages; n; --n) {
1195		ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1196		if (ptep && pte_present(*ptep))
1197			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1198					 kvm->arch.lpid);
1199		gpa += PAGE_SIZE;
1200	}
1201	/*
1202	 * Increase the mmu notifier sequence number to prevent any page
1203	 * fault that read the memslot earlier from writing a PTE.
1204	 */
1205	kvm->mmu_invalidate_seq++;
1206	spin_unlock(&kvm->mmu_lock);
1207}
1208
1209static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1210				 int psize, int *indexp)
1211{
1212	if (!mmu_psize_defs[psize].shift)
1213		return;
1214	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1215		(mmu_psize_defs[psize].ap << 29);
1216	++(*indexp);
1217}
1218
1219int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1220{
1221	int i;
1222
1223	if (!radix_enabled())
1224		return -EINVAL;
1225	memset(info, 0, sizeof(*info));
1226
1227	/* 4k page size */
1228	info->geometries[0].page_shift = 12;
1229	info->geometries[0].level_bits[0] = 9;
1230	for (i = 1; i < 4; ++i)
1231		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1232	/* 64k page size */
1233	info->geometries[1].page_shift = 16;
1234	for (i = 0; i < 4; ++i)
1235		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1236
1237	i = 0;
1238	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1239	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1240	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1241	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1242
1243	return 0;
1244}
1245
1246int kvmppc_init_vm_radix(struct kvm *kvm)
1247{
1248	kvm->arch.pgtable = pgd_alloc(kvm->mm);
1249	if (!kvm->arch.pgtable)
1250		return -ENOMEM;
1251	return 0;
1252}
1253
1254static void pte_ctor(void *addr)
1255{
1256	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1257}
1258
1259static void pmd_ctor(void *addr)
1260{
1261	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1262}
1263
1264struct debugfs_radix_state {
1265	struct kvm	*kvm;
1266	struct mutex	mutex;
1267	unsigned long	gpa;
1268	int		lpid;
1269	int		chars_left;
1270	int		buf_index;
1271	char		buf[128];
1272	u8		hdr;
1273};
1274
1275static int debugfs_radix_open(struct inode *inode, struct file *file)
1276{
1277	struct kvm *kvm = inode->i_private;
1278	struct debugfs_radix_state *p;
1279
1280	p = kzalloc(sizeof(*p), GFP_KERNEL);
1281	if (!p)
1282		return -ENOMEM;
1283
1284	kvm_get_kvm(kvm);
1285	p->kvm = kvm;
1286	mutex_init(&p->mutex);
1287	file->private_data = p;
1288
1289	return nonseekable_open(inode, file);
1290}
1291
1292static int debugfs_radix_release(struct inode *inode, struct file *file)
1293{
1294	struct debugfs_radix_state *p = file->private_data;
1295
1296	kvm_put_kvm(p->kvm);
1297	kfree(p);
1298	return 0;
1299}
1300
1301static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1302				 size_t len, loff_t *ppos)
1303{
1304	struct debugfs_radix_state *p = file->private_data;
1305	ssize_t ret, r;
1306	unsigned long n;
1307	struct kvm *kvm;
1308	unsigned long gpa;
1309	pgd_t *pgt;
1310	struct kvm_nested_guest *nested;
1311	pgd_t *pgdp;
1312	p4d_t p4d, *p4dp;
1313	pud_t pud, *pudp;
1314	pmd_t pmd, *pmdp;
1315	pte_t *ptep;
1316	int shift;
1317	unsigned long pte;
1318
1319	kvm = p->kvm;
1320	if (!kvm_is_radix(kvm))
1321		return 0;
1322
1323	ret = mutex_lock_interruptible(&p->mutex);
1324	if (ret)
1325		return ret;
1326
1327	if (p->chars_left) {
1328		n = p->chars_left;
1329		if (n > len)
1330			n = len;
1331		r = copy_to_user(buf, p->buf + p->buf_index, n);
1332		n -= r;
1333		p->chars_left -= n;
1334		p->buf_index += n;
1335		buf += n;
1336		len -= n;
1337		ret = n;
1338		if (r) {
1339			if (!n)
1340				ret = -EFAULT;
1341			goto out;
1342		}
1343	}
1344
1345	gpa = p->gpa;
1346	nested = NULL;
1347	pgt = NULL;
1348	while (len != 0 && p->lpid >= 0) {
1349		if (gpa >= RADIX_PGTABLE_RANGE) {
1350			gpa = 0;
1351			pgt = NULL;
1352			if (nested) {
1353				kvmhv_put_nested(nested);
1354				nested = NULL;
1355			}
1356			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1357			p->hdr = 0;
1358			if (p->lpid < 0)
1359				break;
1360		}
1361		if (!pgt) {
1362			if (p->lpid == 0) {
1363				pgt = kvm->arch.pgtable;
1364			} else {
1365				nested = kvmhv_get_nested(kvm, p->lpid, false);
1366				if (!nested) {
1367					gpa = RADIX_PGTABLE_RANGE;
1368					continue;
1369				}
1370				pgt = nested->shadow_pgtable;
1371			}
1372		}
1373		n = 0;
1374		if (!p->hdr) {
1375			if (p->lpid > 0)
1376				n = scnprintf(p->buf, sizeof(p->buf),
1377					      "\nNested LPID %d: ", p->lpid);
1378			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1379				      "pgdir: %lx\n", (unsigned long)pgt);
1380			p->hdr = 1;
1381			goto copy;
1382		}
1383
1384		pgdp = pgt + pgd_index(gpa);
1385		p4dp = p4d_offset(pgdp, gpa);
1386		p4d = READ_ONCE(*p4dp);
1387		if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1388			gpa = (gpa & P4D_MASK) + P4D_SIZE;
1389			continue;
1390		}
1391
1392		pudp = pud_offset(&p4d, gpa);
1393		pud = READ_ONCE(*pudp);
1394		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1395			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1396			continue;
1397		}
1398		if (pud_val(pud) & _PAGE_PTE) {
1399			pte = pud_val(pud);
1400			shift = PUD_SHIFT;
1401			goto leaf;
1402		}
1403
1404		pmdp = pmd_offset(&pud, gpa);
1405		pmd = READ_ONCE(*pmdp);
1406		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1407			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1408			continue;
1409		}
1410		if (pmd_val(pmd) & _PAGE_PTE) {
1411			pte = pmd_val(pmd);
1412			shift = PMD_SHIFT;
1413			goto leaf;
1414		}
1415
1416		ptep = pte_offset_kernel(&pmd, gpa);
1417		pte = pte_val(READ_ONCE(*ptep));
1418		if (!(pte & _PAGE_PRESENT)) {
1419			gpa += PAGE_SIZE;
1420			continue;
1421		}
1422		shift = PAGE_SHIFT;
1423	leaf:
1424		n = scnprintf(p->buf, sizeof(p->buf),
1425			      " %lx: %lx %d\n", gpa, pte, shift);
1426		gpa += 1ul << shift;
1427	copy:
1428		p->chars_left = n;
1429		if (n > len)
1430			n = len;
1431		r = copy_to_user(buf, p->buf, n);
1432		n -= r;
1433		p->chars_left -= n;
1434		p->buf_index = n;
1435		buf += n;
1436		len -= n;
1437		ret += n;
1438		if (r) {
1439			if (!ret)
1440				ret = -EFAULT;
1441			break;
1442		}
1443	}
1444	p->gpa = gpa;
1445	if (nested)
1446		kvmhv_put_nested(nested);
1447
1448 out:
1449	mutex_unlock(&p->mutex);
1450	return ret;
1451}
1452
1453static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1454			   size_t len, loff_t *ppos)
1455{
1456	return -EACCES;
1457}
1458
1459static const struct file_operations debugfs_radix_fops = {
1460	.owner	 = THIS_MODULE,
1461	.open	 = debugfs_radix_open,
1462	.release = debugfs_radix_release,
1463	.read	 = debugfs_radix_read,
1464	.write	 = debugfs_radix_write,
1465	.llseek	 = generic_file_llseek,
1466};
1467
1468void kvmhv_radix_debugfs_init(struct kvm *kvm)
1469{
1470	debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
1471			    &debugfs_radix_fops);
 
1472}
1473
1474int kvmppc_radix_init(void)
1475{
1476	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1477
1478	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1479	if (!kvm_pte_cache)
1480		return -ENOMEM;
1481
1482	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1483
1484	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1485	if (!kvm_pmd_cache) {
1486		kmem_cache_destroy(kvm_pte_cache);
1487		return -ENOMEM;
1488	}
1489
1490	return 0;
1491}
1492
1493void kvmppc_radix_exit(void)
1494{
1495	kmem_cache_destroy(kvm_pte_cache);
1496	kmem_cache_destroy(kvm_pmd_cache);
1497}
v5.4
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   5 */
   6
   7#include <linux/types.h>
   8#include <linux/string.h>
   9#include <linux/kvm.h>
  10#include <linux/kvm_host.h>
  11#include <linux/anon_inodes.h>
  12#include <linux/file.h>
  13#include <linux/debugfs.h>
 
  14
  15#include <asm/kvm_ppc.h>
  16#include <asm/kvm_book3s.h>
 
  17#include <asm/page.h>
  18#include <asm/mmu.h>
  19#include <asm/pgtable.h>
  20#include <asm/pgalloc.h>
  21#include <asm/pte-walk.h>
 
 
 
 
  22
  23/*
  24 * Supported radix tree geometry.
  25 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
  26 * for a page size of 64k or 4k.
  27 */
  28static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  29
  30unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
  31					      gva_t eaddr, void *to, void *from,
  32					      unsigned long n)
  33{
  34	int uninitialized_var(old_pid), old_lpid;
  35	unsigned long quadrant, ret = n;
  36	bool is_load = !!to;
  37
 
 
 
  38	/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
  39	if (kvmhv_on_pseries())
  40		return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
  41					  __pa(to), __pa(from), n);
 
 
 
 
  42
  43	quadrant = 1;
  44	if (!pid)
  45		quadrant = 2;
  46	if (is_load)
  47		from = (void *) (eaddr | (quadrant << 62));
  48	else
  49		to = (void *) (eaddr | (quadrant << 62));
  50
  51	preempt_disable();
  52
 
 
  53	/* switch the lpid first to avoid running host with unallocated pid */
  54	old_lpid = mfspr(SPRN_LPID);
  55	if (old_lpid != lpid)
  56		mtspr(SPRN_LPID, lpid);
  57	if (quadrant == 1) {
  58		old_pid = mfspr(SPRN_PID);
  59		if (old_pid != pid)
  60			mtspr(SPRN_PID, pid);
  61	}
  62	isync();
  63
  64	pagefault_disable();
  65	if (is_load)
  66		ret = raw_copy_from_user(to, from, n);
  67	else
  68		ret = raw_copy_to_user(to, from, n);
  69	pagefault_enable();
  70
 
 
  71	/* switch the pid first to avoid running host with unallocated pid */
  72	if (quadrant == 1 && pid != old_pid)
  73		mtspr(SPRN_PID, old_pid);
  74	if (lpid != old_lpid)
  75		mtspr(SPRN_LPID, old_lpid);
  76	isync();
  77
  78	preempt_enable();
  79
  80	return ret;
  81}
  82EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
  83
  84static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
  85					  void *to, void *from, unsigned long n)
  86{
  87	int lpid = vcpu->kvm->arch.lpid;
  88	int pid = vcpu->arch.pid;
  89
  90	/* This would cause a data segment intr so don't allow the access */
  91	if (eaddr & (0x3FFUL << 52))
  92		return -EINVAL;
  93
  94	/* Should we be using the nested lpid */
  95	if (vcpu->arch.nested)
  96		lpid = vcpu->arch.nested->shadow_lpid;
  97
  98	/* If accessing quadrant 3 then pid is expected to be 0 */
  99	if (((eaddr >> 62) & 0x3) == 0x3)
 100		pid = 0;
 
 
 101
 102	eaddr &= ~(0xFFFUL << 52);
 103
 104	return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
 105}
 106
 107long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
 108				 unsigned long n)
 109{
 110	long ret;
 111
 112	ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
 113	if (ret > 0)
 114		memset(to + (n - ret), 0, ret);
 115
 116	return ret;
 117}
 118EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
 119
 120long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
 121			       unsigned long n)
 122{
 123	return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
 124}
 125EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
 126
 127int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
 128			       struct kvmppc_pte *gpte, u64 root,
 129			       u64 *pte_ret_p)
 130{
 131	struct kvm *kvm = vcpu->kvm;
 132	int ret, level, ps;
 133	unsigned long rts, bits, offset, index;
 134	u64 pte, base, gpa;
 135	__be64 rpte;
 136
 137	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
 138		((root & RTS2_MASK) >> RTS2_SHIFT);
 139	bits = root & RPDS_MASK;
 140	base = root & RPDB_MASK;
 141
 142	offset = rts + 31;
 143
 144	/* Current implementations only support 52-bit space */
 145	if (offset != 52)
 146		return -EINVAL;
 147
 148	/* Walk each level of the radix tree */
 149	for (level = 3; level >= 0; --level) {
 150		u64 addr;
 151		/* Check a valid size */
 152		if (level && bits != p9_supported_radix_bits[level])
 153			return -EINVAL;
 154		if (level == 0 && !(bits == 5 || bits == 9))
 155			return -EINVAL;
 156		offset -= bits;
 157		index = (eaddr >> offset) & ((1UL << bits) - 1);
 158		/* Check that low bits of page table base are zero */
 159		if (base & ((1UL << (bits + 3)) - 1))
 160			return -EINVAL;
 161		/* Read the entry from guest memory */
 162		addr = base + (index * sizeof(rpte));
 
 
 163		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
 
 164		if (ret) {
 165			if (pte_ret_p)
 166				*pte_ret_p = addr;
 167			return ret;
 168		}
 169		pte = __be64_to_cpu(rpte);
 170		if (!(pte & _PAGE_PRESENT))
 171			return -ENOENT;
 172		/* Check if a leaf entry */
 173		if (pte & _PAGE_PTE)
 174			break;
 175		/* Get ready to walk the next level */
 176		base = pte & RPDB_MASK;
 177		bits = pte & RPDS_MASK;
 178	}
 179
 180	/* Need a leaf at lowest level; 512GB pages not supported */
 181	if (level < 0 || level == 3)
 182		return -EINVAL;
 183
 184	/* We found a valid leaf PTE */
 185	/* Offset is now log base 2 of the page size */
 186	gpa = pte & 0x01fffffffffff000ul;
 187	if (gpa & ((1ul << offset) - 1))
 188		return -EINVAL;
 189	gpa |= eaddr & ((1ul << offset) - 1);
 190	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
 191		if (offset == mmu_psize_defs[ps].shift)
 192			break;
 193	gpte->page_size = ps;
 194	gpte->page_shift = offset;
 195
 196	gpte->eaddr = eaddr;
 197	gpte->raddr = gpa;
 198
 199	/* Work out permissions */
 200	gpte->may_read = !!(pte & _PAGE_READ);
 201	gpte->may_write = !!(pte & _PAGE_WRITE);
 202	gpte->may_execute = !!(pte & _PAGE_EXEC);
 203
 204	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
 205
 206	if (pte_ret_p)
 207		*pte_ret_p = pte;
 208
 209	return 0;
 210}
 211
 212/*
 213 * Used to walk a partition or process table radix tree in guest memory
 214 * Note: We exploit the fact that a partition table and a process
 215 * table have the same layout, a partition-scoped page table and a
 216 * process-scoped page table have the same layout, and the 2nd
 217 * doubleword of a partition table entry has the same layout as
 218 * the PTCR register.
 219 */
 220int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
 221				     struct kvmppc_pte *gpte, u64 table,
 222				     int table_index, u64 *pte_ret_p)
 223{
 224	struct kvm *kvm = vcpu->kvm;
 225	int ret;
 226	unsigned long size, ptbl, root;
 227	struct prtb_entry entry;
 228
 229	if ((table & PRTS_MASK) > 24)
 230		return -EINVAL;
 231	size = 1ul << ((table & PRTS_MASK) + 12);
 232
 233	/* Is the table big enough to contain this entry? */
 234	if ((table_index * sizeof(entry)) >= size)
 235		return -EINVAL;
 236
 237	/* Read the table to find the root of the radix tree */
 238	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
 
 239	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
 
 240	if (ret)
 241		return ret;
 242
 243	/* Root is stored in the first double word */
 244	root = be64_to_cpu(entry.prtb0);
 245
 246	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
 247}
 248
 249int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 250			   struct kvmppc_pte *gpte, bool data, bool iswrite)
 251{
 252	u32 pid;
 253	u64 pte;
 254	int ret;
 255
 256	/* Work out effective PID */
 257	switch (eaddr >> 62) {
 258	case 0:
 259		pid = vcpu->arch.pid;
 260		break;
 261	case 3:
 262		pid = 0;
 263		break;
 264	default:
 265		return -EINVAL;
 266	}
 267
 268	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
 269				vcpu->kvm->arch.process_table, pid, &pte);
 270	if (ret)
 271		return ret;
 272
 273	/* Check privilege (applies only to process scoped translations) */
 274	if (kvmppc_get_msr(vcpu) & MSR_PR) {
 275		if (pte & _PAGE_PRIVILEGED) {
 276			gpte->may_read = 0;
 277			gpte->may_write = 0;
 278			gpte->may_execute = 0;
 279		}
 280	} else {
 281		if (!(pte & _PAGE_PRIVILEGED)) {
 282			/* Check AMR/IAMR to see if strict mode is in force */
 283			if (vcpu->arch.amr & (1ul << 62))
 284				gpte->may_read = 0;
 285			if (vcpu->arch.amr & (1ul << 63))
 286				gpte->may_write = 0;
 287			if (vcpu->arch.iamr & (1ul << 62))
 288				gpte->may_execute = 0;
 289		}
 290	}
 291
 292	return 0;
 293}
 294
 295void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 296			     unsigned int pshift, unsigned int lpid)
 297{
 298	unsigned long psize = PAGE_SIZE;
 299	int psi;
 300	long rc;
 301	unsigned long rb;
 302
 303	if (pshift)
 304		psize = 1UL << pshift;
 305	else
 306		pshift = PAGE_SHIFT;
 307
 308	addr &= ~(psize - 1);
 309
 310	if (!kvmhv_on_pseries()) {
 311		radix__flush_tlb_lpid_page(lpid, addr, psize);
 312		return;
 313	}
 314
 315	psi = shift_to_mmu_psize(pshift);
 316	rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
 317	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
 318				lpid, rb);
 
 
 
 
 
 
 
 
 
 
 319	if (rc)
 320		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
 321}
 322
 323static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
 324{
 325	long rc;
 326
 327	if (!kvmhv_on_pseries()) {
 328		radix__flush_pwc_lpid(lpid);
 329		return;
 330	}
 331
 332	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
 333				lpid, TLBIEL_INVAL_SET_LPID);
 
 
 
 
 
 
 334	if (rc)
 335		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
 336}
 337
 338static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
 339				      unsigned long clr, unsigned long set,
 340				      unsigned long addr, unsigned int shift)
 341{
 342	return __radix_pte_update(ptep, clr, set);
 343}
 344
 345void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
 346			     pte_t *ptep, pte_t pte)
 347{
 348	radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
 349}
 350
 351static struct kmem_cache *kvm_pte_cache;
 352static struct kmem_cache *kvm_pmd_cache;
 353
 354static pte_t *kvmppc_pte_alloc(void)
 355{
 356	return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
 
 
 
 
 
 
 357}
 358
 359static void kvmppc_pte_free(pte_t *ptep)
 360{
 361	kmem_cache_free(kvm_pte_cache, ptep);
 362}
 363
 364static pmd_t *kvmppc_pmd_alloc(void)
 365{
 366	return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
 
 
 
 
 
 
 367}
 368
 369static void kvmppc_pmd_free(pmd_t *pmdp)
 370{
 371	kmem_cache_free(kvm_pmd_cache, pmdp);
 372}
 373
 374/* Called with kvm->mmu_lock held */
 375void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
 376		      unsigned int shift,
 377		      const struct kvm_memory_slot *memslot,
 378		      unsigned int lpid)
 379
 380{
 381	unsigned long old;
 382	unsigned long gfn = gpa >> PAGE_SHIFT;
 383	unsigned long page_size = PAGE_SIZE;
 384	unsigned long hpa;
 385
 386	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
 387	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
 388
 389	/* The following only applies to L1 entries */
 390	if (lpid != kvm->arch.lpid)
 391		return;
 392
 393	if (!memslot) {
 394		memslot = gfn_to_memslot(kvm, gfn);
 395		if (!memslot)
 396			return;
 397	}
 398	if (shift) { /* 1GB or 2MB page */
 399		page_size = 1ul << shift;
 400		if (shift == PMD_SHIFT)
 401			kvm->stat.num_2M_pages--;
 402		else if (shift == PUD_SHIFT)
 403			kvm->stat.num_1G_pages--;
 404	}
 405
 406	gpa &= ~(page_size - 1);
 407	hpa = old & PTE_RPN_MASK;
 408	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
 409
 410	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
 411		kvmppc_update_dirty_map(memslot, gfn, page_size);
 412}
 413
 414/*
 415 * kvmppc_free_p?d are used to free existing page tables, and recursively
 416 * descend and clear and free children.
 417 * Callers are responsible for flushing the PWC.
 418 *
 419 * When page tables are being unmapped/freed as part of page fault path
 420 * (full == false), ptes are not expected. There is code to unmap them
 421 * and emit a warning if encountered, but there may already be data
 422 * corruption due to the unexpected mappings.
 
 
 
 
 423 */
 424static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
 425				  unsigned int lpid)
 426{
 427	if (full) {
 428		memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
 429	} else {
 430		pte_t *p = pte;
 431		unsigned long it;
 432
 433		for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
 434			if (pte_val(*p) == 0)
 435				continue;
 436			WARN_ON_ONCE(1);
 437			kvmppc_unmap_pte(kvm, p,
 438					 pte_pfn(*p) << PAGE_SHIFT,
 439					 PAGE_SHIFT, NULL, lpid);
 440		}
 441	}
 442
 443	kvmppc_pte_free(pte);
 444}
 445
 446static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
 447				  unsigned int lpid)
 448{
 449	unsigned long im;
 450	pmd_t *p = pmd;
 451
 452	for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
 453		if (!pmd_present(*p))
 454			continue;
 455		if (pmd_is_leaf(*p)) {
 456			if (full) {
 457				pmd_clear(p);
 458			} else {
 459				WARN_ON_ONCE(1);
 460				kvmppc_unmap_pte(kvm, (pte_t *)p,
 461					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
 462					 PMD_SHIFT, NULL, lpid);
 463			}
 464		} else {
 465			pte_t *pte;
 466
 467			pte = pte_offset_map(p, 0);
 468			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
 469			pmd_clear(p);
 470		}
 471	}
 472	kvmppc_pmd_free(pmd);
 473}
 474
 475static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
 476				  unsigned int lpid)
 477{
 478	unsigned long iu;
 479	pud_t *p = pud;
 480
 481	for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
 482		if (!pud_present(*p))
 483			continue;
 484		if (pud_is_leaf(*p)) {
 485			pud_clear(p);
 486		} else {
 487			pmd_t *pmd;
 488
 489			pmd = pmd_offset(p, 0);
 490			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
 491			pud_clear(p);
 492		}
 493	}
 494	pud_free(kvm->mm, pud);
 495}
 496
 497void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 498{
 499	unsigned long ig;
 500
 501	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
 
 502		pud_t *pud;
 503
 504		if (!pgd_present(*pgd))
 505			continue;
 506		pud = pud_offset(pgd, 0);
 507		kvmppc_unmap_free_pud(kvm, pud, lpid);
 508		pgd_clear(pgd);
 509	}
 510}
 511
 512void kvmppc_free_radix(struct kvm *kvm)
 513{
 514	if (kvm->arch.pgtable) {
 515		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
 516					  kvm->arch.lpid);
 517		pgd_free(kvm->mm, kvm->arch.pgtable);
 518		kvm->arch.pgtable = NULL;
 519	}
 520}
 521
 522static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
 523					unsigned long gpa, unsigned int lpid)
 524{
 525	pte_t *pte = pte_offset_kernel(pmd, 0);
 526
 527	/*
 528	 * Clearing the pmd entry then flushing the PWC ensures that the pte
 529	 * page no longer be cached by the MMU, so can be freed without
 530	 * flushing the PWC again.
 531	 */
 532	pmd_clear(pmd);
 533	kvmppc_radix_flush_pwc(kvm, lpid);
 534
 535	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
 536}
 537
 538static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 539					unsigned long gpa, unsigned int lpid)
 540{
 541	pmd_t *pmd = pmd_offset(pud, 0);
 542
 543	/*
 544	 * Clearing the pud entry then flushing the PWC ensures that the pmd
 545	 * page and any children pte pages will no longer be cached by the MMU,
 546	 * so can be freed without flushing the PWC again.
 547	 */
 548	pud_clear(pud);
 549	kvmppc_radix_flush_pwc(kvm, lpid);
 550
 551	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
 552}
 553
 554/*
 555 * There are a number of bits which may differ between different faults to
 556 * the same partition scope entry. RC bits, in the course of cleaning and
 557 * aging. And the write bit can change, either the access could have been
 558 * upgraded, or a read fault could happen concurrently with a write fault
 559 * that sets those bits first.
 560 */
 561#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
 562
 563int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 564		      unsigned long gpa, unsigned int level,
 565		      unsigned long mmu_seq, unsigned int lpid,
 566		      unsigned long *rmapp, struct rmap_nested **n_rmap)
 567{
 568	pgd_t *pgd;
 
 569	pud_t *pud, *new_pud = NULL;
 570	pmd_t *pmd, *new_pmd = NULL;
 571	pte_t *ptep, *new_ptep = NULL;
 572	int ret;
 573
 574	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
 575	pgd = pgtable + pgd_index(gpa);
 
 
 576	pud = NULL;
 577	if (pgd_present(*pgd))
 578		pud = pud_offset(pgd, gpa);
 579	else
 580		new_pud = pud_alloc_one(kvm->mm, gpa);
 581
 582	pmd = NULL;
 583	if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
 584		pmd = pmd_offset(pud, gpa);
 585	else if (level <= 1)
 586		new_pmd = kvmppc_pmd_alloc();
 587
 588	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
 589		new_ptep = kvmppc_pte_alloc();
 590
 591	/* Check if we might have been invalidated; let the guest retry if so */
 592	spin_lock(&kvm->mmu_lock);
 593	ret = -EAGAIN;
 594	if (mmu_notifier_retry(kvm, mmu_seq))
 595		goto out_unlock;
 596
 597	/* Now traverse again under the lock and change the tree */
 598	ret = -ENOMEM;
 599	if (pgd_none(*pgd)) {
 600		if (!new_pud)
 601			goto out_unlock;
 602		pgd_populate(kvm->mm, pgd, new_pud);
 603		new_pud = NULL;
 604	}
 605	pud = pud_offset(pgd, gpa);
 606	if (pud_is_leaf(*pud)) {
 607		unsigned long hgpa = gpa & PUD_MASK;
 608
 609		/* Check if we raced and someone else has set the same thing */
 610		if (level == 2) {
 611			if (pud_raw(*pud) == pte_raw(pte)) {
 612				ret = 0;
 613				goto out_unlock;
 614			}
 615			/* Valid 1GB page here already, add our extra bits */
 616			WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
 617							PTE_BITS_MUST_MATCH);
 618			kvmppc_radix_update_pte(kvm, (pte_t *)pud,
 619					      0, pte_val(pte), hgpa, PUD_SHIFT);
 620			ret = 0;
 621			goto out_unlock;
 622		}
 623		/*
 624		 * If we raced with another CPU which has just put
 625		 * a 1GB pte in after we saw a pmd page, try again.
 626		 */
 627		if (!new_pmd) {
 628			ret = -EAGAIN;
 629			goto out_unlock;
 630		}
 631		/* Valid 1GB page here already, remove it */
 632		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
 633				 lpid);
 634	}
 635	if (level == 2) {
 636		if (!pud_none(*pud)) {
 637			/*
 638			 * There's a page table page here, but we wanted to
 639			 * install a large page, so remove and free the page
 640			 * table page.
 641			 */
 642			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
 643		}
 644		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
 645		if (rmapp && n_rmap)
 646			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 647		ret = 0;
 648		goto out_unlock;
 649	}
 650	if (pud_none(*pud)) {
 651		if (!new_pmd)
 652			goto out_unlock;
 653		pud_populate(kvm->mm, pud, new_pmd);
 654		new_pmd = NULL;
 655	}
 656	pmd = pmd_offset(pud, gpa);
 657	if (pmd_is_leaf(*pmd)) {
 658		unsigned long lgpa = gpa & PMD_MASK;
 659
 660		/* Check if we raced and someone else has set the same thing */
 661		if (level == 1) {
 662			if (pmd_raw(*pmd) == pte_raw(pte)) {
 663				ret = 0;
 664				goto out_unlock;
 665			}
 666			/* Valid 2MB page here already, add our extra bits */
 667			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
 668							PTE_BITS_MUST_MATCH);
 669			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
 670					0, pte_val(pte), lgpa, PMD_SHIFT);
 671			ret = 0;
 672			goto out_unlock;
 673		}
 674
 675		/*
 676		 * If we raced with another CPU which has just put
 677		 * a 2MB pte in after we saw a pte page, try again.
 678		 */
 679		if (!new_ptep) {
 680			ret = -EAGAIN;
 681			goto out_unlock;
 682		}
 683		/* Valid 2MB page here already, remove it */
 684		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
 685				 lpid);
 686	}
 687	if (level == 1) {
 688		if (!pmd_none(*pmd)) {
 689			/*
 690			 * There's a page table page here, but we wanted to
 691			 * install a large page, so remove and free the page
 692			 * table page.
 693			 */
 694			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
 695		}
 696		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
 697		if (rmapp && n_rmap)
 698			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 699		ret = 0;
 700		goto out_unlock;
 701	}
 702	if (pmd_none(*pmd)) {
 703		if (!new_ptep)
 704			goto out_unlock;
 705		pmd_populate(kvm->mm, pmd, new_ptep);
 706		new_ptep = NULL;
 707	}
 708	ptep = pte_offset_kernel(pmd, gpa);
 709	if (pte_present(*ptep)) {
 710		/* Check if someone else set the same thing */
 711		if (pte_raw(*ptep) == pte_raw(pte)) {
 712			ret = 0;
 713			goto out_unlock;
 714		}
 715		/* Valid page here already, add our extra bits */
 716		WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
 717							PTE_BITS_MUST_MATCH);
 718		kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
 719		ret = 0;
 720		goto out_unlock;
 721	}
 722	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 723	if (rmapp && n_rmap)
 724		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 725	ret = 0;
 726
 727 out_unlock:
 728	spin_unlock(&kvm->mmu_lock);
 729	if (new_pud)
 730		pud_free(kvm->mm, new_pud);
 731	if (new_pmd)
 732		kvmppc_pmd_free(new_pmd);
 733	if (new_ptep)
 734		kvmppc_pte_free(new_ptep);
 735	return ret;
 736}
 737
 738bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
 739			     unsigned long gpa, unsigned int lpid)
 740{
 741	unsigned long pgflags;
 742	unsigned int shift;
 743	pte_t *ptep;
 744
 745	/*
 746	 * Need to set an R or C bit in the 2nd-level tables;
 747	 * since we are just helping out the hardware here,
 748	 * it is sufficient to do what the hardware does.
 749	 */
 750	pgflags = _PAGE_ACCESSED;
 751	if (writing)
 752		pgflags |= _PAGE_DIRTY;
 753	/*
 754	 * We are walking the secondary (partition-scoped) page table here.
 755	 * We can do this without disabling irq because the Linux MM
 756	 * subsystem doesn't do THP splits and collapses on this tree.
 757	 */
 758	ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
 759	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
 760		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
 761		return true;
 762	}
 763	return false;
 764}
 765
 766int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 767				   unsigned long gpa,
 768				   struct kvm_memory_slot *memslot,
 769				   bool writing, bool kvm_ro,
 770				   pte_t *inserted_pte, unsigned int *levelp)
 771{
 772	struct kvm *kvm = vcpu->kvm;
 773	struct page *page = NULL;
 774	unsigned long mmu_seq;
 775	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
 776	bool upgrade_write = false;
 777	bool *upgrade_p = &upgrade_write;
 778	pte_t pte, *ptep;
 779	unsigned int shift, level;
 780	int ret;
 781	bool large_enable;
 782
 783	/* used to check for invalidations in progress */
 784	mmu_seq = kvm->mmu_notifier_seq;
 785	smp_rmb();
 786
 787	/*
 788	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
 789	 * do it with !atomic && !async, which is how we call it.
 790	 * We always ask for write permission since the common case
 791	 * is that the page is writable.
 792	 */
 793	hva = gfn_to_hva_memslot(memslot, gfn);
 794	if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
 795		upgrade_write = true;
 796	} else {
 797		unsigned long pfn;
 798
 799		/* Call KVM generic code to do the slow-path check */
 800		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 801					   writing, upgrade_p);
 802		if (is_error_noslot_pfn(pfn))
 803			return -EFAULT;
 804		page = NULL;
 805		if (pfn_valid(pfn)) {
 806			page = pfn_to_page(pfn);
 807			if (PageReserved(page))
 808				page = NULL;
 809		}
 810	}
 811
 812	/*
 813	 * Read the PTE from the process' radix tree and use that
 814	 * so we get the shift and attribute bits.
 815	 */
 816	local_irq_disable();
 817	ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
 
 
 
 
 818	/*
 819	 * If the PTE disappeared temporarily due to a THP
 820	 * collapse, just return and let the guest try again.
 821	 */
 822	if (!ptep) {
 823		local_irq_enable();
 824		if (page)
 825			put_page(page);
 826		return RESUME_GUEST;
 827	}
 828	pte = *ptep;
 829	local_irq_enable();
 830
 831	/* If we're logging dirty pages, always map single pages */
 832	large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
 833
 834	/* Get pte level from shift/size */
 835	if (large_enable && shift == PUD_SHIFT &&
 836	    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
 837	    (hva & (PUD_SIZE - PAGE_SIZE))) {
 838		level = 2;
 839	} else if (large_enable && shift == PMD_SHIFT &&
 840		   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
 841		   (hva & (PMD_SIZE - PAGE_SIZE))) {
 842		level = 1;
 843	} else {
 844		level = 0;
 845		if (shift > PAGE_SHIFT) {
 846			/*
 847			 * If the pte maps more than one page, bring over
 848			 * bits from the virtual address to get the real
 849			 * address of the specific single page we want.
 850			 */
 851			unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
 852			pte = __pte(pte_val(pte) | (hva & rpnmask));
 853		}
 854	}
 855
 856	pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
 857	if (writing || upgrade_write) {
 858		if (pte_val(pte) & _PAGE_WRITE)
 859			pte = __pte(pte_val(pte) | _PAGE_DIRTY);
 860	} else {
 861		pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
 862	}
 863
 864	/* Allocate space in the tree and write the PTE */
 865	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
 866				mmu_seq, kvm->arch.lpid, NULL, NULL);
 867	if (inserted_pte)
 868		*inserted_pte = pte;
 869	if (levelp)
 870		*levelp = level;
 871
 872	if (page) {
 873		if (!ret && (pte_val(pte) & _PAGE_WRITE))
 874			set_page_dirty_lock(page);
 875		put_page(page);
 876	}
 877
 878	/* Increment number of large pages if we (successfully) inserted one */
 879	if (!ret) {
 880		if (level == 1)
 881			kvm->stat.num_2M_pages++;
 882		else if (level == 2)
 883			kvm->stat.num_1G_pages++;
 884	}
 885
 886	return ret;
 887}
 888
 889int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 890				   unsigned long ea, unsigned long dsisr)
 891{
 892	struct kvm *kvm = vcpu->kvm;
 893	unsigned long gpa, gfn;
 894	struct kvm_memory_slot *memslot;
 895	long ret;
 896	bool writing = !!(dsisr & DSISR_ISSTORE);
 897	bool kvm_ro = false;
 898
 899	/* Check for unusual errors */
 900	if (dsisr & DSISR_UNSUPP_MMU) {
 901		pr_err("KVM: Got unsupported MMU fault\n");
 902		return -EFAULT;
 903	}
 904	if (dsisr & DSISR_BADACCESS) {
 905		/* Reflect to the guest as DSI */
 906		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
 907		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
 
 
 908		return RESUME_GUEST;
 909	}
 910
 911	/* Translate the logical address */
 912	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
 913	gpa &= ~0xF000000000000000ul;
 914	gfn = gpa >> PAGE_SHIFT;
 915	if (!(dsisr & DSISR_PRTABLE_FAULT))
 916		gpa |= ea & 0xfff;
 917
 
 
 
 918	/* Get the corresponding memslot */
 919	memslot = gfn_to_memslot(kvm, gfn);
 920
 921	/* No memslot means it's an emulated MMIO region */
 922	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 923		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
 924			     DSISR_SET_RC)) {
 925			/*
 926			 * Bad address in guest page table tree, or other
 927			 * unusual error - reflect it to the guest as DSI.
 928			 */
 929			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
 
 
 930			return RESUME_GUEST;
 931		}
 932		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
 933	}
 934
 935	if (memslot->flags & KVM_MEM_READONLY) {
 936		if (writing) {
 937			/* give the guest a DSI */
 938			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
 939						       DSISR_PROTFAULT);
 
 940			return RESUME_GUEST;
 941		}
 942		kvm_ro = true;
 943	}
 944
 945	/* Failed to set the reference/change bits */
 946	if (dsisr & DSISR_SET_RC) {
 947		spin_lock(&kvm->mmu_lock);
 948		if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
 949					    writing, gpa, kvm->arch.lpid))
 950			dsisr &= ~DSISR_SET_RC;
 951		spin_unlock(&kvm->mmu_lock);
 952
 953		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
 954			       DSISR_PROTFAULT | DSISR_SET_RC)))
 955			return RESUME_GUEST;
 956	}
 957
 958	/* Try to insert a pte */
 959	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
 960					     kvm_ro, NULL, NULL);
 961
 962	if (ret == 0 || ret == -EAGAIN)
 963		ret = RESUME_GUEST;
 964	return ret;
 965}
 966
 967/* Called with kvm->mmu_lock held */
 968int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 969		    unsigned long gfn)
 970{
 971	pte_t *ptep;
 972	unsigned long gpa = gfn << PAGE_SHIFT;
 973	unsigned int shift;
 974
 975	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 
 
 
 
 
 976	if (ptep && pte_present(*ptep))
 977		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
 978				 kvm->arch.lpid);
 979	return 0;				
 980}
 981
 982/* Called with kvm->mmu_lock held */
 983int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 984		  unsigned long gfn)
 985{
 986	pte_t *ptep;
 987	unsigned long gpa = gfn << PAGE_SHIFT;
 988	unsigned int shift;
 989	int ref = 0;
 990	unsigned long old, *rmapp;
 991
 992	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 
 
 
 993	if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
 994		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
 995					      gpa, shift);
 996		/* XXX need to flush tlb here? */
 997		/* Also clear bit in ptes in shadow pgtable for nested guests */
 998		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 999		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1000					       old & PTE_RPN_MASK,
1001					       1UL << shift);
1002		ref = 1;
1003	}
1004	return ref;
1005}
1006
1007/* Called with kvm->mmu_lock held */
1008int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1009		       unsigned long gfn)
 
1010{
1011	pte_t *ptep;
1012	unsigned long gpa = gfn << PAGE_SHIFT;
1013	unsigned int shift;
1014	int ref = 0;
1015
1016	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
 
 
 
1017	if (ptep && pte_present(*ptep) && pte_young(*ptep))
1018		ref = 1;
1019	return ref;
1020}
1021
1022/* Returns the number of PAGE_SIZE pages that are dirty */
1023static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1024				struct kvm_memory_slot *memslot, int pagenum)
1025{
1026	unsigned long gfn = memslot->base_gfn + pagenum;
1027	unsigned long gpa = gfn << PAGE_SHIFT;
1028	pte_t *ptep;
1029	unsigned int shift;
1030	int ret = 0;
1031	unsigned long old, *rmapp;
1032
1033	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
1034	if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1035		ret = 1;
1036		if (shift)
1037			ret = 1 << (shift - PAGE_SHIFT);
1038		spin_lock(&kvm->mmu_lock);
1039		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1040					      gpa, shift);
1041		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1042		/* Also clear bit in ptes in shadow pgtable for nested guests */
1043		rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1044		kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1045					       old & PTE_RPN_MASK,
1046					       1UL << shift);
1047		spin_unlock(&kvm->mmu_lock);
1048	}
1049	return ret;
1050}
1051
1052long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1053			struct kvm_memory_slot *memslot, unsigned long *map)
1054{
1055	unsigned long i, j;
1056	int npages;
1057
1058	for (i = 0; i < memslot->npages; i = j) {
1059		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1060
1061		/*
1062		 * Note that if npages > 0 then i must be a multiple of npages,
1063		 * since huge pages are only used to back the guest at guest
1064		 * real addresses that are a multiple of their size.
1065		 * Since we have at most one PTE covering any given guest
1066		 * real address, if npages > 1 we can skip to i + npages.
1067		 */
1068		j = i + 1;
1069		if (npages) {
1070			set_dirty_bits(map, i, npages);
1071			j = i + npages;
1072		}
1073	}
1074	return 0;
1075}
1076
1077void kvmppc_radix_flush_memslot(struct kvm *kvm,
1078				const struct kvm_memory_slot *memslot)
1079{
1080	unsigned long n;
1081	pte_t *ptep;
1082	unsigned long gpa;
1083	unsigned int shift;
1084
 
 
 
 
 
 
1085	gpa = memslot->base_gfn << PAGE_SHIFT;
1086	spin_lock(&kvm->mmu_lock);
1087	for (n = memslot->npages; n; --n) {
1088		ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
1089		if (ptep && pte_present(*ptep))
1090			kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1091					 kvm->arch.lpid);
1092		gpa += PAGE_SIZE;
1093	}
 
 
 
 
 
1094	spin_unlock(&kvm->mmu_lock);
1095}
1096
1097static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1098				 int psize, int *indexp)
1099{
1100	if (!mmu_psize_defs[psize].shift)
1101		return;
1102	info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1103		(mmu_psize_defs[psize].ap << 29);
1104	++(*indexp);
1105}
1106
1107int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1108{
1109	int i;
1110
1111	if (!radix_enabled())
1112		return -EINVAL;
1113	memset(info, 0, sizeof(*info));
1114
1115	/* 4k page size */
1116	info->geometries[0].page_shift = 12;
1117	info->geometries[0].level_bits[0] = 9;
1118	for (i = 1; i < 4; ++i)
1119		info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1120	/* 64k page size */
1121	info->geometries[1].page_shift = 16;
1122	for (i = 0; i < 4; ++i)
1123		info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1124
1125	i = 0;
1126	add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1127	add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1128	add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1129	add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1130
1131	return 0;
1132}
1133
1134int kvmppc_init_vm_radix(struct kvm *kvm)
1135{
1136	kvm->arch.pgtable = pgd_alloc(kvm->mm);
1137	if (!kvm->arch.pgtable)
1138		return -ENOMEM;
1139	return 0;
1140}
1141
1142static void pte_ctor(void *addr)
1143{
1144	memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1145}
1146
1147static void pmd_ctor(void *addr)
1148{
1149	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1150}
1151
1152struct debugfs_radix_state {
1153	struct kvm	*kvm;
1154	struct mutex	mutex;
1155	unsigned long	gpa;
1156	int		lpid;
1157	int		chars_left;
1158	int		buf_index;
1159	char		buf[128];
1160	u8		hdr;
1161};
1162
1163static int debugfs_radix_open(struct inode *inode, struct file *file)
1164{
1165	struct kvm *kvm = inode->i_private;
1166	struct debugfs_radix_state *p;
1167
1168	p = kzalloc(sizeof(*p), GFP_KERNEL);
1169	if (!p)
1170		return -ENOMEM;
1171
1172	kvm_get_kvm(kvm);
1173	p->kvm = kvm;
1174	mutex_init(&p->mutex);
1175	file->private_data = p;
1176
1177	return nonseekable_open(inode, file);
1178}
1179
1180static int debugfs_radix_release(struct inode *inode, struct file *file)
1181{
1182	struct debugfs_radix_state *p = file->private_data;
1183
1184	kvm_put_kvm(p->kvm);
1185	kfree(p);
1186	return 0;
1187}
1188
1189static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1190				 size_t len, loff_t *ppos)
1191{
1192	struct debugfs_radix_state *p = file->private_data;
1193	ssize_t ret, r;
1194	unsigned long n;
1195	struct kvm *kvm;
1196	unsigned long gpa;
1197	pgd_t *pgt;
1198	struct kvm_nested_guest *nested;
1199	pgd_t pgd, *pgdp;
 
1200	pud_t pud, *pudp;
1201	pmd_t pmd, *pmdp;
1202	pte_t *ptep;
1203	int shift;
1204	unsigned long pte;
1205
1206	kvm = p->kvm;
1207	if (!kvm_is_radix(kvm))
1208		return 0;
1209
1210	ret = mutex_lock_interruptible(&p->mutex);
1211	if (ret)
1212		return ret;
1213
1214	if (p->chars_left) {
1215		n = p->chars_left;
1216		if (n > len)
1217			n = len;
1218		r = copy_to_user(buf, p->buf + p->buf_index, n);
1219		n -= r;
1220		p->chars_left -= n;
1221		p->buf_index += n;
1222		buf += n;
1223		len -= n;
1224		ret = n;
1225		if (r) {
1226			if (!n)
1227				ret = -EFAULT;
1228			goto out;
1229		}
1230	}
1231
1232	gpa = p->gpa;
1233	nested = NULL;
1234	pgt = NULL;
1235	while (len != 0 && p->lpid >= 0) {
1236		if (gpa >= RADIX_PGTABLE_RANGE) {
1237			gpa = 0;
1238			pgt = NULL;
1239			if (nested) {
1240				kvmhv_put_nested(nested);
1241				nested = NULL;
1242			}
1243			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1244			p->hdr = 0;
1245			if (p->lpid < 0)
1246				break;
1247		}
1248		if (!pgt) {
1249			if (p->lpid == 0) {
1250				pgt = kvm->arch.pgtable;
1251			} else {
1252				nested = kvmhv_get_nested(kvm, p->lpid, false);
1253				if (!nested) {
1254					gpa = RADIX_PGTABLE_RANGE;
1255					continue;
1256				}
1257				pgt = nested->shadow_pgtable;
1258			}
1259		}
1260		n = 0;
1261		if (!p->hdr) {
1262			if (p->lpid > 0)
1263				n = scnprintf(p->buf, sizeof(p->buf),
1264					      "\nNested LPID %d: ", p->lpid);
1265			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1266				      "pgdir: %lx\n", (unsigned long)pgt);
1267			p->hdr = 1;
1268			goto copy;
1269		}
1270
1271		pgdp = pgt + pgd_index(gpa);
1272		pgd = READ_ONCE(*pgdp);
1273		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
1274			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
 
1275			continue;
1276		}
1277
1278		pudp = pud_offset(&pgd, gpa);
1279		pud = READ_ONCE(*pudp);
1280		if (!(pud_val(pud) & _PAGE_PRESENT)) {
1281			gpa = (gpa & PUD_MASK) + PUD_SIZE;
1282			continue;
1283		}
1284		if (pud_val(pud) & _PAGE_PTE) {
1285			pte = pud_val(pud);
1286			shift = PUD_SHIFT;
1287			goto leaf;
1288		}
1289
1290		pmdp = pmd_offset(&pud, gpa);
1291		pmd = READ_ONCE(*pmdp);
1292		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1293			gpa = (gpa & PMD_MASK) + PMD_SIZE;
1294			continue;
1295		}
1296		if (pmd_val(pmd) & _PAGE_PTE) {
1297			pte = pmd_val(pmd);
1298			shift = PMD_SHIFT;
1299			goto leaf;
1300		}
1301
1302		ptep = pte_offset_kernel(&pmd, gpa);
1303		pte = pte_val(READ_ONCE(*ptep));
1304		if (!(pte & _PAGE_PRESENT)) {
1305			gpa += PAGE_SIZE;
1306			continue;
1307		}
1308		shift = PAGE_SHIFT;
1309	leaf:
1310		n = scnprintf(p->buf, sizeof(p->buf),
1311			      " %lx: %lx %d\n", gpa, pte, shift);
1312		gpa += 1ul << shift;
1313	copy:
1314		p->chars_left = n;
1315		if (n > len)
1316			n = len;
1317		r = copy_to_user(buf, p->buf, n);
1318		n -= r;
1319		p->chars_left -= n;
1320		p->buf_index = n;
1321		buf += n;
1322		len -= n;
1323		ret += n;
1324		if (r) {
1325			if (!ret)
1326				ret = -EFAULT;
1327			break;
1328		}
1329	}
1330	p->gpa = gpa;
1331	if (nested)
1332		kvmhv_put_nested(nested);
1333
1334 out:
1335	mutex_unlock(&p->mutex);
1336	return ret;
1337}
1338
1339static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1340			   size_t len, loff_t *ppos)
1341{
1342	return -EACCES;
1343}
1344
1345static const struct file_operations debugfs_radix_fops = {
1346	.owner	 = THIS_MODULE,
1347	.open	 = debugfs_radix_open,
1348	.release = debugfs_radix_release,
1349	.read	 = debugfs_radix_read,
1350	.write	 = debugfs_radix_write,
1351	.llseek	 = generic_file_llseek,
1352};
1353
1354void kvmhv_radix_debugfs_init(struct kvm *kvm)
1355{
1356	kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
1357						     kvm->arch.debugfs_dir, kvm,
1358						     &debugfs_radix_fops);
1359}
1360
1361int kvmppc_radix_init(void)
1362{
1363	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1364
1365	kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1366	if (!kvm_pte_cache)
1367		return -ENOMEM;
1368
1369	size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1370
1371	kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1372	if (!kvm_pmd_cache) {
1373		kmem_cache_destroy(kvm_pte_cache);
1374		return -ENOMEM;
1375	}
1376
1377	return 0;
1378}
1379
1380void kvmppc_radix_exit(void)
1381{
1382	kmem_cache_destroy(kvm_pte_cache);
1383	kmem_cache_destroy(kvm_pmd_cache);
1384}