Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.17.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
   4 * No bombay mix was harmed in the writing of this file.
   5 *
   6 * Copyright (C) 2020 Google LLC
   7 * Author: Will Deacon <will@kernel.org>
   8 */
   9
  10#include <linux/bitfield.h>
  11#include <asm/kvm_pgtable.h>
  12#include <asm/stage2_pgtable.h>
  13
  14
  15#define KVM_PTE_TYPE			BIT(1)
  16#define KVM_PTE_TYPE_BLOCK		0
  17#define KVM_PTE_TYPE_PAGE		1
  18#define KVM_PTE_TYPE_TABLE		1
  19
  20#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
  21
  22#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
  23#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
  24#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
  25#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
  26#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
  27#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
  28#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
  29
  30#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
  31#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
  32#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
  33#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
  34#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
  35#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
  36
  37#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
  38
  39#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
  40
  41#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
  42
  43#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
  44
  45#define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
  46					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
  47					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
  48
  49#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
  50#define KVM_MAX_OWNER_ID		1
  51
  52/*
  53 * Used to indicate a pte for which a 'break-before-make' sequence is in
  54 * progress.
  55 */
  56#define KVM_INVALID_PTE_LOCKED		BIT(10)
  57
  58struct kvm_pgtable_walk_data {
  59	struct kvm_pgtable_walker	*walker;
  60
  61	u64				addr;
  62	u64				end;
  63};
  64
  65static bool kvm_phys_is_valid(u64 phys)
  66{
  67	return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
  68}
  69
  70static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
  71{
  72	u64 granule = kvm_granule_size(ctx->level);
  73
  74	if (!kvm_level_supports_block_mapping(ctx->level))
  75		return false;
  76
  77	if (granule > (ctx->end - ctx->addr))
  78		return false;
  79
  80	if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule))
  81		return false;
  82
  83	return IS_ALIGNED(ctx->addr, granule);
  84}
  85
  86static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
  87{
  88	u64 shift = kvm_granule_shift(level);
  89	u64 mask = BIT(PAGE_SHIFT - 3) - 1;
  90
  91	return (data->addr >> shift) & mask;
  92}
  93
  94static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
  95{
  96	u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
  97	u64 mask = BIT(pgt->ia_bits) - 1;
  98
  99	return (addr & mask) >> shift;
 100}
 101
 102static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
 103{
 104	struct kvm_pgtable pgt = {
 105		.ia_bits	= ia_bits,
 106		.start_level	= start_level,
 107	};
 108
 109	return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
 110}
 111
 112static bool kvm_pte_table(kvm_pte_t pte, u32 level)
 113{
 114	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
 115		return false;
 116
 117	if (!kvm_pte_valid(pte))
 118		return false;
 119
 120	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
 121}
 122
 123static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
 124{
 125	return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
 126}
 127
 128static void kvm_clear_pte(kvm_pte_t *ptep)
 129{
 130	WRITE_ONCE(*ptep, 0);
 131}
 132
 133static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
 134{
 135	kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
 136
 137	pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
 138	pte |= KVM_PTE_VALID;
 139	return pte;
 140}
 141
 142static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 143{
 144	kvm_pte_t pte = kvm_phys_to_pte(pa);
 145	u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
 146							   KVM_PTE_TYPE_BLOCK;
 147
 148	pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
 149	pte |= FIELD_PREP(KVM_PTE_TYPE, type);
 150	pte |= KVM_PTE_VALID;
 151
 152	return pte;
 153}
 154
 155static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
 156{
 157	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
 158}
 159
 160static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
 161				  const struct kvm_pgtable_visit_ctx *ctx,
 162				  enum kvm_pgtable_walk_flags visit)
 163{
 164	struct kvm_pgtable_walker *walker = data->walker;
 165
 166	/* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
 167	WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
 168	return walker->cb(ctx, visit);
 169}
 170
 171static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
 172			      struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level);
 173
 174static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
 175				      struct kvm_pgtable_mm_ops *mm_ops,
 176				      kvm_pteref_t pteref, u32 level)
 177{
 178	enum kvm_pgtable_walk_flags flags = data->walker->flags;
 179	kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
 180	struct kvm_pgtable_visit_ctx ctx = {
 181		.ptep	= ptep,
 182		.old	= READ_ONCE(*ptep),
 183		.arg	= data->walker->arg,
 184		.mm_ops	= mm_ops,
 185		.addr	= data->addr,
 186		.end	= data->end,
 187		.level	= level,
 188		.flags	= flags,
 189	};
 190	int ret = 0;
 191	kvm_pteref_t childp;
 192	bool table = kvm_pte_table(ctx.old, level);
 193
 194	if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE))
 195		ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
 196
 197	if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
 198		ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
 199		ctx.old = READ_ONCE(*ptep);
 200		table = kvm_pte_table(ctx.old, level);
 201	}
 202
 203	if (ret)
 204		goto out;
 205
 206	if (!table) {
 207		data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
 208		data->addr += kvm_granule_size(level);
 209		goto out;
 210	}
 211
 212	childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
 213	ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
 214	if (ret)
 215		goto out;
 216
 217	if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
 218		ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
 219
 220out:
 221	return ret;
 222}
 223
 224static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
 225			      struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level)
 226{
 227	u32 idx;
 228	int ret = 0;
 229
 230	if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
 231		return -EINVAL;
 232
 233	for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
 234		kvm_pteref_t pteref = &pgtable[idx];
 235
 236		if (data->addr >= data->end)
 237			break;
 238
 239		ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
 240		if (ret)
 241			break;
 242	}
 243
 244	return ret;
 245}
 246
 247static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
 248{
 249	u32 idx;
 250	int ret = 0;
 251	u64 limit = BIT(pgt->ia_bits);
 252
 253	if (data->addr > limit || data->end > limit)
 254		return -ERANGE;
 255
 256	if (!pgt->pgd)
 257		return -EINVAL;
 258
 259	for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
 260		kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
 261
 262		ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
 263		if (ret)
 264			break;
 265	}
 266
 267	return ret;
 268}
 269
 270int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
 271		     struct kvm_pgtable_walker *walker)
 272{
 273	struct kvm_pgtable_walk_data walk_data = {
 274		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
 275		.end	= PAGE_ALIGN(walk_data.addr + size),
 276		.walker	= walker,
 277	};
 278	int r;
 279
 280	r = kvm_pgtable_walk_begin(walker);
 281	if (r)
 282		return r;
 283
 284	r = _kvm_pgtable_walk(pgt, &walk_data);
 285	kvm_pgtable_walk_end(walker);
 286
 287	return r;
 288}
 289
 290struct leaf_walk_data {
 291	kvm_pte_t	pte;
 292	u32		level;
 293};
 294
 295static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
 296		       enum kvm_pgtable_walk_flags visit)
 297{
 298	struct leaf_walk_data *data = ctx->arg;
 299
 300	data->pte   = ctx->old;
 301	data->level = ctx->level;
 302
 303	return 0;
 304}
 305
 306int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
 307			 kvm_pte_t *ptep, u32 *level)
 308{
 309	struct leaf_walk_data data;
 310	struct kvm_pgtable_walker walker = {
 311		.cb	= leaf_walker,
 312		.flags	= KVM_PGTABLE_WALK_LEAF,
 313		.arg	= &data,
 314	};
 315	int ret;
 316
 317	ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
 318			       PAGE_SIZE, &walker);
 319	if (!ret) {
 320		if (ptep)
 321			*ptep  = data.pte;
 322		if (level)
 323			*level = data.level;
 324	}
 325
 326	return ret;
 327}
 328
 329struct hyp_map_data {
 330	u64				phys;
 331	kvm_pte_t			attr;
 332};
 333
 334static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
 335{
 336	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
 337	u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
 338	kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
 339	u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
 340	u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
 341					       KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
 342
 343	if (!(prot & KVM_PGTABLE_PROT_R))
 344		return -EINVAL;
 345
 346	if (prot & KVM_PGTABLE_PROT_X) {
 347		if (prot & KVM_PGTABLE_PROT_W)
 348			return -EINVAL;
 349
 350		if (device)
 351			return -EINVAL;
 352	} else {
 353		attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
 354	}
 355
 356	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
 357	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
 358	attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
 359	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
 360	*ptep = attr;
 361
 362	return 0;
 363}
 364
 365enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
 366{
 367	enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
 368	u32 ap;
 369
 370	if (!kvm_pte_valid(pte))
 371		return prot;
 372
 373	if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
 374		prot |= KVM_PGTABLE_PROT_X;
 375
 376	ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
 377	if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
 378		prot |= KVM_PGTABLE_PROT_R;
 379	else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
 380		prot |= KVM_PGTABLE_PROT_RW;
 381
 382	return prot;
 383}
 384
 385static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
 386				    struct hyp_map_data *data)
 387{
 388	kvm_pte_t new;
 389	u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
 390
 391	if (!kvm_block_mapping_supported(ctx, phys))
 392		return false;
 393
 394	data->phys += granule;
 395	new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
 396	if (ctx->old == new)
 397		return true;
 398	if (!kvm_pte_valid(ctx->old))
 399		ctx->mm_ops->get_page(ctx->ptep);
 400	else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
 401		return false;
 402
 403	smp_store_release(ctx->ptep, new);
 404	return true;
 405}
 406
 407static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
 408			  enum kvm_pgtable_walk_flags visit)
 409{
 410	kvm_pte_t *childp, new;
 411	struct hyp_map_data *data = ctx->arg;
 412	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 413
 414	if (hyp_map_walker_try_leaf(ctx, data))
 415		return 0;
 416
 417	if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
 418		return -EINVAL;
 419
 420	childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
 421	if (!childp)
 422		return -ENOMEM;
 423
 424	new = kvm_init_table_pte(childp, mm_ops);
 425	mm_ops->get_page(ctx->ptep);
 426	smp_store_release(ctx->ptep, new);
 427
 428	return 0;
 429}
 430
 431int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
 432			enum kvm_pgtable_prot prot)
 433{
 434	int ret;
 435	struct hyp_map_data map_data = {
 436		.phys	= ALIGN_DOWN(phys, PAGE_SIZE),
 437	};
 438	struct kvm_pgtable_walker walker = {
 439		.cb	= hyp_map_walker,
 440		.flags	= KVM_PGTABLE_WALK_LEAF,
 441		.arg	= &map_data,
 442	};
 443
 444	ret = hyp_set_prot_attr(prot, &map_data.attr);
 445	if (ret)
 446		return ret;
 447
 448	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
 449	dsb(ishst);
 450	isb();
 451	return ret;
 452}
 453
 454static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 455			    enum kvm_pgtable_walk_flags visit)
 456{
 457	kvm_pte_t *childp = NULL;
 458	u64 granule = kvm_granule_size(ctx->level);
 459	u64 *unmapped = ctx->arg;
 460	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 461
 462	if (!kvm_pte_valid(ctx->old))
 463		return -EINVAL;
 464
 465	if (kvm_pte_table(ctx->old, ctx->level)) {
 466		childp = kvm_pte_follow(ctx->old, mm_ops);
 467
 468		if (mm_ops->page_count(childp) != 1)
 469			return 0;
 470
 471		kvm_clear_pte(ctx->ptep);
 472		dsb(ishst);
 473		__tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
 474	} else {
 475		if (ctx->end - ctx->addr < granule)
 476			return -EINVAL;
 477
 478		kvm_clear_pte(ctx->ptep);
 479		dsb(ishst);
 480		__tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
 481		*unmapped += granule;
 482	}
 483
 484	dsb(ish);
 485	isb();
 486	mm_ops->put_page(ctx->ptep);
 487
 488	if (childp)
 489		mm_ops->put_page(childp);
 490
 491	return 0;
 492}
 493
 494u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 495{
 496	u64 unmapped = 0;
 497	struct kvm_pgtable_walker walker = {
 498		.cb	= hyp_unmap_walker,
 499		.arg	= &unmapped,
 500		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
 501	};
 502
 503	if (!pgt->mm_ops->page_count)
 504		return 0;
 505
 506	kvm_pgtable_walk(pgt, addr, size, &walker);
 507	return unmapped;
 508}
 509
 510int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
 511			 struct kvm_pgtable_mm_ops *mm_ops)
 512{
 513	u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
 514
 515	pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
 516	if (!pgt->pgd)
 517		return -ENOMEM;
 518
 519	pgt->ia_bits		= va_bits;
 520	pgt->start_level	= KVM_PGTABLE_MAX_LEVELS - levels;
 521	pgt->mm_ops		= mm_ops;
 522	pgt->mmu		= NULL;
 523	pgt->force_pte_cb	= NULL;
 524
 525	return 0;
 526}
 527
 528static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
 529			   enum kvm_pgtable_walk_flags visit)
 530{
 531	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 532
 533	if (!kvm_pte_valid(ctx->old))
 534		return 0;
 535
 536	mm_ops->put_page(ctx->ptep);
 537
 538	if (kvm_pte_table(ctx->old, ctx->level))
 539		mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
 540
 541	return 0;
 542}
 543
 544void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
 545{
 546	struct kvm_pgtable_walker walker = {
 547		.cb	= hyp_free_walker,
 548		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
 549	};
 550
 551	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
 552	pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
 553	pgt->pgd = NULL;
 554}
 555
 556struct stage2_map_data {
 557	u64				phys;
 558	kvm_pte_t			attr;
 559	u8				owner_id;
 560
 561	kvm_pte_t			*anchor;
 562	kvm_pte_t			*childp;
 563
 564	struct kvm_s2_mmu		*mmu;
 565	void				*memcache;
 566
 567	/* Force mappings to page granularity */
 568	bool				force_pte;
 569};
 570
 571u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
 572{
 573	u64 vtcr = VTCR_EL2_FLAGS;
 574	u8 lvls;
 575
 576	vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
 577	vtcr |= VTCR_EL2_T0SZ(phys_shift);
 578	/*
 579	 * Use a minimum 2 level page table to prevent splitting
 580	 * host PMD huge pages at stage2.
 581	 */
 582	lvls = stage2_pgtable_levels(phys_shift);
 583	if (lvls < 2)
 584		lvls = 2;
 585	vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
 586
 587	/*
 588	 * Enable the Hardware Access Flag management, unconditionally
 589	 * on all CPUs. The features is RES0 on CPUs without the support
 590	 * and must be ignored by the CPUs.
 591	 */
 592	vtcr |= VTCR_EL2_HA;
 593
 594	/* Set the vmid bits */
 595	vtcr |= (get_vmid_bits(mmfr1) == 16) ?
 596		VTCR_EL2_VS_16BIT :
 597		VTCR_EL2_VS_8BIT;
 598
 599	return vtcr;
 600}
 601
 602static bool stage2_has_fwb(struct kvm_pgtable *pgt)
 603{
 604	if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
 605		return false;
 606
 607	return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
 608}
 609
 610#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
 611
 612static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
 613				kvm_pte_t *ptep)
 614{
 615	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
 616	kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
 617			    KVM_S2_MEMATTR(pgt, NORMAL);
 618	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
 619
 620	if (!(prot & KVM_PGTABLE_PROT_X))
 621		attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
 622	else if (device)
 623		return -EINVAL;
 624
 625	if (prot & KVM_PGTABLE_PROT_R)
 626		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
 627
 628	if (prot & KVM_PGTABLE_PROT_W)
 629		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
 630
 631	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
 632	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
 633	attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
 634	*ptep = attr;
 635
 636	return 0;
 637}
 638
 639enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
 640{
 641	enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
 642
 643	if (!kvm_pte_valid(pte))
 644		return prot;
 645
 646	if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
 647		prot |= KVM_PGTABLE_PROT_R;
 648	if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
 649		prot |= KVM_PGTABLE_PROT_W;
 650	if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
 651		prot |= KVM_PGTABLE_PROT_X;
 652
 653	return prot;
 654}
 655
 656static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
 657{
 658	if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
 659		return true;
 660
 661	return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
 662}
 663
 664static bool stage2_pte_is_counted(kvm_pte_t pte)
 665{
 666	/*
 667	 * The refcount tracks valid entries as well as invalid entries if they
 668	 * encode ownership of a page to another entity than the page-table
 669	 * owner, whose id is 0.
 670	 */
 671	return !!pte;
 672}
 673
 674static bool stage2_pte_is_locked(kvm_pte_t pte)
 675{
 676	return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
 677}
 678
 679static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
 680{
 681	if (!kvm_pgtable_walk_shared(ctx)) {
 682		WRITE_ONCE(*ctx->ptep, new);
 683		return true;
 684	}
 685
 686	return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
 687}
 688
 689/**
 690 * stage2_try_break_pte() - Invalidates a pte according to the
 691 *			    'break-before-make' requirements of the
 692 *			    architecture.
 693 *
 694 * @ctx: context of the visited pte.
 695 * @mmu: stage-2 mmu
 696 *
 697 * Returns: true if the pte was successfully broken.
 698 *
 699 * If the removed pte was valid, performs the necessary serialization and TLB
 700 * invalidation for the old value. For counted ptes, drops the reference count
 701 * on the containing table page.
 702 */
 703static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
 704				 struct kvm_s2_mmu *mmu)
 705{
 706	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 707
 708	if (stage2_pte_is_locked(ctx->old)) {
 709		/*
 710		 * Should never occur if this walker has exclusive access to the
 711		 * page tables.
 712		 */
 713		WARN_ON(!kvm_pgtable_walk_shared(ctx));
 714		return false;
 715	}
 716
 717	if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
 718		return false;
 719
 720	/*
 721	 * Perform the appropriate TLB invalidation based on the evicted pte
 722	 * value (if any).
 723	 */
 724	if (kvm_pte_table(ctx->old, ctx->level))
 725		kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
 726	else if (kvm_pte_valid(ctx->old))
 727		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
 728
 729	if (stage2_pte_is_counted(ctx->old))
 730		mm_ops->put_page(ctx->ptep);
 731
 732	return true;
 733}
 734
 735static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
 736{
 737	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 738
 739	WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
 740
 741	if (stage2_pte_is_counted(new))
 742		mm_ops->get_page(ctx->ptep);
 743
 744	smp_store_release(ctx->ptep, new);
 745}
 746
 747static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu,
 748			   struct kvm_pgtable_mm_ops *mm_ops)
 749{
 750	/*
 751	 * Clear the existing PTE, and perform break-before-make with
 752	 * TLB maintenance if it was valid.
 753	 */
 754	if (kvm_pte_valid(ctx->old)) {
 755		kvm_clear_pte(ctx->ptep);
 756		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
 757	}
 758
 759	mm_ops->put_page(ctx->ptep);
 760}
 761
 762static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
 763{
 764	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
 765	return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
 766}
 767
 768static bool stage2_pte_executable(kvm_pte_t pte)
 769{
 770	return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
 771}
 772
 773static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
 774					struct stage2_map_data *data)
 775{
 776	if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1)))
 777		return false;
 778
 779	return kvm_block_mapping_supported(ctx, data->phys);
 780}
 781
 782static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
 783				      struct stage2_map_data *data)
 784{
 785	kvm_pte_t new;
 786	u64 granule = kvm_granule_size(ctx->level), phys = data->phys;
 787	struct kvm_pgtable *pgt = data->mmu->pgt;
 788	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 789
 790	if (!stage2_leaf_mapping_allowed(ctx, data))
 791		return -E2BIG;
 792
 793	if (kvm_phys_is_valid(phys))
 794		new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
 795	else
 796		new = kvm_init_invalid_leaf_owner(data->owner_id);
 797
 798	/*
 799	 * Skip updating the PTE if we are trying to recreate the exact
 800	 * same mapping or only change the access permissions. Instead,
 801	 * the vCPU will exit one more time from guest if still needed
 802	 * and then go through the path of relaxing permissions.
 803	 */
 804	if (!stage2_pte_needs_update(ctx->old, new))
 805		return -EAGAIN;
 806
 807	if (!stage2_try_break_pte(ctx, data->mmu))
 808		return -EAGAIN;
 809
 810	/* Perform CMOs before installation of the guest stage-2 PTE */
 811	if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
 812		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
 813						granule);
 814
 815	if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
 816		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
 817
 818	stage2_make_pte(ctx, new);
 819
 820	if (kvm_phys_is_valid(phys))
 821		data->phys += granule;
 822	return 0;
 823}
 824
 825static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
 826				     struct stage2_map_data *data)
 827{
 828	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 829	kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
 830	int ret;
 831
 832	if (!stage2_leaf_mapping_allowed(ctx, data))
 833		return 0;
 834
 835	ret = stage2_map_walker_try_leaf(ctx, data);
 836	if (ret)
 837		return ret;
 838
 839	mm_ops->free_removed_table(childp, ctx->level);
 840	return 0;
 841}
 842
 843static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
 844				struct stage2_map_data *data)
 845{
 846	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 847	kvm_pte_t *childp, new;
 848	int ret;
 849
 850	ret = stage2_map_walker_try_leaf(ctx, data);
 851	if (ret != -E2BIG)
 852		return ret;
 853
 854	if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1))
 855		return -EINVAL;
 856
 857	if (!data->memcache)
 858		return -ENOMEM;
 859
 860	childp = mm_ops->zalloc_page(data->memcache);
 861	if (!childp)
 862		return -ENOMEM;
 863
 864	if (!stage2_try_break_pte(ctx, data->mmu)) {
 865		mm_ops->put_page(childp);
 866		return -EAGAIN;
 867	}
 868
 869	/*
 870	 * If we've run into an existing block mapping then replace it with
 871	 * a table. Accesses beyond 'end' that fall within the new table
 872	 * will be mapped lazily.
 873	 */
 874	new = kvm_init_table_pte(childp, mm_ops);
 875	stage2_make_pte(ctx, new);
 876
 877	return 0;
 878}
 879
 880/*
 881 * The TABLE_PRE callback runs for table entries on the way down, looking
 882 * for table entries which we could conceivably replace with a block entry
 883 * for this mapping. If it finds one it replaces the entry and calls
 884 * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
 885 *
 886 * Otherwise, the LEAF callback performs the mapping at the existing leaves
 887 * instead.
 888 */
 889static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
 890			     enum kvm_pgtable_walk_flags visit)
 891{
 892	struct stage2_map_data *data = ctx->arg;
 893
 894	switch (visit) {
 895	case KVM_PGTABLE_WALK_TABLE_PRE:
 896		return stage2_map_walk_table_pre(ctx, data);
 897	case KVM_PGTABLE_WALK_LEAF:
 898		return stage2_map_walk_leaf(ctx, data);
 899	default:
 900		return -EINVAL;
 901	}
 902}
 903
 904int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 905			   u64 phys, enum kvm_pgtable_prot prot,
 906			   void *mc, enum kvm_pgtable_walk_flags flags)
 907{
 908	int ret;
 909	struct stage2_map_data map_data = {
 910		.phys		= ALIGN_DOWN(phys, PAGE_SIZE),
 911		.mmu		= pgt->mmu,
 912		.memcache	= mc,
 913		.force_pte	= pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
 914	};
 915	struct kvm_pgtable_walker walker = {
 916		.cb		= stage2_map_walker,
 917		.flags		= flags |
 918				  KVM_PGTABLE_WALK_TABLE_PRE |
 919				  KVM_PGTABLE_WALK_LEAF,
 920		.arg		= &map_data,
 921	};
 922
 923	if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
 924		return -EINVAL;
 925
 926	ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
 927	if (ret)
 928		return ret;
 929
 930	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
 931	dsb(ishst);
 932	return ret;
 933}
 934
 935int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
 936				 void *mc, u8 owner_id)
 937{
 938	int ret;
 939	struct stage2_map_data map_data = {
 940		.phys		= KVM_PHYS_INVALID,
 941		.mmu		= pgt->mmu,
 942		.memcache	= mc,
 943		.owner_id	= owner_id,
 944		.force_pte	= true,
 945	};
 946	struct kvm_pgtable_walker walker = {
 947		.cb		= stage2_map_walker,
 948		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
 949				  KVM_PGTABLE_WALK_LEAF,
 950		.arg		= &map_data,
 951	};
 952
 953	if (owner_id > KVM_MAX_OWNER_ID)
 954		return -EINVAL;
 955
 956	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
 957	return ret;
 958}
 959
 960static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 961			       enum kvm_pgtable_walk_flags visit)
 962{
 963	struct kvm_pgtable *pgt = ctx->arg;
 964	struct kvm_s2_mmu *mmu = pgt->mmu;
 965	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 966	kvm_pte_t *childp = NULL;
 967	bool need_flush = false;
 968
 969	if (!kvm_pte_valid(ctx->old)) {
 970		if (stage2_pte_is_counted(ctx->old)) {
 971			kvm_clear_pte(ctx->ptep);
 972			mm_ops->put_page(ctx->ptep);
 973		}
 974		return 0;
 975	}
 976
 977	if (kvm_pte_table(ctx->old, ctx->level)) {
 978		childp = kvm_pte_follow(ctx->old, mm_ops);
 979
 980		if (mm_ops->page_count(childp) != 1)
 981			return 0;
 982	} else if (stage2_pte_cacheable(pgt, ctx->old)) {
 983		need_flush = !stage2_has_fwb(pgt);
 984	}
 985
 986	/*
 987	 * This is similar to the map() path in that we unmap the entire
 988	 * block entry and rely on the remaining portions being faulted
 989	 * back lazily.
 990	 */
 991	stage2_put_pte(ctx, mmu, mm_ops);
 992
 993	if (need_flush && mm_ops->dcache_clean_inval_poc)
 994		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
 995					       kvm_granule_size(ctx->level));
 996
 997	if (childp)
 998		mm_ops->put_page(childp);
 999
1000	return 0;
1001}
1002
1003int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
1004{
1005	struct kvm_pgtable_walker walker = {
1006		.cb	= stage2_unmap_walker,
1007		.arg	= pgt,
1008		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
1009	};
1010
1011	return kvm_pgtable_walk(pgt, addr, size, &walker);
1012}
1013
1014struct stage2_attr_data {
1015	kvm_pte_t			attr_set;
1016	kvm_pte_t			attr_clr;
1017	kvm_pte_t			pte;
1018	u32				level;
1019};
1020
1021static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
1022			      enum kvm_pgtable_walk_flags visit)
1023{
1024	kvm_pte_t pte = ctx->old;
1025	struct stage2_attr_data *data = ctx->arg;
1026	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1027
1028	if (!kvm_pte_valid(ctx->old))
1029		return 0;
1030
1031	data->level = ctx->level;
1032	data->pte = pte;
1033	pte &= ~data->attr_clr;
1034	pte |= data->attr_set;
1035
1036	/*
1037	 * We may race with the CPU trying to set the access flag here,
1038	 * but worst-case the access flag update gets lost and will be
1039	 * set on the next access instead.
1040	 */
1041	if (data->pte != pte) {
1042		/*
1043		 * Invalidate instruction cache before updating the guest
1044		 * stage-2 PTE if we are going to add executable permission.
1045		 */
1046		if (mm_ops->icache_inval_pou &&
1047		    stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
1048			mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
1049						  kvm_granule_size(ctx->level));
1050
1051		if (!stage2_try_set_pte(ctx, pte))
1052			return -EAGAIN;
1053	}
1054
1055	return 0;
1056}
1057
1058static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
1059				    u64 size, kvm_pte_t attr_set,
1060				    kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
1061				    u32 *level, enum kvm_pgtable_walk_flags flags)
1062{
1063	int ret;
1064	kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
1065	struct stage2_attr_data data = {
1066		.attr_set	= attr_set & attr_mask,
1067		.attr_clr	= attr_clr & attr_mask,
1068	};
1069	struct kvm_pgtable_walker walker = {
1070		.cb		= stage2_attr_walker,
1071		.arg		= &data,
1072		.flags		= flags | KVM_PGTABLE_WALK_LEAF,
1073	};
1074
1075	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1076	if (ret)
1077		return ret;
1078
1079	if (orig_pte)
1080		*orig_pte = data.pte;
1081
1082	if (level)
1083		*level = data.level;
1084	return 0;
1085}
1086
1087int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
1088{
1089	return stage2_update_leaf_attrs(pgt, addr, size, 0,
1090					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
1091					NULL, NULL, 0);
1092}
1093
1094kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
1095{
1096	kvm_pte_t pte = 0;
1097	stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1098				 &pte, NULL, 0);
1099	dsb(ishst);
1100	return pte;
1101}
1102
1103kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
1104{
1105	kvm_pte_t pte = 0;
1106	stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
1107				 &pte, NULL, 0);
1108	/*
1109	 * "But where's the TLBI?!", you scream.
1110	 * "Over in the core code", I sigh.
1111	 *
1112	 * See the '->clear_flush_young()' callback on the KVM mmu notifier.
1113	 */
1114	return pte;
1115}
1116
1117bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
1118{
1119	kvm_pte_t pte = 0;
1120	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL, 0);
1121	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
1122}
1123
1124int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
1125				   enum kvm_pgtable_prot prot)
1126{
1127	int ret;
1128	u32 level;
1129	kvm_pte_t set = 0, clr = 0;
1130
1131	if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
1132		return -EINVAL;
1133
1134	if (prot & KVM_PGTABLE_PROT_R)
1135		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
1136
1137	if (prot & KVM_PGTABLE_PROT_W)
1138		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
1139
1140	if (prot & KVM_PGTABLE_PROT_X)
1141		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
1142
1143	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
1144				       KVM_PGTABLE_WALK_SHARED);
1145	if (!ret)
1146		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
1147	return ret;
1148}
1149
1150static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
1151			       enum kvm_pgtable_walk_flags visit)
1152{
1153	struct kvm_pgtable *pgt = ctx->arg;
1154	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1155
1156	if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old))
1157		return 0;
1158
1159	if (mm_ops->dcache_clean_inval_poc)
1160		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1161					       kvm_granule_size(ctx->level));
1162	return 0;
1163}
1164
1165int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
1166{
1167	struct kvm_pgtable_walker walker = {
1168		.cb	= stage2_flush_walker,
1169		.flags	= KVM_PGTABLE_WALK_LEAF,
1170		.arg	= pgt,
1171	};
1172
1173	if (stage2_has_fwb(pgt))
1174		return 0;
1175
1176	return kvm_pgtable_walk(pgt, addr, size, &walker);
1177}
1178
1179
1180int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
1181			      struct kvm_pgtable_mm_ops *mm_ops,
1182			      enum kvm_pgtable_stage2_flags flags,
1183			      kvm_pgtable_force_pte_cb_t force_pte_cb)
1184{
1185	size_t pgd_sz;
1186	u64 vtcr = mmu->arch->vtcr;
1187	u32 ia_bits = VTCR_EL2_IPA(vtcr);
1188	u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1189	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1190
1191	pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1192	pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
1193	if (!pgt->pgd)
1194		return -ENOMEM;
1195
1196	pgt->ia_bits		= ia_bits;
1197	pgt->start_level	= start_level;
1198	pgt->mm_ops		= mm_ops;
1199	pgt->mmu		= mmu;
1200	pgt->flags		= flags;
1201	pgt->force_pte_cb	= force_pte_cb;
1202
1203	/* Ensure zeroed PGD pages are visible to the hardware walker */
1204	dsb(ishst);
1205	return 0;
1206}
1207
1208size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
1209{
1210	u32 ia_bits = VTCR_EL2_IPA(vtcr);
1211	u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1212	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1213
1214	return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1215}
1216
1217static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
1218			      enum kvm_pgtable_walk_flags visit)
1219{
1220	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1221
1222	if (!stage2_pte_is_counted(ctx->old))
1223		return 0;
1224
1225	mm_ops->put_page(ctx->ptep);
1226
1227	if (kvm_pte_table(ctx->old, ctx->level))
1228		mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
1229
1230	return 0;
1231}
1232
1233void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
1234{
1235	size_t pgd_sz;
1236	struct kvm_pgtable_walker walker = {
1237		.cb	= stage2_free_walker,
1238		.flags	= KVM_PGTABLE_WALK_LEAF |
1239			  KVM_PGTABLE_WALK_TABLE_POST,
1240	};
1241
1242	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
1243	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
1244	pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
1245	pgt->pgd = NULL;
1246}
1247
1248void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
1249{
1250	kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
1251	struct kvm_pgtable_walker walker = {
1252		.cb	= stage2_free_walker,
1253		.flags	= KVM_PGTABLE_WALK_LEAF |
1254			  KVM_PGTABLE_WALK_TABLE_POST,
1255	};
1256	struct kvm_pgtable_walk_data data = {
1257		.walker	= &walker,
1258
1259		/*
1260		 * At this point the IPA really doesn't matter, as the page
1261		 * table being traversed has already been removed from the stage
1262		 * 2. Set an appropriate range to cover the entire page table.
1263		 */
1264		.addr	= 0,
1265		.end	= kvm_granule_size(level),
1266	};
1267
1268	WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
1269}