Linux Audio

Check our new training course

Loading...
v4.6
 
  1/*
  2 * Based on arch/arm/mm/mmu.c
  3 *
  4 * Copyright (C) 1995-2005 Russell King
  5 * Copyright (C) 2012 ARM Ltd.
  6 *
  7 * This program is free software; you can redistribute it and/or modify
  8 * it under the terms of the GNU General Public License version 2 as
  9 * published by the Free Software Foundation.
 10 *
 11 * This program is distributed in the hope that it will be useful,
 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 * GNU General Public License for more details.
 15 *
 16 * You should have received a copy of the GNU General Public License
 17 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 18 */
 19
 
 20#include <linux/export.h>
 21#include <linux/kernel.h>
 22#include <linux/errno.h>
 23#include <linux/init.h>
 
 
 24#include <linux/libfdt.h>
 25#include <linux/mman.h>
 26#include <linux/nodemask.h>
 27#include <linux/memblock.h>
 
 
 28#include <linux/fs.h>
 29#include <linux/io.h>
 30#include <linux/slab.h>
 31#include <linux/stop_machine.h>
 
 32
 33#include <asm/barrier.h>
 34#include <asm/cputype.h>
 35#include <asm/fixmap.h>
 36#include <asm/kasan.h>
 37#include <asm/kernel-pgtable.h>
 38#include <asm/sections.h>
 39#include <asm/setup.h>
 40#include <asm/sizes.h>
 41#include <asm/tlb.h>
 42#include <asm/memblock.h>
 43#include <asm/mmu_context.h>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 44
 45#include "mm.h"
 46
 47u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
 48
 49u64 kimage_voffset __read_mostly;
 50EXPORT_SYMBOL(kimage_voffset);
 51
 
 
 
 
 
 
 
 
 52/*
 53 * Empty_zero_page is a special page that is used for zero-initialized data
 54 * and COW.
 55 */
 56unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
 57EXPORT_SYMBOL(empty_zero_page);
 58
 59static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
 60static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
 61static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
 62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 63pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 64			      unsigned long size, pgprot_t vma_prot)
 65{
 66	if (!pfn_valid(pfn))
 67		return pgprot_noncached(vma_prot);
 68	else if (file->f_flags & O_SYNC)
 69		return pgprot_writecombine(vma_prot);
 70	return vma_prot;
 71}
 72EXPORT_SYMBOL(phys_mem_access_prot);
 73
 74static phys_addr_t __init early_pgtable_alloc(void)
 75{
 76	phys_addr_t phys;
 77	void *ptr;
 78
 79	phys = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 80	BUG_ON(!phys);
 
 
 81
 82	/*
 83	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
 84	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
 85	 * any level of table.
 86	 */
 87	ptr = pte_set_fixmap(phys);
 88
 89	memset(ptr, 0, PAGE_SIZE);
 90
 91	/*
 92	 * Implicit barriers also ensure the zeroed page is visible to the page
 93	 * table walker
 94	 */
 95	pte_clear_fixmap();
 96
 97	return phys;
 98}
 99
100/*
101 * remap a PMD into pages
102 */
103static void split_pmd(pmd_t *pmd, pte_t *pte)
104{
105	unsigned long pfn = pmd_pfn(*pmd);
106	int i = 0;
 
 
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108	do {
 
 
 
 
109		/*
110		 * Need to have the least restrictive permissions available
111		 * permissions will be fixed up later
112		 */
113		set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
114		pfn++;
115	} while (pte++, i++, i < PTRS_PER_PTE);
 
 
 
 
116}
117
118static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
119				  unsigned long end, unsigned long pfn,
120				  pgprot_t prot,
121				  phys_addr_t (*pgtable_alloc)(void))
 
122{
123	pte_t *pte;
 
124
125	if (pmd_none(*pmd) || pmd_sect(*pmd)) {
 
 
126		phys_addr_t pte_phys;
 
 
 
127		BUG_ON(!pgtable_alloc);
128		pte_phys = pgtable_alloc();
129		pte = pte_set_fixmap(pte_phys);
130		if (pmd_sect(*pmd))
131			split_pmd(pmd, pte);
132		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
133		flush_tlb_all();
134		pte_clear_fixmap();
135	}
136	BUG_ON(pmd_bad(*pmd));
137
138	pte = pte_set_fixmap_offset(pmd, addr);
139	do {
140		set_pte(pte, pfn_pte(pfn, prot));
141		pfn++;
142	} while (pte++, addr += PAGE_SIZE, addr != end);
143
144	pte_clear_fixmap();
145}
146
147static void split_pud(pud_t *old_pud, pmd_t *pmd)
148{
149	unsigned long addr = pud_pfn(*old_pud) << PAGE_SHIFT;
150	pgprot_t prot = __pgprot(pud_val(*old_pud) ^ addr);
151	int i = 0;
152
153	do {
154		set_pmd(pmd, __pmd(addr | pgprot_val(prot)));
155		addr += PMD_SIZE;
156	} while (pmd++, i++, i < PTRS_PER_PMD);
157}
158
159#ifdef CONFIG_DEBUG_PAGEALLOC
160static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
161{
162
163	/*
164	 * If debug_page_alloc is enabled we must map the linear map
165	 * using pages. However, other mappings created by
166	 * create_mapping_noalloc must use sections in some cases. Allow
167	 * sections to be used in those cases, where no pgtable_alloc
168	 * function is provided.
169	 */
170	return !pgtable_alloc || !debug_pagealloc_enabled();
171}
172#else
173static bool block_mappings_allowed(phys_addr_t (*pgtable_alloc)(void))
174{
175	return true;
176}
177#endif
178
179static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
180				  phys_addr_t phys, pgprot_t prot,
181				  phys_addr_t (*pgtable_alloc)(void))
182{
183	pmd_t *pmd;
184	unsigned long next;
 
185
186	/*
187	 * Check for initial section mappings in the pgd/pud and remove them.
188	 */
189	if (pud_none(*pud) || pud_sect(*pud)) {
190		phys_addr_t pmd_phys;
191		BUG_ON(!pgtable_alloc);
192		pmd_phys = pgtable_alloc();
193		pmd = pmd_set_fixmap(pmd_phys);
194		if (pud_sect(*pud)) {
195			/*
196			 * need to have the 1G of mappings continue to be
197			 * present
198			 */
199			split_pud(pud, pmd);
200		}
201		__pud_populate(pud, pmd_phys, PUD_TYPE_TABLE);
202		flush_tlb_all();
203		pmd_clear_fixmap();
204	}
205	BUG_ON(pud_bad(*pud));
206
207	pmd = pmd_set_fixmap_offset(pud, addr);
208	do {
 
 
209		next = pmd_addr_end(addr, end);
 
210		/* try section mapping first */
211		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
212		      block_mappings_allowed(pgtable_alloc)) {
213			pmd_t old_pmd =*pmd;
214			pmd_set_huge(pmd, phys, prot);
215			/*
216			 * Check for previous table entries created during
217			 * boot (__create_page_tables) and flush them.
218			 */
219			if (!pmd_none(old_pmd)) {
220				flush_tlb_all();
221				if (pmd_table(old_pmd)) {
222					phys_addr_t table = pmd_page_paddr(old_pmd);
223					if (!WARN_ON_ONCE(slab_is_available()))
224						memblock_free(table, PAGE_SIZE);
225				}
226			}
227		} else {
228			alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys),
229				       prot, pgtable_alloc);
 
 
 
230		}
231		phys += next - addr;
232	} while (pmd++, addr = next, addr != end);
233
234	pmd_clear_fixmap();
235}
236
237static inline bool use_1G_block(unsigned long addr, unsigned long next,
238			unsigned long phys)
 
 
239{
240	if (PAGE_SHIFT != 12)
241		return false;
242
243	if (((addr | next | phys) & ~PUD_MASK) != 0)
244		return false;
 
 
 
 
 
245
246	return true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247}
248
249static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
250				  phys_addr_t phys, pgprot_t prot,
251				  phys_addr_t (*pgtable_alloc)(void))
 
252{
253	pud_t *pud;
254	unsigned long next;
 
 
 
255
256	if (pgd_none(*pgd)) {
 
257		phys_addr_t pud_phys;
 
 
 
258		BUG_ON(!pgtable_alloc);
259		pud_phys = pgtable_alloc();
260		__pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
 
261	}
262	BUG_ON(pgd_bad(*pgd));
263
264	pud = pud_set_fixmap_offset(pgd, addr);
265	do {
 
 
266		next = pud_addr_end(addr, end);
267
268		/*
269		 * For 4K granule only, attempt to put down a 1GB block
270		 */
271		if (use_1G_block(addr, next, phys) &&
272		    block_mappings_allowed(pgtable_alloc)) {
273			pud_t old_pud = *pud;
274			pud_set_huge(pud, phys, prot);
275
276			/*
277			 * If we have an old value for a pud, it will
278			 * be pointing to a pmd table that we no longer
279			 * need (from swapper_pg_dir).
280			 *
281			 * Look up the old pmd table and free it.
282			 */
283			if (!pud_none(old_pud)) {
284				flush_tlb_all();
285				if (pud_table(old_pud)) {
286					phys_addr_t table = pud_page_paddr(old_pud);
287					if (!WARN_ON_ONCE(slab_is_available()))
288						memblock_free(table, PAGE_SIZE);
289				}
290			}
291		} else {
292			alloc_init_pmd(pud, addr, next, phys, prot,
293				       pgtable_alloc);
 
 
 
294		}
295		phys += next - addr;
296	} while (pud++, addr = next, addr != end);
297
298	pud_clear_fixmap();
299}
300
301/*
302 * Create the page directory entries and any necessary page tables for the
303 * mapping specified by 'md'.
304 */
305static void init_pgd(pgd_t *pgd, phys_addr_t phys, unsigned long virt,
306				    phys_addr_t size, pgprot_t prot,
307				    phys_addr_t (*pgtable_alloc)(void))
308{
309	unsigned long addr, length, end, next;
 
310
311	/*
312	 * If the virtual and physical address don't have the same offset
313	 * within a page, we cannot map the region as the caller expects.
314	 */
315	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
316		return;
317
318	phys &= PAGE_MASK;
319	addr = virt & PAGE_MASK;
320	length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
321
322	end = addr + length;
323	do {
324		next = pgd_addr_end(addr, end);
325		alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc);
 
326		phys += next - addr;
327	} while (pgd++, addr = next, addr != end);
328}
329
330static phys_addr_t late_pgtable_alloc(void)
 
 
 
 
331{
332	void *ptr = (void *)__get_free_page(PGALLOC_GFP);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333	BUG_ON(!ptr);
334
335	/* Ensure the zeroed page is visible to the page table walker */
336	dsb(ishst);
337	return __pa(ptr);
338}
339
340static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
341				 unsigned long virt, phys_addr_t size,
342				 pgprot_t prot,
343				 phys_addr_t (*alloc)(void))
344{
345	init_pgd(pgd_offset_raw(pgdir, virt), phys, virt, size, prot, alloc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346}
347
348/*
349 * This function can only be used to modify existing table entries,
350 * without allocating new levels of table. Note that this permits the
351 * creation of new section or page entries.
352 */
353static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
354				  phys_addr_t size, pgprot_t prot)
355{
356	if (virt < VMALLOC_START) {
357		pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
358			&phys, virt);
359		return;
360	}
361	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
362			     NULL);
363}
364
365void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
366			       unsigned long virt, phys_addr_t size,
367			       pgprot_t prot)
368{
 
 
 
 
 
 
 
369	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
370			     late_pgtable_alloc);
371}
372
373static void create_mapping_late(phys_addr_t phys, unsigned long virt,
374				  phys_addr_t size, pgprot_t prot)
375{
376	if (virt < VMALLOC_START) {
377		pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
378			&phys, virt);
379		return;
380	}
381
382	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot,
383			     late_pgtable_alloc);
 
 
 
384}
385
386static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end)
 
387{
388	unsigned long kernel_start = __pa(_stext);
389	unsigned long kernel_end = __pa(_etext);
 
390
 
 
391	/*
392	 * Take care not to create a writable alias for the
393	 * read-only text and rodata sections of the kernel image.
394	 */
 
 
 
 
395
396	/* No overlap with the kernel text */
397	if (end < kernel_start || start >= kernel_end) {
398		__create_pgd_mapping(pgd, start, __phys_to_virt(start),
399				     end - start, PAGE_KERNEL,
400				     early_pgtable_alloc);
401		return;
402	}
403
 
 
404	/*
405	 * This block overlaps the kernel text mapping.
406	 * Map the portion(s) which don't overlap.
 
407	 */
408	if (start < kernel_start)
409		__create_pgd_mapping(pgd, start,
410				     __phys_to_virt(start),
411				     kernel_start - start, PAGE_KERNEL,
412				     early_pgtable_alloc);
413	if (kernel_end < end)
414		__create_pgd_mapping(pgd, kernel_end,
415				     __phys_to_virt(kernel_end),
416				     end - kernel_end, PAGE_KERNEL,
417				     early_pgtable_alloc);
418
419	/*
420	 * Map the linear alias of the [_stext, _etext) interval as
421	 * read-only/non-executable. This makes the contents of the
422	 * region accessible to subsystems such as hibernate, but
423	 * protects it from inadvertent modification or execution.
424	 */
425	__create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start),
426			     kernel_end - kernel_start, PAGE_KERNEL_RO,
427			     early_pgtable_alloc);
428}
 
429
430static void __init map_mem(pgd_t *pgd)
431{
432	struct memblock_region *reg;
 
 
 
 
 
433
434	/* map all the memory banks */
435	for_each_memblock(memory, reg) {
436		phys_addr_t start = reg->base;
437		phys_addr_t end = start + reg->size;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
 
 
 
 
 
 
 
 
 
 
 
439		if (start >= end)
440			break;
441		if (memblock_is_nomap(reg))
442			continue;
 
 
 
 
 
 
443
444		__map_memblock(pgd, start, end);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445	}
 
446}
447
448void mark_rodata_ro(void)
449{
450	unsigned long section_size;
451
452	section_size = (unsigned long)__start_rodata - (unsigned long)_stext;
453	create_mapping_late(__pa(_stext), (unsigned long)_stext,
454			    section_size, PAGE_KERNEL_ROX);
455	/*
456	 * mark .rodata as read only. Use _etext rather than __end_rodata to
457	 * cover NOTES and EXCEPTION_TABLE.
458	 */
459	section_size = (unsigned long)_etext - (unsigned long)__start_rodata;
460	create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata,
461			    section_size, PAGE_KERNEL_RO);
462}
463
464void fixup_init(void)
465{
466	/*
467	 * Unmap the __init region but leave the VM area in place. This
468	 * prevents the region from being reused for kernel modules, which
469	 * is not supported by kallsyms.
470	 */
471	unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
472}
473
474static void __init map_kernel_chunk(pgd_t *pgd, void *va_start, void *va_end,
475				    pgprot_t prot, struct vm_struct *vma)
 
476{
477	phys_addr_t pa_start = __pa(va_start);
478	unsigned long size = va_end - va_start;
479
480	BUG_ON(!PAGE_ALIGNED(pa_start));
481	BUG_ON(!PAGE_ALIGNED(size));
482
483	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
484			     early_pgtable_alloc);
 
 
 
485
486	vma->addr	= va_start;
487	vma->phys_addr	= pa_start;
488	vma->size	= size;
489	vma->flags	= VM_MAP;
490	vma->caller	= __builtin_return_address(0);
491
492	vm_area_add_early(vma);
493}
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495/*
496 * Create fine-grained mappings for the kernel.
497 */
498static void __init map_kernel(pgd_t *pgd)
499{
500	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
 
 
 
 
 
 
 
 
501
502	map_kernel_chunk(pgd, _stext, __start_rodata, PAGE_KERNEL_EXEC, &vmlinux_text);
503	map_kernel_chunk(pgd, __start_rodata, _etext, PAGE_KERNEL, &vmlinux_rodata);
504	map_kernel_chunk(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
505			 &vmlinux_init);
506	map_kernel_chunk(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
 
 
507
508	if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509		/*
510		 * The fixmap falls in a separate pgd to the kernel, and doesn't
511		 * live in the carveout for the swapper_pg_dir. We can simply
512		 * re-use the existing dir for the fixmap.
513		 */
514		set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
515			*pgd_offset_k(FIXADDR_START));
516	} else if (CONFIG_PGTABLE_LEVELS > 3) {
 
 
 
517		/*
518		 * The fixmap shares its top level pgd entry with the kernel
519		 * mapping. This can really only occur when we are running
520		 * with 16k/4 levels, so we can simply reuse the pud level
521		 * entry instead.
522		 */
523		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
524		set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
525			__pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
 
 
526		pud_clear_fixmap();
527	} else {
528		BUG();
529	}
530
531	kasan_copy_shadow(pgd);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532}
533
534/*
535 * paging_init() sets up the page tables, initialises the zone memory
536 * maps and sets up the zero page.
537 */
538void __init paging_init(void)
539{
540	phys_addr_t pgd_phys = early_pgtable_alloc();
541	pgd_t *pgd = pgd_set_fixmap(pgd_phys);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
543	map_kernel(pgd);
544	map_mem(pgd);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
546	/*
547	 * We want to reuse the original swapper_pg_dir so we don't have to
548	 * communicate the new address to non-coherent secondaries in
549	 * secondary_entry, and so cpu_switch_mm can generate the address with
550	 * adrp+add rather than a load from some global variable.
551	 *
552	 * To do this we need to go via a temporary pgd.
553	 */
554	cpu_replace_ttbr1(__va(pgd_phys));
555	memcpy(swapper_pg_dir, pgd, PAGE_SIZE);
556	cpu_replace_ttbr1(swapper_pg_dir);
557
558	pgd_clear_fixmap();
559	memblock_free(pgd_phys, PAGE_SIZE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
561	/*
562	 * We only reuse the PGD from the swapper_pg_dir, not the pud + pmd
563	 * allocated with it.
 
564	 */
565	memblock_free(__pa(swapper_pg_dir) + PAGE_SIZE,
566		      SWAPPER_DIR_SIZE - PAGE_SIZE);
 
 
 
567
568	bootmem_init();
 
 
569}
570
571/*
572 * Check whether a kernel address is valid (derived from arch/x86/).
573 */
574int kern_addr_valid(unsigned long addr)
575{
576	pgd_t *pgd;
577	pud_t *pud;
578	pmd_t *pmd;
579	pte_t *pte;
580
581	if ((((long)addr) >> VA_BITS) != -1UL)
582		return 0;
 
 
 
 
583
584	pgd = pgd_offset_k(addr);
585	if (pgd_none(*pgd))
586		return 0;
587
588	pud = pud_offset(pgd, addr);
589	if (pud_none(*pud))
590		return 0;
591
592	if (pud_sect(*pud))
593		return pfn_valid(pud_pfn(*pud));
594
595	pmd = pmd_offset(pud, addr);
596	if (pmd_none(*pmd))
597		return 0;
 
 
 
 
 
 
 
598
599	if (pmd_sect(*pmd))
600		return pfn_valid(pmd_pfn(*pmd));
 
 
601
602	pte = pte_offset_kernel(pmd, addr);
603	if (pte_none(*pte))
604		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
606	return pfn_valid(pte_pfn(*pte));
 
 
 
 
 
 
 
 
 
 
 
 
 
607}
608#ifdef CONFIG_SPARSEMEM_VMEMMAP
609#if !ARM64_SWAPPER_USES_SECTION_MAPS
610int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 
611{
612	return vmemmap_populate_basepages(start, end, node);
 
 
 
 
 
 
 
 
 
 
 
 
613}
614#else	/* !ARM64_SWAPPER_USES_SECTION_MAPS */
615int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 
616{
617	unsigned long addr = start;
618	unsigned long next;
619	pgd_t *pgd;
620	pud_t *pud;
621	pmd_t *pmd;
622
623	do {
624		next = pmd_addr_end(addr, end);
 
 
 
 
625
626		pgd = vmemmap_pgd_populate(addr, node);
627		if (!pgd)
628			return -ENOMEM;
629
630		pud = vmemmap_pud_populate(pgd, addr, node);
631		if (!pud)
632			return -ENOMEM;
633
634		pmd = pmd_offset(pud, addr);
635		if (pmd_none(*pmd)) {
636			void *p = NULL;
637
638			p = vmemmap_alloc_block_buf(PMD_SIZE, node);
639			if (!p)
640				return -ENOMEM;
641
642			set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL));
643		} else
644			vmemmap_verify((pte_t *)pmd, node, addr, next);
645	} while (addr = next, addr != end);
646
647	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648}
649#endif	/* CONFIG_ARM64_64K_PAGES */
650void vmemmap_free(unsigned long start, unsigned long end)
 
 
651{
 
 
 
 
652}
653#endif	/* CONFIG_SPARSEMEM_VMEMMAP */
654
655static inline pud_t * fixmap_pud(unsigned long addr)
656{
657	pgd_t *pgd = pgd_offset_k(addr);
 
 
658
659	BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
660
661	return pud_offset_kimg(pgd, addr);
662}
663
664static inline pmd_t * fixmap_pmd(unsigned long addr)
665{
666	pud_t *pud = fixmap_pud(addr);
 
667
668	BUG_ON(pud_none(*pud) || pud_bad(*pud));
669
670	return pmd_offset_kimg(pud, addr);
671}
672
673static inline pte_t * fixmap_pte(unsigned long addr)
674{
675	return &bm_pte[pte_index(addr)];
676}
677
 
 
 
 
 
 
678void __init early_fixmap_init(void)
679{
680	pgd_t *pgd;
681	pud_t *pud;
682	pmd_t *pmd;
 
683	unsigned long addr = FIXADDR_START;
684
685	pgd = pgd_offset_k(addr);
 
 
686	if (CONFIG_PGTABLE_LEVELS > 3 &&
687	    !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa(bm_pud))) {
688		/*
689		 * We only end up here if the kernel mapping and the fixmap
690		 * share the top level pgd entry, which should only happen on
691		 * 16k/4 levels configurations.
692		 */
693		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
694		pud = pud_offset_kimg(pgd, addr);
695	} else {
696		pgd_populate(&init_mm, pgd, bm_pud);
697		pud = fixmap_pud(addr);
 
698	}
699	pud_populate(&init_mm, pud, bm_pmd);
700	pmd = fixmap_pmd(addr);
701	pmd_populate_kernel(&init_mm, pmd, bm_pte);
 
702
703	/*
704	 * The boot-ioremap range spans multiple pmds, for which
705	 * we are not prepared:
706	 */
707	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
708		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
709
710	if ((pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
711	     || pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
712		WARN_ON(1);
713		pr_warn("pmd %p != %p, %p\n",
714			pmd, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
715			fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
716		pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
717			fix_to_virt(FIX_BTMAP_BEGIN));
718		pr_warn("fix_to_virt(FIX_BTMAP_END):   %08lx\n",
719			fix_to_virt(FIX_BTMAP_END));
720
721		pr_warn("FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
722		pr_warn("FIX_BTMAP_BEGIN:     %d\n", FIX_BTMAP_BEGIN);
723	}
724}
725
 
 
 
 
726void __set_fixmap(enum fixed_addresses idx,
727			       phys_addr_t phys, pgprot_t flags)
728{
729	unsigned long addr = __fix_to_virt(idx);
730	pte_t *pte;
731
732	BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
733
734	pte = fixmap_pte(addr);
735
736	if (pgprot_val(flags)) {
737		set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
738	} else {
739		pte_clear(&init_mm, addr, pte);
740		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
741	}
742}
743
744void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
745{
746	const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
747	int offset;
748	void *dt_virt;
749
750	/*
751	 * Check whether the physical FDT address is set and meets the minimum
752	 * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
753	 * at least 8 bytes so that we can always access the size field of the
754	 * FDT header after mapping the first chunk, double check here if that
755	 * is indeed the case.
756	 */
757	BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
758	if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
759		return NULL;
760
761	/*
762	 * Make sure that the FDT region can be mapped without the need to
763	 * allocate additional translation table pages, so that it is safe
764	 * to call create_mapping_noalloc() this early.
765	 *
766	 * On 64k pages, the FDT will be mapped using PTEs, so we need to
767	 * be in the same PMD as the rest of the fixmap.
768	 * On 4k pages, we'll use section mappings for the FDT so we only
769	 * have to be in the same PUD.
770	 */
771	BUILD_BUG_ON(dt_virt_base % SZ_2M);
772
773	BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
774		     __fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);
775
776	offset = dt_phys % SWAPPER_BLOCK_SIZE;
777	dt_virt = (void *)dt_virt_base + offset;
778
779	/* map the first chunk so we can read the size from the header */
780	create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
781			dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
782
783	if (fdt_check_header(dt_virt) != 0)
784		return NULL;
785
786	*size = fdt_totalsize(dt_virt);
787	if (*size > MAX_FDT_SIZE)
788		return NULL;
789
790	if (offset + *size > SWAPPER_BLOCK_SIZE)
791		create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
792			       round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
793
794	return dt_virt;
795}
796
797void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
798{
799	void *dt_virt;
800	int size;
801
802	dt_virt = __fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
803	if (!dt_virt)
804		return NULL;
 
805
806	memblock_reserve(dt_phys, size);
807	return dt_virt;
 
808}
809
810int __init arch_ioremap_pud_supported(void)
811{
812	/* only 4k granule supports level 1 block mappings */
813	return IS_ENABLED(CONFIG_ARM64_4K_PAGES);
 
 
 
 
 
 
 
 
814}
815
816int __init arch_ioremap_pmd_supported(void)
817{
 
 
 
818	return 1;
819}
820
821int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
822{
823	BUG_ON(phys & ~PUD_MASK);
824	set_pud(pud, __pud(phys | PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
 
825	return 1;
826}
827
828int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
829{
830	BUG_ON(phys & ~PMD_MASK);
831	set_pmd(pmd, __pmd(phys | PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))));
 
 
 
 
 
 
 
 
 
 
 
 
832	return 1;
833}
834
835int pud_clear_huge(pud_t *pud)
836{
837	if (!pud_sect(*pud))
838		return 0;
839	pud_clear(pud);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840	return 1;
841}
842
843int pmd_clear_huge(pmd_t *pmd)
 
844{
845	if (!pmd_sect(*pmd))
846		return 0;
847	pmd_clear(pmd);
848	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849}
v6.2
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Based on arch/arm/mm/mmu.c
   4 *
   5 * Copyright (C) 1995-2005 Russell King
   6 * Copyright (C) 2012 ARM Ltd.
 
 
 
 
 
 
 
 
 
 
 
 
   7 */
   8
   9#include <linux/cache.h>
  10#include <linux/export.h>
  11#include <linux/kernel.h>
  12#include <linux/errno.h>
  13#include <linux/init.h>
  14#include <linux/ioport.h>
  15#include <linux/kexec.h>
  16#include <linux/libfdt.h>
  17#include <linux/mman.h>
  18#include <linux/nodemask.h>
  19#include <linux/memblock.h>
  20#include <linux/memremap.h>
  21#include <linux/memory.h>
  22#include <linux/fs.h>
  23#include <linux/io.h>
  24#include <linux/mm.h>
  25#include <linux/vmalloc.h>
  26#include <linux/set_memory.h>
  27
  28#include <asm/barrier.h>
  29#include <asm/cputype.h>
  30#include <asm/fixmap.h>
  31#include <asm/kasan.h>
  32#include <asm/kernel-pgtable.h>
  33#include <asm/sections.h>
  34#include <asm/setup.h>
  35#include <linux/sizes.h>
  36#include <asm/tlb.h>
 
  37#include <asm/mmu_context.h>
  38#include <asm/ptdump.h>
  39#include <asm/tlbflush.h>
  40#include <asm/pgalloc.h>
  41
  42#define NO_BLOCK_MAPPINGS	BIT(0)
  43#define NO_CONT_MAPPINGS	BIT(1)
  44#define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
  45
  46int idmap_t0sz __ro_after_init;
  47
  48#if VA_BITS > 48
  49u64 vabits_actual __ro_after_init = VA_BITS_MIN;
  50EXPORT_SYMBOL(vabits_actual);
  51#endif
  52
  53u64 kimage_vaddr __ro_after_init = (u64)&_text;
  54EXPORT_SYMBOL(kimage_vaddr);
 
  55
  56u64 kimage_voffset __ro_after_init;
  57EXPORT_SYMBOL(kimage_voffset);
  58
  59u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
  60
  61/*
  62 * The booting CPU updates the failed status @__early_cpu_boot_status,
  63 * with MMU turned off.
  64 */
  65long __section(".mmuoff.data.write") __early_cpu_boot_status;
  66
  67/*
  68 * Empty_zero_page is a special page that is used for zero-initialized data
  69 * and COW.
  70 */
  71unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
  72EXPORT_SYMBOL(empty_zero_page);
  73
  74static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
  75static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
  76static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
  77
  78static DEFINE_SPINLOCK(swapper_pgdir_lock);
  79static DEFINE_MUTEX(fixmap_lock);
  80
  81void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
  82{
  83	pgd_t *fixmap_pgdp;
  84
  85	spin_lock(&swapper_pgdir_lock);
  86	fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
  87	WRITE_ONCE(*fixmap_pgdp, pgd);
  88	/*
  89	 * We need dsb(ishst) here to ensure the page-table-walker sees
  90	 * our new entry before set_p?d() returns. The fixmap's
  91	 * flush_tlb_kernel_range() via clear_fixmap() does this for us.
  92	 */
  93	pgd_clear_fixmap();
  94	spin_unlock(&swapper_pgdir_lock);
  95}
  96
  97pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
  98			      unsigned long size, pgprot_t vma_prot)
  99{
 100	if (!pfn_is_map_memory(pfn))
 101		return pgprot_noncached(vma_prot);
 102	else if (file->f_flags & O_SYNC)
 103		return pgprot_writecombine(vma_prot);
 104	return vma_prot;
 105}
 106EXPORT_SYMBOL(phys_mem_access_prot);
 107
 108static phys_addr_t __init early_pgtable_alloc(int shift)
 109{
 110	phys_addr_t phys;
 111	void *ptr;
 112
 113	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
 114					 MEMBLOCK_ALLOC_NOLEAKTRACE);
 115	if (!phys)
 116		panic("Failed to allocate page table page\n");
 117
 118	/*
 119	 * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
 120	 * slot will be free, so we can (ab)use the FIX_PTE slot to initialise
 121	 * any level of table.
 122	 */
 123	ptr = pte_set_fixmap(phys);
 124
 125	memset(ptr, 0, PAGE_SIZE);
 126
 127	/*
 128	 * Implicit barriers also ensure the zeroed page is visible to the page
 129	 * table walker
 130	 */
 131	pte_clear_fixmap();
 132
 133	return phys;
 134}
 135
 136static bool pgattr_change_is_safe(u64 old, u64 new)
 
 
 
 137{
 138	/*
 139	 * The following mapping attributes may be updated in live
 140	 * kernel mappings without the need for break-before-make.
 141	 */
 142	pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
 143
 144	/* creating or taking down mappings is always safe */
 145	if (old == 0 || new == 0)
 146		return true;
 147
 148	/* live contiguous mappings may not be manipulated at all */
 149	if ((old | new) & PTE_CONT)
 150		return false;
 151
 152	/* Transitioning from Non-Global to Global is unsafe */
 153	if (old & ~new & PTE_NG)
 154		return false;
 155
 156	/*
 157	 * Changing the memory type between Normal and Normal-Tagged is safe
 158	 * since Tagged is considered a permission attribute from the
 159	 * mismatched attribute aliases perspective.
 160	 */
 161	if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
 162	     (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
 163	    ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
 164	     (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
 165		mask |= PTE_ATTRINDX_MASK;
 166
 167	return ((old ^ new) & ~mask) == 0;
 168}
 169
 170static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 171		     phys_addr_t phys, pgprot_t prot)
 172{
 173	pte_t *ptep;
 174
 175	ptep = pte_set_fixmap_offset(pmdp, addr);
 176	do {
 177		pte_t old_pte = READ_ONCE(*ptep);
 178
 179		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 180
 181		/*
 182		 * After the PTE entry has been populated once, we
 183		 * only allow updates to the permission attributes.
 184		 */
 185		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
 186					      READ_ONCE(pte_val(*ptep))));
 187
 188		phys += PAGE_SIZE;
 189	} while (ptep++, addr += PAGE_SIZE, addr != end);
 190
 191	pte_clear_fixmap();
 192}
 193
 194static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 195				unsigned long end, phys_addr_t phys,
 196				pgprot_t prot,
 197				phys_addr_t (*pgtable_alloc)(int),
 198				int flags)
 199{
 200	unsigned long next;
 201	pmd_t pmd = READ_ONCE(*pmdp);
 202
 203	BUG_ON(pmd_sect(pmd));
 204	if (pmd_none(pmd)) {
 205		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
 206		phys_addr_t pte_phys;
 207
 208		if (flags & NO_EXEC_MAPPINGS)
 209			pmdval |= PMD_TABLE_PXN;
 210		BUG_ON(!pgtable_alloc);
 211		pte_phys = pgtable_alloc(PAGE_SHIFT);
 212		__pmd_populate(pmdp, pte_phys, pmdval);
 213		pmd = READ_ONCE(*pmdp);
 214	}
 215	BUG_ON(pmd_bad(pmd));
 
 
 
 
 
 
 
 
 
 
 216
 217	do {
 218		pgprot_t __prot = prot;
 219
 220		next = pte_cont_addr_end(addr, end);
 
 
 
 
 221
 222		/* use a contiguous mapping if the range is suitably aligned */
 223		if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
 224		    (flags & NO_CONT_MAPPINGS) == 0)
 225			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
 226
 227		init_pte(pmdp, addr, next, phys, __prot);
 
 
 228
 229		phys += next - addr;
 230	} while (addr = next, addr != end);
 
 
 
 
 
 
 
 
 
 
 
 231}
 
 232
 233static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 234		     phys_addr_t phys, pgprot_t prot,
 235		     phys_addr_t (*pgtable_alloc)(int), int flags)
 236{
 
 237	unsigned long next;
 238	pmd_t *pmdp;
 239
 240	pmdp = pmd_set_fixmap_offset(pudp, addr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 241	do {
 242		pmd_t old_pmd = READ_ONCE(*pmdp);
 243
 244		next = pmd_addr_end(addr, end);
 245
 246		/* try section mapping first */
 247		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
 248		    (flags & NO_BLOCK_MAPPINGS) == 0) {
 249			pmd_set_huge(pmdp, phys, prot);
 250
 251			/*
 252			 * After the PMD entry has been populated once, we
 253			 * only allow updates to the permission attributes.
 254			 */
 255			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
 256						      READ_ONCE(pmd_val(*pmdp))));
 
 
 
 
 
 
 257		} else {
 258			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
 259					    pgtable_alloc, flags);
 260
 261			BUG_ON(pmd_val(old_pmd) != 0 &&
 262			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
 263		}
 264		phys += next - addr;
 265	} while (pmdp++, addr = next, addr != end);
 266
 267	pmd_clear_fixmap();
 268}
 269
 270static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 271				unsigned long end, phys_addr_t phys,
 272				pgprot_t prot,
 273				phys_addr_t (*pgtable_alloc)(int), int flags)
 274{
 275	unsigned long next;
 276	pud_t pud = READ_ONCE(*pudp);
 277
 278	/*
 279	 * Check for initial section mappings in the pgd/pud.
 280	 */
 281	BUG_ON(pud_sect(pud));
 282	if (pud_none(pud)) {
 283		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
 284		phys_addr_t pmd_phys;
 285
 286		if (flags & NO_EXEC_MAPPINGS)
 287			pudval |= PUD_TABLE_PXN;
 288		BUG_ON(!pgtable_alloc);
 289		pmd_phys = pgtable_alloc(PMD_SHIFT);
 290		__pud_populate(pudp, pmd_phys, pudval);
 291		pud = READ_ONCE(*pudp);
 292	}
 293	BUG_ON(pud_bad(pud));
 294
 295	do {
 296		pgprot_t __prot = prot;
 297
 298		next = pmd_cont_addr_end(addr, end);
 299
 300		/* use a contiguous mapping if the range is suitably aligned */
 301		if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
 302		    (flags & NO_CONT_MAPPINGS) == 0)
 303			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 304
 305		init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
 306
 307		phys += next - addr;
 308	} while (addr = next, addr != end);
 309}
 310
 311static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
 312			   phys_addr_t phys, pgprot_t prot,
 313			   phys_addr_t (*pgtable_alloc)(int),
 314			   int flags)
 315{
 
 316	unsigned long next;
 317	pud_t *pudp;
 318	p4d_t *p4dp = p4d_offset(pgdp, addr);
 319	p4d_t p4d = READ_ONCE(*p4dp);
 320
 321	if (p4d_none(p4d)) {
 322		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
 323		phys_addr_t pud_phys;
 324
 325		if (flags & NO_EXEC_MAPPINGS)
 326			p4dval |= P4D_TABLE_PXN;
 327		BUG_ON(!pgtable_alloc);
 328		pud_phys = pgtable_alloc(PUD_SHIFT);
 329		__p4d_populate(p4dp, pud_phys, p4dval);
 330		p4d = READ_ONCE(*p4dp);
 331	}
 332	BUG_ON(p4d_bad(p4d));
 333
 334	pudp = pud_set_fixmap_offset(p4dp, addr);
 335	do {
 336		pud_t old_pud = READ_ONCE(*pudp);
 337
 338		next = pud_addr_end(addr, end);
 339
 340		/*
 341		 * For 4K granule only, attempt to put down a 1GB block
 342		 */
 343		if (pud_sect_supported() &&
 344		   ((addr | next | phys) & ~PUD_MASK) == 0 &&
 345		    (flags & NO_BLOCK_MAPPINGS) == 0) {
 346			pud_set_huge(pudp, phys, prot);
 347
 348			/*
 349			 * After the PUD entry has been populated once, we
 350			 * only allow updates to the permission attributes.
 
 
 
 351			 */
 352			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
 353						      READ_ONCE(pud_val(*pudp))));
 
 
 
 
 
 
 354		} else {
 355			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
 356					    pgtable_alloc, flags);
 357
 358			BUG_ON(pud_val(old_pud) != 0 &&
 359			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
 360		}
 361		phys += next - addr;
 362	} while (pudp++, addr = next, addr != end);
 363
 364	pud_clear_fixmap();
 365}
 366
 367static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
 368					unsigned long virt, phys_addr_t size,
 369					pgprot_t prot,
 370					phys_addr_t (*pgtable_alloc)(int),
 371					int flags)
 
 
 372{
 373	unsigned long addr, end, next;
 374	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
 375
 376	/*
 377	 * If the virtual and physical address don't have the same offset
 378	 * within a page, we cannot map the region as the caller expects.
 379	 */
 380	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
 381		return;
 382
 383	phys &= PAGE_MASK;
 384	addr = virt & PAGE_MASK;
 385	end = PAGE_ALIGN(virt + size);
 386
 
 387	do {
 388		next = pgd_addr_end(addr, end);
 389		alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
 390			       flags);
 391		phys += next - addr;
 392	} while (pgdp++, addr = next, addr != end);
 393}
 394
 395static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 396				 unsigned long virt, phys_addr_t size,
 397				 pgprot_t prot,
 398				 phys_addr_t (*pgtable_alloc)(int),
 399				 int flags)
 400{
 401	mutex_lock(&fixmap_lock);
 402	__create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
 403				    pgtable_alloc, flags);
 404	mutex_unlock(&fixmap_lock);
 405}
 406
 407#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 408extern __alias(__create_pgd_mapping_locked)
 409void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
 410			     phys_addr_t size, pgprot_t prot,
 411			     phys_addr_t (*pgtable_alloc)(int), int flags);
 412#endif
 413
 414static phys_addr_t __pgd_pgtable_alloc(int shift)
 415{
 416	void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
 417	BUG_ON(!ptr);
 418
 419	/* Ensure the zeroed page is visible to the page table walker */
 420	dsb(ishst);
 421	return __pa(ptr);
 422}
 423
 424static phys_addr_t pgd_pgtable_alloc(int shift)
 
 
 
 425{
 426	phys_addr_t pa = __pgd_pgtable_alloc(shift);
 427
 428	/*
 429	 * Call proper page table ctor in case later we need to
 430	 * call core mm functions like apply_to_page_range() on
 431	 * this pre-allocated page table.
 432	 *
 433	 * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
 434	 * folded, and if so pgtable_pmd_page_ctor() becomes nop.
 435	 */
 436	if (shift == PAGE_SHIFT)
 437		BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
 438	else if (shift == PMD_SHIFT)
 439		BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
 440
 441	return pa;
 442}
 443
 444/*
 445 * This function can only be used to modify existing table entries,
 446 * without allocating new levels of table. Note that this permits the
 447 * creation of new section or page entries.
 448 */
 449static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
 450				  phys_addr_t size, pgprot_t prot)
 451{
 452	if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
 453		pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
 454			&phys, virt);
 455		return;
 456	}
 457	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 458			     NO_CONT_MAPPINGS);
 459}
 460
 461void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 462			       unsigned long virt, phys_addr_t size,
 463			       pgprot_t prot, bool page_mappings_only)
 464{
 465	int flags = 0;
 466
 467	BUG_ON(mm == &init_mm);
 468
 469	if (page_mappings_only)
 470		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 471
 472	__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
 473			     pgd_pgtable_alloc, flags);
 474}
 475
 476static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 477				phys_addr_t size, pgprot_t prot)
 478{
 479	if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
 480		pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
 481			&phys, virt);
 482		return;
 483	}
 484
 485	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
 486			     NO_CONT_MAPPINGS);
 487
 488	/* flush the TLBs after updating live kernel mappings */
 489	flush_tlb_kernel_range(virt, virt + size);
 490}
 491
 492static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 493				  phys_addr_t end, pgprot_t prot, int flags)
 494{
 495	__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
 496			     prot, early_pgtable_alloc, flags);
 497}
 498
 499void __init mark_linear_text_alias_ro(void)
 500{
 501	/*
 502	 * Remove the write permissions from the linear alias of .text/.rodata
 
 503	 */
 504	update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
 505			    (unsigned long)__init_begin - (unsigned long)_stext,
 506			    PAGE_KERNEL_RO);
 507}
 508
 509static bool crash_mem_map __initdata;
 
 
 
 
 
 
 510
 511static int __init enable_crash_mem_map(char *arg)
 512{
 513	/*
 514	 * Proper parameter parsing is done by reserve_crashkernel(). We only
 515	 * need to know if the linear map has to avoid block mappings so that
 516	 * the crashkernel reservations can be unmapped later.
 517	 */
 518	crash_mem_map = true;
 519
 520	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 521}
 522early_param("crashkernel", enable_crash_mem_map);
 523
 524static void __init map_mem(pgd_t *pgdp)
 525{
 526	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
 527	phys_addr_t kernel_start = __pa_symbol(_stext);
 528	phys_addr_t kernel_end = __pa_symbol(__init_begin);
 529	phys_addr_t start, end;
 530	int flags = NO_EXEC_MAPPINGS;
 531	u64 i;
 532
 533	/*
 534	 * Setting hierarchical PXNTable attributes on table entries covering
 535	 * the linear region is only possible if it is guaranteed that no table
 536	 * entries at any level are being shared between the linear region and
 537	 * the vmalloc region. Check whether this is true for the PGD level, in
 538	 * which case it is guaranteed to be true for all other levels as well.
 539	 */
 540	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
 541
 542	if (can_set_direct_map())
 543		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 544
 545	/*
 546	 * Take care not to create a writable alias for the
 547	 * read-only text and rodata sections of the kernel image.
 548	 * So temporarily mark them as NOMAP to skip mappings in
 549	 * the following for-loop
 550	 */
 551	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
 552
 553#ifdef CONFIG_KEXEC_CORE
 554	if (crash_mem_map) {
 555		if (defer_reserve_crashkernel())
 556			flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 557		else if (crashk_res.end)
 558			memblock_mark_nomap(crashk_res.start,
 559			    resource_size(&crashk_res));
 560	}
 561#endif
 562
 563	/* map all the memory banks */
 564	for_each_mem_range(i, &start, &end) {
 565		if (start >= end)
 566			break;
 567		/*
 568		 * The linear map must allow allocation tags reading/writing
 569		 * if MTE is present. Otherwise, it has the same attributes as
 570		 * PAGE_KERNEL.
 571		 */
 572		__map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
 573			       flags);
 574	}
 575
 576	/*
 577	 * Map the linear alias of the [_stext, __init_begin) interval
 578	 * as non-executable now, and remove the write permission in
 579	 * mark_linear_text_alias_ro() below (which will be called after
 580	 * alternative patching has completed). This makes the contents
 581	 * of the region accessible to subsystems such as hibernate,
 582	 * but protects it from inadvertent modification or execution.
 583	 * Note that contiguous mappings cannot be remapped in this way,
 584	 * so we should avoid them here.
 585	 */
 586	__map_memblock(pgdp, kernel_start, kernel_end,
 587		       PAGE_KERNEL, NO_CONT_MAPPINGS);
 588	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 589
 590	/*
 591	 * Use page-level mappings here so that we can shrink the region
 592	 * in page granularity and put back unused memory to buddy system
 593	 * through /sys/kernel/kexec_crash_size interface.
 594	 */
 595#ifdef CONFIG_KEXEC_CORE
 596	if (crash_mem_map && !defer_reserve_crashkernel()) {
 597		if (crashk_res.end) {
 598			__map_memblock(pgdp, crashk_res.start,
 599				       crashk_res.end + 1,
 600				       PAGE_KERNEL,
 601				       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
 602			memblock_clear_nomap(crashk_res.start,
 603					     resource_size(&crashk_res));
 604		}
 605	}
 606#endif
 607}
 608
 609void mark_rodata_ro(void)
 610{
 611	unsigned long section_size;
 612
 
 
 
 613	/*
 614	 * mark .rodata as read only. Use __init_begin rather than __end_rodata
 615	 * to cover NOTES and EXCEPTION_TABLE.
 616	 */
 617	section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
 618	update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
 619			    section_size, PAGE_KERNEL_RO);
 
 620
 621	debug_checkwx();
 
 
 
 
 
 
 
 622}
 623
 624static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 625				      pgprot_t prot, struct vm_struct *vma,
 626				      int flags, unsigned long vm_flags)
 627{
 628	phys_addr_t pa_start = __pa_symbol(va_start);
 629	unsigned long size = va_end - va_start;
 630
 631	BUG_ON(!PAGE_ALIGNED(pa_start));
 632	BUG_ON(!PAGE_ALIGNED(size));
 633
 634	__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
 635			     early_pgtable_alloc, flags);
 636
 637	if (!(vm_flags & VM_NO_GUARD))
 638		size += PAGE_SIZE;
 639
 640	vma->addr	= va_start;
 641	vma->phys_addr	= pa_start;
 642	vma->size	= size;
 643	vma->flags	= VM_MAP | vm_flags;
 644	vma->caller	= __builtin_return_address(0);
 645
 646	vm_area_add_early(vma);
 647}
 648
 649#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
 650static int __init map_entry_trampoline(void)
 651{
 652	int i;
 653
 654	pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 655	phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
 656
 657	/* The trampoline is always mapped and can therefore be global */
 658	pgprot_val(prot) &= ~PTE_NG;
 659
 660	/* Map only the text into the trampoline page table */
 661	memset(tramp_pg_dir, 0, PGD_SIZE);
 662	__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
 663			     entry_tramp_text_size(), prot,
 664			     __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
 665
 666	/* Map both the text and data into the kernel page table */
 667	for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
 668		__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
 669			     pa_start + i * PAGE_SIZE, prot);
 670
 671	if (IS_ENABLED(CONFIG_RELOCATABLE))
 672		__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
 673			     pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
 674
 675	return 0;
 676}
 677core_initcall(map_entry_trampoline);
 678#endif
 679
 680/*
 681 * Open coded check for BTI, only for use to determine configuration
 682 * for early mappings for before the cpufeature code has run.
 683 */
 684static bool arm64_early_this_cpu_has_bti(void)
 685{
 686	u64 pfr1;
 687
 688	if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
 689		return false;
 690
 691	pfr1 = __read_sysreg_by_encoding(SYS_ID_AA64PFR1_EL1);
 692	return cpuid_feature_extract_unsigned_field(pfr1,
 693						    ID_AA64PFR1_EL1_BT_SHIFT);
 694}
 695
 696/*
 697 * Create fine-grained mappings for the kernel.
 698 */
 699static void __init map_kernel(pgd_t *pgdp)
 700{
 701	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
 702				vmlinux_initdata, vmlinux_data;
 703
 704	/*
 705	 * External debuggers may need to write directly to the text
 706	 * mapping to install SW breakpoints. Allow this (only) when
 707	 * explicitly requested with rodata=off.
 708	 */
 709	pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
 710
 711	/*
 712	 * If we have a CPU that supports BTI and a kernel built for
 713	 * BTI then mark the kernel executable text as guarded pages
 714	 * now so we don't have to rewrite the page tables later.
 715	 */
 716	if (arm64_early_this_cpu_has_bti())
 717		text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
 718
 719	/*
 720	 * Only rodata will be remapped with different permissions later on,
 721	 * all other segments are allowed to use contiguous mappings.
 722	 */
 723	map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
 724			   VM_NO_GUARD);
 725	map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
 726			   &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
 727	map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
 728			   &vmlinux_inittext, 0, VM_NO_GUARD);
 729	map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
 730			   &vmlinux_initdata, 0, VM_NO_GUARD);
 731	map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
 732
 733	if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
 734		/*
 735		 * The fixmap falls in a separate pgd to the kernel, and doesn't
 736		 * live in the carveout for the swapper_pg_dir. We can simply
 737		 * re-use the existing dir for the fixmap.
 738		 */
 739		set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START),
 740			READ_ONCE(*pgd_offset_k(FIXADDR_START)));
 741	} else if (CONFIG_PGTABLE_LEVELS > 3) {
 742		pgd_t *bm_pgdp;
 743		p4d_t *bm_p4dp;
 744		pud_t *bm_pudp;
 745		/*
 746		 * The fixmap shares its top level pgd entry with the kernel
 747		 * mapping. This can really only occur when we are running
 748		 * with 16k/4 levels, so we can simply reuse the pud level
 749		 * entry instead.
 750		 */
 751		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
 752		bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
 753		bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
 754		bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
 755		pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
 756		pud_clear_fixmap();
 757	} else {
 758		BUG();
 759	}
 760
 761	kasan_copy_shadow(pgdp);
 762}
 763
 764static void __init create_idmap(void)
 765{
 766	u64 start = __pa_symbol(__idmap_text_start);
 767	u64 size = __pa_symbol(__idmap_text_end) - start;
 768	pgd_t *pgd = idmap_pg_dir;
 769	u64 pgd_phys;
 770
 771	/* check if we need an additional level of translation */
 772	if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
 773		pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
 774		set_pgd(&idmap_pg_dir[start >> VA_BITS],
 775			__pgd(pgd_phys | P4D_TYPE_TABLE));
 776		pgd = __va(pgd_phys);
 777	}
 778	__create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
 779			     early_pgtable_alloc, 0);
 780
 781	if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
 782		extern u32 __idmap_kpti_flag;
 783		u64 pa = __pa_symbol(&__idmap_kpti_flag);
 784
 785		/*
 786		 * The KPTI G-to-nG conversion code needs a read-write mapping
 787		 * of its synchronization flag in the ID map.
 788		 */
 789		__create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
 790				     early_pgtable_alloc, 0);
 791	}
 792}
 793
 
 
 
 
 794void __init paging_init(void)
 795{
 796	pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
 797	extern pgd_t init_idmap_pg_dir[];
 798
 799	idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0));
 800
 801	map_kernel(pgdp);
 802	map_mem(pgdp);
 803
 804	pgd_clear_fixmap();
 805
 806	cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir);
 807	init_mm.pgd = swapper_pg_dir;
 808
 809	memblock_phys_free(__pa_symbol(init_pg_dir),
 810			   __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
 811
 812	memblock_allow_resize();
 813
 814	create_idmap();
 815}
 816
 817#ifdef CONFIG_MEMORY_HOTPLUG
 818static void free_hotplug_page_range(struct page *page, size_t size,
 819				    struct vmem_altmap *altmap)
 820{
 821	if (altmap) {
 822		vmem_altmap_free(altmap, size >> PAGE_SHIFT);
 823	} else {
 824		WARN_ON(PageReserved(page));
 825		free_pages((unsigned long)page_address(page), get_order(size));
 826	}
 827}
 828
 829static void free_hotplug_pgtable_page(struct page *page)
 830{
 831	free_hotplug_page_range(page, PAGE_SIZE, NULL);
 832}
 833
 834static bool pgtable_range_aligned(unsigned long start, unsigned long end,
 835				  unsigned long floor, unsigned long ceiling,
 836				  unsigned long mask)
 837{
 838	start &= mask;
 839	if (start < floor)
 840		return false;
 841
 842	if (ceiling) {
 843		ceiling &= mask;
 844		if (!ceiling)
 845			return false;
 846	}
 847
 848	if (end - 1 > ceiling - 1)
 849		return false;
 850	return true;
 851}
 852
 853static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 854				    unsigned long end, bool free_mapped,
 855				    struct vmem_altmap *altmap)
 856{
 857	pte_t *ptep, pte;
 858
 859	do {
 860		ptep = pte_offset_kernel(pmdp, addr);
 861		pte = READ_ONCE(*ptep);
 862		if (pte_none(pte))
 863			continue;
 864
 865		WARN_ON(!pte_present(pte));
 866		pte_clear(&init_mm, addr, ptep);
 867		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 868		if (free_mapped)
 869			free_hotplug_page_range(pte_page(pte),
 870						PAGE_SIZE, altmap);
 871	} while (addr += PAGE_SIZE, addr < end);
 872}
 873
 874static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 875				    unsigned long end, bool free_mapped,
 876				    struct vmem_altmap *altmap)
 877{
 878	unsigned long next;
 879	pmd_t *pmdp, pmd;
 880
 881	do {
 882		next = pmd_addr_end(addr, end);
 883		pmdp = pmd_offset(pudp, addr);
 884		pmd = READ_ONCE(*pmdp);
 885		if (pmd_none(pmd))
 886			continue;
 887
 888		WARN_ON(!pmd_present(pmd));
 889		if (pmd_sect(pmd)) {
 890			pmd_clear(pmdp);
 891
 892			/*
 893			 * One TLBI should be sufficient here as the PMD_SIZE
 894			 * range is mapped with a single block entry.
 895			 */
 896			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 897			if (free_mapped)
 898				free_hotplug_page_range(pmd_page(pmd),
 899							PMD_SIZE, altmap);
 900			continue;
 901		}
 902		WARN_ON(!pmd_table(pmd));
 903		unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
 904	} while (addr = next, addr < end);
 905}
 906
 907static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 908				    unsigned long end, bool free_mapped,
 909				    struct vmem_altmap *altmap)
 910{
 911	unsigned long next;
 912	pud_t *pudp, pud;
 913
 914	do {
 915		next = pud_addr_end(addr, end);
 916		pudp = pud_offset(p4dp, addr);
 917		pud = READ_ONCE(*pudp);
 918		if (pud_none(pud))
 919			continue;
 920
 921		WARN_ON(!pud_present(pud));
 922		if (pud_sect(pud)) {
 923			pud_clear(pudp);
 924
 925			/*
 926			 * One TLBI should be sufficient here as the PUD_SIZE
 927			 * range is mapped with a single block entry.
 928			 */
 929			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 930			if (free_mapped)
 931				free_hotplug_page_range(pud_page(pud),
 932							PUD_SIZE, altmap);
 933			continue;
 934		}
 935		WARN_ON(!pud_table(pud));
 936		unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
 937	} while (addr = next, addr < end);
 938}
 939
 940static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
 941				    unsigned long end, bool free_mapped,
 942				    struct vmem_altmap *altmap)
 943{
 944	unsigned long next;
 945	p4d_t *p4dp, p4d;
 946
 947	do {
 948		next = p4d_addr_end(addr, end);
 949		p4dp = p4d_offset(pgdp, addr);
 950		p4d = READ_ONCE(*p4dp);
 951		if (p4d_none(p4d))
 952			continue;
 953
 954		WARN_ON(!p4d_present(p4d));
 955		unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
 956	} while (addr = next, addr < end);
 957}
 958
 959static void unmap_hotplug_range(unsigned long addr, unsigned long end,
 960				bool free_mapped, struct vmem_altmap *altmap)
 961{
 962	unsigned long next;
 963	pgd_t *pgdp, pgd;
 964
 965	/*
 966	 * altmap can only be used as vmemmap mapping backing memory.
 967	 * In case the backing memory itself is not being freed, then
 968	 * altmap is irrelevant. Warn about this inconsistency when
 969	 * encountered.
 
 
 970	 */
 971	WARN_ON(!free_mapped && altmap);
 
 
 972
 973	do {
 974		next = pgd_addr_end(addr, end);
 975		pgdp = pgd_offset_k(addr);
 976		pgd = READ_ONCE(*pgdp);
 977		if (pgd_none(pgd))
 978			continue;
 979
 980		WARN_ON(!pgd_present(pgd));
 981		unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
 982	} while (addr = next, addr < end);
 983}
 984
 985static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 986				 unsigned long end, unsigned long floor,
 987				 unsigned long ceiling)
 988{
 989	pte_t *ptep, pte;
 990	unsigned long i, start = addr;
 991
 992	do {
 993		ptep = pte_offset_kernel(pmdp, addr);
 994		pte = READ_ONCE(*ptep);
 995
 996		/*
 997		 * This is just a sanity check here which verifies that
 998		 * pte clearing has been done by earlier unmap loops.
 999		 */
1000		WARN_ON(!pte_none(pte));
1001	} while (addr += PAGE_SIZE, addr < end);
1002
1003	if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
1004		return;
1005
1006	/*
1007	 * Check whether we can free the pte page if the rest of the
1008	 * entries are empty. Overlap with other regions have been
1009	 * handled by the floor/ceiling check.
1010	 */
1011	ptep = pte_offset_kernel(pmdp, 0UL);
1012	for (i = 0; i < PTRS_PER_PTE; i++) {
1013		if (!pte_none(READ_ONCE(ptep[i])))
1014			return;
1015	}
1016
1017	pmd_clear(pmdp);
1018	__flush_tlb_kernel_pgtable(start);
1019	free_hotplug_pgtable_page(virt_to_page(ptep));
1020}
1021
1022static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
1023				 unsigned long end, unsigned long floor,
1024				 unsigned long ceiling)
 
1025{
1026	pmd_t *pmdp, pmd;
1027	unsigned long i, next, start = addr;
 
 
1028
1029	do {
1030		next = pmd_addr_end(addr, end);
1031		pmdp = pmd_offset(pudp, addr);
1032		pmd = READ_ONCE(*pmdp);
1033		if (pmd_none(pmd))
1034			continue;
1035
1036		WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
1037		free_empty_pte_table(pmdp, addr, next, floor, ceiling);
1038	} while (addr = next, addr < end);
1039
1040	if (CONFIG_PGTABLE_LEVELS <= 2)
1041		return;
 
1042
1043	if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
1044		return;
1045
1046	/*
1047	 * Check whether we can free the pmd page if the rest of the
1048	 * entries are empty. Overlap with other regions have been
1049	 * handled by the floor/ceiling check.
1050	 */
1051	pmdp = pmd_offset(pudp, 0UL);
1052	for (i = 0; i < PTRS_PER_PMD; i++) {
1053		if (!pmd_none(READ_ONCE(pmdp[i])))
1054			return;
1055	}
1056
1057	pud_clear(pudp);
1058	__flush_tlb_kernel_pgtable(start);
1059	free_hotplug_pgtable_page(virt_to_page(pmdp));
1060}
1061
1062static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
1063				 unsigned long end, unsigned long floor,
1064				 unsigned long ceiling)
1065{
1066	pud_t *pudp, pud;
1067	unsigned long i, next, start = addr;
1068
1069	do {
1070		next = pud_addr_end(addr, end);
1071		pudp = pud_offset(p4dp, addr);
1072		pud = READ_ONCE(*pudp);
1073		if (pud_none(pud))
1074			continue;
1075
1076		WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
1077		free_empty_pmd_table(pudp, addr, next, floor, ceiling);
1078	} while (addr = next, addr < end);
1079
1080	if (CONFIG_PGTABLE_LEVELS <= 3)
1081		return;
1082
1083	if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
1084		return;
1085
1086	/*
1087	 * Check whether we can free the pud page if the rest of the
1088	 * entries are empty. Overlap with other regions have been
1089	 * handled by the floor/ceiling check.
1090	 */
1091	pudp = pud_offset(p4dp, 0UL);
1092	for (i = 0; i < PTRS_PER_PUD; i++) {
1093		if (!pud_none(READ_ONCE(pudp[i])))
1094			return;
1095	}
1096
1097	p4d_clear(p4dp);
1098	__flush_tlb_kernel_pgtable(start);
1099	free_hotplug_pgtable_page(virt_to_page(pudp));
1100}
1101
1102static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
1103				 unsigned long end, unsigned long floor,
1104				 unsigned long ceiling)
1105{
1106	unsigned long next;
1107	p4d_t *p4dp, p4d;
1108
1109	do {
1110		next = p4d_addr_end(addr, end);
1111		p4dp = p4d_offset(pgdp, addr);
1112		p4d = READ_ONCE(*p4dp);
1113		if (p4d_none(p4d))
1114			continue;
1115
1116		WARN_ON(!p4d_present(p4d));
1117		free_empty_pud_table(p4dp, addr, next, floor, ceiling);
1118	} while (addr = next, addr < end);
1119}
1120
1121static void free_empty_tables(unsigned long addr, unsigned long end,
1122			      unsigned long floor, unsigned long ceiling)
1123{
 
1124	unsigned long next;
1125	pgd_t *pgdp, pgd;
 
 
1126
1127	do {
1128		next = pgd_addr_end(addr, end);
1129		pgdp = pgd_offset_k(addr);
1130		pgd = READ_ONCE(*pgdp);
1131		if (pgd_none(pgd))
1132			continue;
1133
1134		WARN_ON(!pgd_present(pgd));
1135		free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
1136	} while (addr = next, addr < end);
1137}
1138#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1139
1140void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1141			       unsigned long addr, unsigned long next)
1142{
1143	pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
1144}
1145
1146int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
1147				unsigned long addr, unsigned long next)
1148{
1149	vmemmap_verify((pte_t *)pmdp, node, addr, next);
1150	return 1;
1151}
1152
1153int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
1154		struct vmem_altmap *altmap)
1155{
1156	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1157
1158	if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
1159		return vmemmap_populate_basepages(start, end, node, altmap);
1160	else
1161		return vmemmap_populate_hugepages(start, end, node, altmap);
1162}
1163
1164#ifdef CONFIG_MEMORY_HOTPLUG
1165void vmemmap_free(unsigned long start, unsigned long end,
1166		struct vmem_altmap *altmap)
1167{
1168	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
1169
1170	unmap_hotplug_range(start, end, true, altmap);
1171	free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
1172}
1173#endif /* CONFIG_MEMORY_HOTPLUG */
1174
1175static inline pud_t *fixmap_pud(unsigned long addr)
1176{
1177	pgd_t *pgdp = pgd_offset_k(addr);
1178	p4d_t *p4dp = p4d_offset(pgdp, addr);
1179	p4d_t p4d = READ_ONCE(*p4dp);
1180
1181	BUG_ON(p4d_none(p4d) || p4d_bad(p4d));
1182
1183	return pud_offset_kimg(p4dp, addr);
1184}
1185
1186static inline pmd_t *fixmap_pmd(unsigned long addr)
1187{
1188	pud_t *pudp = fixmap_pud(addr);
1189	pud_t pud = READ_ONCE(*pudp);
1190
1191	BUG_ON(pud_none(pud) || pud_bad(pud));
1192
1193	return pmd_offset_kimg(pudp, addr);
1194}
1195
1196static inline pte_t *fixmap_pte(unsigned long addr)
1197{
1198	return &bm_pte[pte_index(addr)];
1199}
1200
1201/*
1202 * The p*d_populate functions call virt_to_phys implicitly so they can't be used
1203 * directly on kernel symbols (bm_p*d). This function is called too early to use
1204 * lm_alias so __p*d_populate functions must be used to populate with the
1205 * physical address from __pa_symbol.
1206 */
1207void __init early_fixmap_init(void)
1208{
1209	pgd_t *pgdp;
1210	p4d_t *p4dp, p4d;
1211	pud_t *pudp;
1212	pmd_t *pmdp;
1213	unsigned long addr = FIXADDR_START;
1214
1215	pgdp = pgd_offset_k(addr);
1216	p4dp = p4d_offset(pgdp, addr);
1217	p4d = READ_ONCE(*p4dp);
1218	if (CONFIG_PGTABLE_LEVELS > 3 &&
1219	    !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
1220		/*
1221		 * We only end up here if the kernel mapping and the fixmap
1222		 * share the top level pgd entry, which should only happen on
1223		 * 16k/4 levels configurations.
1224		 */
1225		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
1226		pudp = pud_offset_kimg(p4dp, addr);
1227	} else {
1228		if (p4d_none(p4d))
1229			__p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE);
1230		pudp = fixmap_pud(addr);
1231	}
1232	if (pud_none(READ_ONCE(*pudp)))
1233		__pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE);
1234	pmdp = fixmap_pmd(addr);
1235	__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
1236
1237	/*
1238	 * The boot-ioremap range spans multiple pmds, for which
1239	 * we are not prepared:
1240	 */
1241	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
1242		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
1243
1244	if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
1245	     || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
1246		WARN_ON(1);
1247		pr_warn("pmdp %p != %p, %p\n",
1248			pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
1249			fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
1250		pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
1251			fix_to_virt(FIX_BTMAP_BEGIN));
1252		pr_warn("fix_to_virt(FIX_BTMAP_END):   %08lx\n",
1253			fix_to_virt(FIX_BTMAP_END));
1254
1255		pr_warn("FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
1256		pr_warn("FIX_BTMAP_BEGIN:     %d\n", FIX_BTMAP_BEGIN);
1257	}
1258}
1259
1260/*
1261 * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
1262 * ever need to use IPIs for TLB broadcasting, then we're in trouble here.
1263 */
1264void __set_fixmap(enum fixed_addresses idx,
1265			       phys_addr_t phys, pgprot_t flags)
1266{
1267	unsigned long addr = __fix_to_virt(idx);
1268	pte_t *ptep;
1269
1270	BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
1271
1272	ptep = fixmap_pte(addr);
1273
1274	if (pgprot_val(flags)) {
1275		set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
1276	} else {
1277		pte_clear(&init_mm, addr, ptep);
1278		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
1279	}
1280}
1281
1282void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
1283{
1284	const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
1285	int offset;
1286	void *dt_virt;
1287
1288	/*
1289	 * Check whether the physical FDT address is set and meets the minimum
1290	 * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
1291	 * at least 8 bytes so that we can always access the magic and size
1292	 * fields of the FDT header after mapping the first chunk, double check
1293	 * here if that is indeed the case.
1294	 */
1295	BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
1296	if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
1297		return NULL;
1298
1299	/*
1300	 * Make sure that the FDT region can be mapped without the need to
1301	 * allocate additional translation table pages, so that it is safe
1302	 * to call create_mapping_noalloc() this early.
1303	 *
1304	 * On 64k pages, the FDT will be mapped using PTEs, so we need to
1305	 * be in the same PMD as the rest of the fixmap.
1306	 * On 4k pages, we'll use section mappings for the FDT so we only
1307	 * have to be in the same PUD.
1308	 */
1309	BUILD_BUG_ON(dt_virt_base % SZ_2M);
1310
1311	BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
1312		     __fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);
1313
1314	offset = dt_phys % SWAPPER_BLOCK_SIZE;
1315	dt_virt = (void *)dt_virt_base + offset;
1316
1317	/* map the first chunk so we can read the size from the header */
1318	create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
1319			dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
1320
1321	if (fdt_magic(dt_virt) != FDT_MAGIC)
1322		return NULL;
1323
1324	*size = fdt_totalsize(dt_virt);
1325	if (*size > MAX_FDT_SIZE)
1326		return NULL;
1327
1328	if (offset + *size > SWAPPER_BLOCK_SIZE)
1329		create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
1330			       round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
1331
1332	return dt_virt;
1333}
1334
1335int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
1336{
1337	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
 
1338
1339	/* Only allow permission changes for now */
1340	if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
1341				   pud_val(new_pud)))
1342		return 0;
1343
1344	VM_BUG_ON(phys & ~PUD_MASK);
1345	set_pud(pudp, new_pud);
1346	return 1;
1347}
1348
1349int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
1350{
1351	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
1352
1353	/* Only allow permission changes for now */
1354	if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
1355				   pmd_val(new_pmd)))
1356		return 0;
1357
1358	VM_BUG_ON(phys & ~PMD_MASK);
1359	set_pmd(pmdp, new_pmd);
1360	return 1;
1361}
1362
1363int pud_clear_huge(pud_t *pudp)
1364{
1365	if (!pud_sect(READ_ONCE(*pudp)))
1366		return 0;
1367	pud_clear(pudp);
1368	return 1;
1369}
1370
1371int pmd_clear_huge(pmd_t *pmdp)
1372{
1373	if (!pmd_sect(READ_ONCE(*pmdp)))
1374		return 0;
1375	pmd_clear(pmdp);
1376	return 1;
1377}
1378
1379int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
1380{
1381	pte_t *table;
1382	pmd_t pmd;
1383
1384	pmd = READ_ONCE(*pmdp);
1385
1386	if (!pmd_table(pmd)) {
1387		VM_WARN_ON(1);
1388		return 1;
1389	}
1390
1391	table = pte_offset_kernel(pmdp, addr);
1392	pmd_clear(pmdp);
1393	__flush_tlb_kernel_pgtable(addr);
1394	pte_free_kernel(NULL, table);
1395	return 1;
1396}
1397
1398int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
1399{
1400	pmd_t *table;
1401	pmd_t *pmdp;
1402	pud_t pud;
1403	unsigned long next, end;
1404
1405	pud = READ_ONCE(*pudp);
1406
1407	if (!pud_table(pud)) {
1408		VM_WARN_ON(1);
1409		return 1;
1410	}
1411
1412	table = pmd_offset(pudp, addr);
1413	pmdp = table;
1414	next = addr;
1415	end = addr + PUD_SIZE;
1416	do {
1417		pmd_free_pte_page(pmdp, next);
1418	} while (pmdp++, next += PMD_SIZE, next != end);
1419
1420	pud_clear(pudp);
1421	__flush_tlb_kernel_pgtable(addr);
1422	pmd_free(NULL, table);
1423	return 1;
1424}
1425
1426#ifdef CONFIG_MEMORY_HOTPLUG
1427static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
1428{
1429	unsigned long end = start + size;
1430
1431	WARN_ON(pgdir != init_mm.pgd);
1432	WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
1433
1434	unmap_hotplug_range(start, end, false, NULL);
1435	free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
1436}
1437
1438struct range arch_get_mappable_range(void)
1439{
1440	struct range mhp_range;
1441	u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
1442	u64 end_linear_pa = __pa(PAGE_END - 1);
1443
1444	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
1445		/*
1446		 * Check for a wrap, it is possible because of randomized linear
1447		 * mapping the start physical address is actually bigger than
1448		 * the end physical address. In this case set start to zero
1449		 * because [0, end_linear_pa] range must still be able to cover
1450		 * all addressable physical addresses.
1451		 */
1452		if (start_linear_pa > end_linear_pa)
1453			start_linear_pa = 0;
1454	}
1455
1456	WARN_ON(start_linear_pa > end_linear_pa);
1457
1458	/*
1459	 * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
1460	 * accommodating both its ends but excluding PAGE_END. Max physical
1461	 * range which can be mapped inside this linear mapping range, must
1462	 * also be derived from its end points.
1463	 */
1464	mhp_range.start = start_linear_pa;
1465	mhp_range.end =  end_linear_pa;
1466
1467	return mhp_range;
1468}
1469
1470int arch_add_memory(int nid, u64 start, u64 size,
1471		    struct mhp_params *params)
1472{
1473	int ret, flags = NO_EXEC_MAPPINGS;
1474
1475	VM_BUG_ON(!mhp_range_allowed(start, size, true));
1476
1477	if (can_set_direct_map())
1478		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
1479
1480	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
1481			     size, params->pgprot, __pgd_pgtable_alloc,
1482			     flags);
1483
1484	memblock_clear_nomap(start, size);
1485
1486	ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
1487			   params);
1488	if (ret)
1489		__remove_pgd_mapping(swapper_pg_dir,
1490				     __phys_to_virt(start), size);
1491	else {
1492		max_pfn = PFN_UP(start + size);
1493		max_low_pfn = max_pfn;
1494	}
1495
1496	return ret;
1497}
1498
1499void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
1500{
1501	unsigned long start_pfn = start >> PAGE_SHIFT;
1502	unsigned long nr_pages = size >> PAGE_SHIFT;
1503
1504	__remove_pages(start_pfn, nr_pages, altmap);
1505	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
1506}
1507
1508/*
1509 * This memory hotplug notifier helps prevent boot memory from being
1510 * inadvertently removed as it blocks pfn range offlining process in
1511 * __offline_pages(). Hence this prevents both offlining as well as
1512 * removal process for boot memory which is initially always online.
1513 * In future if and when boot memory could be removed, this notifier
1514 * should be dropped and free_hotplug_page_range() should handle any
1515 * reserved pages allocated during boot.
1516 */
1517static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
1518					   unsigned long action, void *data)
1519{
1520	struct mem_section *ms;
1521	struct memory_notify *arg = data;
1522	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
1523	unsigned long pfn = arg->start_pfn;
1524
1525	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
1526		return NOTIFY_OK;
1527
1528	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1529		unsigned long start = PFN_PHYS(pfn);
1530		unsigned long end = start + (1UL << PA_SECTION_SHIFT);
1531
1532		ms = __pfn_to_section(pfn);
1533		if (!early_section(ms))
1534			continue;
1535
1536		if (action == MEM_GOING_OFFLINE) {
1537			/*
1538			 * Boot memory removal is not supported. Prevent
1539			 * it via blocking any attempted offline request
1540			 * for the boot memory and just report it.
1541			 */
1542			pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
1543			return NOTIFY_BAD;
1544		} else if (action == MEM_OFFLINE) {
1545			/*
1546			 * This should have never happened. Boot memory
1547			 * offlining should have been prevented by this
1548			 * very notifier. Probably some memory removal
1549			 * procedure might have changed which would then
1550			 * require further debug.
1551			 */
1552			pr_err("Boot memory [%lx %lx] offlined\n", start, end);
1553
1554			/*
1555			 * Core memory hotplug does not process a return
1556			 * code from the notifier for MEM_OFFLINE events.
1557			 * The error condition has been reported. Return
1558			 * from here as if ignored.
1559			 */
1560			return NOTIFY_DONE;
1561		}
1562	}
1563	return NOTIFY_OK;
1564}
1565
1566static struct notifier_block prevent_bootmem_remove_nb = {
1567	.notifier_call = prevent_bootmem_remove_notifier,
1568};
1569
1570/*
1571 * This ensures that boot memory sections on the platform are online
1572 * from early boot. Memory sections could not be prevented from being
1573 * offlined, unless for some reason they are not online to begin with.
1574 * This helps validate the basic assumption on which the above memory
1575 * event notifier works to prevent boot memory section offlining and
1576 * its possible removal.
1577 */
1578static void validate_bootmem_online(void)
1579{
1580	phys_addr_t start, end, addr;
1581	struct mem_section *ms;
1582	u64 i;
1583
1584	/*
1585	 * Scanning across all memblock might be expensive
1586	 * on some big memory systems. Hence enable this
1587	 * validation only with DEBUG_VM.
1588	 */
1589	if (!IS_ENABLED(CONFIG_DEBUG_VM))
1590		return;
1591
1592	for_each_mem_range(i, &start, &end) {
1593		for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
1594			ms = __pfn_to_section(PHYS_PFN(addr));
1595
1596			/*
1597			 * All memory ranges in the system at this point
1598			 * should have been marked as early sections.
1599			 */
1600			WARN_ON(!early_section(ms));
1601
1602			/*
1603			 * Memory notifier mechanism here to prevent boot
1604			 * memory offlining depends on the fact that each
1605			 * early section memory on the system is initially
1606			 * online. Otherwise a given memory section which
1607			 * is already offline will be overlooked and can
1608			 * be removed completely. Call out such sections.
1609			 */
1610			if (!online_section(ms))
1611				pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
1612					addr, addr + (1UL << PA_SECTION_SHIFT));
1613		}
1614	}
1615}
1616
1617static int __init prevent_bootmem_remove_init(void)
1618{
1619	int ret = 0;
1620
1621	if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
1622		return ret;
1623
1624	validate_bootmem_online();
1625	ret = register_memory_notifier(&prevent_bootmem_remove_nb);
1626	if (ret)
1627		pr_err("%s: Notifier registration failed %d\n", __func__, ret);
1628
1629	return ret;
1630}
1631early_initcall(prevent_bootmem_remove_init);
1632#endif
1633
1634pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
1635{
1636	if (IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198) &&
1637	    cpus_have_const_cap(ARM64_WORKAROUND_2645198)) {
1638		/*
1639		 * Break-before-make (BBM) is required for all user space mappings
1640		 * when the permission changes from executable to non-executable
1641		 * in cases where cpu is affected with errata #2645198.
1642		 */
1643		if (pte_user_exec(READ_ONCE(*ptep)))
1644			return ptep_clear_flush(vma, addr, ptep);
1645	}
1646	return ptep_get_and_clear(vma->vm_mm, addr, ptep);
1647}
1648
1649void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
1650			     pte_t old_pte, pte_t pte)
1651{
1652	set_pte_at(vma->vm_mm, addr, ptep, pte);
1653}