Linux Audio

Check our new training course

Loading...
v6.8
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 *  This file contains pgtable related functions for 64-bit machines.
  4 *
  5 *  Derived from arch/ppc64/mm/init.c
  6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  7 *
  8 *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
  9 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 10 *    Copyright (C) 1996 Paul Mackerras
 11 *
 12 *  Derived from "arch/i386/mm/init.c"
 13 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 14 *
 15 *  Dave Engebretsen <engebret@us.ibm.com>
 16 *      Rework for PPC64 port.
 
 
 
 
 
 
 17 */
 18
 19#include <linux/signal.h>
 20#include <linux/sched.h>
 21#include <linux/kernel.h>
 22#include <linux/errno.h>
 23#include <linux/string.h>
 24#include <linux/export.h>
 25#include <linux/types.h>
 26#include <linux/mman.h>
 27#include <linux/mm.h>
 28#include <linux/swap.h>
 29#include <linux/stddef.h>
 30#include <linux/vmalloc.h>
 
 31#include <linux/slab.h>
 32#include <linux/hugetlb.h>
 33
 
 34#include <asm/page.h>
 
 
 35#include <asm/mmu_context.h>
 
 36#include <asm/mmu.h>
 37#include <asm/smp.h>
 38#include <asm/machdep.h>
 39#include <asm/tlb.h>
 40#include <asm/processor.h>
 41#include <asm/cputable.h>
 42#include <asm/sections.h>
 43#include <asm/firmware.h>
 44#include <asm/dma.h>
 45
 46#include <mm/mmu_decl.h>
 47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 48
 49#ifdef CONFIG_PPC_BOOK3S_64
 50/*
 51 * partition table and process table for ISA 3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 52 */
 53struct prtb_entry *process_tb;
 54struct patb_entry *partition_tb;
 55/*
 56 * page table size
 57 */
 58unsigned long __pte_index_size;
 59EXPORT_SYMBOL(__pte_index_size);
 60unsigned long __pmd_index_size;
 61EXPORT_SYMBOL(__pmd_index_size);
 62unsigned long __pud_index_size;
 63EXPORT_SYMBOL(__pud_index_size);
 64unsigned long __pgd_index_size;
 65EXPORT_SYMBOL(__pgd_index_size);
 66unsigned long __pud_cache_index;
 67EXPORT_SYMBOL(__pud_cache_index);
 68unsigned long __pte_table_size;
 69EXPORT_SYMBOL(__pte_table_size);
 70unsigned long __pmd_table_size;
 71EXPORT_SYMBOL(__pmd_table_size);
 72unsigned long __pud_table_size;
 73EXPORT_SYMBOL(__pud_table_size);
 74unsigned long __pgd_table_size;
 75EXPORT_SYMBOL(__pgd_table_size);
 76unsigned long __pmd_val_bits;
 77EXPORT_SYMBOL(__pmd_val_bits);
 78unsigned long __pud_val_bits;
 79EXPORT_SYMBOL(__pud_val_bits);
 80unsigned long __pgd_val_bits;
 81EXPORT_SYMBOL(__pgd_val_bits);
 82unsigned long __kernel_virt_start;
 83EXPORT_SYMBOL(__kernel_virt_start);
 84unsigned long __vmalloc_start;
 85EXPORT_SYMBOL(__vmalloc_start);
 86unsigned long __vmalloc_end;
 87EXPORT_SYMBOL(__vmalloc_end);
 88unsigned long __kernel_io_start;
 89EXPORT_SYMBOL(__kernel_io_start);
 90unsigned long __kernel_io_end;
 91struct page *vmemmap;
 92EXPORT_SYMBOL(vmemmap);
 93unsigned long __pte_frag_nr;
 94EXPORT_SYMBOL(__pte_frag_nr);
 95unsigned long __pte_frag_size_shift;
 96EXPORT_SYMBOL(__pte_frag_size_shift);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 97#endif
 98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 99#ifndef __PAGETABLE_PUD_FOLDED
100/* 4 level page table */
101struct page *p4d_page(p4d_t p4d)
102{
103	if (p4d_is_leaf(p4d)) {
104		if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
105			VM_WARN_ON(!p4d_huge(p4d));
106		return pte_page(p4d_pte(p4d));
107	}
108	return virt_to_page(p4d_pgtable(p4d));
109}
110#endif
111
112struct page *pud_page(pud_t pud)
113{
114	if (pud_is_leaf(pud)) {
115		if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
116			VM_WARN_ON(!pud_huge(pud));
117		return pte_page(pud_pte(pud));
118	}
119	return virt_to_page(pud_pgtable(pud));
120}
121
122/*
123 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
124 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
125 */
126struct page *pmd_page(pmd_t pmd)
127{
128	if (pmd_is_leaf(pmd)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
129		/*
130		 * vmalloc_to_page may be called on any vmap address (not only
131		 * vmalloc), and it uses pmd_page() etc., when huge vmap is
132		 * enabled so these checks can't be used.
133		 */
134		if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
135			VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd)));
136		return pte_page(pmd_pte(pmd));
137	}
138	return virt_to_page(pmd_page_vaddr(pmd));
 
139}
140
141#ifdef CONFIG_STRICT_KERNEL_RWX
142void mark_rodata_ro(void)
143{
144	if (!mmu_has_feature(MMU_FTR_KERNEL_RO)) {
145		pr_warn("Warning: Unable to mark rodata read only on this CPU.\n");
146		return;
 
 
 
 
 
147	}
148
149	if (radix_enabled())
150		radix__mark_rodata_ro();
151	else
152		hash__mark_rodata_ro();
 
 
 
 
 
 
 
 
153
154	// mark_initmem_nx() should have already run by now
155	ptdump_check_wx();
156}
157
158void mark_initmem_nx(void)
159{
160	if (radix_enabled())
161		radix__mark_initmem_nx();
162	else
163		hash__mark_initmem_nx();
 
 
 
 
 
 
 
 
 
 
 
 
 
164}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165#endif
v4.6
 
  1/*
  2 *  This file contains ioremap and related functions for 64-bit machines.
  3 *
  4 *  Derived from arch/ppc64/mm/init.c
  5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6 *
  7 *  Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
  8 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
  9 *    Copyright (C) 1996 Paul Mackerras
 10 *
 11 *  Derived from "arch/i386/mm/init.c"
 12 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 13 *
 14 *  Dave Engebretsen <engebret@us.ibm.com>
 15 *      Rework for PPC64 port.
 16 *
 17 *  This program is free software; you can redistribute it and/or
 18 *  modify it under the terms of the GNU General Public License
 19 *  as published by the Free Software Foundation; either version
 20 *  2 of the License, or (at your option) any later version.
 21 *
 22 */
 23
 24#include <linux/signal.h>
 25#include <linux/sched.h>
 26#include <linux/kernel.h>
 27#include <linux/errno.h>
 28#include <linux/string.h>
 29#include <linux/export.h>
 30#include <linux/types.h>
 31#include <linux/mman.h>
 32#include <linux/mm.h>
 33#include <linux/swap.h>
 34#include <linux/stddef.h>
 35#include <linux/vmalloc.h>
 36#include <linux/memblock.h>
 37#include <linux/slab.h>
 38#include <linux/hugetlb.h>
 39
 40#include <asm/pgalloc.h>
 41#include <asm/page.h>
 42#include <asm/prom.h>
 43#include <asm/io.h>
 44#include <asm/mmu_context.h>
 45#include <asm/pgtable.h>
 46#include <asm/mmu.h>
 47#include <asm/smp.h>
 48#include <asm/machdep.h>
 49#include <asm/tlb.h>
 50#include <asm/processor.h>
 51#include <asm/cputable.h>
 52#include <asm/sections.h>
 53#include <asm/firmware.h>
 54#include <asm/dma.h>
 55
 56#include "mmu_decl.h"
 57
 58#define CREATE_TRACE_POINTS
 59#include <trace/events/thp.h>
 60
 61/* Some sanity checking */
 62#if TASK_SIZE_USER64 > PGTABLE_RANGE
 63#error TASK_SIZE_USER64 exceeds pagetable range
 64#endif
 65
 66#ifdef CONFIG_PPC_STD_MMU_64
 67#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
 68#error TASK_SIZE_USER64 exceeds user VSID range
 69#endif
 70#endif
 71
 72unsigned long ioremap_bot = IOREMAP_BASE;
 73
 74#ifdef CONFIG_PPC_MMU_NOHASH
 75static __ref void *early_alloc_pgtable(unsigned long size)
 76{
 77	void *pt;
 78
 79	pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
 80	memset(pt, 0, size);
 81
 82	return pt;
 83}
 84#endif /* CONFIG_PPC_MMU_NOHASH */
 85
 
 86/*
 87 * map_kernel_page currently only called by __ioremap
 88 * map_kernel_page adds an entry to the ioremap page table
 89 * and adds an entry to the HPT, possibly bolting it
 90 */
 91int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
 92{
 93	pgd_t *pgdp;
 94	pud_t *pudp;
 95	pmd_t *pmdp;
 96	pte_t *ptep;
 97
 98	if (slab_is_available()) {
 99		pgdp = pgd_offset_k(ea);
100		pudp = pud_alloc(&init_mm, pgdp, ea);
101		if (!pudp)
102			return -ENOMEM;
103		pmdp = pmd_alloc(&init_mm, pudp, ea);
104		if (!pmdp)
105			return -ENOMEM;
106		ptep = pte_alloc_kernel(pmdp, ea);
107		if (!ptep)
108			return -ENOMEM;
109		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
110							  __pgprot(flags)));
111	} else {
112#ifdef CONFIG_PPC_MMU_NOHASH
113		pgdp = pgd_offset_k(ea);
114#ifdef PUD_TABLE_SIZE
115		if (pgd_none(*pgdp)) {
116			pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
117			BUG_ON(pudp == NULL);
118			pgd_populate(&init_mm, pgdp, pudp);
119		}
120#endif /* PUD_TABLE_SIZE */
121		pudp = pud_offset(pgdp, ea);
122		if (pud_none(*pudp)) {
123			pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
124			BUG_ON(pmdp == NULL);
125			pud_populate(&init_mm, pudp, pmdp);
126		}
127		pmdp = pmd_offset(pudp, ea);
128		if (!pmd_present(*pmdp)) {
129			ptep = early_alloc_pgtable(PAGE_SIZE);
130			BUG_ON(ptep == NULL);
131			pmd_populate_kernel(&init_mm, pmdp, ptep);
132		}
133		ptep = pte_offset_kernel(pmdp, ea);
134		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
135							  __pgprot(flags)));
136#else /* CONFIG_PPC_MMU_NOHASH */
137		/*
138		 * If the mm subsystem is not fully up, we cannot create a
139		 * linux page table entry for this mapping.  Simply bolt an
140		 * entry in the hardware page table.
141		 *
142		 */
143		if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
144				      mmu_io_psize, mmu_kernel_ssize)) {
145			printk(KERN_ERR "Failed to do bolted mapping IO "
146			       "memory at %016lx !\n", pa);
147			return -ENOMEM;
148		}
149#endif /* !CONFIG_PPC_MMU_NOHASH */
150	}
151
152	smp_wmb();
153	return 0;
154}
155
156
157/**
158 * __ioremap_at - Low level function to establish the page tables
159 *                for an IO mapping
160 */
161void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
162			    unsigned long flags)
163{
164	unsigned long i;
165
166	/* Make sure we have the base flags */
167	if ((flags & _PAGE_PRESENT) == 0)
168		flags |= pgprot_val(PAGE_KERNEL);
169
170	/* Non-cacheable page cannot be coherent */
171	if (flags & _PAGE_NO_CACHE)
172		flags &= ~_PAGE_COHERENT;
173
174	/* We don't support the 4K PFN hack with ioremap */
175	if (flags & _PAGE_4K_PFN)
176		return NULL;
177
178	WARN_ON(pa & ~PAGE_MASK);
179	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
180	WARN_ON(size & ~PAGE_MASK);
181
182	for (i = 0; i < size; i += PAGE_SIZE)
183		if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
184			return NULL;
185
186	return (void __iomem *)ea;
187}
188
189/**
190 * __iounmap_from - Low level function to tear down the page tables
191 *                  for an IO mapping. This is used for mappings that
192 *                  are manipulated manually, like partial unmapping of
193 *                  PCI IOs or ISA space.
194 */
195void __iounmap_at(void *ea, unsigned long size)
196{
197	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
198	WARN_ON(size & ~PAGE_MASK);
199
200	unmap_kernel_range((unsigned long)ea, size);
201}
202
203void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
204				unsigned long flags, void *caller)
205{
206	phys_addr_t paligned;
207	void __iomem *ret;
208
209	/*
210	 * Choose an address to map it to.
211	 * Once the imalloc system is running, we use it.
212	 * Before that, we map using addresses going
213	 * up from ioremap_bot.  imalloc will use
214	 * the addresses from ioremap_bot through
215	 * IMALLOC_END
216	 * 
217	 */
218	paligned = addr & PAGE_MASK;
219	size = PAGE_ALIGN(addr + size) - paligned;
220
221	if ((size == 0) || (paligned == 0))
222		return NULL;
223
224	if (slab_is_available()) {
225		struct vm_struct *area;
226
227		area = __get_vm_area_caller(size, VM_IOREMAP,
228					    ioremap_bot, IOREMAP_END,
229					    caller);
230		if (area == NULL)
231			return NULL;
232
233		area->phys_addr = paligned;
234		ret = __ioremap_at(paligned, area->addr, size, flags);
235		if (!ret)
236			vunmap(area->addr);
237	} else {
238		ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
239		if (ret)
240			ioremap_bot += size;
241	}
242
243	if (ret)
244		ret += addr & ~PAGE_MASK;
245	return ret;
246}
247
248void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
249			 unsigned long flags)
250{
251	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
252}
253
254void __iomem * ioremap(phys_addr_t addr, unsigned long size)
255{
256	unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED;
257	void *caller = __builtin_return_address(0);
258
259	if (ppc_md.ioremap)
260		return ppc_md.ioremap(addr, size, flags, caller);
261	return __ioremap_caller(addr, size, flags, caller);
262}
263
264void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
265{
266	unsigned long flags = _PAGE_NO_CACHE;
267	void *caller = __builtin_return_address(0);
268
269	if (ppc_md.ioremap)
270		return ppc_md.ioremap(addr, size, flags, caller);
271	return __ioremap_caller(addr, size, flags, caller);
272}
273
274void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
275			     unsigned long flags)
276{
277	void *caller = __builtin_return_address(0);
278
279	/* writeable implies dirty for kernel addresses */
280	if (flags & _PAGE_RW)
281		flags |= _PAGE_DIRTY;
282
283	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
284	flags &= ~(_PAGE_USER | _PAGE_EXEC);
285
286#ifdef _PAGE_BAP_SR
287	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
288	 * which means that we just cleared supervisor access... oops ;-) This
289	 * restores it
290	 */
291	flags |= _PAGE_BAP_SR;
292#endif
293
294	if (ppc_md.ioremap)
295		return ppc_md.ioremap(addr, size, flags, caller);
296	return __ioremap_caller(addr, size, flags, caller);
297}
298
299
300/*  
301 * Unmap an IO region and remove it from imalloc'd list.
302 * Access to IO memory should be serialized by driver.
303 */
304void __iounmap(volatile void __iomem *token)
305{
306	void *addr;
307
308	if (!slab_is_available())
309		return;
310	
311	addr = (void *) ((unsigned long __force)
312			 PCI_FIX_ADDR(token) & PAGE_MASK);
313	if ((unsigned long)addr < ioremap_bot) {
314		printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
315		       " at 0x%p\n", addr);
316		return;
317	}
318	vunmap(addr);
319}
320
321void iounmap(volatile void __iomem *token)
322{
323	if (ppc_md.iounmap)
324		ppc_md.iounmap(token);
325	else
326		__iounmap(token);
327}
328
329EXPORT_SYMBOL(ioremap);
330EXPORT_SYMBOL(ioremap_wc);
331EXPORT_SYMBOL(ioremap_prot);
332EXPORT_SYMBOL(__ioremap);
333EXPORT_SYMBOL(__ioremap_at);
334EXPORT_SYMBOL(iounmap);
335EXPORT_SYMBOL(__iounmap);
336EXPORT_SYMBOL(__iounmap_at);
337
338#ifndef __PAGETABLE_PUD_FOLDED
339/* 4 level page table */
340struct page *pgd_page(pgd_t pgd)
341{
342	if (pgd_huge(pgd))
343		return pte_page(pgd_pte(pgd));
344	return virt_to_page(pgd_page_vaddr(pgd));
 
 
 
345}
346#endif
347
348struct page *pud_page(pud_t pud)
349{
350	if (pud_huge(pud))
 
 
351		return pte_page(pud_pte(pud));
352	return virt_to_page(pud_page_vaddr(pud));
 
353}
354
355/*
356 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
357 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
358 */
359struct page *pmd_page(pmd_t pmd)
360{
361	if (pmd_trans_huge(pmd) || pmd_huge(pmd))
362		return pte_page(pmd_pte(pmd));
363	return virt_to_page(pmd_page_vaddr(pmd));
364}
365
366#ifdef CONFIG_PPC_64K_PAGES
367static pte_t *get_from_cache(struct mm_struct *mm)
368{
369	void *pte_frag, *ret;
370
371	spin_lock(&mm->page_table_lock);
372	ret = mm->context.pte_frag;
373	if (ret) {
374		pte_frag = ret + PTE_FRAG_SIZE;
375		/*
376		 * If we have taken up all the fragments mark PTE page NULL
 
 
377		 */
378		if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
379			pte_frag = NULL;
380		mm->context.pte_frag = pte_frag;
381	}
382	spin_unlock(&mm->page_table_lock);
383	return (pte_t *)ret;
384}
385
386static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
 
387{
388	void *ret = NULL;
389	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
390				       __GFP_REPEAT | __GFP_ZERO);
391	if (!page)
392		return NULL;
393	if (!kernel && !pgtable_page_ctor(page)) {
394		__free_page(page);
395		return NULL;
396	}
397
398	ret = page_address(page);
399	spin_lock(&mm->page_table_lock);
400	/*
401	 * If we find pgtable_page set, we return
402	 * the allocated page with single fragement
403	 * count.
404	 */
405	if (likely(!mm->context.pte_frag)) {
406		set_page_count(page, PTE_FRAG_NR);
407		mm->context.pte_frag = ret + PTE_FRAG_SIZE;
408	}
409	spin_unlock(&mm->page_table_lock);
410
411	return (pte_t *)ret;
 
412}
413
414pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
415{
416	pte_t *pte;
417
418	pte = get_from_cache(mm);
419	if (pte)
420		return pte;
421
422	return __alloc_for_cache(mm, kernel);
423}
424
425void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
426{
427	struct page *page = virt_to_page(table);
428	if (put_page_testzero(page)) {
429		if (!kernel)
430			pgtable_page_dtor(page);
431		free_hot_cold_page(page, 0);
432	}
433}
434
435#ifdef CONFIG_SMP
436static void page_table_free_rcu(void *table)
437{
438	struct page *page = virt_to_page(table);
439	if (put_page_testzero(page)) {
440		pgtable_page_dtor(page);
441		free_hot_cold_page(page, 0);
442	}
443}
444
445void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
446{
447	unsigned long pgf = (unsigned long)table;
448
449	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
450	pgf |= shift;
451	tlb_remove_table(tlb, (void *)pgf);
452}
453
454void __tlb_remove_table(void *_table)
455{
456	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
457	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
458
459	if (!shift)
460		/* PTE page needs special handling */
461		page_table_free_rcu(table);
462	else {
463		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
464		kmem_cache_free(PGT_CACHE(shift), table);
465	}
466}
467#else
468void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
469{
470	if (!shift) {
471		/* PTE page needs special handling */
472		struct page *page = virt_to_page(table);
473		if (put_page_testzero(page)) {
474			pgtable_page_dtor(page);
475			free_hot_cold_page(page, 0);
476		}
477	} else {
478		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
479		kmem_cache_free(PGT_CACHE(shift), table);
480	}
481}
482#endif
483#endif /* CONFIG_PPC_64K_PAGES */
484
485#ifdef CONFIG_TRANSPARENT_HUGEPAGE
486
487/*
488 * This is called when relaxing access to a hugepage. It's also called in the page
489 * fault path when we don't hit any of the major fault cases, ie, a minor
490 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
491 * handled those two for us, we additionally deal with missing execute
492 * permission here on some processors
493 */
494int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
495			  pmd_t *pmdp, pmd_t entry, int dirty)
496{
497	int changed;
498#ifdef CONFIG_DEBUG_VM
499	WARN_ON(!pmd_trans_huge(*pmdp));
500	assert_spin_locked(&vma->vm_mm->page_table_lock);
501#endif
502	changed = !pmd_same(*(pmdp), entry);
503	if (changed) {
504		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
505		/*
506		 * Since we are not supporting SW TLB systems, we don't
507		 * have any thing similar to flush_tlb_page_nohash()
508		 */
509	}
510	return changed;
511}
512
513unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
514				  pmd_t *pmdp, unsigned long clr,
515				  unsigned long set)
516{
517
518	unsigned long old, tmp;
519
520#ifdef CONFIG_DEBUG_VM
521	WARN_ON(!pmd_trans_huge(*pmdp));
522	assert_spin_locked(&mm->page_table_lock);
523#endif
524
525#ifdef PTE_ATOMIC_UPDATES
526	__asm__ __volatile__(
527	"1:	ldarx	%0,0,%3\n\
528		andi.	%1,%0,%6\n\
529		bne-	1b \n\
530		andc	%1,%0,%4 \n\
531		or	%1,%1,%7\n\
532		stdcx.	%1,0,%3 \n\
533		bne-	1b"
534	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
535	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set)
536	: "cc" );
537#else
538	old = pmd_val(*pmdp);
539	*pmdp = __pmd((old & ~clr) | set);
540#endif
541	trace_hugepage_update(addr, old, clr, set);
542	if (old & _PAGE_HASHPTE)
543		hpte_do_hugepage_flush(mm, addr, pmdp, old);
544	return old;
545}
546
547pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
548			  pmd_t *pmdp)
549{
550	pmd_t pmd;
551
552	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
553	VM_BUG_ON(pmd_trans_huge(*pmdp));
554
555	pmd = *pmdp;
556	pmd_clear(pmdp);
557	/*
558	 * Wait for all pending hash_page to finish. This is needed
559	 * in case of subpage collapse. When we collapse normal pages
560	 * to hugepage, we first clear the pmd, then invalidate all
561	 * the PTE entries. The assumption here is that any low level
562	 * page fault will see a none pmd and take the slow path that
563	 * will wait on mmap_sem. But we could very well be in a
564	 * hash_page with local ptep pointer value. Such a hash page
565	 * can result in adding new HPTE entries for normal subpages.
566	 * That means we could be modifying the page content as we
567	 * copy them to a huge page. So wait for parallel hash_page
568	 * to finish before invalidating HPTE entries. We can do this
569	 * by sending an IPI to all the cpus and executing a dummy
570	 * function there.
571	 */
572	kick_all_cpus_sync();
573	/*
574	 * Now invalidate the hpte entries in the range
575	 * covered by pmd. This make sure we take a
576	 * fault and will find the pmd as none, which will
577	 * result in a major fault which takes mmap_sem and
578	 * hence wait for collapse to complete. Without this
579	 * the __collapse_huge_page_copy can result in copying
580	 * the old content.
581	 */
582	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
583	return pmd;
584}
585
586int pmdp_test_and_clear_young(struct vm_area_struct *vma,
587			      unsigned long address, pmd_t *pmdp)
588{
589	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
590}
591
592/*
593 * We currently remove entries from the hashtable regardless of whether
594 * the entry was young or dirty. The generic routines only flush if the
595 * entry was young or dirty which is not good enough.
596 *
597 * We should be more intelligent about this but for the moment we override
598 * these functions and force a tlb flush unconditionally
599 */
600int pmdp_clear_flush_young(struct vm_area_struct *vma,
601				  unsigned long address, pmd_t *pmdp)
602{
603	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
604}
605
606/*
607 * We want to put the pgtable in pmd and use pgtable for tracking
608 * the base page size hptes
609 */
610void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
611				pgtable_t pgtable)
612{
613	pgtable_t *pgtable_slot;
614	assert_spin_locked(&mm->page_table_lock);
615	/*
616	 * we store the pgtable in the second half of PMD
617	 */
618	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
619	*pgtable_slot = pgtable;
620	/*
621	 * expose the deposited pgtable to other cpus.
622	 * before we set the hugepage PTE at pmd level
623	 * hash fault code looks at the deposted pgtable
624	 * to store hash index values.
625	 */
626	smp_wmb();
627}
628
629pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
630{
631	pgtable_t pgtable;
632	pgtable_t *pgtable_slot;
633
634	assert_spin_locked(&mm->page_table_lock);
635	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
636	pgtable = *pgtable_slot;
637	/*
638	 * Once we withdraw, mark the entry NULL.
639	 */
640	*pgtable_slot = NULL;
641	/*
642	 * We store HPTE information in the deposited PTE fragment.
643	 * zero out the content on withdraw.
644	 */
645	memset(pgtable, 0, PTE_FRAG_SIZE);
646	return pgtable;
647}
648
649void pmdp_huge_split_prepare(struct vm_area_struct *vma,
650			     unsigned long address, pmd_t *pmdp)
651{
652	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
653	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
654
655	/*
656	 * We can't mark the pmd none here, because that will cause a race
657	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
658	 * we spilt, but at the same time we wan't rest of the ppc64 code
659	 * not to insert hash pte on this, because we will be modifying
660	 * the deposited pgtable in the caller of this function. Hence
661	 * clear the _PAGE_USER so that we move the fault handling to
662	 * higher level function and that will serialize against ptl.
663	 * We need to flush existing hash pte entries here even though,
664	 * the translation is still valid, because we will withdraw
665	 * pgtable_t after this.
666	 */
667	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0);
668}
669
670
671/*
672 * set a new huge pmd. We should not be called for updating
673 * an existing pmd entry. That should go via pmd_hugepage_update.
674 */
675void set_pmd_at(struct mm_struct *mm, unsigned long addr,
676		pmd_t *pmdp, pmd_t pmd)
677{
678#ifdef CONFIG_DEBUG_VM
679	WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
680		(_PAGE_PRESENT | _PAGE_USER));
681	assert_spin_locked(&mm->page_table_lock);
682	WARN_ON(!pmd_trans_huge(pmd));
683#endif
684	trace_hugepage_set_pmd(addr, pmd_val(pmd));
685	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
686}
687
688/*
689 * We use this to invalidate a pmdp entry before switching from a
690 * hugepte to regular pmd entry.
691 */
692void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
693		     pmd_t *pmdp)
694{
695	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
696
697	/*
698	 * This ensures that generic code that rely on IRQ disabling
699	 * to prevent a parallel THP split work as expected.
700	 */
701	kick_all_cpus_sync();
702}
703
704/*
705 * A linux hugepage PMD was changed and the corresponding hash table entries
706 * neesd to be flushed.
707 */
708void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
709			    pmd_t *pmdp, unsigned long old_pmd)
710{
711	int ssize;
712	unsigned int psize;
713	unsigned long vsid;
714	unsigned long flags = 0;
715	const struct cpumask *tmp;
716
717	/* get the base page size,vsid and segment size */
718#ifdef CONFIG_DEBUG_VM
719	psize = get_slice_psize(mm, addr);
720	BUG_ON(psize == MMU_PAGE_16M);
721#endif
722	if (old_pmd & _PAGE_COMBO)
723		psize = MMU_PAGE_4K;
724	else
725		psize = MMU_PAGE_64K;
726
727	if (!is_kernel_addr(addr)) {
728		ssize = user_segment_size(addr);
729		vsid = get_vsid(mm->context.id, addr, ssize);
730		WARN_ON(vsid == 0);
731	} else {
732		vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
733		ssize = mmu_kernel_ssize;
734	}
735
736	tmp = cpumask_of(smp_processor_id());
737	if (cpumask_equal(mm_cpumask(mm), tmp))
738		flags |= HPTE_LOCAL_UPDATE;
739
740	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
741}
742
743static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
744{
745	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
746}
747
748pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
749{
750	unsigned long pmdv;
751
752	pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK;
753	return pmd_set_protbits(__pmd(pmdv), pgprot);
754}
755
756pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
757{
758	return pfn_pmd(page_to_pfn(page), pgprot);
759}
760
761pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
762{
763	unsigned long pmdv;
764
765	pmdv = pmd_val(pmd);
766	pmdv &= _HPAGE_CHG_MASK;
767	return pmd_set_protbits(__pmd(pmdv), newprot);
768}
769
770/*
771 * This is called at the end of handling a user page fault, when the
772 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
773 * We use it to preload an HPTE into the hash table corresponding to
774 * the updated linux HUGE PMD entry.
775 */
776void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
777			  pmd_t *pmd)
778{
779	return;
780}
781
782pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
783			      unsigned long addr, pmd_t *pmdp)
784{
785	pmd_t old_pmd;
786	pgtable_t pgtable;
787	unsigned long old;
788	pgtable_t *pgtable_slot;
789
790	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
791	old_pmd = __pmd(old);
792	/*
793	 * We have pmd == none and we are holding page_table_lock.
794	 * So we can safely go and clear the pgtable hash
795	 * index info.
796	 */
797	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
798	pgtable = *pgtable_slot;
799	/*
800	 * Let's zero out old valid and hash index details
801	 * hash fault look at them.
802	 */
803	memset(pgtable, 0, PTE_FRAG_SIZE);
804	/*
805	 * Serialize against find_linux_pte_or_hugepte which does lock-less
806	 * lookup in page tables with local interrupts disabled. For huge pages
807	 * it casts pmd_t to pte_t. Since format of pte_t is different from
808	 * pmd_t we want to prevent transit from pmd pointing to page table
809	 * to pmd pointing to huge page (and back) while interrupts are disabled.
810	 * We clear pmd to possibly replace it with page table pointer in
811	 * different code paths. So make sure we wait for the parallel
812	 * find_linux_pte_or_hugepage to finish.
813	 */
814	kick_all_cpus_sync();
815	return old_pmd;
816}
817
818int has_transparent_hugepage(void)
819{
820
821	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
822		"hugepages can't be allocated by the buddy allocator");
823
824	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
825			 "We need more than 2 pages to do deferred thp split");
826
827	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
828		return 0;
829	/*
830	 * We support THP only if PMD_SIZE is 16MB.
831	 */
832	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
833		return 0;
834	/*
835	 * We need to make sure that we support 16MB hugepage in a segement
836	 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
837	 * of 64K.
838	 */
839	/*
840	 * If we have 64K HPTE, we will be using that by default
841	 */
842	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
843	    (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
844		return 0;
845	/*
846	 * Ok we only have 4K HPTE
847	 */
848	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
849		return 0;
850
851	return 1;
852}
853#endif /* CONFIG_TRANSPARENT_HUGEPAGE */