fault.c - arch/arm64/mm/fault.c - Linux diff v6.8 - Bootlin Elixir Cross Referencer

  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Based on arch/arm/mm/fault.c
  4 *
  5 * Copyright (C) 1995  Linus Torvalds
  6 * Copyright (C) 1995-2004 Russell King
  7 * Copyright (C) 2012 ARM Ltd.
  8 */
  9
 10#include <linux/acpi.h>
 11#include <linux/bitfield.h>
 12#include <linux/extable.h>
 13#include <linux/kfence.h>
 14#include <linux/signal.h>
 15#include <linux/mm.h>
 16#include <linux/hardirq.h>
 17#include <linux/init.h>
 18#include <linux/kasan.h>
 19#include <linux/kprobes.h>
 20#include <linux/uaccess.h>
 21#include <linux/page-flags.h>
 22#include <linux/sched/signal.h>
 23#include <linux/sched/debug.h>
 24#include <linux/highmem.h>
 25#include <linux/perf_event.h>
 
 26#include <linux/preempt.h>
 27#include <linux/hugetlb.h>
 28
 29#include <asm/acpi.h>
 30#include <asm/bug.h>
 31#include <asm/cmpxchg.h>
 32#include <asm/cpufeature.h>
 33#include <asm/efi.h>
 34#include <asm/exception.h>
 35#include <asm/daifflags.h>
 36#include <asm/debug-monitors.h>
 37#include <asm/esr.h>
 38#include <asm/kprobes.h>
 39#include <asm/mte.h>
 40#include <asm/processor.h>
 41#include <asm/sysreg.h>
 42#include <asm/system_misc.h>
 43#include <asm/tlbflush.h>
 44#include <asm/traps.h>
 45
 46struct fault_info {
 47	int	(*fn)(unsigned long far, unsigned long esr,
 48		      struct pt_regs *regs);
 49	int	sig;
 50	int	code;
 51	const char *name;
 52};
 53
 54static const struct fault_info fault_info[];
 55static struct fault_info debug_fault_info[];
 56
 57static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
 58{
 59	return fault_info + (esr & ESR_ELx_FSC);
 60}
 61
 62static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
 63{
 64	return debug_fault_info + DBG_ESR_EVT(esr);
 65}
 66
 67static void data_abort_decode(unsigned long esr)
 68{
 69	unsigned long iss2 = ESR_ELx_ISS2(esr);
 70
 71	pr_alert("Data abort info:\n");
 72
 73	if (esr & ESR_ELx_ISV) {
 74		pr_alert("  Access size = %u byte(s)\n",
 75			 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
 76		pr_alert("  SSE = %lu, SRT = %lu\n",
 77			 (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
 78			 (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
 79		pr_alert("  SF = %lu, AR = %lu\n",
 80			 (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
 81			 (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
 82	} else {
 83		pr_alert("  ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n",
 84			 esr & ESR_ELx_ISS_MASK, iss2);
 85	}
 86
 87	pr_alert("  CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n",
 88		 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
 89		 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT,
 90		 (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT,
 91		 (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT);
 92
 93	pr_alert("  GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n",
 94		 (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT,
 95		 (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT,
 96		 (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT,
 97		 (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT);
 98}
 99
100static void mem_abort_decode(unsigned long esr)
101{
102	pr_alert("Mem abort info:\n");
103
104	pr_alert("  ESR = 0x%016lx\n", esr);
105	pr_alert("  EC = 0x%02lx: %s, IL = %u bits\n",
106		 ESR_ELx_EC(esr), esr_get_class_string(esr),
107		 (esr & ESR_ELx_IL) ? 32 : 16);
108	pr_alert("  SET = %lu, FnV = %lu\n",
109		 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
110		 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
111	pr_alert("  EA = %lu, S1PTW = %lu\n",
112		 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
113		 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
114	pr_alert("  FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
115		 esr_to_fault_info(esr)->name);
116
117	if (esr_is_data_abort(esr))
118		data_abort_decode(esr);
119}
120
121static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
122{
123	/* Either init_pg_dir or swapper_pg_dir */
124	if (mm == &init_mm)
125		return __pa_symbol(mm->pgd);
126
127	return (unsigned long)virt_to_phys(mm->pgd);
128}
129
130/*
131 * Dump out the page tables associated with 'addr' in the currently active mm.
132 */
133static void show_pte(unsigned long addr)
134{
135	struct mm_struct *mm;
136	pgd_t *pgdp;
137	pgd_t pgd;
138
139	if (is_ttbr0_addr(addr)) {
140		/* TTBR0 */
141		mm = current->active_mm;
142		if (mm == &init_mm) {
143			pr_alert("[%016lx] user address but active_mm is swapper\n",
144				 addr);
145			return;
146		}
147	} else if (is_ttbr1_addr(addr)) {
148		/* TTBR1 */
149		mm = &init_mm;
150	} else {
151		pr_alert("[%016lx] address between user and kernel address ranges\n",
152			 addr);
153		return;
154	}
155
156	pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
157		 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
158		 vabits_actual, mm_to_pgd_phys(mm));
159	pgdp = pgd_offset(mm, addr);
160	pgd = READ_ONCE(*pgdp);
161	pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
162
163	do {
164		p4d_t *p4dp, p4d;
165		pud_t *pudp, pud;
166		pmd_t *pmdp, pmd;
167		pte_t *ptep, pte;
168
169		if (pgd_none(pgd) || pgd_bad(pgd))
170			break;
171
172		p4dp = p4d_offset(pgdp, addr);
173		p4d = READ_ONCE(*p4dp);
174		pr_cont(", p4d=%016llx", p4d_val(p4d));
175		if (p4d_none(p4d) || p4d_bad(p4d))
176			break;
177
178		pudp = pud_offset(p4dp, addr);
179		pud = READ_ONCE(*pudp);
180		pr_cont(", pud=%016llx", pud_val(pud));
181		if (pud_none(pud) || pud_bad(pud))
182			break;
183
184		pmdp = pmd_offset(pudp, addr);
185		pmd = READ_ONCE(*pmdp);
186		pr_cont(", pmd=%016llx", pmd_val(pmd));
187		if (pmd_none(pmd) || pmd_bad(pmd))
188			break;
189
190		ptep = pte_offset_map(pmdp, addr);
191		if (!ptep)
192			break;
193
194		pte = READ_ONCE(*ptep);
195		pr_cont(", pte=%016llx", pte_val(pte));
196		pte_unmap(ptep);
197	} while(0);
198
199	pr_cont("\n");
200}
201
202/*
203 * This function sets the access flags (dirty, accessed), as well as write
204 * permission, and only to a more permissive setting.
205 *
206 * It needs to cope with hardware update of the accessed/dirty state by other
207 * agents in the system and can safely skip the __sync_icache_dcache() call as,
208 * like set_pte_at(), the PTE is never changed from no-exec to exec here.
209 *
210 * Returns whether or not the PTE actually changed.
211 */
212int ptep_set_access_flags(struct vm_area_struct *vma,
213			  unsigned long address, pte_t *ptep,
214			  pte_t entry, int dirty)
215{
216	pteval_t old_pteval, pteval;
217	pte_t pte = READ_ONCE(*ptep);
218
219	if (pte_same(pte, entry))
220		return 0;
221
222	/* only preserve the access flags and write permission */
223	pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
224
225	/*
226	 * Setting the flags must be done atomically to avoid racing with the
227	 * hardware update of the access/dirty state. The PTE_RDONLY bit must
228	 * be set to the most permissive (lowest value) of *ptep and entry
229	 * (calculated as: a & b == ~(~a | ~b)).
230	 */
231	pte_val(entry) ^= PTE_RDONLY;
232	pteval = pte_val(pte);
233	do {
234		old_pteval = pteval;
235		pteval ^= PTE_RDONLY;
236		pteval |= pte_val(entry);
237		pteval ^= PTE_RDONLY;
238		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
239	} while (pteval != old_pteval);
240
241	/* Invalidate a stale read-only entry */
242	if (dirty)
243		flush_tlb_page(vma, address);
244	return 1;
245}
246
247static bool is_el1_instruction_abort(unsigned long esr)
248{
249	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
250}
251
252static bool is_el1_data_abort(unsigned long esr)
253{
254	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
255}
256
257static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
258					   struct pt_regs *regs)
259{
260	unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE;
261
262	if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
263		return false;
264
265	if (fsc_type == ESR_ELx_FSC_PERM)
266		return true;
267
268	if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
269		return fsc_type == ESR_ELx_FSC_FAULT &&
270			(regs->pstate & PSR_PAN_BIT);
271
272	return false;
273}
274
275static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
276							unsigned long esr,
277							struct pt_regs *regs)
278{
279	unsigned long flags;
280	u64 par, dfsc;
281
282	if (!is_el1_data_abort(esr) ||
283	    (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
284		return false;
285
286	local_irq_save(flags);
287	asm volatile("at s1e1r, %0" :: "r" (addr));
288	isb();
289	par = read_sysreg_par();
290	local_irq_restore(flags);
291
292	/*
293	 * If we now have a valid translation, treat the translation fault as
294	 * spurious.
295	 */
296	if (!(par & SYS_PAR_EL1_F))
297		return true;
298
299	/*
300	 * If we got a different type of fault from the AT instruction,
301	 * treat the translation fault as spurious.
302	 */
303	dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
304	return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
305}
306
307static void die_kernel_fault(const char *msg, unsigned long addr,
308			     unsigned long esr, struct pt_regs *regs)
309{
310	bust_spinlocks(1);
311
312	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
313		 addr);
314
315	kasan_non_canonical_hook(addr);
316
317	mem_abort_decode(esr);
318
319	show_pte(addr);
320	die("Oops", regs, esr);
321	bust_spinlocks(0);
322	make_task_dead(SIGKILL);
323}
324
325#ifdef CONFIG_KASAN_HW_TAGS
326static void report_tag_fault(unsigned long addr, unsigned long esr,
327			     struct pt_regs *regs)
328{
329	/*
330	 * SAS bits aren't set for all faults reported in EL1, so we can't
331	 * find out access size.
332	 */
333	bool is_write = !!(esr & ESR_ELx_WNR);
334	kasan_report((void *)addr, 0, is_write, regs->pc);
335}
336#else
337/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
338static inline void report_tag_fault(unsigned long addr, unsigned long esr,
339				    struct pt_regs *regs) { }
340#endif
341
342static void do_tag_recovery(unsigned long addr, unsigned long esr,
343			   struct pt_regs *regs)
344{
345
346	report_tag_fault(addr, esr, regs);
347
348	/*
349	 * Disable MTE Tag Checking on the local CPU for the current EL.
350	 * It will be done lazily on the other CPUs when they will hit a
351	 * tag fault.
352	 */
353	sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
354			 SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
355	isb();
356}
357
358static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
359{
360	unsigned long fsc = esr & ESR_ELx_FSC;
361
362	if (!is_el1_data_abort(esr))
363		return false;
364
365	if (fsc == ESR_ELx_FSC_MTE)
366		return true;
367
368	return false;
369}
370
371static bool is_translation_fault(unsigned long esr)
372{
373	return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
374}
375
376static void __do_kernel_fault(unsigned long addr, unsigned long esr,
377			      struct pt_regs *regs)
378{
379	const char *msg;
380
381	/*
382	 * Are we prepared to handle this kernel fault?
383	 * We are almost certainly not prepared to handle instruction faults.
384	 */
385	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
386		return;
387
388	if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
389	    "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
390		return;
391
392	if (is_el1_mte_sync_tag_check_fault(esr)) {
393		do_tag_recovery(addr, esr, regs);
394
395		return;
396	}
397
398	if (is_el1_permission_fault(addr, esr, regs)) {
399		if (esr & ESR_ELx_WNR)
400			msg = "write to read-only memory";
401		else if (is_el1_instruction_abort(esr))
402			msg = "execute from non-executable memory";
403		else
404			msg = "read from unreadable memory";
405	} else if (addr < PAGE_SIZE) {
406		msg = "NULL pointer dereference";
407	} else {
408		if (is_translation_fault(esr) &&
409		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
410			return;
411
412		msg = "paging request";
413	}
414
415	if (efi_runtime_fixup_exception(regs, msg))
416		return;
417
418	die_kernel_fault(msg, addr, esr, regs);
419}
420
421static void set_thread_esr(unsigned long address, unsigned long esr)
422{
423	current->thread.fault_address = address;
424
425	/*
426	 * If the faulting address is in the kernel, we must sanitize the ESR.
427	 * From userspace's point of view, kernel-only mappings don't exist
428	 * at all, so we report them as level 0 translation faults.
429	 * (This is not quite the way that "no mapping there at all" behaves:
430	 * an alignment fault not caused by the memory type would take
431	 * precedence over translation fault for a real access to empty
432	 * space. Unfortunately we can't easily distinguish "alignment fault
433	 * not caused by memory type" from "alignment fault caused by memory
434	 * type", so we ignore this wrinkle and just return the translation
435	 * fault.)
436	 */
437	if (!is_ttbr0_addr(current->thread.fault_address)) {
438		switch (ESR_ELx_EC(esr)) {
439		case ESR_ELx_EC_DABT_LOW:
440			/*
441			 * These bits provide only information about the
442			 * faulting instruction, which userspace knows already.
443			 * We explicitly clear bits which are architecturally
444			 * RES0 in case they are given meanings in future.
445			 * We always report the ESR as if the fault was taken
446			 * to EL1 and so ISV and the bits in ISS[23:14] are
447			 * clear. (In fact it always will be a fault to EL1.)
448			 */
449			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
450				ESR_ELx_CM | ESR_ELx_WNR;
451			esr |= ESR_ELx_FSC_FAULT;
452			break;
453		case ESR_ELx_EC_IABT_LOW:
454			/*
455			 * Claim a level 0 translation fault.
456			 * All other bits are architecturally RES0 for faults
457			 * reported with that DFSC value, so we clear them.
458			 */
459			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
460			esr |= ESR_ELx_FSC_FAULT;
461			break;
462		default:
463			/*
464			 * This should never happen (entry.S only brings us
465			 * into this code for insn and data aborts from a lower
466			 * exception level). Fail safe by not providing an ESR
467			 * context record at all.
468			 */
469			WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
470			esr = 0;
471			break;
472		}
473	}
474
475	current->thread.fault_code = esr;
476}
477
478static void do_bad_area(unsigned long far, unsigned long esr,
479			struct pt_regs *regs)
480{
481	unsigned long addr = untagged_addr(far);
482
483	/*
484	 * If we are in kernel mode at this point, we have no context to
485	 * handle this fault with.
486	 */
487	if (user_mode(regs)) {
488		const struct fault_info *inf = esr_to_fault_info(esr);
489
490		set_thread_esr(addr, esr);
491		arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
492	} else {
493		__do_kernel_fault(addr, esr, regs);
494	}
495}
496
497#define VM_FAULT_BADMAP		((__force vm_fault_t)0x010000)
498#define VM_FAULT_BADACCESS	((__force vm_fault_t)0x020000)
 
 
 
 
 
 
 
 
499
500static vm_fault_t __do_page_fault(struct mm_struct *mm,
501				  struct vm_area_struct *vma, unsigned long addr,
502				  unsigned int mm_flags, unsigned long vm_flags,
503				  struct pt_regs *regs)
504{
505	/*
506	 * Ok, we have a good vm_area for this memory access, so we can handle
507	 * it.
508	 * Check that the permissions on the VMA allow for the fault which
509	 * occurred.
510	 */
511	if (!(vma->vm_flags & vm_flags))
512		return VM_FAULT_BADACCESS;
513	return handle_mm_fault(vma, addr, mm_flags, regs);
514}
515
516static bool is_el0_instruction_abort(unsigned long esr)
517{
518	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
519}
520
521/*
522 * Note: not valid for EL1 DC IVAC, but we never use that such that it
523 * should fault. EL0 cannot issue DC IVAC (undef).
524 */
525static bool is_write_abort(unsigned long esr)
526{
527	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
528}
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
531				   struct pt_regs *regs)
532{
533	const struct fault_info *inf;
534	struct mm_struct *mm = current->mm;
535	vm_fault_t fault;
536	unsigned long vm_flags;
537	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
538	unsigned long addr = untagged_addr(far);
539	struct vm_area_struct *vma;
 
 
540
541	if (kprobe_page_fault(regs, esr))
542		return 0;
543
544	/*
545	 * If we're in an interrupt or have no user context, we must not take
546	 * the fault.
547	 */
548	if (faulthandler_disabled() || !mm)
549		goto no_context;
550
551	if (user_mode(regs))
552		mm_flags |= FAULT_FLAG_USER;
553
554	/*
555	 * vm_flags tells us what bits we must have in vma->vm_flags
556	 * for the fault to be benign, __do_page_fault() would check
557	 * vma->vm_flags & vm_flags and returns an error if the
558	 * intersection is empty
559	 */
560	if (is_el0_instruction_abort(esr)) {
561		/* It was exec fault */
562		vm_flags = VM_EXEC;
563		mm_flags |= FAULT_FLAG_INSTRUCTION;
 
 
 
 
 
 
 
 
564	} else if (is_write_abort(esr)) {
565		/* It was write fault */
566		vm_flags = VM_WRITE;
567		mm_flags |= FAULT_FLAG_WRITE;
568	} else {
569		/* It was read fault */
570		vm_flags = VM_READ;
571		/* Write implies read */
572		vm_flags |= VM_WRITE;
573		/* If EPAN is absent then exec implies read */
574		if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
575			vm_flags |= VM_EXEC;
576	}
577
578	if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
579		if (is_el1_instruction_abort(esr))
580			die_kernel_fault("execution of user memory",
581					 addr, esr, regs);
582
583		if (!search_exception_tables(regs->pc))
584			die_kernel_fault("access to user memory outside uaccess routines",
585					 addr, esr, regs);
586	}
587
588	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
589
590	if (!(mm_flags & FAULT_FLAG_USER))
591		goto lock_mmap;
592
593	vma = lock_vma_under_rcu(mm, addr);
594	if (!vma)
595		goto lock_mmap;
596
 
 
 
 
 
 
 
597	if (!(vma->vm_flags & vm_flags)) {
598		vma_end_read(vma);
599		goto lock_mmap;
 
 
 
600	}
 
 
 
 
 
 
 
 
 
 
601	fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
602	if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
603		vma_end_read(vma);
604
605	if (!(fault & VM_FAULT_RETRY)) {
606		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
607		goto done;
608	}
609	count_vm_vma_lock_event(VMA_LOCK_RETRY);
610	if (fault & VM_FAULT_MAJOR)
611		mm_flags |= FAULT_FLAG_TRIED;
612
613	/* Quick path to respond to signals */
614	if (fault_signal_pending(fault, regs)) {
615		if (!user_mode(regs))
616			goto no_context;
617		return 0;
618	}
619lock_mmap:
620
621retry:
622	vma = lock_mm_and_find_vma(mm, addr, regs);
623	if (unlikely(!vma)) {
624		fault = VM_FAULT_BADMAP;
625		goto done;
 
626	}
627
628	fault = __do_page_fault(mm, vma, addr, mm_flags, vm_flags, regs);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
630	/* Quick path to respond to signals */
631	if (fault_signal_pending(fault, regs)) {
632		if (!user_mode(regs))
633			goto no_context;
634		return 0;
635	}
636
637	/* The fault is fully completed (including releasing mmap lock) */
638	if (fault & VM_FAULT_COMPLETED)
639		return 0;
640
641	if (fault & VM_FAULT_RETRY) {
642		mm_flags |= FAULT_FLAG_TRIED;
643		goto retry;
644	}
645	mmap_read_unlock(mm);
646
647done:
648	/*
649	 * Handle the "normal" (no error) case first.
650	 */
651	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
652			      VM_FAULT_BADACCESS))))
653		return 0;
654
 
 
655	/*
656	 * If we are in kernel mode at this point, we have no context to
657	 * handle this fault with.
658	 */
659	if (!user_mode(regs))
660		goto no_context;
661
662	if (fault & VM_FAULT_OOM) {
663		/*
664		 * We ran out of memory, call the OOM killer, and return to
665		 * userspace (which will retry the fault, or kill us if we got
666		 * oom-killed).
667		 */
668		pagefault_out_of_memory();
669		return 0;
670	}
671
672	inf = esr_to_fault_info(esr);
673	set_thread_esr(addr, esr);
674	if (fault & VM_FAULT_SIGBUS) {
675		/*
676		 * We had some memory, but were unable to successfully fix up
677		 * this page fault.
678		 */
679		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
680	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
681		unsigned int lsb;
682
683		lsb = PAGE_SHIFT;
684		if (fault & VM_FAULT_HWPOISON_LARGE)
685			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
686
687		arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
688	} else {
689		/*
690		 * Something tried to access memory that isn't in our memory
691		 * map.
 
 
 
 
 
 
 
 
692		 */
693		arm64_force_sig_fault(SIGSEGV,
694				      fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
695				      far, inf->name);
 
 
696	}
697
698	return 0;
699
700no_context:
701	__do_kernel_fault(addr, esr, regs);
702	return 0;
703}
704
705static int __kprobes do_translation_fault(unsigned long far,
706					  unsigned long esr,
707					  struct pt_regs *regs)
708{
709	unsigned long addr = untagged_addr(far);
710
711	if (is_ttbr0_addr(addr))
712		return do_page_fault(far, esr, regs);
713
714	do_bad_area(far, esr, regs);
715	return 0;
716}
717
718static int do_alignment_fault(unsigned long far, unsigned long esr,
719			      struct pt_regs *regs)
720{
721	if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) &&
722	    compat_user_mode(regs))
723		return do_compat_alignment_fixup(far, regs);
724	do_bad_area(far, esr, regs);
725	return 0;
726}
727
728static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
729{
730	return 1; /* "fault" */
731}
732
733static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
734{
735	const struct fault_info *inf;
736	unsigned long siaddr;
737
738	inf = esr_to_fault_info(esr);
739
740	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
741		/*
742		 * APEI claimed this as a firmware-first notification.
743		 * Some processing deferred to task_work before ret_to_user().
744		 */
745		return 0;
746	}
747
748	if (esr & ESR_ELx_FnV) {
749		siaddr = 0;
750	} else {
751		/*
752		 * The architecture specifies that the tag bits of FAR_EL1 are
753		 * UNKNOWN for synchronous external aborts. Mask them out now
754		 * so that userspace doesn't see them.
755		 */
756		siaddr  = untagged_addr(far);
757	}
758	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
759
760	return 0;
761}
762
763static int do_tag_check_fault(unsigned long far, unsigned long esr,
764			      struct pt_regs *regs)
765{
766	/*
767	 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
768	 * for tag check faults. Set them to corresponding bits in the untagged
769	 * address.
770	 */
771	far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
772	do_bad_area(far, esr, regs);
773	return 0;
774}
775
776static const struct fault_info fault_info[] = {
777	{ do_bad,		SIGKILL, SI_KERNEL,	"ttbr address size fault"	},
778	{ do_bad,		SIGKILL, SI_KERNEL,	"level 1 address size fault"	},
779	{ do_bad,		SIGKILL, SI_KERNEL,	"level 2 address size fault"	},
780	{ do_bad,		SIGKILL, SI_KERNEL,	"level 3 address size fault"	},
781	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 0 translation fault"	},
782	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	},
783	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	},
784	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
785	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 8"			},
786	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 access flag fault"	},
787	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
788	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 access flag fault"	},
789	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 12"			},
790	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 permission fault"	},
791	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 permission fault"	},
792	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 permission fault"	},
793	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous external abort"	},
794	{ do_tag_check_fault,	SIGSEGV, SEGV_MTESERR,	"synchronous tag check fault"	},
795	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 18"			},
796	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 19"			},
797	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 (translation table walk)"	},
798	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 (translation table walk)"	},
799	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 (translation table walk)"	},
800	{ do_sea,		SIGKILL, SI_KERNEL,	"level 3 (translation table walk)"	},
801	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous parity or ECC error" },	// Reserved when RAS is implemented
802	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 25"			},
803	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 26"			},
804	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 27"			},
805	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
806	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
807	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
808	{ do_sea,		SIGKILL, SI_KERNEL,	"level 3 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
809	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 32"			},
810	{ do_alignment_fault,	SIGBUS,  BUS_ADRALN,	"alignment fault"		},
811	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 34"			},
812	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 35"			},
813	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 36"			},
814	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 37"			},
815	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 38"			},
816	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 39"			},
817	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 40"			},
818	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 41"			},
819	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 42"			},
820	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 43"			},
821	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 44"			},
822	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 45"			},
823	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 46"			},
824	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 47"			},
825	{ do_bad,		SIGKILL, SI_KERNEL,	"TLB conflict abort"		},
826	{ do_bad,		SIGKILL, SI_KERNEL,	"Unsupported atomic hardware update fault"	},
827	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 50"			},
828	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 51"			},
829	{ do_bad,		SIGKILL, SI_KERNEL,	"implementation fault (lockdown abort)" },
830	{ do_bad,		SIGBUS,  BUS_OBJERR,	"implementation fault (unsupported exclusive)" },
831	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 54"			},
832	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 55"			},
833	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 56"			},
834	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 57"			},
835	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 58" 			},
836	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 59"			},
837	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 60"			},
838	{ do_bad,		SIGKILL, SI_KERNEL,	"section domain fault"		},
839	{ do_bad,		SIGKILL, SI_KERNEL,	"page domain fault"		},
840	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
841};
842
843void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
844{
845	const struct fault_info *inf = esr_to_fault_info(esr);
846	unsigned long addr = untagged_addr(far);
847
848	if (!inf->fn(far, esr, regs))
849		return;
850
851	if (!user_mode(regs))
852		die_kernel_fault(inf->name, addr, esr, regs);
853
854	/*
855	 * At this point we have an unrecognized fault type whose tag bits may
856	 * have been defined as UNKNOWN. Therefore we only expose the untagged
857	 * address to the signal handler.
858	 */
859	arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
860}
861NOKPROBE_SYMBOL(do_mem_abort);
862
863void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
864{
865	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
866			 addr, esr);
867}
868NOKPROBE_SYMBOL(do_sp_pc_abort);
869
870/*
871 * __refdata because early_brk64 is __init, but the reference to it is
872 * clobbered at arch_initcall time.
873 * See traps.c and debug-monitors.c:debug_traps_init().
874 */
875static struct fault_info __refdata debug_fault_info[] = {
876	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware breakpoint"	},
877	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware single-step"	},
878	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware watchpoint"	},
879	{ do_bad,	SIGKILL,	SI_KERNEL,	"unknown 3"		},
880	{ do_bad,	SIGTRAP,	TRAP_BRKPT,	"aarch32 BKPT"		},
881	{ do_bad,	SIGKILL,	SI_KERNEL,	"aarch32 vector catch"	},
882	{ early_brk64,	SIGTRAP,	TRAP_BRKPT,	"aarch64 BRK"		},
883	{ do_bad,	SIGKILL,	SI_KERNEL,	"unknown 7"		},
884};
885
886void __init hook_debug_fault_code(int nr,
887				  int (*fn)(unsigned long, unsigned long, struct pt_regs *),
888				  int sig, int code, const char *name)
889{
890	BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
891
892	debug_fault_info[nr].fn		= fn;
893	debug_fault_info[nr].sig	= sig;
894	debug_fault_info[nr].code	= code;
895	debug_fault_info[nr].name	= name;
896}
897
898/*
899 * In debug exception context, we explicitly disable preemption despite
900 * having interrupts disabled.
901 * This serves two purposes: it makes it much less likely that we would
902 * accidentally schedule in exception context and it will force a warning
903 * if we somehow manage to schedule by accident.
904 */
905static void debug_exception_enter(struct pt_regs *regs)
906{
907	preempt_disable();
908
909	/* This code is a bit fragile.  Test it. */
910	RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
911}
912NOKPROBE_SYMBOL(debug_exception_enter);
913
914static void debug_exception_exit(struct pt_regs *regs)
915{
916	preempt_enable_no_resched();
917}
918NOKPROBE_SYMBOL(debug_exception_exit);
919
920void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
921			struct pt_regs *regs)
922{
923	const struct fault_info *inf = esr_to_debug_fault_info(esr);
924	unsigned long pc = instruction_pointer(regs);
925
926	debug_exception_enter(regs);
927
928	if (user_mode(regs) && !is_ttbr0_addr(pc))
929		arm64_apply_bp_hardening();
930
931	if (inf->fn(addr_if_watchpoint, esr, regs)) {
932		arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
933	}
934
935	debug_exception_exit(regs);
936}
937NOKPROBE_SYMBOL(do_debug_exception);
938
939/*
940 * Used during anonymous page fault handling.
941 */
942struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
943						unsigned long vaddr)
944{
945	gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
946
947	/*
948	 * If the page is mapped with PROT_MTE, initialise the tags at the
949	 * point of allocation and page zeroing as this is usually faster than
950	 * separate DC ZVA and STGM.
951	 */
952	if (vma->vm_flags & VM_MTE)
953		flags |= __GFP_ZEROTAGS;
954
955	return vma_alloc_folio(flags, 0, vma, vaddr, false);
956}
957
958void tag_clear_highpage(struct page *page)
959{
960	/* Newly allocated page, shouldn't have been tagged yet */
961	WARN_ON_ONCE(!try_page_mte_tagging(page));
962	mte_zero_clear_page_tags(page_address(page));
963	set_page_mte_tagged(page);
964}

   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Based on arch/arm/mm/fault.c
   4 *
   5 * Copyright (C) 1995  Linus Torvalds
   6 * Copyright (C) 1995-2004 Russell King
   7 * Copyright (C) 2012 ARM Ltd.
   8 */
   9
  10#include <linux/acpi.h>
  11#include <linux/bitfield.h>
  12#include <linux/extable.h>
  13#include <linux/kfence.h>
  14#include <linux/signal.h>
  15#include <linux/mm.h>
  16#include <linux/hardirq.h>
  17#include <linux/init.h>
  18#include <linux/kasan.h>
  19#include <linux/kprobes.h>
  20#include <linux/uaccess.h>
  21#include <linux/page-flags.h>
  22#include <linux/sched/signal.h>
  23#include <linux/sched/debug.h>
  24#include <linux/highmem.h>
  25#include <linux/perf_event.h>
  26#include <linux/pkeys.h>
  27#include <linux/preempt.h>
  28#include <linux/hugetlb.h>
  29
  30#include <asm/acpi.h>
  31#include <asm/bug.h>
  32#include <asm/cmpxchg.h>
  33#include <asm/cpufeature.h>
  34#include <asm/efi.h>
  35#include <asm/exception.h>
  36#include <asm/daifflags.h>
  37#include <asm/debug-monitors.h>
  38#include <asm/esr.h>
  39#include <asm/kprobes.h>
  40#include <asm/mte.h>
  41#include <asm/processor.h>
  42#include <asm/sysreg.h>
  43#include <asm/system_misc.h>
  44#include <asm/tlbflush.h>
  45#include <asm/traps.h>
  46
  47struct fault_info {
  48	int	(*fn)(unsigned long far, unsigned long esr,
  49		      struct pt_regs *regs);
  50	int	sig;
  51	int	code;
  52	const char *name;
  53};
  54
  55static const struct fault_info fault_info[];
  56static struct fault_info debug_fault_info[];
  57
  58static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
  59{
  60	return fault_info + (esr & ESR_ELx_FSC);
  61}
  62
  63static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
  64{
  65	return debug_fault_info + DBG_ESR_EVT(esr);
  66}
  67
  68static void data_abort_decode(unsigned long esr)
  69{
  70	unsigned long iss2 = ESR_ELx_ISS2(esr);
  71
  72	pr_alert("Data abort info:\n");
  73
  74	if (esr & ESR_ELx_ISV) {
  75		pr_alert("  Access size = %u byte(s)\n",
  76			 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
  77		pr_alert("  SSE = %lu, SRT = %lu\n",
  78			 (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
  79			 (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
  80		pr_alert("  SF = %lu, AR = %lu\n",
  81			 (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
  82			 (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
  83	} else {
  84		pr_alert("  ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n",
  85			 esr & ESR_ELx_ISS_MASK, iss2);
  86	}
  87
  88	pr_alert("  CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n",
  89		 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
  90		 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT,
  91		 (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT,
  92		 (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT);
  93
  94	pr_alert("  GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n",
  95		 (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT,
  96		 (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT,
  97		 (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT,
  98		 (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT);
  99}
 100
 101static void mem_abort_decode(unsigned long esr)
 102{
 103	pr_alert("Mem abort info:\n");
 104
 105	pr_alert("  ESR = 0x%016lx\n", esr);
 106	pr_alert("  EC = 0x%02lx: %s, IL = %u bits\n",
 107		 ESR_ELx_EC(esr), esr_get_class_string(esr),
 108		 (esr & ESR_ELx_IL) ? 32 : 16);
 109	pr_alert("  SET = %lu, FnV = %lu\n",
 110		 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
 111		 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
 112	pr_alert("  EA = %lu, S1PTW = %lu\n",
 113		 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
 114		 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
 115	pr_alert("  FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
 116		 esr_to_fault_info(esr)->name);
 117
 118	if (esr_is_data_abort(esr))
 119		data_abort_decode(esr);
 120}
 121
 122static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
 123{
 124	/* Either init_pg_dir or swapper_pg_dir */
 125	if (mm == &init_mm)
 126		return __pa_symbol(mm->pgd);
 127
 128	return (unsigned long)virt_to_phys(mm->pgd);
 129}
 130
 131/*
 132 * Dump out the page tables associated with 'addr' in the currently active mm.
 133 */
 134static void show_pte(unsigned long addr)
 135{
 136	struct mm_struct *mm;
 137	pgd_t *pgdp;
 138	pgd_t pgd;
 139
 140	if (is_ttbr0_addr(addr)) {
 141		/* TTBR0 */
 142		mm = current->active_mm;
 143		if (mm == &init_mm) {
 144			pr_alert("[%016lx] user address but active_mm is swapper\n",
 145				 addr);
 146			return;
 147		}
 148	} else if (is_ttbr1_addr(addr)) {
 149		/* TTBR1 */
 150		mm = &init_mm;
 151	} else {
 152		pr_alert("[%016lx] address between user and kernel address ranges\n",
 153			 addr);
 154		return;
 155	}
 156
 157	pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
 158		 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
 159		 vabits_actual, mm_to_pgd_phys(mm));
 160	pgdp = pgd_offset(mm, addr);
 161	pgd = READ_ONCE(*pgdp);
 162	pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
 163
 164	do {
 165		p4d_t *p4dp, p4d;
 166		pud_t *pudp, pud;
 167		pmd_t *pmdp, pmd;
 168		pte_t *ptep, pte;
 169
 170		if (pgd_none(pgd) || pgd_bad(pgd))
 171			break;
 172
 173		p4dp = p4d_offset(pgdp, addr);
 174		p4d = READ_ONCE(*p4dp);
 175		pr_cont(", p4d=%016llx", p4d_val(p4d));
 176		if (p4d_none(p4d) || p4d_bad(p4d))
 177			break;
 178
 179		pudp = pud_offset(p4dp, addr);
 180		pud = READ_ONCE(*pudp);
 181		pr_cont(", pud=%016llx", pud_val(pud));
 182		if (pud_none(pud) || pud_bad(pud))
 183			break;
 184
 185		pmdp = pmd_offset(pudp, addr);
 186		pmd = READ_ONCE(*pmdp);
 187		pr_cont(", pmd=%016llx", pmd_val(pmd));
 188		if (pmd_none(pmd) || pmd_bad(pmd))
 189			break;
 190
 191		ptep = pte_offset_map(pmdp, addr);
 192		if (!ptep)
 193			break;
 194
 195		pte = __ptep_get(ptep);
 196		pr_cont(", pte=%016llx", pte_val(pte));
 197		pte_unmap(ptep);
 198	} while(0);
 199
 200	pr_cont("\n");
 201}
 202
 203/*
 204 * This function sets the access flags (dirty, accessed), as well as write
 205 * permission, and only to a more permissive setting.
 206 *
 207 * It needs to cope with hardware update of the accessed/dirty state by other
 208 * agents in the system and can safely skip the __sync_icache_dcache() call as,
 209 * like __set_ptes(), the PTE is never changed from no-exec to exec here.
 210 *
 211 * Returns whether or not the PTE actually changed.
 212 */
 213int __ptep_set_access_flags(struct vm_area_struct *vma,
 214			    unsigned long address, pte_t *ptep,
 215			    pte_t entry, int dirty)
 216{
 217	pteval_t old_pteval, pteval;
 218	pte_t pte = __ptep_get(ptep);
 219
 220	if (pte_same(pte, entry))
 221		return 0;
 222
 223	/* only preserve the access flags and write permission */
 224	pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
 225
 226	/*
 227	 * Setting the flags must be done atomically to avoid racing with the
 228	 * hardware update of the access/dirty state. The PTE_RDONLY bit must
 229	 * be set to the most permissive (lowest value) of *ptep and entry
 230	 * (calculated as: a & b == ~(~a | ~b)).
 231	 */
 232	pte_val(entry) ^= PTE_RDONLY;
 233	pteval = pte_val(pte);
 234	do {
 235		old_pteval = pteval;
 236		pteval ^= PTE_RDONLY;
 237		pteval |= pte_val(entry);
 238		pteval ^= PTE_RDONLY;
 239		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
 240	} while (pteval != old_pteval);
 241
 242	/* Invalidate a stale read-only entry */
 243	if (dirty)
 244		flush_tlb_page(vma, address);
 245	return 1;
 246}
 247
 248static bool is_el1_instruction_abort(unsigned long esr)
 249{
 250	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
 251}
 252
 253static bool is_el1_data_abort(unsigned long esr)
 254{
 255	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
 256}
 257
 258static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
 259					   struct pt_regs *regs)
 260{
 
 
 261	if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
 262		return false;
 263
 264	if (esr_fsc_is_permission_fault(esr))
 265		return true;
 266
 267	if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
 268		return esr_fsc_is_translation_fault(esr) &&
 269			(regs->pstate & PSR_PAN_BIT);
 270
 271	return false;
 272}
 273
 274static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 275							unsigned long esr,
 276							struct pt_regs *regs)
 277{
 278	unsigned long flags;
 279	u64 par, dfsc;
 280
 281	if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr))
 
 282		return false;
 283
 284	local_irq_save(flags);
 285	asm volatile("at s1e1r, %0" :: "r" (addr));
 286	isb();
 287	par = read_sysreg_par();
 288	local_irq_restore(flags);
 289
 290	/*
 291	 * If we now have a valid translation, treat the translation fault as
 292	 * spurious.
 293	 */
 294	if (!(par & SYS_PAR_EL1_F))
 295		return true;
 296
 297	/*
 298	 * If we got a different type of fault from the AT instruction,
 299	 * treat the translation fault as spurious.
 300	 */
 301	dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
 302	return !esr_fsc_is_translation_fault(dfsc);
 303}
 304
 305static void die_kernel_fault(const char *msg, unsigned long addr,
 306			     unsigned long esr, struct pt_regs *regs)
 307{
 308	bust_spinlocks(1);
 309
 310	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
 311		 addr);
 312
 313	kasan_non_canonical_hook(addr);
 314
 315	mem_abort_decode(esr);
 316
 317	show_pte(addr);
 318	die("Oops", regs, esr);
 319	bust_spinlocks(0);
 320	make_task_dead(SIGKILL);
 321}
 322
 323#ifdef CONFIG_KASAN_HW_TAGS
 324static void report_tag_fault(unsigned long addr, unsigned long esr,
 325			     struct pt_regs *regs)
 326{
 327	/*
 328	 * SAS bits aren't set for all faults reported in EL1, so we can't
 329	 * find out access size.
 330	 */
 331	bool is_write = !!(esr & ESR_ELx_WNR);
 332	kasan_report((void *)addr, 0, is_write, regs->pc);
 333}
 334#else
 335/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
 336static inline void report_tag_fault(unsigned long addr, unsigned long esr,
 337				    struct pt_regs *regs) { }
 338#endif
 339
 340static void do_tag_recovery(unsigned long addr, unsigned long esr,
 341			   struct pt_regs *regs)
 342{
 343
 344	report_tag_fault(addr, esr, regs);
 345
 346	/*
 347	 * Disable MTE Tag Checking on the local CPU for the current EL.
 348	 * It will be done lazily on the other CPUs when they will hit a
 349	 * tag fault.
 350	 */
 351	sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
 352			 SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
 353	isb();
 354}
 355
 356static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
 357{
 358	unsigned long fsc = esr & ESR_ELx_FSC;
 359
 360	if (!is_el1_data_abort(esr))
 361		return false;
 362
 363	if (fsc == ESR_ELx_FSC_MTE)
 364		return true;
 365
 366	return false;
 367}
 368
 
 
 
 
 
 369static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 370			      struct pt_regs *regs)
 371{
 372	const char *msg;
 373
 374	/*
 375	 * Are we prepared to handle this kernel fault?
 376	 * We are almost certainly not prepared to handle instruction faults.
 377	 */
 378	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
 379		return;
 380
 381	if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
 382	    "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
 383		return;
 384
 385	if (is_el1_mte_sync_tag_check_fault(esr)) {
 386		do_tag_recovery(addr, esr, regs);
 387
 388		return;
 389	}
 390
 391	if (is_el1_permission_fault(addr, esr, regs)) {
 392		if (esr & ESR_ELx_WNR)
 393			msg = "write to read-only memory";
 394		else if (is_el1_instruction_abort(esr))
 395			msg = "execute from non-executable memory";
 396		else
 397			msg = "read from unreadable memory";
 398	} else if (addr < PAGE_SIZE) {
 399		msg = "NULL pointer dereference";
 400	} else {
 401		if (esr_fsc_is_translation_fault(esr) &&
 402		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
 403			return;
 404
 405		msg = "paging request";
 406	}
 407
 408	if (efi_runtime_fixup_exception(regs, msg))
 409		return;
 410
 411	die_kernel_fault(msg, addr, esr, regs);
 412}
 413
 414static void set_thread_esr(unsigned long address, unsigned long esr)
 415{
 416	current->thread.fault_address = address;
 417
 418	/*
 419	 * If the faulting address is in the kernel, we must sanitize the ESR.
 420	 * From userspace's point of view, kernel-only mappings don't exist
 421	 * at all, so we report them as level 0 translation faults.
 422	 * (This is not quite the way that "no mapping there at all" behaves:
 423	 * an alignment fault not caused by the memory type would take
 424	 * precedence over translation fault for a real access to empty
 425	 * space. Unfortunately we can't easily distinguish "alignment fault
 426	 * not caused by memory type" from "alignment fault caused by memory
 427	 * type", so we ignore this wrinkle and just return the translation
 428	 * fault.)
 429	 */
 430	if (!is_ttbr0_addr(current->thread.fault_address)) {
 431		switch (ESR_ELx_EC(esr)) {
 432		case ESR_ELx_EC_DABT_LOW:
 433			/*
 434			 * These bits provide only information about the
 435			 * faulting instruction, which userspace knows already.
 436			 * We explicitly clear bits which are architecturally
 437			 * RES0 in case they are given meanings in future.
 438			 * We always report the ESR as if the fault was taken
 439			 * to EL1 and so ISV and the bits in ISS[23:14] are
 440			 * clear. (In fact it always will be a fault to EL1.)
 441			 */
 442			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
 443				ESR_ELx_CM | ESR_ELx_WNR;
 444			esr |= ESR_ELx_FSC_FAULT;
 445			break;
 446		case ESR_ELx_EC_IABT_LOW:
 447			/*
 448			 * Claim a level 0 translation fault.
 449			 * All other bits are architecturally RES0 for faults
 450			 * reported with that DFSC value, so we clear them.
 451			 */
 452			esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
 453			esr |= ESR_ELx_FSC_FAULT;
 454			break;
 455		default:
 456			/*
 457			 * This should never happen (entry.S only brings us
 458			 * into this code for insn and data aborts from a lower
 459			 * exception level). Fail safe by not providing an ESR
 460			 * context record at all.
 461			 */
 462			WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
 463			esr = 0;
 464			break;
 465		}
 466	}
 467
 468	current->thread.fault_code = esr;
 469}
 470
 471static void do_bad_area(unsigned long far, unsigned long esr,
 472			struct pt_regs *regs)
 473{
 474	unsigned long addr = untagged_addr(far);
 475
 476	/*
 477	 * If we are in kernel mode at this point, we have no context to
 478	 * handle this fault with.
 479	 */
 480	if (user_mode(regs)) {
 481		const struct fault_info *inf = esr_to_fault_info(esr);
 482
 483		set_thread_esr(addr, esr);
 484		arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
 485	} else {
 486		__do_kernel_fault(addr, esr, regs);
 487	}
 488}
 489
 490static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
 491			unsigned int mm_flags)
 492{
 493	unsigned long iss2 = ESR_ELx_ISS2(esr);
 494
 495	if (!system_supports_poe())
 496		return false;
 497
 498	if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
 499		return true;
 500
 501	return !arch_vma_access_permitted(vma,
 502			mm_flags & FAULT_FLAG_WRITE,
 503			mm_flags & FAULT_FLAG_INSTRUCTION,
 504			false);
 505}
 506
 507static bool is_gcs_fault(unsigned long esr)
 508{
 509	if (!esr_is_data_abort(esr))
 510		return false;
 511
 512	return ESR_ELx_ISS2(esr) & ESR_ELx_GCS;
 
 
 513}
 514
 515static bool is_el0_instruction_abort(unsigned long esr)
 516{
 517	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
 518}
 519
 520/*
 521 * Note: not valid for EL1 DC IVAC, but we never use that such that it
 522 * should fault. EL0 cannot issue DC IVAC (undef).
 523 */
 524static bool is_write_abort(unsigned long esr)
 525{
 526	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 527}
 528
 529static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr)
 530{
 531	if (!system_supports_gcs())
 532		return false;
 533
 534	if (unlikely(is_gcs_fault(esr))) {
 535		/* GCS accesses must be performed on a GCS page */
 536		if (!(vma->vm_flags & VM_SHADOW_STACK))
 537			return true;
 538	} else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) {
 539		/* Only GCS operations can write to a GCS page */
 540		return esr_is_data_abort(esr) && is_write_abort(esr);
 541	}
 542
 543	return false;
 544}
 545
 546static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 547				   struct pt_regs *regs)
 548{
 549	const struct fault_info *inf;
 550	struct mm_struct *mm = current->mm;
 551	vm_fault_t fault;
 552	unsigned long vm_flags;
 553	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
 554	unsigned long addr = untagged_addr(far);
 555	struct vm_area_struct *vma;
 556	int si_code;
 557	int pkey = -1;
 558
 559	if (kprobe_page_fault(regs, esr))
 560		return 0;
 561
 562	/*
 563	 * If we're in an interrupt or have no user context, we must not take
 564	 * the fault.
 565	 */
 566	if (faulthandler_disabled() || !mm)
 567		goto no_context;
 568
 569	if (user_mode(regs))
 570		mm_flags |= FAULT_FLAG_USER;
 571
 572	/*
 573	 * vm_flags tells us what bits we must have in vma->vm_flags
 574	 * for the fault to be benign, __do_page_fault() would check
 575	 * vma->vm_flags & vm_flags and returns an error if the
 576	 * intersection is empty
 577	 */
 578	if (is_el0_instruction_abort(esr)) {
 579		/* It was exec fault */
 580		vm_flags = VM_EXEC;
 581		mm_flags |= FAULT_FLAG_INSTRUCTION;
 582	} else if (is_gcs_fault(esr)) {
 583		/*
 584		 * The GCS permission on a page implies both read and
 585		 * write so always handle any GCS fault as a write fault,
 586		 * we need to trigger CoW even for GCS reads.
 587		 */
 588		vm_flags = VM_WRITE;
 589		mm_flags |= FAULT_FLAG_WRITE;
 590	} else if (is_write_abort(esr)) {
 591		/* It was write fault */
 592		vm_flags = VM_WRITE;
 593		mm_flags |= FAULT_FLAG_WRITE;
 594	} else {
 595		/* It was read fault */
 596		vm_flags = VM_READ;
 597		/* Write implies read */
 598		vm_flags |= VM_WRITE;
 599		/* If EPAN is absent then exec implies read */
 600		if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
 601			vm_flags |= VM_EXEC;
 602	}
 603
 604	if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
 605		if (is_el1_instruction_abort(esr))
 606			die_kernel_fault("execution of user memory",
 607					 addr, esr, regs);
 608
 609		if (!search_exception_tables(regs->pc))
 610			die_kernel_fault("access to user memory outside uaccess routines",
 611					 addr, esr, regs);
 612	}
 613
 614	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 615
 616	if (!(mm_flags & FAULT_FLAG_USER))
 617		goto lock_mmap;
 618
 619	vma = lock_vma_under_rcu(mm, addr);
 620	if (!vma)
 621		goto lock_mmap;
 622
 623	if (is_invalid_gcs_access(vma, esr)) {
 624		vma_end_read(vma);
 625		fault = 0;
 626		si_code = SEGV_ACCERR;
 627		goto bad_area;
 628	}
 629
 630	if (!(vma->vm_flags & vm_flags)) {
 631		vma_end_read(vma);
 632		fault = 0;
 633		si_code = SEGV_ACCERR;
 634		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
 635		goto bad_area;
 636	}
 637
 638	if (fault_from_pkey(esr, vma, mm_flags)) {
 639		pkey = vma_pkey(vma);
 640		vma_end_read(vma);
 641		fault = 0;
 642		si_code = SEGV_PKUERR;
 643		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
 644		goto bad_area;
 645	}
 646
 647	fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
 648	if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
 649		vma_end_read(vma);
 650
 651	if (!(fault & VM_FAULT_RETRY)) {
 652		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
 653		goto done;
 654	}
 655	count_vm_vma_lock_event(VMA_LOCK_RETRY);
 656	if (fault & VM_FAULT_MAJOR)
 657		mm_flags |= FAULT_FLAG_TRIED;
 658
 659	/* Quick path to respond to signals */
 660	if (fault_signal_pending(fault, regs)) {
 661		if (!user_mode(regs))
 662			goto no_context;
 663		return 0;
 664	}
 665lock_mmap:
 666
 667retry:
 668	vma = lock_mm_and_find_vma(mm, addr, regs);
 669	if (unlikely(!vma)) {
 670		fault = 0;
 671		si_code = SEGV_MAPERR;
 672		goto bad_area;
 673	}
 674
 675	if (!(vma->vm_flags & vm_flags)) {
 676		mmap_read_unlock(mm);
 677		fault = 0;
 678		si_code = SEGV_ACCERR;
 679		goto bad_area;
 680	}
 681
 682	if (fault_from_pkey(esr, vma, mm_flags)) {
 683		pkey = vma_pkey(vma);
 684		mmap_read_unlock(mm);
 685		fault = 0;
 686		si_code = SEGV_PKUERR;
 687		goto bad_area;
 688	}
 689
 690	fault = handle_mm_fault(vma, addr, mm_flags, regs);
 691
 692	/* Quick path to respond to signals */
 693	if (fault_signal_pending(fault, regs)) {
 694		if (!user_mode(regs))
 695			goto no_context;
 696		return 0;
 697	}
 698
 699	/* The fault is fully completed (including releasing mmap lock) */
 700	if (fault & VM_FAULT_COMPLETED)
 701		return 0;
 702
 703	if (fault & VM_FAULT_RETRY) {
 704		mm_flags |= FAULT_FLAG_TRIED;
 705		goto retry;
 706	}
 707	mmap_read_unlock(mm);
 708
 709done:
 710	/* Handle the "normal" (no error) case first. */
 711	if (likely(!(fault & VM_FAULT_ERROR)))
 
 
 
 712		return 0;
 713
 714	si_code = SEGV_MAPERR;
 715bad_area:
 716	/*
 717	 * If we are in kernel mode at this point, we have no context to
 718	 * handle this fault with.
 719	 */
 720	if (!user_mode(regs))
 721		goto no_context;
 722
 723	if (fault & VM_FAULT_OOM) {
 724		/*
 725		 * We ran out of memory, call the OOM killer, and return to
 726		 * userspace (which will retry the fault, or kill us if we got
 727		 * oom-killed).
 728		 */
 729		pagefault_out_of_memory();
 730		return 0;
 731	}
 732
 733	inf = esr_to_fault_info(esr);
 734	set_thread_esr(addr, esr);
 735	if (fault & VM_FAULT_SIGBUS) {
 736		/*
 737		 * We had some memory, but were unable to successfully fix up
 738		 * this page fault.
 739		 */
 740		arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
 741	} else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
 742		unsigned int lsb;
 743
 744		lsb = PAGE_SHIFT;
 745		if (fault & VM_FAULT_HWPOISON_LARGE)
 746			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
 747
 748		arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
 749	} else {
 750		/*
 751		 * The pkey value that we return to userspace can be different
 752		 * from the pkey that caused the fault.
 753		 *
 754		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
 755		 * 2. T1   : set POR_EL0 to deny access to pkey=4, touches, page
 756		 * 3. T1   : faults...
 757		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
 758		 * 5. T1   : enters fault handler, takes mmap_lock, etc...
 759		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
 760		 *	     faulted on a pte with its pkey=4.
 761		 */
 762		/* Something tried to access memory that out of memory map */
 763		if (si_code == SEGV_PKUERR)
 764			arm64_force_sig_fault_pkey(far, inf->name, pkey);
 765		else
 766			arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name);
 767	}
 768
 769	return 0;
 770
 771no_context:
 772	__do_kernel_fault(addr, esr, regs);
 773	return 0;
 774}
 775
 776static int __kprobes do_translation_fault(unsigned long far,
 777					  unsigned long esr,
 778					  struct pt_regs *regs)
 779{
 780	unsigned long addr = untagged_addr(far);
 781
 782	if (is_ttbr0_addr(addr))
 783		return do_page_fault(far, esr, regs);
 784
 785	do_bad_area(far, esr, regs);
 786	return 0;
 787}
 788
 789static int do_alignment_fault(unsigned long far, unsigned long esr,
 790			      struct pt_regs *regs)
 791{
 792	if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) &&
 793	    compat_user_mode(regs))
 794		return do_compat_alignment_fixup(far, regs);
 795	do_bad_area(far, esr, regs);
 796	return 0;
 797}
 798
 799static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
 800{
 801	return 1; /* "fault" */
 802}
 803
 804static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
 805{
 806	const struct fault_info *inf;
 807	unsigned long siaddr;
 808
 809	inf = esr_to_fault_info(esr);
 810
 811	if (user_mode(regs) && apei_claim_sea(regs) == 0) {
 812		/*
 813		 * APEI claimed this as a firmware-first notification.
 814		 * Some processing deferred to task_work before ret_to_user().
 815		 */
 816		return 0;
 817	}
 818
 819	if (esr & ESR_ELx_FnV) {
 820		siaddr = 0;
 821	} else {
 822		/*
 823		 * The architecture specifies that the tag bits of FAR_EL1 are
 824		 * UNKNOWN for synchronous external aborts. Mask them out now
 825		 * so that userspace doesn't see them.
 826		 */
 827		siaddr  = untagged_addr(far);
 828	}
 829	arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
 830
 831	return 0;
 832}
 833
 834static int do_tag_check_fault(unsigned long far, unsigned long esr,
 835			      struct pt_regs *regs)
 836{
 837	/*
 838	 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
 839	 * for tag check faults. Set them to corresponding bits in the untagged
 840	 * address.
 841	 */
 842	far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
 843	do_bad_area(far, esr, regs);
 844	return 0;
 845}
 846
 847static const struct fault_info fault_info[] = {
 848	{ do_bad,		SIGKILL, SI_KERNEL,	"ttbr address size fault"	},
 849	{ do_bad,		SIGKILL, SI_KERNEL,	"level 1 address size fault"	},
 850	{ do_bad,		SIGKILL, SI_KERNEL,	"level 2 address size fault"	},
 851	{ do_bad,		SIGKILL, SI_KERNEL,	"level 3 address size fault"	},
 852	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 0 translation fault"	},
 853	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	},
 854	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	},
 855	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	},
 856	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 0 access flag fault"	},
 857	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 access flag fault"	},
 858	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 access flag fault"	},
 859	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 access flag fault"	},
 860	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 0 permission fault"	},
 861	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 1 permission fault"	},
 862	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 2 permission fault"	},
 863	{ do_page_fault,	SIGSEGV, SEGV_ACCERR,	"level 3 permission fault"	},
 864	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous external abort"	},
 865	{ do_tag_check_fault,	SIGSEGV, SEGV_MTESERR,	"synchronous tag check fault"	},
 866	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 18"			},
 867	{ do_sea,		SIGKILL, SI_KERNEL,	"level -1 (translation table walk)"	},
 868	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 (translation table walk)"	},
 869	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 (translation table walk)"	},
 870	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 (translation table walk)"	},
 871	{ do_sea,		SIGKILL, SI_KERNEL,	"level 3 (translation table walk)"	},
 872	{ do_sea,		SIGBUS,  BUS_OBJERR,	"synchronous parity or ECC error" },	// Reserved when RAS is implemented
 873	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 25"			},
 874	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 26"			},
 875	{ do_sea,		SIGKILL, SI_KERNEL,	"level -1 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 876	{ do_sea,		SIGKILL, SI_KERNEL,	"level 0 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 877	{ do_sea,		SIGKILL, SI_KERNEL,	"level 1 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 878	{ do_sea,		SIGKILL, SI_KERNEL,	"level 2 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 879	{ do_sea,		SIGKILL, SI_KERNEL,	"level 3 synchronous parity error (translation table walk)"	},	// Reserved when RAS is implemented
 880	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 32"			},
 881	{ do_alignment_fault,	SIGBUS,  BUS_ADRALN,	"alignment fault"		},
 882	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 34"			},
 883	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 35"			},
 884	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 36"			},
 885	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 37"			},
 886	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 38"			},
 887	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 39"			},
 888	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 40"			},
 889	{ do_bad,		SIGKILL, SI_KERNEL,	"level -1 address size fault"	},
 890	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 42"			},
 891	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level -1 translation fault"	},
 892	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 44"			},
 893	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 45"			},
 894	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 46"			},
 895	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 47"			},
 896	{ do_bad,		SIGKILL, SI_KERNEL,	"TLB conflict abort"		},
 897	{ do_bad,		SIGKILL, SI_KERNEL,	"Unsupported atomic hardware update fault"	},
 898	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 50"			},
 899	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 51"			},
 900	{ do_bad,		SIGKILL, SI_KERNEL,	"implementation fault (lockdown abort)" },
 901	{ do_bad,		SIGBUS,  BUS_OBJERR,	"implementation fault (unsupported exclusive)" },
 902	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 54"			},
 903	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 55"			},
 904	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 56"			},
 905	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 57"			},
 906	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 58" 			},
 907	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 59"			},
 908	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 60"			},
 909	{ do_bad,		SIGKILL, SI_KERNEL,	"section domain fault"		},
 910	{ do_bad,		SIGKILL, SI_KERNEL,	"page domain fault"		},
 911	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 63"			},
 912};
 913
 914void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
 915{
 916	const struct fault_info *inf = esr_to_fault_info(esr);
 917	unsigned long addr = untagged_addr(far);
 918
 919	if (!inf->fn(far, esr, regs))
 920		return;
 921
 922	if (!user_mode(regs))
 923		die_kernel_fault(inf->name, addr, esr, regs);
 924
 925	/*
 926	 * At this point we have an unrecognized fault type whose tag bits may
 927	 * have been defined as UNKNOWN. Therefore we only expose the untagged
 928	 * address to the signal handler.
 929	 */
 930	arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
 931}
 932NOKPROBE_SYMBOL(do_mem_abort);
 933
 934void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
 935{
 936	arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
 937			 addr, esr);
 938}
 939NOKPROBE_SYMBOL(do_sp_pc_abort);
 940
 941/*
 942 * __refdata because early_brk64 is __init, but the reference to it is
 943 * clobbered at arch_initcall time.
 944 * See traps.c and debug-monitors.c:debug_traps_init().
 945 */
 946static struct fault_info __refdata debug_fault_info[] = {
 947	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware breakpoint"	},
 948	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware single-step"	},
 949	{ do_bad,	SIGTRAP,	TRAP_HWBKPT,	"hardware watchpoint"	},
 950	{ do_bad,	SIGKILL,	SI_KERNEL,	"unknown 3"		},
 951	{ do_bad,	SIGTRAP,	TRAP_BRKPT,	"aarch32 BKPT"		},
 952	{ do_bad,	SIGKILL,	SI_KERNEL,	"aarch32 vector catch"	},
 953	{ early_brk64,	SIGTRAP,	TRAP_BRKPT,	"aarch64 BRK"		},
 954	{ do_bad,	SIGKILL,	SI_KERNEL,	"unknown 7"		},
 955};
 956
 957void __init hook_debug_fault_code(int nr,
 958				  int (*fn)(unsigned long, unsigned long, struct pt_regs *),
 959				  int sig, int code, const char *name)
 960{
 961	BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
 962
 963	debug_fault_info[nr].fn		= fn;
 964	debug_fault_info[nr].sig	= sig;
 965	debug_fault_info[nr].code	= code;
 966	debug_fault_info[nr].name	= name;
 967}
 968
 969/*
 970 * In debug exception context, we explicitly disable preemption despite
 971 * having interrupts disabled.
 972 * This serves two purposes: it makes it much less likely that we would
 973 * accidentally schedule in exception context and it will force a warning
 974 * if we somehow manage to schedule by accident.
 975 */
 976static void debug_exception_enter(struct pt_regs *regs)
 977{
 978	preempt_disable();
 979
 980	/* This code is a bit fragile.  Test it. */
 981	RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
 982}
 983NOKPROBE_SYMBOL(debug_exception_enter);
 984
 985static void debug_exception_exit(struct pt_regs *regs)
 986{
 987	preempt_enable_no_resched();
 988}
 989NOKPROBE_SYMBOL(debug_exception_exit);
 990
 991void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
 992			struct pt_regs *regs)
 993{
 994	const struct fault_info *inf = esr_to_debug_fault_info(esr);
 995	unsigned long pc = instruction_pointer(regs);
 996
 997	debug_exception_enter(regs);
 998
 999	if (user_mode(regs) && !is_ttbr0_addr(pc))
1000		arm64_apply_bp_hardening();
1001
1002	if (inf->fn(addr_if_watchpoint, esr, regs)) {
1003		arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
1004	}
1005
1006	debug_exception_exit(regs);
1007}
1008NOKPROBE_SYMBOL(do_debug_exception);
1009
1010/*
1011 * Used during anonymous page fault handling.
1012 */
1013struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
1014						unsigned long vaddr)
1015{
1016	gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;
1017
1018	/*
1019	 * If the page is mapped with PROT_MTE, initialise the tags at the
1020	 * point of allocation and page zeroing as this is usually faster than
1021	 * separate DC ZVA and STGM.
1022	 */
1023	if (vma->vm_flags & VM_MTE)
1024		flags |= __GFP_ZEROTAGS;
1025
1026	return vma_alloc_folio(flags, 0, vma, vaddr);
1027}
1028
1029void tag_clear_highpage(struct page *page)
1030{
1031	/* Newly allocated page, shouldn't have been tagged yet */
1032	WARN_ON_ONCE(!try_page_mte_tagging(page));
1033	mte_zero_clear_page_tags(page_address(page));
1034	set_page_mte_tagged(page);
1035}