Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.6.
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Copyright(c) 2017 Intel Corporation. All rights reserved.
  4 *
  5 * This code is based in part on work published here:
  6 *
  7 *	https://github.com/IAIK/KAISER
  8 *
  9 * The original work was written by and and signed off by for the Linux
 10 * kernel by:
 11 *
 12 *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
 13 *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
 14 *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
 15 *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
 16 *
 17 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
 18 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
 19 *		       Andy Lutomirsky <luto@amacapital.net>
 20 */
 21#include <linux/kernel.h>
 22#include <linux/errno.h>
 23#include <linux/string.h>
 24#include <linux/types.h>
 25#include <linux/bug.h>
 26#include <linux/init.h>
 27#include <linux/spinlock.h>
 28#include <linux/mm.h>
 29#include <linux/uaccess.h>
 30#include <linux/cpu.h>
 31
 32#include <asm/cpufeature.h>
 33#include <asm/hypervisor.h>
 34#include <asm/vsyscall.h>
 35#include <asm/cmdline.h>
 36#include <asm/pti.h>
 37#include <asm/pgtable.h>
 38#include <asm/pgalloc.h>
 39#include <asm/tlbflush.h>
 40#include <asm/desc.h>
 41#include <asm/sections.h>
 42
 43#undef pr_fmt
 44#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
 45
 46/* Backporting helper */
 47#ifndef __GFP_NOTRACK
 48#define __GFP_NOTRACK	0
 49#endif
 50
 51/*
 52 * Define the page-table levels we clone for user-space on 32
 53 * and 64 bit.
 54 */
 55#ifdef CONFIG_X86_64
 56#define	PTI_LEVEL_KERNEL_IMAGE	PTI_CLONE_PMD
 57#else
 58#define	PTI_LEVEL_KERNEL_IMAGE	PTI_CLONE_PTE
 59#endif
 60
 61static void __init pti_print_if_insecure(const char *reason)
 62{
 63	if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 64		pr_info("%s\n", reason);
 65}
 66
 67static void __init pti_print_if_secure(const char *reason)
 68{
 69	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 70		pr_info("%s\n", reason);
 71}
 72
 73static enum pti_mode {
 74	PTI_AUTO = 0,
 75	PTI_FORCE_OFF,
 76	PTI_FORCE_ON
 77} pti_mode;
 78
 79void __init pti_check_boottime_disable(void)
 80{
 81	char arg[5];
 82	int ret;
 83
 84	/* Assume mode is auto unless overridden. */
 85	pti_mode = PTI_AUTO;
 86
 87	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
 88		pti_mode = PTI_FORCE_OFF;
 89		pti_print_if_insecure("disabled on XEN PV.");
 90		return;
 91	}
 92
 93	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
 94	if (ret > 0)  {
 95		if (ret == 3 && !strncmp(arg, "off", 3)) {
 96			pti_mode = PTI_FORCE_OFF;
 97			pti_print_if_insecure("disabled on command line.");
 98			return;
 99		}
100		if (ret == 2 && !strncmp(arg, "on", 2)) {
101			pti_mode = PTI_FORCE_ON;
102			pti_print_if_secure("force enabled on command line.");
103			goto enable;
104		}
105		if (ret == 4 && !strncmp(arg, "auto", 4)) {
106			pti_mode = PTI_AUTO;
107			goto autosel;
108		}
109	}
110
111	if (cmdline_find_option_bool(boot_command_line, "nopti") ||
112	    cpu_mitigations_off()) {
113		pti_mode = PTI_FORCE_OFF;
114		pti_print_if_insecure("disabled on command line.");
115		return;
116	}
117
118autosel:
119	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
120		return;
121enable:
122	setup_force_cpu_cap(X86_FEATURE_PTI);
123}
124
125pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
126{
127	/*
128	 * Changes to the high (kernel) portion of the kernelmode page
129	 * tables are not automatically propagated to the usermode tables.
130	 *
131	 * Users should keep in mind that, unlike the kernelmode tables,
132	 * there is no vmalloc_fault equivalent for the usermode tables.
133	 * Top-level entries added to init_mm's usermode pgd after boot
134	 * will not be automatically propagated to other mms.
135	 */
136	if (!pgdp_maps_userspace(pgdp))
137		return pgd;
138
139	/*
140	 * The user page tables get the full PGD, accessible from
141	 * userspace:
142	 */
143	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
144
145	/*
146	 * If this is normal user memory, make it NX in the kernel
147	 * pagetables so that, if we somehow screw up and return to
148	 * usermode with the kernel CR3 loaded, we'll get a page fault
149	 * instead of allowing user code to execute with the wrong CR3.
150	 *
151	 * As exceptions, we don't set NX if:
152	 *  - _PAGE_USER is not set.  This could be an executable
153	 *     EFI runtime mapping or something similar, and the kernel
154	 *     may execute from it
155	 *  - we don't have NX support
156	 *  - we're clearing the PGD (i.e. the new pgd is not present).
157	 */
158	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
159	    (__supported_pte_mask & _PAGE_NX))
160		pgd.pgd |= _PAGE_NX;
161
162	/* return the copy of the PGD we want the kernel to use: */
163	return pgd;
164}
165
166/*
167 * Walk the user copy of the page tables (optionally) trying to allocate
168 * page table pages on the way down.
169 *
170 * Returns a pointer to a P4D on success, or NULL on failure.
171 */
172static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
173{
174	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
175	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
176
177	if (address < PAGE_OFFSET) {
178		WARN_ONCE(1, "attempt to walk user address\n");
179		return NULL;
180	}
181
182	if (pgd_none(*pgd)) {
183		unsigned long new_p4d_page = __get_free_page(gfp);
184		if (WARN_ON_ONCE(!new_p4d_page))
185			return NULL;
186
187		set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
188	}
189	BUILD_BUG_ON(pgd_large(*pgd) != 0);
190
191	return p4d_offset(pgd, address);
192}
193
194/*
195 * Walk the user copy of the page tables (optionally) trying to allocate
196 * page table pages on the way down.
197 *
198 * Returns a pointer to a PMD on success, or NULL on failure.
199 */
200static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
201{
202	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
203	p4d_t *p4d;
204	pud_t *pud;
205
206	p4d = pti_user_pagetable_walk_p4d(address);
207	if (!p4d)
208		return NULL;
209
210	BUILD_BUG_ON(p4d_large(*p4d) != 0);
211	if (p4d_none(*p4d)) {
212		unsigned long new_pud_page = __get_free_page(gfp);
213		if (WARN_ON_ONCE(!new_pud_page))
214			return NULL;
215
216		set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
217	}
218
219	pud = pud_offset(p4d, address);
220	/* The user page tables do not use large mappings: */
221	if (pud_large(*pud)) {
222		WARN_ON(1);
223		return NULL;
224	}
225	if (pud_none(*pud)) {
226		unsigned long new_pmd_page = __get_free_page(gfp);
227		if (WARN_ON_ONCE(!new_pmd_page))
228			return NULL;
229
230		set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
231	}
232
233	return pmd_offset(pud, address);
234}
235
236/*
237 * Walk the shadow copy of the page tables (optionally) trying to allocate
238 * page table pages on the way down.  Does not support large pages.
239 *
240 * Note: this is only used when mapping *new* kernel data into the
241 * user/shadow page tables.  It is never used for userspace data.
242 *
243 * Returns a pointer to a PTE on success, or NULL on failure.
244 */
245static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
246{
247	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
248	pmd_t *pmd;
249	pte_t *pte;
250
251	pmd = pti_user_pagetable_walk_pmd(address);
252	if (!pmd)
253		return NULL;
254
255	/* We can't do anything sensible if we hit a large mapping. */
256	if (pmd_large(*pmd)) {
257		WARN_ON(1);
258		return NULL;
259	}
260
261	if (pmd_none(*pmd)) {
262		unsigned long new_pte_page = __get_free_page(gfp);
263		if (!new_pte_page)
264			return NULL;
265
266		set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
267	}
268
269	pte = pte_offset_kernel(pmd, address);
270	if (pte_flags(*pte) & _PAGE_USER) {
271		WARN_ONCE(1, "attempt to walk to user pte\n");
272		return NULL;
273	}
274	return pte;
275}
276
277#ifdef CONFIG_X86_VSYSCALL_EMULATION
278static void __init pti_setup_vsyscall(void)
279{
280	pte_t *pte, *target_pte;
281	unsigned int level;
282
283	pte = lookup_address(VSYSCALL_ADDR, &level);
284	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
285		return;
286
287	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
288	if (WARN_ON(!target_pte))
289		return;
290
291	*target_pte = *pte;
292	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
293}
294#else
295static void __init pti_setup_vsyscall(void) { }
296#endif
297
298enum pti_clone_level {
299	PTI_CLONE_PMD,
300	PTI_CLONE_PTE,
301};
302
303static void
304pti_clone_pgtable(unsigned long start, unsigned long end,
305		  enum pti_clone_level level)
306{
307	unsigned long addr;
308
309	/*
310	 * Clone the populated PMDs which cover start to end. These PMD areas
311	 * can have holes.
312	 */
313	for (addr = start; addr < end;) {
314		pte_t *pte, *target_pte;
315		pmd_t *pmd, *target_pmd;
316		pgd_t *pgd;
317		p4d_t *p4d;
318		pud_t *pud;
319
320		/* Overflow check */
321		if (addr < start)
322			break;
323
324		pgd = pgd_offset_k(addr);
325		if (WARN_ON(pgd_none(*pgd)))
326			return;
327		p4d = p4d_offset(pgd, addr);
328		if (WARN_ON(p4d_none(*p4d)))
329			return;
330
331		pud = pud_offset(p4d, addr);
332		if (pud_none(*pud)) {
333			WARN_ON_ONCE(addr & ~PUD_MASK);
334			addr = round_up(addr + 1, PUD_SIZE);
335			continue;
336		}
337
338		pmd = pmd_offset(pud, addr);
339		if (pmd_none(*pmd)) {
340			WARN_ON_ONCE(addr & ~PMD_MASK);
341			addr = round_up(addr + 1, PMD_SIZE);
342			continue;
343		}
344
345		if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
346			target_pmd = pti_user_pagetable_walk_pmd(addr);
347			if (WARN_ON(!target_pmd))
348				return;
349
350			/*
351			 * Only clone present PMDs.  This ensures only setting
352			 * _PAGE_GLOBAL on present PMDs.  This should only be
353			 * called on well-known addresses anyway, so a non-
354			 * present PMD would be a surprise.
355			 */
356			if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
357				return;
358
359			/*
360			 * Setting 'target_pmd' below creates a mapping in both
361			 * the user and kernel page tables.  It is effectively
362			 * global, so set it as global in both copies.  Note:
363			 * the X86_FEATURE_PGE check is not _required_ because
364			 * the CPU ignores _PAGE_GLOBAL when PGE is not
365			 * supported.  The check keeps consistentency with
366			 * code that only set this bit when supported.
367			 */
368			if (boot_cpu_has(X86_FEATURE_PGE))
369				*pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
370
371			/*
372			 * Copy the PMD.  That is, the kernelmode and usermode
373			 * tables will share the last-level page tables of this
374			 * address range
375			 */
376			*target_pmd = *pmd;
377
378			addr += PMD_SIZE;
379
380		} else if (level == PTI_CLONE_PTE) {
381
382			/* Walk the page-table down to the pte level */
383			pte = pte_offset_kernel(pmd, addr);
384			if (pte_none(*pte)) {
385				addr += PAGE_SIZE;
386				continue;
387			}
388
389			/* Only clone present PTEs */
390			if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT)))
391				return;
392
393			/* Allocate PTE in the user page-table */
394			target_pte = pti_user_pagetable_walk_pte(addr);
395			if (WARN_ON(!target_pte))
396				return;
397
398			/* Set GLOBAL bit in both PTEs */
399			if (boot_cpu_has(X86_FEATURE_PGE))
400				*pte = pte_set_flags(*pte, _PAGE_GLOBAL);
401
402			/* Clone the PTE */
403			*target_pte = *pte;
404
405			addr += PAGE_SIZE;
406
407		} else {
408			BUG();
409		}
410	}
411}
412
413#ifdef CONFIG_X86_64
414/*
415 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
416 * next-level entry on 5-level systems.
417 */
418static void __init pti_clone_p4d(unsigned long addr)
419{
420	p4d_t *kernel_p4d, *user_p4d;
421	pgd_t *kernel_pgd;
422
423	user_p4d = pti_user_pagetable_walk_p4d(addr);
424	if (!user_p4d)
425		return;
426
427	kernel_pgd = pgd_offset_k(addr);
428	kernel_p4d = p4d_offset(kernel_pgd, addr);
429	*user_p4d = *kernel_p4d;
430}
431
432/*
433 * Clone the CPU_ENTRY_AREA and associated data into the user space visible
434 * page table.
435 */
436static void __init pti_clone_user_shared(void)
437{
438	unsigned int cpu;
439
440	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
441
442	for_each_possible_cpu(cpu) {
443		/*
444		 * The SYSCALL64 entry code needs to be able to find the
445		 * thread stack and needs one word of scratch space in which
446		 * to spill a register.  All of this lives in the TSS, in
447		 * the sp1 and sp2 slots.
448		 *
449		 * This is done for all possible CPUs during boot to ensure
450		 * that it's propagated to all mms.  If we were to add one of
451		 * these mappings during CPU hotplug, we would need to take
452		 * some measure to make sure that every mm that subsequently
453		 * ran on that CPU would have the relevant PGD entry in its
454		 * pagetables.  The usual vmalloc_fault() mechanism would not
455		 * work for page faults taken in entry_SYSCALL_64 before RSP
456		 * is set up.
457		 */
458
459		unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
460		phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
461		pte_t *target_pte;
462
463		target_pte = pti_user_pagetable_walk_pte(va);
464		if (WARN_ON(!target_pte))
465			return;
466
467		*target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL);
468	}
469}
470
471#else /* CONFIG_X86_64 */
472
473/*
474 * On 32 bit PAE systems with 1GB of Kernel address space there is only
475 * one pgd/p4d for the whole kernel. Cloning that would map the whole
476 * address space into the user page-tables, making PTI useless. So clone
477 * the page-table on the PMD level to prevent that.
478 */
479static void __init pti_clone_user_shared(void)
480{
481	unsigned long start, end;
482
483	start = CPU_ENTRY_AREA_BASE;
484	end   = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
485
486	pti_clone_pgtable(start, end, PTI_CLONE_PMD);
487}
488#endif /* CONFIG_X86_64 */
489
490/*
491 * Clone the ESPFIX P4D into the user space visible page table
492 */
493static void __init pti_setup_espfix64(void)
494{
495#ifdef CONFIG_X86_ESPFIX64
496	pti_clone_p4d(ESPFIX_BASE_ADDR);
497#endif
498}
499
500/*
501 * Clone the populated PMDs of the entry and irqentry text and force it RO.
502 */
503static void pti_clone_entry_text(void)
504{
505	pti_clone_pgtable((unsigned long) __entry_text_start,
506			  (unsigned long) __irqentry_text_end,
507			  PTI_CLONE_PMD);
508}
509
510/*
511 * Global pages and PCIDs are both ways to make kernel TLB entries
512 * live longer, reduce TLB misses and improve kernel performance.
513 * But, leaving all kernel text Global makes it potentially accessible
514 * to Meltdown-style attacks which make it trivial to find gadgets or
515 * defeat KASLR.
516 *
517 * Only use global pages when it is really worth it.
518 */
519static inline bool pti_kernel_image_global_ok(void)
520{
521	/*
522	 * Systems with PCIDs get litlle benefit from global
523	 * kernel text and are not worth the downsides.
524	 */
525	if (cpu_feature_enabled(X86_FEATURE_PCID))
526		return false;
527
528	/*
529	 * Only do global kernel image for pti=auto.  Do the most
530	 * secure thing (not global) if pti=on specified.
531	 */
532	if (pti_mode != PTI_AUTO)
533		return false;
534
535	/*
536	 * K8 may not tolerate the cleared _PAGE_RW on the userspace
537	 * global kernel image pages.  Do the safe thing (disable
538	 * global kernel image).  This is unlikely to ever be
539	 * noticed because PTI is disabled by default on AMD CPUs.
540	 */
541	if (boot_cpu_has(X86_FEATURE_K8))
542		return false;
543
544	/*
545	 * RANDSTRUCT derives its hardening benefits from the
546	 * attacker's lack of knowledge about the layout of kernel
547	 * data structures.  Keep the kernel image non-global in
548	 * cases where RANDSTRUCT is in use to help keep the layout a
549	 * secret.
550	 */
551	if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
552		return false;
553
554	return true;
555}
556
557/*
558 * This is the only user for these and it is not arch-generic
559 * like the other set_memory.h functions.  Just extern them.
560 */
561extern int set_memory_nonglobal(unsigned long addr, int numpages);
562extern int set_memory_global(unsigned long addr, int numpages);
563
564/*
565 * For some configurations, map all of kernel text into the user page
566 * tables.  This reduces TLB misses, especially on non-PCID systems.
567 */
568static void pti_clone_kernel_text(void)
569{
570	/*
571	 * rodata is part of the kernel image and is normally
572	 * readable on the filesystem or on the web.  But, do not
573	 * clone the areas past rodata, they might contain secrets.
574	 */
575	unsigned long start = PFN_ALIGN(_text);
576	unsigned long end_clone  = (unsigned long)__end_rodata_aligned;
577	unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
578
579	if (!pti_kernel_image_global_ok())
580		return;
581
582	pr_debug("mapping partial kernel image into user address space\n");
583
584	/*
585	 * Note that this will undo _some_ of the work that
586	 * pti_set_kernel_image_nonglobal() did to clear the
587	 * global bit.
588	 */
589	pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
590
591	/*
592	 * pti_clone_pgtable() will set the global bit in any PMDs
593	 * that it clones, but we also need to get any PTEs in
594	 * the last level for areas that are not huge-page-aligned.
595	 */
596
597	/* Set the global bit for normal non-__init kernel text: */
598	set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
599}
600
601static void pti_set_kernel_image_nonglobal(void)
602{
603	/*
604	 * The identity map is created with PMDs, regardless of the
605	 * actual length of the kernel.  We need to clear
606	 * _PAGE_GLOBAL up to a PMD boundary, not just to the end
607	 * of the image.
608	 */
609	unsigned long start = PFN_ALIGN(_text);
610	unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
611
612	/*
613	 * This clears _PAGE_GLOBAL from the entire kernel image.
614	 * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
615	 * areas that are mapped to userspace.
616	 */
617	set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
618}
619
620/*
621 * Initialize kernel page table isolation
622 */
623void __init pti_init(void)
624{
625	if (!boot_cpu_has(X86_FEATURE_PTI))
626		return;
627
628	pr_info("enabled\n");
629
630#ifdef CONFIG_X86_32
631	/*
632	 * We check for X86_FEATURE_PCID here. But the init-code will
633	 * clear the feature flag on 32 bit because the feature is not
634	 * supported on 32 bit anyway. To print the warning we need to
635	 * check with cpuid directly again.
636	 */
637	if (cpuid_ecx(0x1) & BIT(17)) {
638		/* Use printk to work around pr_fmt() */
639		printk(KERN_WARNING "\n");
640		printk(KERN_WARNING "************************************************************\n");
641		printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!  **\n");
642		printk(KERN_WARNING "**                                                        **\n");
643		printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n");
644		printk(KERN_WARNING "** Your performance will increase dramatically if you     **\n");
645		printk(KERN_WARNING "** switch to a 64-bit kernel!                             **\n");
646		printk(KERN_WARNING "**                                                        **\n");
647		printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!  **\n");
648		printk(KERN_WARNING "************************************************************\n");
649	}
650#endif
651
652	pti_clone_user_shared();
653
654	/* Undo all global bits from the init pagetables in head_64.S: */
655	pti_set_kernel_image_nonglobal();
656	/* Replace some of the global bits just for shared entry text: */
657	pti_clone_entry_text();
658	pti_setup_espfix64();
659	pti_setup_vsyscall();
660}
661
662/*
663 * Finalize the kernel mappings in the userspace page-table. Some of the
664 * mappings for the kernel image might have changed since pti_init()
665 * cloned them. This is because parts of the kernel image have been
666 * mapped RO and/or NX.  These changes need to be cloned again to the
667 * userspace page-table.
668 */
669void pti_finalize(void)
670{
671	if (!boot_cpu_has(X86_FEATURE_PTI))
672		return;
673	/*
674	 * We need to clone everything (again) that maps parts of the
675	 * kernel image.
676	 */
677	pti_clone_entry_text();
678	pti_clone_kernel_text();
679
680	debug_checkwx_user();
681}