Linux Audio

Check our new training course

Loading...
Note: File does not exist in v6.2.
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * mpx.c - Memory Protection eXtensions
  4 *
  5 * Copyright (c) 2014, Intel Corporation.
  6 * Qiaowei Ren <qiaowei.ren@intel.com>
  7 * Dave Hansen <dave.hansen@intel.com>
  8 */
  9#include <linux/kernel.h>
 10#include <linux/slab.h>
 11#include <linux/mm_types.h>
 12#include <linux/syscalls.h>
 13#include <linux/sched/sysctl.h>
 14
 15#include <asm/insn.h>
 16#include <asm/insn-eval.h>
 17#include <asm/mman.h>
 18#include <asm/mmu_context.h>
 19#include <asm/mpx.h>
 20#include <asm/processor.h>
 21#include <asm/fpu/internal.h>
 22
 23#define CREATE_TRACE_POINTS
 24#include <asm/trace/mpx.h>
 25
 26static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
 27{
 28	if (is_64bit_mm(mm))
 29		return MPX_BD_SIZE_BYTES_64;
 30	else
 31		return MPX_BD_SIZE_BYTES_32;
 32}
 33
 34static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
 35{
 36	if (is_64bit_mm(mm))
 37		return MPX_BT_SIZE_BYTES_64;
 38	else
 39		return MPX_BT_SIZE_BYTES_32;
 40}
 41
 42/*
 43 * This is really a simplified "vm_mmap". it only handles MPX
 44 * bounds tables (the bounds directory is user-allocated).
 45 */
 46static unsigned long mpx_mmap(unsigned long len)
 47{
 48	struct mm_struct *mm = current->mm;
 49	unsigned long addr, populate;
 50
 51	/* Only bounds table can be allocated here */
 52	if (len != mpx_bt_size_bytes(mm))
 53		return -EINVAL;
 54
 55	down_write(&mm->mmap_sem);
 56	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
 57		       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
 58	up_write(&mm->mmap_sem);
 59	if (populate)
 60		mm_populate(addr, populate);
 61
 62	return addr;
 63}
 64
 65static int mpx_insn_decode(struct insn *insn,
 66			   struct pt_regs *regs)
 67{
 68	unsigned char buf[MAX_INSN_SIZE];
 69	int x86_64 = !test_thread_flag(TIF_IA32);
 70	int not_copied;
 71	int nr_copied;
 72
 73	not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
 74	nr_copied = sizeof(buf) - not_copied;
 75	/*
 76	 * The decoder _should_ fail nicely if we pass it a short buffer.
 77	 * But, let's not depend on that implementation detail.  If we
 78	 * did not get anything, just error out now.
 79	 */
 80	if (!nr_copied)
 81		return -EFAULT;
 82	insn_init(insn, buf, nr_copied, x86_64);
 83	insn_get_length(insn);
 84	/*
 85	 * copy_from_user() tries to get as many bytes as we could see in
 86	 * the largest possible instruction.  If the instruction we are
 87	 * after is shorter than that _and_ we attempt to copy from
 88	 * something unreadable, we might get a short read.  This is OK
 89	 * as long as the read did not stop in the middle of the
 90	 * instruction.  Check to see if we got a partial instruction.
 91	 */
 92	if (nr_copied < insn->length)
 93		return -EFAULT;
 94
 95	insn_get_opcode(insn);
 96	/*
 97	 * We only _really_ need to decode bndcl/bndcn/bndcu
 98	 * Error out on anything else.
 99	 */
100	if (insn->opcode.bytes[0] != 0x0f)
101		goto bad_opcode;
102	if ((insn->opcode.bytes[1] != 0x1a) &&
103	    (insn->opcode.bytes[1] != 0x1b))
104		goto bad_opcode;
105
106	return 0;
107bad_opcode:
108	return -EINVAL;
109}
110
111/*
112 * If a bounds overflow occurs then a #BR is generated. This
113 * function decodes MPX instructions to get violation address
114 * and set this address into extended struct siginfo.
115 *
116 * Note that this is not a super precise way of doing this.
117 * Userspace could have, by the time we get here, written
118 * anything it wants in to the instructions.  We can not
119 * trust anything about it.  They might not be valid
120 * instructions or might encode invalid registers, etc...
121 *
122 * The caller is expected to kfree() the returned siginfo_t.
123 */
124siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
125{
126	const struct mpx_bndreg_state *bndregs;
127	const struct mpx_bndreg *bndreg;
128	siginfo_t *info = NULL;
129	struct insn insn;
130	uint8_t bndregno;
131	int err;
132
133	err = mpx_insn_decode(&insn, regs);
134	if (err)
135		goto err_out;
136
137	/*
138	 * We know at this point that we are only dealing with
139	 * MPX instructions.
140	 */
141	insn_get_modrm(&insn);
142	bndregno = X86_MODRM_REG(insn.modrm.value);
143	if (bndregno > 3) {
144		err = -EINVAL;
145		goto err_out;
146	}
147	/* get bndregs field from current task's xsave area */
148	bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
149	if (!bndregs) {
150		err = -EINVAL;
151		goto err_out;
152	}
153	/* now go select the individual register in the set of 4 */
154	bndreg = &bndregs->bndreg[bndregno];
155
156	info = kzalloc(sizeof(*info), GFP_KERNEL);
157	if (!info) {
158		err = -ENOMEM;
159		goto err_out;
160	}
161	/*
162	 * The registers are always 64-bit, but the upper 32
163	 * bits are ignored in 32-bit mode.  Also, note that the
164	 * upper bounds are architecturally represented in 1's
165	 * complement form.
166	 *
167	 * The 'unsigned long' cast is because the compiler
168	 * complains when casting from integers to different-size
169	 * pointers.
170	 */
171	info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
172	info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
173	info->si_addr_lsb = 0;
174	info->si_signo = SIGSEGV;
175	info->si_errno = 0;
176	info->si_code = SEGV_BNDERR;
177	info->si_addr = insn_get_addr_ref(&insn, regs);
178	/*
179	 * We were not able to extract an address from the instruction,
180	 * probably because there was something invalid in it.
181	 */
182	if (info->si_addr == (void __user *)-1) {
183		err = -EINVAL;
184		goto err_out;
185	}
186	trace_mpx_bounds_register_exception(info->si_addr, bndreg);
187	return info;
188err_out:
189	/* info might be NULL, but kfree() handles that */
190	kfree(info);
191	return ERR_PTR(err);
192}
193
194static __user void *mpx_get_bounds_dir(void)
195{
196	const struct mpx_bndcsr *bndcsr;
197
198	if (!cpu_feature_enabled(X86_FEATURE_MPX))
199		return MPX_INVALID_BOUNDS_DIR;
200
201	/*
202	 * The bounds directory pointer is stored in a register
203	 * only accessible if we first do an xsave.
204	 */
205	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
206	if (!bndcsr)
207		return MPX_INVALID_BOUNDS_DIR;
208
209	/*
210	 * Make sure the register looks valid by checking the
211	 * enable bit.
212	 */
213	if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
214		return MPX_INVALID_BOUNDS_DIR;
215
216	/*
217	 * Lastly, mask off the low bits used for configuration
218	 * flags, and return the address of the bounds table.
219	 */
220	return (void __user *)(unsigned long)
221		(bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
222}
223
224int mpx_enable_management(void)
225{
226	void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
227	struct mm_struct *mm = current->mm;
228	int ret = 0;
229
230	/*
231	 * runtime in the userspace will be responsible for allocation of
232	 * the bounds directory. Then, it will save the base of the bounds
233	 * directory into XSAVE/XRSTOR Save Area and enable MPX through
234	 * XRSTOR instruction.
235	 *
236	 * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
237	 * expected to be relatively expensive. Storing the bounds
238	 * directory here means that we do not have to do xsave in the
239	 * unmap path; we can just use mm->context.bd_addr instead.
240	 */
241	bd_base = mpx_get_bounds_dir();
242	down_write(&mm->mmap_sem);
243
244	/* MPX doesn't support addresses above 47 bits yet. */
245	if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
246		pr_warn_once("%s (%d): MPX cannot handle addresses "
247				"above 47-bits. Disabling.",
248				current->comm, current->pid);
249		ret = -ENXIO;
250		goto out;
251	}
252	mm->context.bd_addr = bd_base;
253	if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
254		ret = -ENXIO;
255out:
256	up_write(&mm->mmap_sem);
257	return ret;
258}
259
260int mpx_disable_management(void)
261{
262	struct mm_struct *mm = current->mm;
263
264	if (!cpu_feature_enabled(X86_FEATURE_MPX))
265		return -ENXIO;
266
267	down_write(&mm->mmap_sem);
268	mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
269	up_write(&mm->mmap_sem);
270	return 0;
271}
272
273static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
274		unsigned long *curval,
275		unsigned long __user *addr,
276		unsigned long old_val, unsigned long new_val)
277{
278	int ret;
279	/*
280	 * user_atomic_cmpxchg_inatomic() actually uses sizeof()
281	 * the pointer that we pass to it to figure out how much
282	 * data to cmpxchg.  We have to be careful here not to
283	 * pass a pointer to a 64-bit data type when we only want
284	 * a 32-bit copy.
285	 */
286	if (is_64bit_mm(mm)) {
287		ret = user_atomic_cmpxchg_inatomic(curval,
288				addr, old_val, new_val);
289	} else {
290		u32 uninitialized_var(curval_32);
291		u32 old_val_32 = old_val;
292		u32 new_val_32 = new_val;
293		u32 __user *addr_32 = (u32 __user *)addr;
294
295		ret = user_atomic_cmpxchg_inatomic(&curval_32,
296				addr_32, old_val_32, new_val_32);
297		*curval = curval_32;
298	}
299	return ret;
300}
301
302/*
303 * With 32-bit mode, a bounds directory is 4MB, and the size of each
304 * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
305 * and the size of each bounds table is 4MB.
306 */
307static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
308{
309	unsigned long expected_old_val = 0;
310	unsigned long actual_old_val = 0;
311	unsigned long bt_addr;
312	unsigned long bd_new_entry;
313	int ret = 0;
314
315	/*
316	 * Carve the virtual space out of userspace for the new
317	 * bounds table:
318	 */
319	bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
320	if (IS_ERR((void *)bt_addr))
321		return PTR_ERR((void *)bt_addr);
322	/*
323	 * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
324	 */
325	bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
326
327	/*
328	 * Go poke the address of the new bounds table in to the
329	 * bounds directory entry out in userspace memory.  Note:
330	 * we may race with another CPU instantiating the same table.
331	 * In that case the cmpxchg will see an unexpected
332	 * 'actual_old_val'.
333	 *
334	 * This can fault, but that's OK because we do not hold
335	 * mmap_sem at this point, unlike some of the other part
336	 * of the MPX code that have to pagefault_disable().
337	 */
338	ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,	bd_entry,
339				   expected_old_val, bd_new_entry);
340	if (ret)
341		goto out_unmap;
342
343	/*
344	 * The user_atomic_cmpxchg_inatomic() will only return nonzero
345	 * for faults, *not* if the cmpxchg itself fails.  Now we must
346	 * verify that the cmpxchg itself completed successfully.
347	 */
348	/*
349	 * We expected an empty 'expected_old_val', but instead found
350	 * an apparently valid entry.  Assume we raced with another
351	 * thread to instantiate this table and desclare succecss.
352	 */
353	if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
354		ret = 0;
355		goto out_unmap;
356	}
357	/*
358	 * We found a non-empty bd_entry but it did not have the
359	 * VALID_FLAG set.  Return an error which will result in
360	 * a SEGV since this probably means that somebody scribbled
361	 * some invalid data in to a bounds table.
362	 */
363	if (expected_old_val != actual_old_val) {
364		ret = -EINVAL;
365		goto out_unmap;
366	}
367	trace_mpx_new_bounds_table(bt_addr);
368	return 0;
369out_unmap:
370	vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
371	return ret;
372}
373
374/*
375 * When a BNDSTX instruction attempts to save bounds to a bounds
376 * table, it will first attempt to look up the table in the
377 * first-level bounds directory.  If it does not find a table in
378 * the directory, a #BR is generated and we get here in order to
379 * allocate a new table.
380 *
381 * With 32-bit mode, the size of BD is 4MB, and the size of each
382 * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
383 * and the size of each bound table is 4MB.
384 */
385static int do_mpx_bt_fault(void)
386{
387	unsigned long bd_entry, bd_base;
388	const struct mpx_bndcsr *bndcsr;
389	struct mm_struct *mm = current->mm;
390
391	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
392	if (!bndcsr)
393		return -EINVAL;
394	/*
395	 * Mask off the preserve and enable bits
396	 */
397	bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
398	/*
399	 * The hardware provides the address of the missing or invalid
400	 * entry via BNDSTATUS, so we don't have to go look it up.
401	 */
402	bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
403	/*
404	 * Make sure the directory entry is within where we think
405	 * the directory is.
406	 */
407	if ((bd_entry < bd_base) ||
408	    (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
409		return -EINVAL;
410
411	return allocate_bt(mm, (long __user *)bd_entry);
412}
413
414int mpx_handle_bd_fault(void)
415{
416	/*
417	 * Userspace never asked us to manage the bounds tables,
418	 * so refuse to help.
419	 */
420	if (!kernel_managing_mpx_tables(current->mm))
421		return -EINVAL;
422
423	return do_mpx_bt_fault();
424}
425
426/*
427 * A thin wrapper around get_user_pages().  Returns 0 if the
428 * fault was resolved or -errno if not.
429 */
430static int mpx_resolve_fault(long __user *addr, int write)
431{
432	long gup_ret;
433	int nr_pages = 1;
434
435	gup_ret = get_user_pages((unsigned long)addr, nr_pages,
436			write ? FOLL_WRITE : 0,	NULL, NULL);
437	/*
438	 * get_user_pages() returns number of pages gotten.
439	 * 0 means we failed to fault in and get anything,
440	 * probably because 'addr' is bad.
441	 */
442	if (!gup_ret)
443		return -EFAULT;
444	/* Other error, return it */
445	if (gup_ret < 0)
446		return gup_ret;
447	/* must have gup'd a page and gup_ret>0, success */
448	return 0;
449}
450
451static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
452					     unsigned long bd_entry)
453{
454	unsigned long bt_addr = bd_entry;
455	int align_to_bytes;
456	/*
457	 * Bit 0 in a bt_entry is always the valid bit.
458	 */
459	bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
460	/*
461	 * Tables are naturally aligned at 8-byte boundaries
462	 * on 64-bit and 4-byte boundaries on 32-bit.  The
463	 * documentation makes it appear that the low bits
464	 * are ignored by the hardware, so we do the same.
465	 */
466	if (is_64bit_mm(mm))
467		align_to_bytes = 8;
468	else
469		align_to_bytes = 4;
470	bt_addr &= ~(align_to_bytes-1);
471	return bt_addr;
472}
473
474/*
475 * We only want to do a 4-byte get_user() on 32-bit.  Otherwise,
476 * we might run off the end of the bounds table if we are on
477 * a 64-bit kernel and try to get 8 bytes.
478 */
479static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
480		long __user *bd_entry_ptr)
481{
482	u32 bd_entry_32;
483	int ret;
484
485	if (is_64bit_mm(mm))
486		return get_user(*bd_entry_ret, bd_entry_ptr);
487
488	/*
489	 * Note that get_user() uses the type of the *pointer* to
490	 * establish the size of the get, not the destination.
491	 */
492	ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
493	*bd_entry_ret = bd_entry_32;
494	return ret;
495}
496
497/*
498 * Get the base of bounds tables pointed by specific bounds
499 * directory entry.
500 */
501static int get_bt_addr(struct mm_struct *mm,
502			long __user *bd_entry_ptr,
503			unsigned long *bt_addr_result)
504{
505	int ret;
506	int valid_bit;
507	unsigned long bd_entry;
508	unsigned long bt_addr;
509
510	if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
511		return -EFAULT;
512
513	while (1) {
514		int need_write = 0;
515
516		pagefault_disable();
517		ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
518		pagefault_enable();
519		if (!ret)
520			break;
521		if (ret == -EFAULT)
522			ret = mpx_resolve_fault(bd_entry_ptr, need_write);
523		/*
524		 * If we could not resolve the fault, consider it
525		 * userspace's fault and error out.
526		 */
527		if (ret)
528			return ret;
529	}
530
531	valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
532	bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
533
534	/*
535	 * When the kernel is managing bounds tables, a bounds directory
536	 * entry will either have a valid address (plus the valid bit)
537	 * *OR* be completely empty. If we see a !valid entry *and* some
538	 * data in the address field, we know something is wrong. This
539	 * -EINVAL return will cause a SIGSEGV.
540	 */
541	if (!valid_bit && bt_addr)
542		return -EINVAL;
543	/*
544	 * Do we have an completely zeroed bt entry?  That is OK.  It
545	 * just means there was no bounds table for this memory.  Make
546	 * sure to distinguish this from -EINVAL, which will cause
547	 * a SEGV.
548	 */
549	if (!valid_bit)
550		return -ENOENT;
551
552	*bt_addr_result = bt_addr;
553	return 0;
554}
555
556static inline int bt_entry_size_bytes(struct mm_struct *mm)
557{
558	if (is_64bit_mm(mm))
559		return MPX_BT_ENTRY_BYTES_64;
560	else
561		return MPX_BT_ENTRY_BYTES_32;
562}
563
564/*
565 * Take a virtual address and turns it in to the offset in bytes
566 * inside of the bounds table where the bounds table entry
567 * controlling 'addr' can be found.
568 */
569static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
570		unsigned long addr)
571{
572	unsigned long bt_table_nr_entries;
573	unsigned long offset = addr;
574
575	if (is_64bit_mm(mm)) {
576		/* Bottom 3 bits are ignored on 64-bit */
577		offset >>= 3;
578		bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
579	} else {
580		/* Bottom 2 bits are ignored on 32-bit */
581		offset >>= 2;
582		bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
583	}
584	/*
585	 * We know the size of the table in to which we are
586	 * indexing, and we have eliminated all the low bits
587	 * which are ignored for indexing.
588	 *
589	 * Mask out all the high bits which we do not need
590	 * to index in to the table.  Note that the tables
591	 * are always powers of two so this gives us a proper
592	 * mask.
593	 */
594	offset &= (bt_table_nr_entries-1);
595	/*
596	 * We now have an entry offset in terms of *entries* in
597	 * the table.  We need to scale it back up to bytes.
598	 */
599	offset *= bt_entry_size_bytes(mm);
600	return offset;
601}
602
603/*
604 * How much virtual address space does a single bounds
605 * directory entry cover?
606 *
607 * Note, we need a long long because 4GB doesn't fit in
608 * to a long on 32-bit.
609 */
610static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
611{
612	unsigned long long virt_space;
613	unsigned long long GB = (1ULL << 30);
614
615	/*
616	 * This covers 32-bit emulation as well as 32-bit kernels
617	 * running on 64-bit hardware.
618	 */
619	if (!is_64bit_mm(mm))
620		return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
621
622	/*
623	 * 'x86_virt_bits' returns what the hardware is capable
624	 * of, and returns the full >32-bit address space when
625	 * running 32-bit kernels on 64-bit hardware.
626	 */
627	virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
628	return virt_space / MPX_BD_NR_ENTRIES_64;
629}
630
631/*
632 * Free the backing physical pages of bounds table 'bt_addr'.
633 * Assume start...end is within that bounds table.
634 */
635static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
636		unsigned long bt_addr,
637		unsigned long start_mapping, unsigned long end_mapping)
638{
639	struct vm_area_struct *vma;
640	unsigned long addr, len;
641	unsigned long start;
642	unsigned long end;
643
644	/*
645	 * if we 'end' on a boundary, the offset will be 0 which
646	 * is not what we want.  Back it up a byte to get the
647	 * last bt entry.  Then once we have the entry itself,
648	 * move 'end' back up by the table entry size.
649	 */
650	start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
651	end   = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
652	/*
653	 * Move end back up by one entry.  Among other things
654	 * this ensures that it remains page-aligned and does
655	 * not screw up zap_page_range()
656	 */
657	end += bt_entry_size_bytes(mm);
658
659	/*
660	 * Find the first overlapping vma. If vma->vm_start > start, there
661	 * will be a hole in the bounds table. This -EINVAL return will
662	 * cause a SIGSEGV.
663	 */
664	vma = find_vma(mm, start);
665	if (!vma || vma->vm_start > start)
666		return -EINVAL;
667
668	/*
669	 * A NUMA policy on a VM_MPX VMA could cause this bounds table to
670	 * be split. So we need to look across the entire 'start -> end'
671	 * range of this bounds table, find all of the VM_MPX VMAs, and
672	 * zap only those.
673	 */
674	addr = start;
675	while (vma && vma->vm_start < end) {
676		/*
677		 * We followed a bounds directory entry down
678		 * here.  If we find a non-MPX VMA, that's bad,
679		 * so stop immediately and return an error.  This
680		 * probably results in a SIGSEGV.
681		 */
682		if (!(vma->vm_flags & VM_MPX))
683			return -EINVAL;
684
685		len = min(vma->vm_end, end) - addr;
686		zap_page_range(vma, addr, len);
687		trace_mpx_unmap_zap(addr, addr+len);
688
689		vma = vma->vm_next;
690		addr = vma->vm_start;
691	}
692	return 0;
693}
694
695static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
696		unsigned long addr)
697{
698	/*
699	 * There are several ways to derive the bd offsets.  We
700	 * use the following approach here:
701	 * 1. We know the size of the virtual address space
702	 * 2. We know the number of entries in a bounds table
703	 * 3. We know that each entry covers a fixed amount of
704	 *    virtual address space.
705	 * So, we can just divide the virtual address by the
706	 * virtual space used by one entry to determine which
707	 * entry "controls" the given virtual address.
708	 */
709	if (is_64bit_mm(mm)) {
710		int bd_entry_size = 8; /* 64-bit pointer */
711		/*
712		 * Take the 64-bit addressing hole in to account.
713		 */
714		addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
715		return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
716	} else {
717		int bd_entry_size = 4; /* 32-bit pointer */
718		/*
719		 * 32-bit has no hole so this case needs no mask
720		 */
721		return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
722	}
723	/*
724	 * The two return calls above are exact copies.  If we
725	 * pull out a single copy and put it in here, gcc won't
726	 * realize that we're doing a power-of-2 divide and use
727	 * shifts.  It uses a real divide.  If we put them up
728	 * there, it manages to figure it out (gcc 4.8.3).
729	 */
730}
731
732static int unmap_entire_bt(struct mm_struct *mm,
733		long __user *bd_entry, unsigned long bt_addr)
734{
735	unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
736	unsigned long uninitialized_var(actual_old_val);
737	int ret;
738
739	while (1) {
740		int need_write = 1;
741		unsigned long cleared_bd_entry = 0;
742
743		pagefault_disable();
744		ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
745				bd_entry, expected_old_val, cleared_bd_entry);
746		pagefault_enable();
747		if (!ret)
748			break;
749		if (ret == -EFAULT)
750			ret = mpx_resolve_fault(bd_entry, need_write);
751		/*
752		 * If we could not resolve the fault, consider it
753		 * userspace's fault and error out.
754		 */
755		if (ret)
756			return ret;
757	}
758	/*
759	 * The cmpxchg was performed, check the results.
760	 */
761	if (actual_old_val != expected_old_val) {
762		/*
763		 * Someone else raced with us to unmap the table.
764		 * That is OK, since we were both trying to do
765		 * the same thing.  Declare success.
766		 */
767		if (!actual_old_val)
768			return 0;
769		/*
770		 * Something messed with the bounds directory
771		 * entry.  We hold mmap_sem for read or write
772		 * here, so it could not be a _new_ bounds table
773		 * that someone just allocated.  Something is
774		 * wrong, so pass up the error and SIGSEGV.
775		 */
776		return -EINVAL;
777	}
778	/*
779	 * Note, we are likely being called under do_munmap() already. To
780	 * avoid recursion, do_munmap() will check whether it comes
781	 * from one bounds table through VM_MPX flag.
782	 */
783	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
784}
785
786static int try_unmap_single_bt(struct mm_struct *mm,
787	       unsigned long start, unsigned long end)
788{
789	struct vm_area_struct *next;
790	struct vm_area_struct *prev;
791	/*
792	 * "bta" == Bounds Table Area: the area controlled by the
793	 * bounds table that we are unmapping.
794	 */
795	unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
796	unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
797	unsigned long uninitialized_var(bt_addr);
798	void __user *bde_vaddr;
799	int ret;
800	/*
801	 * We already unlinked the VMAs from the mm's rbtree so 'start'
802	 * is guaranteed to be in a hole. This gets us the first VMA
803	 * before the hole in to 'prev' and the next VMA after the hole
804	 * in to 'next'.
805	 */
806	next = find_vma_prev(mm, start, &prev);
807	/*
808	 * Do not count other MPX bounds table VMAs as neighbors.
809	 * Although theoretically possible, we do not allow bounds
810	 * tables for bounds tables so our heads do not explode.
811	 * If we count them as neighbors here, we may end up with
812	 * lots of tables even though we have no actual table
813	 * entries in use.
814	 */
815	while (next && (next->vm_flags & VM_MPX))
816		next = next->vm_next;
817	while (prev && (prev->vm_flags & VM_MPX))
818		prev = prev->vm_prev;
819	/*
820	 * We know 'start' and 'end' lie within an area controlled
821	 * by a single bounds table.  See if there are any other
822	 * VMAs controlled by that bounds table.  If there are not
823	 * then we can "expand" the are we are unmapping to possibly
824	 * cover the entire table.
825	 */
826	next = find_vma_prev(mm, start, &prev);
827	if ((!prev || prev->vm_end <= bta_start_vaddr) &&
828	    (!next || next->vm_start >= bta_end_vaddr)) {
829		/*
830		 * No neighbor VMAs controlled by same bounds
831		 * table.  Try to unmap the whole thing
832		 */
833		start = bta_start_vaddr;
834		end = bta_end_vaddr;
835	}
836
837	bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start);
838	ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
839	/*
840	 * No bounds table there, so nothing to unmap.
841	 */
842	if (ret == -ENOENT) {
843		ret = 0;
844		return 0;
845	}
846	if (ret)
847		return ret;
848	/*
849	 * We are unmapping an entire table.  Either because the
850	 * unmap that started this whole process was large enough
851	 * to cover an entire table, or that the unmap was small
852	 * but was the area covered by a bounds table.
853	 */
854	if ((start == bta_start_vaddr) &&
855	    (end == bta_end_vaddr))
856		return unmap_entire_bt(mm, bde_vaddr, bt_addr);
857	return zap_bt_entries_mapping(mm, bt_addr, start, end);
858}
859
860static int mpx_unmap_tables(struct mm_struct *mm,
861		unsigned long start, unsigned long end)
862{
863	unsigned long one_unmap_start;
864	trace_mpx_unmap_search(start, end);
865
866	one_unmap_start = start;
867	while (one_unmap_start < end) {
868		int ret;
869		unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
870						       bd_entry_virt_space(mm));
871		unsigned long one_unmap_end = end;
872		/*
873		 * if the end is beyond the current bounds table,
874		 * move it back so we only deal with a single one
875		 * at a time
876		 */
877		if (one_unmap_end > next_unmap_start)
878			one_unmap_end = next_unmap_start;
879		ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
880		if (ret)
881			return ret;
882
883		one_unmap_start = next_unmap_start;
884	}
885	return 0;
886}
887
888/*
889 * Free unused bounds tables covered in a virtual address region being
890 * munmap()ed. Assume end > start.
891 *
892 * This function will be called by do_munmap(), and the VMAs covering
893 * the virtual address region start...end have already been split if
894 * necessary, and the 'vma' is the first vma in this range (start -> end).
895 */
896void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
897		unsigned long start, unsigned long end)
898{
899	int ret;
900
901	/*
902	 * Refuse to do anything unless userspace has asked
903	 * the kernel to help manage the bounds tables,
904	 */
905	if (!kernel_managing_mpx_tables(current->mm))
906		return;
907	/*
908	 * This will look across the entire 'start -> end' range,
909	 * and find all of the non-VM_MPX VMAs.
910	 *
911	 * To avoid recursion, if a VM_MPX vma is found in the range
912	 * (start->end), we will not continue follow-up work. This
913	 * recursion represents having bounds tables for bounds tables,
914	 * which should not occur normally. Being strict about it here
915	 * helps ensure that we do not have an exploitable stack overflow.
916	 */
917	do {
918		if (vma->vm_flags & VM_MPX)
919			return;
920		vma = vma->vm_next;
921	} while (vma && vma->vm_start < end);
922
923	ret = mpx_unmap_tables(mm, start, end);
924	if (ret)
925		force_sig(SIGSEGV, current);
926}
927
928/* MPX cannot handle addresses above 47 bits yet. */
929unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
930		unsigned long flags)
931{
932	if (!kernel_managing_mpx_tables(current->mm))
933		return addr;
934	if (addr + len <= DEFAULT_MAP_WINDOW)
935		return addr;
936	if (flags & MAP_FIXED)
937		return -ENOMEM;
938
939	/*
940	 * Requested len is larger than the whole area we're allowed to map in.
941	 * Resetting hinting address wouldn't do much good -- fail early.
942	 */
943	if (len > DEFAULT_MAP_WINDOW)
944		return -ENOMEM;
945
946	/* Look for unmap area within DEFAULT_MAP_WINDOW */
947	return 0;
948}