Loading...
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/list.h>
5#include <linux/stringify.h>
6#include <linux/kprobes.h>
7#include <linux/mm.h>
8#include <linux/vmalloc.h>
9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
12#include <asm/alternative.h>
13#include <asm/sections.h>
14#include <asm/pgtable.h>
15#include <asm/mce.h>
16#include <asm/nmi.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/io.h>
20#include <asm/fixmap.h>
21
22#define MAX_PATCH_LEN (255-1)
23
24#ifdef CONFIG_HOTPLUG_CPU
25static int smp_alt_once;
26
27static int __init bootonly(char *str)
28{
29 smp_alt_once = 1;
30 return 1;
31}
32__setup("smp-alt-boot", bootonly);
33#else
34#define smp_alt_once 1
35#endif
36
37static int __initdata_or_module debug_alternative;
38
39static int __init debug_alt(char *str)
40{
41 debug_alternative = 1;
42 return 1;
43}
44__setup("debug-alternative", debug_alt);
45
46static int noreplace_smp;
47
48static int __init setup_noreplace_smp(char *str)
49{
50 noreplace_smp = 1;
51 return 1;
52}
53__setup("noreplace-smp", setup_noreplace_smp);
54
55#ifdef CONFIG_PARAVIRT
56static int __initdata_or_module noreplace_paravirt = 0;
57
58static int __init setup_noreplace_paravirt(char *str)
59{
60 noreplace_paravirt = 1;
61 return 1;
62}
63__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif
65
66#define DPRINTK(fmt, args...) if (debug_alternative) \
67 printk(KERN_DEBUG fmt, args)
68
69/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
71 * that correspond to that nop. Getting from one nop to the next, we
72 * add to the array the offset that is equal to the sum of all sizes of
73 * nops preceding the one we are after.
74 *
75 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
76 * nice symmetry of sizes of the previous nops.
77 */
78#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
79static const unsigned char intelnops[] =
80{
81 GENERIC_NOP1,
82 GENERIC_NOP2,
83 GENERIC_NOP3,
84 GENERIC_NOP4,
85 GENERIC_NOP5,
86 GENERIC_NOP6,
87 GENERIC_NOP7,
88 GENERIC_NOP8,
89 GENERIC_NOP5_ATOMIC
90};
91static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
92{
93 NULL,
94 intelnops,
95 intelnops + 1,
96 intelnops + 1 + 2,
97 intelnops + 1 + 2 + 3,
98 intelnops + 1 + 2 + 3 + 4,
99 intelnops + 1 + 2 + 3 + 4 + 5,
100 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
101 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103};
104#endif
105
106#ifdef K8_NOP1
107static const unsigned char k8nops[] =
108{
109 K8_NOP1,
110 K8_NOP2,
111 K8_NOP3,
112 K8_NOP4,
113 K8_NOP5,
114 K8_NOP6,
115 K8_NOP7,
116 K8_NOP8,
117 K8_NOP5_ATOMIC
118};
119static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
120{
121 NULL,
122 k8nops,
123 k8nops + 1,
124 k8nops + 1 + 2,
125 k8nops + 1 + 2 + 3,
126 k8nops + 1 + 2 + 3 + 4,
127 k8nops + 1 + 2 + 3 + 4 + 5,
128 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
131};
132#endif
133
134#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
135static const unsigned char k7nops[] =
136{
137 K7_NOP1,
138 K7_NOP2,
139 K7_NOP3,
140 K7_NOP4,
141 K7_NOP5,
142 K7_NOP6,
143 K7_NOP7,
144 K7_NOP8,
145 K7_NOP5_ATOMIC
146};
147static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
148{
149 NULL,
150 k7nops,
151 k7nops + 1,
152 k7nops + 1 + 2,
153 k7nops + 1 + 2 + 3,
154 k7nops + 1 + 2 + 3 + 4,
155 k7nops + 1 + 2 + 3 + 4 + 5,
156 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
157 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
159};
160#endif
161
162#ifdef P6_NOP1
163static const unsigned char p6nops[] =
164{
165 P6_NOP1,
166 P6_NOP2,
167 P6_NOP3,
168 P6_NOP4,
169 P6_NOP5,
170 P6_NOP6,
171 P6_NOP7,
172 P6_NOP8,
173 P6_NOP5_ATOMIC
174};
175static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
176{
177 NULL,
178 p6nops,
179 p6nops + 1,
180 p6nops + 1 + 2,
181 p6nops + 1 + 2 + 3,
182 p6nops + 1 + 2 + 3 + 4,
183 p6nops + 1 + 2 + 3 + 4 + 5,
184 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
185 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
187};
188#endif
189
190/* Initialize these to a safe default */
191#ifdef CONFIG_X86_64
192const unsigned char * const *ideal_nops = p6_nops;
193#else
194const unsigned char * const *ideal_nops = intel_nops;
195#endif
196
197void __init arch_init_ideal_nops(void)
198{
199 switch (boot_cpu_data.x86_vendor) {
200 case X86_VENDOR_INTEL:
201 /*
202 * Due to a decoder implementation quirk, some
203 * specific Intel CPUs actually perform better with
204 * the "k8_nops" than with the SDM-recommended NOPs.
205 */
206 if (boot_cpu_data.x86 == 6 &&
207 boot_cpu_data.x86_model >= 0x0f &&
208 boot_cpu_data.x86_model != 0x1c &&
209 boot_cpu_data.x86_model != 0x26 &&
210 boot_cpu_data.x86_model != 0x27 &&
211 boot_cpu_data.x86_model < 0x30) {
212 ideal_nops = k8_nops;
213 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
214 ideal_nops = p6_nops;
215 } else {
216#ifdef CONFIG_X86_64
217 ideal_nops = k8_nops;
218#else
219 ideal_nops = intel_nops;
220#endif
221 }
222 break;
223 default:
224#ifdef CONFIG_X86_64
225 ideal_nops = k8_nops;
226#else
227 if (boot_cpu_has(X86_FEATURE_K8))
228 ideal_nops = k8_nops;
229 else if (boot_cpu_has(X86_FEATURE_K7))
230 ideal_nops = k7_nops;
231 else
232 ideal_nops = intel_nops;
233#endif
234 }
235}
236
237/* Use this to add nops to a buffer, then text_poke the whole buffer. */
238static void __init_or_module add_nops(void *insns, unsigned int len)
239{
240 while (len > 0) {
241 unsigned int noplen = len;
242 if (noplen > ASM_NOP_MAX)
243 noplen = ASM_NOP_MAX;
244 memcpy(insns, ideal_nops[noplen], noplen);
245 insns += noplen;
246 len -= noplen;
247 }
248}
249
250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
251extern s32 __smp_locks[], __smp_locks_end[];
252void *text_poke_early(void *addr, const void *opcode, size_t len);
253
254/* Replace instructions with better alternatives for this CPU type.
255 This runs before SMP is initialized to avoid SMP problems with
256 self modifying code. This implies that asymmetric systems where
257 APs have less capabilities than the boot processor are not handled.
258 Tough. Make sure you disable such features by hand. */
259
260void __init_or_module apply_alternatives(struct alt_instr *start,
261 struct alt_instr *end)
262{
263 struct alt_instr *a;
264 u8 *instr, *replacement;
265 u8 insnbuf[MAX_PATCH_LEN];
266
267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
268 /*
269 * The scan order should be from start to end. A later scanned
270 * alternative code can overwrite a previous scanned alternative code.
271 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
272 * patch code.
273 *
274 * So be careful if you want to change the scan order to any other
275 * order.
276 */
277 for (a = start; a < end; a++) {
278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid))
284 continue;
285
286 memcpy(insnbuf, replacement, a->replacementlen);
287
288 /* 0xe8 is a relative jump; fix the offset. */
289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
292 add_nops(insnbuf + a->replacementlen,
293 a->instrlen - a->replacementlen);
294
295 text_poke_early(instr, insnbuf, a->instrlen);
296 }
297}
298
299#ifdef CONFIG_SMP
300
301static void alternatives_smp_lock(const s32 *start, const s32 *end,
302 u8 *text, u8 *text_end)
303{
304 const s32 *poff;
305
306 mutex_lock(&text_mutex);
307 for (poff = start; poff < end; poff++) {
308 u8 *ptr = (u8 *)poff + *poff;
309
310 if (!*poff || ptr < text || ptr >= text_end)
311 continue;
312 /* turn DS segment override prefix into lock prefix */
313 if (*ptr == 0x3e)
314 text_poke(ptr, ((unsigned char []){0xf0}), 1);
315 };
316 mutex_unlock(&text_mutex);
317}
318
319static void alternatives_smp_unlock(const s32 *start, const s32 *end,
320 u8 *text, u8 *text_end)
321{
322 const s32 *poff;
323
324 if (noreplace_smp)
325 return;
326
327 mutex_lock(&text_mutex);
328 for (poff = start; poff < end; poff++) {
329 u8 *ptr = (u8 *)poff + *poff;
330
331 if (!*poff || ptr < text || ptr >= text_end)
332 continue;
333 /* turn lock prefix into DS segment override prefix */
334 if (*ptr == 0xf0)
335 text_poke(ptr, ((unsigned char []){0x3E}), 1);
336 };
337 mutex_unlock(&text_mutex);
338}
339
340struct smp_alt_module {
341 /* what is this ??? */
342 struct module *mod;
343 char *name;
344
345 /* ptrs to lock prefixes */
346 const s32 *locks;
347 const s32 *locks_end;
348
349 /* .text segment, needed to avoid patching init code ;) */
350 u8 *text;
351 u8 *text_end;
352
353 struct list_head next;
354};
355static LIST_HEAD(smp_alt_modules);
356static DEFINE_MUTEX(smp_alt);
357static int smp_mode = 1; /* protected by smp_alt */
358
359void __init_or_module alternatives_smp_module_add(struct module *mod,
360 char *name,
361 void *locks, void *locks_end,
362 void *text, void *text_end)
363{
364 struct smp_alt_module *smp;
365
366 if (noreplace_smp)
367 return;
368
369 if (smp_alt_once) {
370 if (boot_cpu_has(X86_FEATURE_UP))
371 alternatives_smp_unlock(locks, locks_end,
372 text, text_end);
373 return;
374 }
375
376 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
377 if (NULL == smp)
378 return; /* we'll run the (safe but slow) SMP code then ... */
379
380 smp->mod = mod;
381 smp->name = name;
382 smp->locks = locks;
383 smp->locks_end = locks_end;
384 smp->text = text;
385 smp->text_end = text_end;
386 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
387 __func__, smp->locks, smp->locks_end,
388 smp->text, smp->text_end, smp->name);
389
390 mutex_lock(&smp_alt);
391 list_add_tail(&smp->next, &smp_alt_modules);
392 if (boot_cpu_has(X86_FEATURE_UP))
393 alternatives_smp_unlock(smp->locks, smp->locks_end,
394 smp->text, smp->text_end);
395 mutex_unlock(&smp_alt);
396}
397
398void __init_or_module alternatives_smp_module_del(struct module *mod)
399{
400 struct smp_alt_module *item;
401
402 if (smp_alt_once || noreplace_smp)
403 return;
404
405 mutex_lock(&smp_alt);
406 list_for_each_entry(item, &smp_alt_modules, next) {
407 if (mod != item->mod)
408 continue;
409 list_del(&item->next);
410 mutex_unlock(&smp_alt);
411 DPRINTK("%s: %s\n", __func__, item->name);
412 kfree(item);
413 return;
414 }
415 mutex_unlock(&smp_alt);
416}
417
418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
420{
421 struct smp_alt_module *mod;
422
423#ifdef CONFIG_LOCKDEP
424 /*
425 * Older binutils section handling bug prevented
426 * alternatives-replacement from working reliably.
427 *
428 * If this still occurs then you should see a hang
429 * or crash shortly after this line:
430 */
431 printk("lockdep: fixing up alternatives.\n");
432#endif
433
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
435 return;
436 BUG_ON(!smp && (num_online_cpus() > 1));
437
438 mutex_lock(&smp_alt);
439
440 /*
441 * Avoid unnecessary switches because it forces JIT based VMs to
442 * throw away all cached translations, which can be quite costly.
443 */
444 if (smp == smp_mode) {
445 /* nothing */
446 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end);
453 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next)
458 alternatives_smp_unlock(mod->locks, mod->locks_end,
459 mod->text, mod->text_end);
460 }
461 smp_mode = smp;
462 mutex_unlock(&smp_alt);
463}
464
465/* Return 1 if the address range is reserved for smp-alternatives */
466int alternatives_text_reserved(void *start, void *end)
467{
468 struct smp_alt_module *mod;
469 const s32 *poff;
470 u8 *text_start = start;
471 u8 *text_end = end;
472
473 list_for_each_entry(mod, &smp_alt_modules, next) {
474 if (mod->text > text_end || mod->text_end < text_start)
475 continue;
476 for (poff = mod->locks; poff < mod->locks_end; poff++) {
477 const u8 *ptr = (const u8 *)poff + *poff;
478
479 if (text_start <= ptr && text_end > ptr)
480 return 1;
481 }
482 }
483
484 return 0;
485}
486#endif
487
488#ifdef CONFIG_PARAVIRT
489void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
490 struct paravirt_patch_site *end)
491{
492 struct paravirt_patch_site *p;
493 char insnbuf[MAX_PATCH_LEN];
494
495 if (noreplace_paravirt)
496 return;
497
498 for (p = start; p < end; p++) {
499 unsigned int used;
500
501 BUG_ON(p->len > MAX_PATCH_LEN);
502 /* prep the buffer with the original instructions */
503 memcpy(insnbuf, p->instr, p->len);
504 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
505 (unsigned long)p->instr, p->len);
506
507 BUG_ON(used > p->len);
508
509 /* Pad the rest with nops */
510 add_nops(insnbuf + used, p->len - used);
511 text_poke_early(p->instr, insnbuf, p->len);
512 }
513}
514extern struct paravirt_patch_site __start_parainstructions[],
515 __stop_parainstructions[];
516#endif /* CONFIG_PARAVIRT */
517
518void __init alternative_instructions(void)
519{
520 /* The patching is not fully atomic, so try to avoid local interruptions
521 that might execute the to be patched code.
522 Other CPUs are not running. */
523 stop_nmi();
524
525 /*
526 * Don't stop machine check exceptions while patching.
527 * MCEs only happen when something got corrupted and in this
528 * case we must do something about the corruption.
529 * Ignoring it is worse than a unlikely patching race.
530 * Also machine checks tend to be broadcast and if one CPU
531 * goes into machine check the others follow quickly, so we don't
532 * expect a machine check to cause undue problems during to code
533 * patching.
534 */
535
536 apply_alternatives(__alt_instructions, __alt_instructions_end);
537
538 /* switch to patch-once-at-boottime-only mode and free the
539 * tables in case we know the number of CPUs will never ever
540 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542 if (num_possible_cpus() < 2)
543 smp_alt_once = 1;
544#endif
545
546#ifdef CONFIG_SMP
547 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554 _text, _etext);
555 }
556 } else {
557 alternatives_smp_module_add(NULL, "core kernel",
558 __smp_locks, __smp_locks_end,
559 _text, _etext);
560
561 /* Only switch to UP mode if we don't immediately boot others */
562 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563 alternatives_smp_switch(0);
564 }
565#endif
566 apply_paravirt(__parainstructions, __parainstructions_end);
567
568 if (smp_alt_once)
569 free_init_pages("SMP alternatives",
570 (unsigned long)__smp_locks,
571 (unsigned long)__smp_locks_end);
572
573 restart_nmi();
574}
575
576/**
577 * text_poke_early - Update instructions on a live kernel at boot time
578 * @addr: address to modify
579 * @opcode: source of the copy
580 * @len: length to copy
581 *
582 * When you use this code to patch more than one byte of an instruction
583 * you need to make sure that other CPUs cannot execute this code in parallel.
584 * Also no thread must be currently preempted in the middle of these
585 * instructions. And on the local CPU you need to be protected again NMI or MCE
586 * handlers seeing an inconsistent instruction while you patch.
587 */
588void *__init_or_module text_poke_early(void *addr, const void *opcode,
589 size_t len)
590{
591 unsigned long flags;
592 local_irq_save(flags);
593 memcpy(addr, opcode, len);
594 sync_core();
595 local_irq_restore(flags);
596 /* Could also do a CLFLUSH here to speed up CPU recovery; but
597 that causes hangs on some VIA CPUs. */
598 return addr;
599}
600
601/**
602 * text_poke - Update instructions on a live kernel
603 * @addr: address to modify
604 * @opcode: source of the copy
605 * @len: length to copy
606 *
607 * Only atomic text poke/set should be allowed when not doing early patching.
608 * It means the size must be writable atomically and the address must be aligned
609 * in a way that permits an atomic write. It also makes sure we fit on a single
610 * page.
611 *
612 * Note: Must be called under text_mutex.
613 */
614void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
615{
616 unsigned long flags;
617 char *vaddr;
618 struct page *pages[2];
619 int i;
620
621 if (!core_kernel_text((unsigned long)addr)) {
622 pages[0] = vmalloc_to_page(addr);
623 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
624 } else {
625 pages[0] = virt_to_page(addr);
626 WARN_ON(!PageReserved(pages[0]));
627 pages[1] = virt_to_page(addr + PAGE_SIZE);
628 }
629 BUG_ON(!pages[0]);
630 local_irq_save(flags);
631 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
632 if (pages[1])
633 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
634 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
635 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
636 clear_fixmap(FIX_TEXT_POKE0);
637 if (pages[1])
638 clear_fixmap(FIX_TEXT_POKE1);
639 local_flush_tlb();
640 sync_core();
641 /* Could also do a CLFLUSH here to speed up CPU recovery; but
642 that causes hangs on some VIA CPUs. */
643 for (i = 0; i < len; i++)
644 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
645 local_irq_restore(flags);
646 return addr;
647}
648
649/*
650 * Cross-modifying kernel text with stop_machine().
651 * This code originally comes from immediate value.
652 */
653static atomic_t stop_machine_first;
654static int wrote_text;
655
656struct text_poke_params {
657 struct text_poke_param *params;
658 int nparams;
659};
660
661static int __kprobes stop_machine_text_poke(void *data)
662{
663 struct text_poke_params *tpp = data;
664 struct text_poke_param *p;
665 int i;
666
667 if (atomic_dec_and_test(&stop_machine_first)) {
668 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len);
671 }
672 smp_wmb(); /* Make sure other cpus see that this has run */
673 wrote_text = 1;
674 } else {
675 while (!wrote_text)
676 cpu_relax();
677 smp_mb(); /* Load wrote_text before following execution */
678 }
679
680 for (i = 0; i < tpp->nparams; i++) {
681 p = &tpp->params[i];
682 flush_icache_range((unsigned long)p->addr,
683 (unsigned long)p->addr + p->len);
684 }
685 /*
686 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
687 * that a core serializing instruction such as "cpuid" should be
688 * executed on _each_ core before the new instruction is made visible.
689 */
690 sync_core();
691 return 0;
692}
693
694/**
695 * text_poke_smp - Update instructions on a live kernel on SMP
696 * @addr: address to modify
697 * @opcode: source of the copy
698 * @len: length to copy
699 *
700 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
701 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
702 * should be allowed, since stop_machine() does _not_ protect code against
703 * NMI and MCE.
704 *
705 * Note: Must be called under get_online_cpus() and text_mutex.
706 */
707void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
708{
709 struct text_poke_params tpp;
710 struct text_poke_param p;
711
712 p.addr = addr;
713 p.opcode = opcode;
714 p.len = len;
715 tpp.params = &p;
716 tpp.nparams = 1;
717 atomic_set(&stop_machine_first, 1);
718 wrote_text = 0;
719 /* Use __stop_machine() because the caller already got online_cpus. */
720 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
721 return addr;
722}
723
724/**
725 * text_poke_smp_batch - Update instructions on a live kernel on SMP
726 * @params: an array of text_poke parameters
727 * @n: the number of elements in params.
728 *
729 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
730 * stop_machine() is heavy task, it is better to aggregate text_poke requests
731 * and do it once if possible.
732 *
733 * Note: Must be called under get_online_cpus() and text_mutex.
734 */
735void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
736{
737 struct text_poke_params tpp = {.params = params, .nparams = n};
738
739 atomic_set(&stop_machine_first, 1);
740 wrote_text = 0;
741 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
742}
1// SPDX-License-Identifier: GPL-2.0-only
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
4#include <linux/module.h>
5#include <linux/sched.h>
6#include <linux/perf_event.h>
7#include <linux/mutex.h>
8#include <linux/list.h>
9#include <linux/stringify.h>
10#include <linux/highmem.h>
11#include <linux/mm.h>
12#include <linux/vmalloc.h>
13#include <linux/memory.h>
14#include <linux/stop_machine.h>
15#include <linux/slab.h>
16#include <linux/kdebug.h>
17#include <linux/kprobes.h>
18#include <linux/mmu_context.h>
19#include <linux/bsearch.h>
20#include <linux/sync_core.h>
21#include <asm/text-patching.h>
22#include <asm/alternative.h>
23#include <asm/sections.h>
24#include <asm/mce.h>
25#include <asm/nmi.h>
26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h>
28#include <asm/insn.h>
29#include <asm/io.h>
30#include <asm/fixmap.h>
31#include <asm/paravirt.h>
32#include <asm/asm-prototypes.h>
33
34int __read_mostly alternatives_patched;
35
36EXPORT_SYMBOL_GPL(alternatives_patched);
37
38#define MAX_PATCH_LEN (255-1)
39
40static int __initdata_or_module debug_alternative;
41
42static int __init debug_alt(char *str)
43{
44 debug_alternative = 1;
45 return 1;
46}
47__setup("debug-alternative", debug_alt);
48
49static int noreplace_smp;
50
51static int __init setup_noreplace_smp(char *str)
52{
53 noreplace_smp = 1;
54 return 1;
55}
56__setup("noreplace-smp", setup_noreplace_smp);
57
58#define DPRINTK(fmt, args...) \
59do { \
60 if (debug_alternative) \
61 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
62} while (0)
63
64#define DUMP_BYTES(buf, len, fmt, args...) \
65do { \
66 if (unlikely(debug_alternative)) { \
67 int j; \
68 \
69 if (!(len)) \
70 break; \
71 \
72 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
73 for (j = 0; j < (len) - 1; j++) \
74 printk(KERN_CONT "%02hhx ", buf[j]); \
75 printk(KERN_CONT "%02hhx\n", buf[j]); \
76 } \
77} while (0)
78
79static const unsigned char x86nops[] =
80{
81 BYTES_NOP1,
82 BYTES_NOP2,
83 BYTES_NOP3,
84 BYTES_NOP4,
85 BYTES_NOP5,
86 BYTES_NOP6,
87 BYTES_NOP7,
88 BYTES_NOP8,
89};
90
91const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
92{
93 NULL,
94 x86nops,
95 x86nops + 1,
96 x86nops + 1 + 2,
97 x86nops + 1 + 2 + 3,
98 x86nops + 1 + 2 + 3 + 4,
99 x86nops + 1 + 2 + 3 + 4 + 5,
100 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
101 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102};
103
104/* Use this to add nops to a buffer, then text_poke the whole buffer. */
105static void __init_or_module add_nops(void *insns, unsigned int len)
106{
107 while (len > 0) {
108 unsigned int noplen = len;
109 if (noplen > ASM_NOP_MAX)
110 noplen = ASM_NOP_MAX;
111 memcpy(insns, x86_nops[noplen], noplen);
112 insns += noplen;
113 len -= noplen;
114 }
115}
116
117extern s32 __retpoline_sites[], __retpoline_sites_end[];
118extern s32 __return_sites[], __return_sites_end[];
119extern s32 __cfi_sites[], __cfi_sites_end[];
120extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
121extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
122extern s32 __smp_locks[], __smp_locks_end[];
123void text_poke_early(void *addr, const void *opcode, size_t len);
124
125/*
126 * Are we looking at a near JMP with a 1 or 4-byte displacement.
127 */
128static inline bool is_jmp(const u8 opcode)
129{
130 return opcode == 0xeb || opcode == 0xe9;
131}
132
133static void __init_or_module
134recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
135{
136 u8 *next_rip, *tgt_rip;
137 s32 n_dspl, o_dspl;
138 int repl_len;
139
140 if (a->replacementlen != 5)
141 return;
142
143 o_dspl = *(s32 *)(insn_buff + 1);
144
145 /* next_rip of the replacement JMP */
146 next_rip = repl_insn + a->replacementlen;
147 /* target rip of the replacement JMP */
148 tgt_rip = next_rip + o_dspl;
149 n_dspl = tgt_rip - orig_insn;
150
151 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
152
153 if (tgt_rip - orig_insn >= 0) {
154 if (n_dspl - 2 <= 127)
155 goto two_byte_jmp;
156 else
157 goto five_byte_jmp;
158 /* negative offset */
159 } else {
160 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
161 goto two_byte_jmp;
162 else
163 goto five_byte_jmp;
164 }
165
166two_byte_jmp:
167 n_dspl -= 2;
168
169 insn_buff[0] = 0xeb;
170 insn_buff[1] = (s8)n_dspl;
171 add_nops(insn_buff + 2, 3);
172
173 repl_len = 2;
174 goto done;
175
176five_byte_jmp:
177 n_dspl -= 5;
178
179 insn_buff[0] = 0xe9;
180 *(s32 *)&insn_buff[1] = n_dspl;
181
182 repl_len = 5;
183
184done:
185
186 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
187 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
188}
189
190/*
191 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
192 *
193 * @instr: instruction byte stream
194 * @instrlen: length of the above
195 * @off: offset within @instr where the first NOP has been detected
196 *
197 * Return: number of NOPs found (and replaced).
198 */
199static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
200{
201 unsigned long flags;
202 int i = off, nnops;
203
204 while (i < instrlen) {
205 if (instr[i] != 0x90)
206 break;
207
208 i++;
209 }
210
211 nnops = i - off;
212
213 if (nnops <= 1)
214 return nnops;
215
216 local_irq_save(flags);
217 add_nops(instr + off, nnops);
218 local_irq_restore(flags);
219
220 DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
221
222 return nnops;
223}
224
225/*
226 * "noinline" to cause control flow change and thus invalidate I$ and
227 * cause refetch after modification.
228 */
229static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
230{
231 struct insn insn;
232 int i = 0;
233
234 /*
235 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
236 * ones.
237 */
238 for (;;) {
239 if (insn_decode_kernel(&insn, &instr[i]))
240 return;
241
242 /*
243 * See if this and any potentially following NOPs can be
244 * optimized.
245 */
246 if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
247 i += optimize_nops_range(instr, len, i);
248 else
249 i += insn.length;
250
251 if (i >= len)
252 return;
253 }
254}
255
256/*
257 * Replace instructions with better alternatives for this CPU type. This runs
258 * before SMP is initialized to avoid SMP problems with self modifying code.
259 * This implies that asymmetric systems where APs have less capabilities than
260 * the boot processor are not handled. Tough. Make sure you disable such
261 * features by hand.
262 *
263 * Marked "noinline" to cause control flow change and thus insn cache
264 * to refetch changed I$ lines.
265 */
266void __init_or_module noinline apply_alternatives(struct alt_instr *start,
267 struct alt_instr *end)
268{
269 struct alt_instr *a;
270 u8 *instr, *replacement;
271 u8 insn_buff[MAX_PATCH_LEN];
272
273 DPRINTK("alt table %px, -> %px", start, end);
274 /*
275 * The scan order should be from start to end. A later scanned
276 * alternative code can overwrite previously scanned alternative code.
277 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
278 * patch code.
279 *
280 * So be careful if you want to change the scan order to any other
281 * order.
282 */
283 for (a = start; a < end; a++) {
284 int insn_buff_sz = 0;
285 /* Mask away "NOT" flag bit for feature to test. */
286 u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
287
288 instr = (u8 *)&a->instr_offset + a->instr_offset;
289 replacement = (u8 *)&a->repl_offset + a->repl_offset;
290 BUG_ON(a->instrlen > sizeof(insn_buff));
291 BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
292
293 /*
294 * Patch if either:
295 * - feature is present
296 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
297 * patch if feature is *NOT* present.
298 */
299 if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
300 goto next;
301
302 DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
303 (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
304 feature >> 5,
305 feature & 0x1f,
306 instr, instr, a->instrlen,
307 replacement, a->replacementlen);
308
309 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
310 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
311
312 memcpy(insn_buff, replacement, a->replacementlen);
313 insn_buff_sz = a->replacementlen;
314
315 /*
316 * 0xe8 is a relative jump; fix the offset.
317 *
318 * Instruction length is checked before the opcode to avoid
319 * accessing uninitialized bytes for zero-length replacements.
320 */
321 if (a->replacementlen == 5 && *insn_buff == 0xe8) {
322 *(s32 *)(insn_buff + 1) += replacement - instr;
323 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
324 *(s32 *)(insn_buff + 1),
325 (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
326 }
327
328 if (a->replacementlen && is_jmp(replacement[0]))
329 recompute_jump(a, instr, replacement, insn_buff);
330
331 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
332 insn_buff[insn_buff_sz] = 0x90;
333
334 DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
335
336 text_poke_early(instr, insn_buff, insn_buff_sz);
337
338next:
339 optimize_nops(instr, a->instrlen);
340 }
341}
342
343#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
344
345/*
346 * CALL/JMP *%\reg
347 */
348static int emit_indirect(int op, int reg, u8 *bytes)
349{
350 int i = 0;
351 u8 modrm;
352
353 switch (op) {
354 case CALL_INSN_OPCODE:
355 modrm = 0x10; /* Reg = 2; CALL r/m */
356 break;
357
358 case JMP32_INSN_OPCODE:
359 modrm = 0x20; /* Reg = 4; JMP r/m */
360 break;
361
362 default:
363 WARN_ON_ONCE(1);
364 return -1;
365 }
366
367 if (reg >= 8) {
368 bytes[i++] = 0x41; /* REX.B prefix */
369 reg -= 8;
370 }
371
372 modrm |= 0xc0; /* Mod = 3 */
373 modrm += reg;
374
375 bytes[i++] = 0xff; /* opcode */
376 bytes[i++] = modrm;
377
378 return i;
379}
380
381static inline bool is_jcc32(struct insn *insn)
382{
383 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
384 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
385}
386
387static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
388{
389 u8 op = insn->opcode.bytes[0];
390 int i = 0;
391
392 /*
393 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
394 * tail-calls. Deal with them.
395 */
396 if (is_jcc32(insn)) {
397 bytes[i++] = op;
398 op = insn->opcode.bytes[1];
399 goto clang_jcc;
400 }
401
402 if (insn->length == 6)
403 bytes[i++] = 0x2e; /* CS-prefix */
404
405 switch (op) {
406 case CALL_INSN_OPCODE:
407 __text_gen_insn(bytes+i, op, addr+i,
408 __x86_indirect_call_thunk_array[reg],
409 CALL_INSN_SIZE);
410 i += CALL_INSN_SIZE;
411 break;
412
413 case JMP32_INSN_OPCODE:
414clang_jcc:
415 __text_gen_insn(bytes+i, op, addr+i,
416 __x86_indirect_jump_thunk_array[reg],
417 JMP32_INSN_SIZE);
418 i += JMP32_INSN_SIZE;
419 break;
420
421 default:
422 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
423 return -1;
424 }
425
426 WARN_ON_ONCE(i != insn->length);
427
428 return i;
429}
430
431/*
432 * Rewrite the compiler generated retpoline thunk calls.
433 *
434 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
435 * indirect instructions, avoiding the extra indirection.
436 *
437 * For example, convert:
438 *
439 * CALL __x86_indirect_thunk_\reg
440 *
441 * into:
442 *
443 * CALL *%\reg
444 *
445 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
446 */
447static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
448{
449 retpoline_thunk_t *target;
450 int reg, ret, i = 0;
451 u8 op, cc;
452
453 target = addr + insn->length + insn->immediate.value;
454 reg = target - __x86_indirect_thunk_array;
455
456 if (WARN_ON_ONCE(reg & ~0xf))
457 return -1;
458
459 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
460 BUG_ON(reg == 4);
461
462 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
463 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
464 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
465 return emit_call_track_retpoline(addr, insn, reg, bytes);
466
467 return -1;
468 }
469
470 op = insn->opcode.bytes[0];
471
472 /*
473 * Convert:
474 *
475 * Jcc.d32 __x86_indirect_thunk_\reg
476 *
477 * into:
478 *
479 * Jncc.d8 1f
480 * [ LFENCE ]
481 * JMP *%\reg
482 * [ NOP ]
483 * 1:
484 */
485 if (is_jcc32(insn)) {
486 cc = insn->opcode.bytes[1] & 0xf;
487 cc ^= 1; /* invert condition */
488
489 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
490 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
491
492 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
493 op = JMP32_INSN_OPCODE;
494 }
495
496 /*
497 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
498 */
499 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
500 bytes[i++] = 0x0f;
501 bytes[i++] = 0xae;
502 bytes[i++] = 0xe8; /* LFENCE */
503 }
504
505 ret = emit_indirect(op, reg, bytes + i);
506 if (ret < 0)
507 return ret;
508 i += ret;
509
510 /*
511 * The compiler is supposed to EMIT an INT3 after every unconditional
512 * JMP instruction due to AMD BTC. However, if the compiler is too old
513 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
514 * even on Intel.
515 */
516 if (op == JMP32_INSN_OPCODE && i < insn->length)
517 bytes[i++] = INT3_INSN_OPCODE;
518
519 for (; i < insn->length;)
520 bytes[i++] = BYTES_NOP1;
521
522 return i;
523}
524
525/*
526 * Generated by 'objtool --retpoline'.
527 */
528void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
529{
530 s32 *s;
531
532 for (s = start; s < end; s++) {
533 void *addr = (void *)s + *s;
534 struct insn insn;
535 int len, ret;
536 u8 bytes[16];
537 u8 op1, op2;
538
539 ret = insn_decode_kernel(&insn, addr);
540 if (WARN_ON_ONCE(ret < 0))
541 continue;
542
543 op1 = insn.opcode.bytes[0];
544 op2 = insn.opcode.bytes[1];
545
546 switch (op1) {
547 case CALL_INSN_OPCODE:
548 case JMP32_INSN_OPCODE:
549 break;
550
551 case 0x0f: /* escape */
552 if (op2 >= 0x80 && op2 <= 0x8f)
553 break;
554 fallthrough;
555 default:
556 WARN_ON_ONCE(1);
557 continue;
558 }
559
560 DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
561 addr, addr, insn.length,
562 addr + insn.length + insn.immediate.value);
563
564 len = patch_retpoline(addr, &insn, bytes);
565 if (len == insn.length) {
566 optimize_nops(bytes, len);
567 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
568 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
569 text_poke_early(addr, bytes, len);
570 }
571 }
572}
573
574#ifdef CONFIG_RETHUNK
575
576#ifdef CONFIG_CALL_THUNKS
577void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
578#endif
579
580/*
581 * Rewrite the compiler generated return thunk tail-calls.
582 *
583 * For example, convert:
584 *
585 * JMP __x86_return_thunk
586 *
587 * into:
588 *
589 * RET
590 */
591static int patch_return(void *addr, struct insn *insn, u8 *bytes)
592{
593 int i = 0;
594
595 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
596 if (x86_return_thunk == __x86_return_thunk)
597 return -1;
598
599 i = JMP32_INSN_SIZE;
600 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
601 } else {
602 bytes[i++] = RET_INSN_OPCODE;
603 }
604
605 for (; i < insn->length;)
606 bytes[i++] = INT3_INSN_OPCODE;
607 return i;
608}
609
610void __init_or_module noinline apply_returns(s32 *start, s32 *end)
611{
612 s32 *s;
613
614 for (s = start; s < end; s++) {
615 void *dest = NULL, *addr = (void *)s + *s;
616 struct insn insn;
617 int len, ret;
618 u8 bytes[16];
619 u8 op;
620
621 ret = insn_decode_kernel(&insn, addr);
622 if (WARN_ON_ONCE(ret < 0))
623 continue;
624
625 op = insn.opcode.bytes[0];
626 if (op == JMP32_INSN_OPCODE)
627 dest = addr + insn.length + insn.immediate.value;
628
629 if (__static_call_fixup(addr, op, dest) ||
630 WARN_ONCE(dest != &__x86_return_thunk,
631 "missing return thunk: %pS-%pS: %*ph",
632 addr, dest, 5, addr))
633 continue;
634
635 DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
636 addr, addr, insn.length,
637 addr + insn.length + insn.immediate.value);
638
639 len = patch_return(addr, &insn, bytes);
640 if (len == insn.length) {
641 DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr);
642 DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
643 text_poke_early(addr, bytes, len);
644 }
645 }
646}
647#else
648void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
649#endif /* CONFIG_RETHUNK */
650
651#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
652
653void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
654void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
655
656#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
657
658#ifdef CONFIG_X86_KERNEL_IBT
659
660static void poison_endbr(void *addr, bool warn)
661{
662 u32 endbr, poison = gen_endbr_poison();
663
664 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
665 return;
666
667 if (!is_endbr(endbr)) {
668 WARN_ON_ONCE(warn);
669 return;
670 }
671
672 DPRINTK("ENDBR at: %pS (%px)", addr, addr);
673
674 /*
675 * When we have IBT, the lack of ENDBR will trigger #CP
676 */
677 DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
678 DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
679 text_poke_early(addr, &poison, 4);
680}
681
682/*
683 * Generated by: objtool --ibt
684 */
685void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
686{
687 s32 *s;
688
689 for (s = start; s < end; s++) {
690 void *addr = (void *)s + *s;
691
692 poison_endbr(addr, true);
693 if (IS_ENABLED(CONFIG_FINEIBT))
694 poison_endbr(addr - 16, false);
695 }
696}
697
698#else
699
700void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
701
702#endif /* CONFIG_X86_KERNEL_IBT */
703
704#ifdef CONFIG_FINEIBT
705
706enum cfi_mode {
707 CFI_DEFAULT,
708 CFI_OFF,
709 CFI_KCFI,
710 CFI_FINEIBT,
711};
712
713static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
714static bool cfi_rand __ro_after_init = true;
715static u32 cfi_seed __ro_after_init;
716
717/*
718 * Re-hash the CFI hash with a boot-time seed while making sure the result is
719 * not a valid ENDBR instruction.
720 */
721static u32 cfi_rehash(u32 hash)
722{
723 hash ^= cfi_seed;
724 while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
725 bool lsb = hash & 1;
726 hash >>= 1;
727 if (lsb)
728 hash ^= 0x80200003;
729 }
730 return hash;
731}
732
733static __init int cfi_parse_cmdline(char *str)
734{
735 if (!str)
736 return -EINVAL;
737
738 while (str) {
739 char *next = strchr(str, ',');
740 if (next) {
741 *next = 0;
742 next++;
743 }
744
745 if (!strcmp(str, "auto")) {
746 cfi_mode = CFI_DEFAULT;
747 } else if (!strcmp(str, "off")) {
748 cfi_mode = CFI_OFF;
749 cfi_rand = false;
750 } else if (!strcmp(str, "kcfi")) {
751 cfi_mode = CFI_KCFI;
752 } else if (!strcmp(str, "fineibt")) {
753 cfi_mode = CFI_FINEIBT;
754 } else if (!strcmp(str, "norand")) {
755 cfi_rand = false;
756 } else {
757 pr_err("Ignoring unknown cfi option (%s).", str);
758 }
759
760 str = next;
761 }
762
763 return 0;
764}
765early_param("cfi", cfi_parse_cmdline);
766
767/*
768 * kCFI FineIBT
769 *
770 * __cfi_\func: __cfi_\func:
771 * movl $0x12345678,%eax // 5 endbr64 // 4
772 * nop subl $0x12345678,%r10d // 7
773 * nop jz 1f // 2
774 * nop ud2 // 2
775 * nop 1: nop // 1
776 * nop
777 * nop
778 * nop
779 * nop
780 * nop
781 * nop
782 * nop
783 *
784 *
785 * caller: caller:
786 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
787 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
788 * je 1f // 2 nop4 // 4
789 * ud2 // 2
790 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
791 *
792 */
793
794asm( ".pushsection .rodata \n"
795 "fineibt_preamble_start: \n"
796 " endbr64 \n"
797 " subl $0x12345678, %r10d \n"
798 " je fineibt_preamble_end \n"
799 " ud2 \n"
800 " nop \n"
801 "fineibt_preamble_end: \n"
802 ".popsection\n"
803);
804
805extern u8 fineibt_preamble_start[];
806extern u8 fineibt_preamble_end[];
807
808#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
809#define fineibt_preamble_hash 7
810
811asm( ".pushsection .rodata \n"
812 "fineibt_caller_start: \n"
813 " movl $0x12345678, %r10d \n"
814 " sub $16, %r11 \n"
815 ASM_NOP4
816 "fineibt_caller_end: \n"
817 ".popsection \n"
818);
819
820extern u8 fineibt_caller_start[];
821extern u8 fineibt_caller_end[];
822
823#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
824#define fineibt_caller_hash 2
825
826#define fineibt_caller_jmp (fineibt_caller_size - 2)
827
828static u32 decode_preamble_hash(void *addr)
829{
830 u8 *p = addr;
831
832 /* b8 78 56 34 12 mov $0x12345678,%eax */
833 if (p[0] == 0xb8)
834 return *(u32 *)(addr + 1);
835
836 return 0; /* invalid hash value */
837}
838
839static u32 decode_caller_hash(void *addr)
840{
841 u8 *p = addr;
842
843 /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
844 if (p[0] == 0x41 && p[1] == 0xba)
845 return -*(u32 *)(addr + 2);
846
847 /* e8 0c 78 56 34 12 jmp.d8 +12 */
848 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
849 return -*(u32 *)(addr + 2);
850
851 return 0; /* invalid hash value */
852}
853
854/* .retpoline_sites */
855static int cfi_disable_callers(s32 *start, s32 *end)
856{
857 /*
858 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
859 * in tact for later usage. Also see decode_caller_hash() and
860 * cfi_rewrite_callers().
861 */
862 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
863 s32 *s;
864
865 for (s = start; s < end; s++) {
866 void *addr = (void *)s + *s;
867 u32 hash;
868
869 addr -= fineibt_caller_size;
870 hash = decode_caller_hash(addr);
871 if (!hash) /* nocfi callers */
872 continue;
873
874 text_poke_early(addr, jmp, 2);
875 }
876
877 return 0;
878}
879
880static int cfi_enable_callers(s32 *start, s32 *end)
881{
882 /*
883 * Re-enable kCFI, undo what cfi_disable_callers() did.
884 */
885 const u8 mov[] = { 0x41, 0xba };
886 s32 *s;
887
888 for (s = start; s < end; s++) {
889 void *addr = (void *)s + *s;
890 u32 hash;
891
892 addr -= fineibt_caller_size;
893 hash = decode_caller_hash(addr);
894 if (!hash) /* nocfi callers */
895 continue;
896
897 text_poke_early(addr, mov, 2);
898 }
899
900 return 0;
901}
902
903/* .cfi_sites */
904static int cfi_rand_preamble(s32 *start, s32 *end)
905{
906 s32 *s;
907
908 for (s = start; s < end; s++) {
909 void *addr = (void *)s + *s;
910 u32 hash;
911
912 hash = decode_preamble_hash(addr);
913 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
914 addr, addr, 5, addr))
915 return -EINVAL;
916
917 hash = cfi_rehash(hash);
918 text_poke_early(addr + 1, &hash, 4);
919 }
920
921 return 0;
922}
923
924static int cfi_rewrite_preamble(s32 *start, s32 *end)
925{
926 s32 *s;
927
928 for (s = start; s < end; s++) {
929 void *addr = (void *)s + *s;
930 u32 hash;
931
932 hash = decode_preamble_hash(addr);
933 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
934 addr, addr, 5, addr))
935 return -EINVAL;
936
937 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
938 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
939 text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
940 }
941
942 return 0;
943}
944
945/* .retpoline_sites */
946static int cfi_rand_callers(s32 *start, s32 *end)
947{
948 s32 *s;
949
950 for (s = start; s < end; s++) {
951 void *addr = (void *)s + *s;
952 u32 hash;
953
954 addr -= fineibt_caller_size;
955 hash = decode_caller_hash(addr);
956 if (hash) {
957 hash = -cfi_rehash(hash);
958 text_poke_early(addr + 2, &hash, 4);
959 }
960 }
961
962 return 0;
963}
964
965static int cfi_rewrite_callers(s32 *start, s32 *end)
966{
967 s32 *s;
968
969 for (s = start; s < end; s++) {
970 void *addr = (void *)s + *s;
971 u32 hash;
972
973 addr -= fineibt_caller_size;
974 hash = decode_caller_hash(addr);
975 if (hash) {
976 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
977 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
978 text_poke_early(addr + fineibt_caller_hash, &hash, 4);
979 }
980 /* rely on apply_retpolines() */
981 }
982
983 return 0;
984}
985
986static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
987 s32 *start_cfi, s32 *end_cfi, bool builtin)
988{
989 int ret;
990
991 if (WARN_ONCE(fineibt_preamble_size != 16,
992 "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
993 return;
994
995 if (cfi_mode == CFI_DEFAULT) {
996 cfi_mode = CFI_KCFI;
997 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
998 cfi_mode = CFI_FINEIBT;
999 }
1000
1001 /*
1002 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1003 * rewrite them. This disables all CFI. If this succeeds but any of the
1004 * later stages fails, we're without CFI.
1005 */
1006 ret = cfi_disable_callers(start_retpoline, end_retpoline);
1007 if (ret)
1008 goto err;
1009
1010 if (cfi_rand) {
1011 if (builtin)
1012 cfi_seed = get_random_u32();
1013
1014 ret = cfi_rand_preamble(start_cfi, end_cfi);
1015 if (ret)
1016 goto err;
1017
1018 ret = cfi_rand_callers(start_retpoline, end_retpoline);
1019 if (ret)
1020 goto err;
1021 }
1022
1023 switch (cfi_mode) {
1024 case CFI_OFF:
1025 if (builtin)
1026 pr_info("Disabling CFI\n");
1027 return;
1028
1029 case CFI_KCFI:
1030 ret = cfi_enable_callers(start_retpoline, end_retpoline);
1031 if (ret)
1032 goto err;
1033
1034 if (builtin)
1035 pr_info("Using kCFI\n");
1036 return;
1037
1038 case CFI_FINEIBT:
1039 ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1040 if (ret)
1041 goto err;
1042
1043 ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1044 if (ret)
1045 goto err;
1046
1047 if (builtin)
1048 pr_info("Using FineIBT CFI\n");
1049 return;
1050
1051 default:
1052 break;
1053 }
1054
1055err:
1056 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1057}
1058
1059#else
1060
1061static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1062 s32 *start_cfi, s32 *end_cfi, bool builtin)
1063{
1064}
1065
1066#endif
1067
1068void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1069 s32 *start_cfi, s32 *end_cfi)
1070{
1071 return __apply_fineibt(start_retpoline, end_retpoline,
1072 start_cfi, end_cfi,
1073 /* .builtin = */ false);
1074}
1075
1076#ifdef CONFIG_SMP
1077static void alternatives_smp_lock(const s32 *start, const s32 *end,
1078 u8 *text, u8 *text_end)
1079{
1080 const s32 *poff;
1081
1082 for (poff = start; poff < end; poff++) {
1083 u8 *ptr = (u8 *)poff + *poff;
1084
1085 if (!*poff || ptr < text || ptr >= text_end)
1086 continue;
1087 /* turn DS segment override prefix into lock prefix */
1088 if (*ptr == 0x3e)
1089 text_poke(ptr, ((unsigned char []){0xf0}), 1);
1090 }
1091}
1092
1093static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1094 u8 *text, u8 *text_end)
1095{
1096 const s32 *poff;
1097
1098 for (poff = start; poff < end; poff++) {
1099 u8 *ptr = (u8 *)poff + *poff;
1100
1101 if (!*poff || ptr < text || ptr >= text_end)
1102 continue;
1103 /* turn lock prefix into DS segment override prefix */
1104 if (*ptr == 0xf0)
1105 text_poke(ptr, ((unsigned char []){0x3E}), 1);
1106 }
1107}
1108
1109struct smp_alt_module {
1110 /* what is this ??? */
1111 struct module *mod;
1112 char *name;
1113
1114 /* ptrs to lock prefixes */
1115 const s32 *locks;
1116 const s32 *locks_end;
1117
1118 /* .text segment, needed to avoid patching init code ;) */
1119 u8 *text;
1120 u8 *text_end;
1121
1122 struct list_head next;
1123};
1124static LIST_HEAD(smp_alt_modules);
1125static bool uniproc_patched = false; /* protected by text_mutex */
1126
1127void __init_or_module alternatives_smp_module_add(struct module *mod,
1128 char *name,
1129 void *locks, void *locks_end,
1130 void *text, void *text_end)
1131{
1132 struct smp_alt_module *smp;
1133
1134 mutex_lock(&text_mutex);
1135 if (!uniproc_patched)
1136 goto unlock;
1137
1138 if (num_possible_cpus() == 1)
1139 /* Don't bother remembering, we'll never have to undo it. */
1140 goto smp_unlock;
1141
1142 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1143 if (NULL == smp)
1144 /* we'll run the (safe but slow) SMP code then ... */
1145 goto unlock;
1146
1147 smp->mod = mod;
1148 smp->name = name;
1149 smp->locks = locks;
1150 smp->locks_end = locks_end;
1151 smp->text = text;
1152 smp->text_end = text_end;
1153 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
1154 smp->locks, smp->locks_end,
1155 smp->text, smp->text_end, smp->name);
1156
1157 list_add_tail(&smp->next, &smp_alt_modules);
1158smp_unlock:
1159 alternatives_smp_unlock(locks, locks_end, text, text_end);
1160unlock:
1161 mutex_unlock(&text_mutex);
1162}
1163
1164void __init_or_module alternatives_smp_module_del(struct module *mod)
1165{
1166 struct smp_alt_module *item;
1167
1168 mutex_lock(&text_mutex);
1169 list_for_each_entry(item, &smp_alt_modules, next) {
1170 if (mod != item->mod)
1171 continue;
1172 list_del(&item->next);
1173 kfree(item);
1174 break;
1175 }
1176 mutex_unlock(&text_mutex);
1177}
1178
1179void alternatives_enable_smp(void)
1180{
1181 struct smp_alt_module *mod;
1182
1183 /* Why bother if there are no other CPUs? */
1184 BUG_ON(num_possible_cpus() == 1);
1185
1186 mutex_lock(&text_mutex);
1187
1188 if (uniproc_patched) {
1189 pr_info("switching to SMP code\n");
1190 BUG_ON(num_online_cpus() != 1);
1191 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1192 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1193 list_for_each_entry(mod, &smp_alt_modules, next)
1194 alternatives_smp_lock(mod->locks, mod->locks_end,
1195 mod->text, mod->text_end);
1196 uniproc_patched = false;
1197 }
1198 mutex_unlock(&text_mutex);
1199}
1200
1201/*
1202 * Return 1 if the address range is reserved for SMP-alternatives.
1203 * Must hold text_mutex.
1204 */
1205int alternatives_text_reserved(void *start, void *end)
1206{
1207 struct smp_alt_module *mod;
1208 const s32 *poff;
1209 u8 *text_start = start;
1210 u8 *text_end = end;
1211
1212 lockdep_assert_held(&text_mutex);
1213
1214 list_for_each_entry(mod, &smp_alt_modules, next) {
1215 if (mod->text > text_end || mod->text_end < text_start)
1216 continue;
1217 for (poff = mod->locks; poff < mod->locks_end; poff++) {
1218 const u8 *ptr = (const u8 *)poff + *poff;
1219
1220 if (text_start <= ptr && text_end > ptr)
1221 return 1;
1222 }
1223 }
1224
1225 return 0;
1226}
1227#endif /* CONFIG_SMP */
1228
1229#ifdef CONFIG_PARAVIRT
1230void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1231 struct paravirt_patch_site *end)
1232{
1233 struct paravirt_patch_site *p;
1234 char insn_buff[MAX_PATCH_LEN];
1235
1236 for (p = start; p < end; p++) {
1237 unsigned int used;
1238
1239 BUG_ON(p->len > MAX_PATCH_LEN);
1240 /* prep the buffer with the original instructions */
1241 memcpy(insn_buff, p->instr, p->len);
1242 used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
1243
1244 BUG_ON(used > p->len);
1245
1246 /* Pad the rest with nops */
1247 add_nops(insn_buff + used, p->len - used);
1248 text_poke_early(p->instr, insn_buff, p->len);
1249 }
1250}
1251extern struct paravirt_patch_site __start_parainstructions[],
1252 __stop_parainstructions[];
1253#endif /* CONFIG_PARAVIRT */
1254
1255/*
1256 * Self-test for the INT3 based CALL emulation code.
1257 *
1258 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1259 * properly and that there is a stack gap between the INT3 frame and the
1260 * previous context. Without this gap doing a virtual PUSH on the interrupted
1261 * stack would corrupt the INT3 IRET frame.
1262 *
1263 * See entry_{32,64}.S for more details.
1264 */
1265
1266/*
1267 * We define the int3_magic() function in assembly to control the calling
1268 * convention such that we can 'call' it from assembly.
1269 */
1270
1271extern void int3_magic(unsigned int *ptr); /* defined in asm */
1272
1273asm (
1274" .pushsection .init.text, \"ax\", @progbits\n"
1275" .type int3_magic, @function\n"
1276"int3_magic:\n"
1277 ANNOTATE_NOENDBR
1278" movl $1, (%" _ASM_ARG1 ")\n"
1279 ASM_RET
1280" .size int3_magic, .-int3_magic\n"
1281" .popsection\n"
1282);
1283
1284extern void int3_selftest_ip(void); /* defined in asm below */
1285
1286static int __init
1287int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1288{
1289 unsigned long selftest = (unsigned long)&int3_selftest_ip;
1290 struct die_args *args = data;
1291 struct pt_regs *regs = args->regs;
1292
1293 OPTIMIZER_HIDE_VAR(selftest);
1294
1295 if (!regs || user_mode(regs))
1296 return NOTIFY_DONE;
1297
1298 if (val != DIE_INT3)
1299 return NOTIFY_DONE;
1300
1301 if (regs->ip - INT3_INSN_SIZE != selftest)
1302 return NOTIFY_DONE;
1303
1304 int3_emulate_call(regs, (unsigned long)&int3_magic);
1305 return NOTIFY_STOP;
1306}
1307
1308/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1309static noinline void __init int3_selftest(void)
1310{
1311 static __initdata struct notifier_block int3_exception_nb = {
1312 .notifier_call = int3_exception_notify,
1313 .priority = INT_MAX-1, /* last */
1314 };
1315 unsigned int val = 0;
1316
1317 BUG_ON(register_die_notifier(&int3_exception_nb));
1318
1319 /*
1320 * Basically: int3_magic(&val); but really complicated :-)
1321 *
1322 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1323 * notifier above will emulate CALL for us.
1324 */
1325 asm volatile ("int3_selftest_ip:\n\t"
1326 ANNOTATE_NOENDBR
1327 " int3; nop; nop; nop; nop\n\t"
1328 : ASM_CALL_CONSTRAINT
1329 : __ASM_SEL_RAW(a, D) (&val)
1330 : "memory");
1331
1332 BUG_ON(val != 1);
1333
1334 unregister_die_notifier(&int3_exception_nb);
1335}
1336
1337void __init alternative_instructions(void)
1338{
1339 int3_selftest();
1340
1341 /*
1342 * The patching is not fully atomic, so try to avoid local
1343 * interruptions that might execute the to be patched code.
1344 * Other CPUs are not running.
1345 */
1346 stop_nmi();
1347
1348 /*
1349 * Don't stop machine check exceptions while patching.
1350 * MCEs only happen when something got corrupted and in this
1351 * case we must do something about the corruption.
1352 * Ignoring it is worse than an unlikely patching race.
1353 * Also machine checks tend to be broadcast and if one CPU
1354 * goes into machine check the others follow quickly, so we don't
1355 * expect a machine check to cause undue problems during to code
1356 * patching.
1357 */
1358
1359 /*
1360 * Paravirt patching and alternative patching can be combined to
1361 * replace a function call with a short direct code sequence (e.g.
1362 * by setting a constant return value instead of doing that in an
1363 * external function).
1364 * In order to make this work the following sequence is required:
1365 * 1. set (artificial) features depending on used paravirt
1366 * functions which can later influence alternative patching
1367 * 2. apply paravirt patching (generally replacing an indirect
1368 * function call with a direct one)
1369 * 3. apply alternative patching (e.g. replacing a direct function
1370 * call with a custom code sequence)
1371 * Doing paravirt patching after alternative patching would clobber
1372 * the optimization of the custom code with a function call again.
1373 */
1374 paravirt_set_cap();
1375
1376 /*
1377 * First patch paravirt functions, such that we overwrite the indirect
1378 * call with the direct call.
1379 */
1380 apply_paravirt(__parainstructions, __parainstructions_end);
1381
1382 __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1383 __cfi_sites, __cfi_sites_end, true);
1384
1385 /*
1386 * Rewrite the retpolines, must be done before alternatives since
1387 * those can rewrite the retpoline thunks.
1388 */
1389 apply_retpolines(__retpoline_sites, __retpoline_sites_end);
1390 apply_returns(__return_sites, __return_sites_end);
1391
1392 /*
1393 * Then patch alternatives, such that those paravirt calls that are in
1394 * alternatives can be overwritten by their immediate fragments.
1395 */
1396 apply_alternatives(__alt_instructions, __alt_instructions_end);
1397
1398 /*
1399 * Now all calls are established. Apply the call thunks if
1400 * required.
1401 */
1402 callthunks_patch_builtin_calls();
1403
1404 apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
1405
1406#ifdef CONFIG_SMP
1407 /* Patch to UP if other cpus not imminent. */
1408 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1409 uniproc_patched = true;
1410 alternatives_smp_module_add(NULL, "core kernel",
1411 __smp_locks, __smp_locks_end,
1412 _text, _etext);
1413 }
1414
1415 if (!uniproc_patched || num_possible_cpus() == 1) {
1416 free_init_pages("SMP alternatives",
1417 (unsigned long)__smp_locks,
1418 (unsigned long)__smp_locks_end);
1419 }
1420#endif
1421
1422 restart_nmi();
1423 alternatives_patched = 1;
1424}
1425
1426/**
1427 * text_poke_early - Update instructions on a live kernel at boot time
1428 * @addr: address to modify
1429 * @opcode: source of the copy
1430 * @len: length to copy
1431 *
1432 * When you use this code to patch more than one byte of an instruction
1433 * you need to make sure that other CPUs cannot execute this code in parallel.
1434 * Also no thread must be currently preempted in the middle of these
1435 * instructions. And on the local CPU you need to be protected against NMI or
1436 * MCE handlers seeing an inconsistent instruction while you patch.
1437 */
1438void __init_or_module text_poke_early(void *addr, const void *opcode,
1439 size_t len)
1440{
1441 unsigned long flags;
1442
1443 if (boot_cpu_has(X86_FEATURE_NX) &&
1444 is_module_text_address((unsigned long)addr)) {
1445 /*
1446 * Modules text is marked initially as non-executable, so the
1447 * code cannot be running and speculative code-fetches are
1448 * prevented. Just change the code.
1449 */
1450 memcpy(addr, opcode, len);
1451 } else {
1452 local_irq_save(flags);
1453 memcpy(addr, opcode, len);
1454 local_irq_restore(flags);
1455 sync_core();
1456
1457 /*
1458 * Could also do a CLFLUSH here to speed up CPU recovery; but
1459 * that causes hangs on some VIA CPUs.
1460 */
1461 }
1462}
1463
1464typedef struct {
1465 struct mm_struct *mm;
1466} temp_mm_state_t;
1467
1468/*
1469 * Using a temporary mm allows to set temporary mappings that are not accessible
1470 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1471 * that override the kernel memory protections (e.g., W^X), without exposing the
1472 * temporary page-table mappings that are required for these write operations to
1473 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1474 * mapping is torn down.
1475 *
1476 * Context: The temporary mm needs to be used exclusively by a single core. To
1477 * harden security IRQs must be disabled while the temporary mm is
1478 * loaded, thereby preventing interrupt handler bugs from overriding
1479 * the kernel memory protection.
1480 */
1481static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1482{
1483 temp_mm_state_t temp_state;
1484
1485 lockdep_assert_irqs_disabled();
1486
1487 /*
1488 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1489 * with a stale address space WITHOUT being in lazy mode after
1490 * restoring the previous mm.
1491 */
1492 if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1493 leave_mm(smp_processor_id());
1494
1495 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1496 switch_mm_irqs_off(NULL, mm, current);
1497
1498 /*
1499 * If breakpoints are enabled, disable them while the temporary mm is
1500 * used. Userspace might set up watchpoints on addresses that are used
1501 * in the temporary mm, which would lead to wrong signals being sent or
1502 * crashes.
1503 *
1504 * Note that breakpoints are not disabled selectively, which also causes
1505 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1506 * undesirable, but still seems reasonable as the code that runs in the
1507 * temporary mm should be short.
1508 */
1509 if (hw_breakpoint_active())
1510 hw_breakpoint_disable();
1511
1512 return temp_state;
1513}
1514
1515static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1516{
1517 lockdep_assert_irqs_disabled();
1518 switch_mm_irqs_off(NULL, prev_state.mm, current);
1519
1520 /*
1521 * Restore the breakpoints if they were disabled before the temporary mm
1522 * was loaded.
1523 */
1524 if (hw_breakpoint_active())
1525 hw_breakpoint_restore();
1526}
1527
1528__ro_after_init struct mm_struct *poking_mm;
1529__ro_after_init unsigned long poking_addr;
1530
1531static void text_poke_memcpy(void *dst, const void *src, size_t len)
1532{
1533 memcpy(dst, src, len);
1534}
1535
1536static void text_poke_memset(void *dst, const void *src, size_t len)
1537{
1538 int c = *(const int *)src;
1539
1540 memset(dst, c, len);
1541}
1542
1543typedef void text_poke_f(void *dst, const void *src, size_t len);
1544
1545static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1546{
1547 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1548 struct page *pages[2] = {NULL};
1549 temp_mm_state_t prev;
1550 unsigned long flags;
1551 pte_t pte, *ptep;
1552 spinlock_t *ptl;
1553 pgprot_t pgprot;
1554
1555 /*
1556 * While boot memory allocator is running we cannot use struct pages as
1557 * they are not yet initialized. There is no way to recover.
1558 */
1559 BUG_ON(!after_bootmem);
1560
1561 if (!core_kernel_text((unsigned long)addr)) {
1562 pages[0] = vmalloc_to_page(addr);
1563 if (cross_page_boundary)
1564 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1565 } else {
1566 pages[0] = virt_to_page(addr);
1567 WARN_ON(!PageReserved(pages[0]));
1568 if (cross_page_boundary)
1569 pages[1] = virt_to_page(addr + PAGE_SIZE);
1570 }
1571 /*
1572 * If something went wrong, crash and burn since recovery paths are not
1573 * implemented.
1574 */
1575 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1576
1577 /*
1578 * Map the page without the global bit, as TLB flushing is done with
1579 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1580 */
1581 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1582
1583 /*
1584 * The lock is not really needed, but this allows to avoid open-coding.
1585 */
1586 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1587
1588 /*
1589 * This must not fail; preallocated in poking_init().
1590 */
1591 VM_BUG_ON(!ptep);
1592
1593 local_irq_save(flags);
1594
1595 pte = mk_pte(pages[0], pgprot);
1596 set_pte_at(poking_mm, poking_addr, ptep, pte);
1597
1598 if (cross_page_boundary) {
1599 pte = mk_pte(pages[1], pgprot);
1600 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1601 }
1602
1603 /*
1604 * Loading the temporary mm behaves as a compiler barrier, which
1605 * guarantees that the PTE will be set at the time memcpy() is done.
1606 */
1607 prev = use_temporary_mm(poking_mm);
1608
1609 kasan_disable_current();
1610 func((u8 *)poking_addr + offset_in_page(addr), src, len);
1611 kasan_enable_current();
1612
1613 /*
1614 * Ensure that the PTE is only cleared after the instructions of memcpy
1615 * were issued by using a compiler barrier.
1616 */
1617 barrier();
1618
1619 pte_clear(poking_mm, poking_addr, ptep);
1620 if (cross_page_boundary)
1621 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1622
1623 /*
1624 * Loading the previous page-table hierarchy requires a serializing
1625 * instruction that already allows the core to see the updated version.
1626 * Xen-PV is assumed to serialize execution in a similar manner.
1627 */
1628 unuse_temporary_mm(prev);
1629
1630 /*
1631 * Flushing the TLB might involve IPIs, which would require enabled
1632 * IRQs, but not if the mm is not used, as it is in this point.
1633 */
1634 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1635 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1636 PAGE_SHIFT, false);
1637
1638 if (func == text_poke_memcpy) {
1639 /*
1640 * If the text does not match what we just wrote then something is
1641 * fundamentally screwy; there's nothing we can really do about that.
1642 */
1643 BUG_ON(memcmp(addr, src, len));
1644 }
1645
1646 local_irq_restore(flags);
1647 pte_unmap_unlock(ptep, ptl);
1648 return addr;
1649}
1650
1651/**
1652 * text_poke - Update instructions on a live kernel
1653 * @addr: address to modify
1654 * @opcode: source of the copy
1655 * @len: length to copy
1656 *
1657 * Only atomic text poke/set should be allowed when not doing early patching.
1658 * It means the size must be writable atomically and the address must be aligned
1659 * in a way that permits an atomic write. It also makes sure we fit on a single
1660 * page.
1661 *
1662 * Note that the caller must ensure that if the modified code is part of a
1663 * module, the module would not be removed during poking. This can be achieved
1664 * by registering a module notifier, and ordering module removal and patching
1665 * trough a mutex.
1666 */
1667void *text_poke(void *addr, const void *opcode, size_t len)
1668{
1669 lockdep_assert_held(&text_mutex);
1670
1671 return __text_poke(text_poke_memcpy, addr, opcode, len);
1672}
1673
1674/**
1675 * text_poke_kgdb - Update instructions on a live kernel by kgdb
1676 * @addr: address to modify
1677 * @opcode: source of the copy
1678 * @len: length to copy
1679 *
1680 * Only atomic text poke/set should be allowed when not doing early patching.
1681 * It means the size must be writable atomically and the address must be aligned
1682 * in a way that permits an atomic write. It also makes sure we fit on a single
1683 * page.
1684 *
1685 * Context: should only be used by kgdb, which ensures no other core is running,
1686 * despite the fact it does not hold the text_mutex.
1687 */
1688void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1689{
1690 return __text_poke(text_poke_memcpy, addr, opcode, len);
1691}
1692
1693void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
1694 bool core_ok)
1695{
1696 unsigned long start = (unsigned long)addr;
1697 size_t patched = 0;
1698
1699 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
1700 return NULL;
1701
1702 while (patched < len) {
1703 unsigned long ptr = start + patched;
1704 size_t s;
1705
1706 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1707
1708 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
1709 patched += s;
1710 }
1711 return addr;
1712}
1713
1714/**
1715 * text_poke_copy - Copy instructions into (an unused part of) RX memory
1716 * @addr: address to modify
1717 * @opcode: source of the copy
1718 * @len: length to copy, could be more than 2x PAGE_SIZE
1719 *
1720 * Not safe against concurrent execution; useful for JITs to dump
1721 * new code blocks into unused regions of RX memory. Can be used in
1722 * conjunction with synchronize_rcu_tasks() to wait for existing
1723 * execution to quiesce after having made sure no existing functions
1724 * pointers are live.
1725 */
1726void *text_poke_copy(void *addr, const void *opcode, size_t len)
1727{
1728 mutex_lock(&text_mutex);
1729 addr = text_poke_copy_locked(addr, opcode, len, false);
1730 mutex_unlock(&text_mutex);
1731 return addr;
1732}
1733
1734/**
1735 * text_poke_set - memset into (an unused part of) RX memory
1736 * @addr: address to modify
1737 * @c: the byte to fill the area with
1738 * @len: length to copy, could be more than 2x PAGE_SIZE
1739 *
1740 * This is useful to overwrite unused regions of RX memory with illegal
1741 * instructions.
1742 */
1743void *text_poke_set(void *addr, int c, size_t len)
1744{
1745 unsigned long start = (unsigned long)addr;
1746 size_t patched = 0;
1747
1748 if (WARN_ON_ONCE(core_kernel_text(start)))
1749 return NULL;
1750
1751 mutex_lock(&text_mutex);
1752 while (patched < len) {
1753 unsigned long ptr = start + patched;
1754 size_t s;
1755
1756 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1757
1758 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
1759 patched += s;
1760 }
1761 mutex_unlock(&text_mutex);
1762 return addr;
1763}
1764
1765static void do_sync_core(void *info)
1766{
1767 sync_core();
1768}
1769
1770void text_poke_sync(void)
1771{
1772 on_each_cpu(do_sync_core, NULL, 1);
1773}
1774
1775struct text_poke_loc {
1776 /* addr := _stext + rel_addr */
1777 s32 rel_addr;
1778 s32 disp;
1779 u8 len;
1780 u8 opcode;
1781 const u8 text[POKE_MAX_OPCODE_SIZE];
1782 /* see text_poke_bp_batch() */
1783 u8 old;
1784};
1785
1786struct bp_patching_desc {
1787 struct text_poke_loc *vec;
1788 int nr_entries;
1789 atomic_t refs;
1790};
1791
1792static struct bp_patching_desc bp_desc;
1793
1794static __always_inline
1795struct bp_patching_desc *try_get_desc(void)
1796{
1797 struct bp_patching_desc *desc = &bp_desc;
1798
1799 if (!arch_atomic_inc_not_zero(&desc->refs))
1800 return NULL;
1801
1802 return desc;
1803}
1804
1805static __always_inline void put_desc(void)
1806{
1807 struct bp_patching_desc *desc = &bp_desc;
1808
1809 smp_mb__before_atomic();
1810 arch_atomic_dec(&desc->refs);
1811}
1812
1813static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1814{
1815 return _stext + tp->rel_addr;
1816}
1817
1818static __always_inline int patch_cmp(const void *key, const void *elt)
1819{
1820 struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1821
1822 if (key < text_poke_addr(tp))
1823 return -1;
1824 if (key > text_poke_addr(tp))
1825 return 1;
1826 return 0;
1827}
1828
1829noinstr int poke_int3_handler(struct pt_regs *regs)
1830{
1831 struct bp_patching_desc *desc;
1832 struct text_poke_loc *tp;
1833 int ret = 0;
1834 void *ip;
1835
1836 if (user_mode(regs))
1837 return 0;
1838
1839 /*
1840 * Having observed our INT3 instruction, we now must observe
1841 * bp_desc with non-zero refcount:
1842 *
1843 * bp_desc.refs = 1 INT3
1844 * WMB RMB
1845 * write INT3 if (bp_desc.refs != 0)
1846 */
1847 smp_rmb();
1848
1849 desc = try_get_desc();
1850 if (!desc)
1851 return 0;
1852
1853 /*
1854 * Discount the INT3. See text_poke_bp_batch().
1855 */
1856 ip = (void *) regs->ip - INT3_INSN_SIZE;
1857
1858 /*
1859 * Skip the binary search if there is a single member in the vector.
1860 */
1861 if (unlikely(desc->nr_entries > 1)) {
1862 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
1863 sizeof(struct text_poke_loc),
1864 patch_cmp);
1865 if (!tp)
1866 goto out_put;
1867 } else {
1868 tp = desc->vec;
1869 if (text_poke_addr(tp) != ip)
1870 goto out_put;
1871 }
1872
1873 ip += tp->len;
1874
1875 switch (tp->opcode) {
1876 case INT3_INSN_OPCODE:
1877 /*
1878 * Someone poked an explicit INT3, they'll want to handle it,
1879 * do not consume.
1880 */
1881 goto out_put;
1882
1883 case RET_INSN_OPCODE:
1884 int3_emulate_ret(regs);
1885 break;
1886
1887 case CALL_INSN_OPCODE:
1888 int3_emulate_call(regs, (long)ip + tp->disp);
1889 break;
1890
1891 case JMP32_INSN_OPCODE:
1892 case JMP8_INSN_OPCODE:
1893 int3_emulate_jmp(regs, (long)ip + tp->disp);
1894 break;
1895
1896 default:
1897 BUG();
1898 }
1899
1900 ret = 1;
1901
1902out_put:
1903 put_desc();
1904 return ret;
1905}
1906
1907#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1908static struct text_poke_loc tp_vec[TP_VEC_MAX];
1909static int tp_vec_nr;
1910
1911/**
1912 * text_poke_bp_batch() -- update instructions on live kernel on SMP
1913 * @tp: vector of instructions to patch
1914 * @nr_entries: number of entries in the vector
1915 *
1916 * Modify multi-byte instruction by using int3 breakpoint on SMP.
1917 * We completely avoid stop_machine() here, and achieve the
1918 * synchronization using int3 breakpoint.
1919 *
1920 * The way it is done:
1921 * - For each entry in the vector:
1922 * - add a int3 trap to the address that will be patched
1923 * - sync cores
1924 * - For each entry in the vector:
1925 * - update all but the first byte of the patched range
1926 * - sync cores
1927 * - For each entry in the vector:
1928 * - replace the first byte (int3) by the first byte of
1929 * replacing opcode
1930 * - sync cores
1931 */
1932static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1933{
1934 unsigned char int3 = INT3_INSN_OPCODE;
1935 unsigned int i;
1936 int do_sync;
1937
1938 lockdep_assert_held(&text_mutex);
1939
1940 bp_desc.vec = tp;
1941 bp_desc.nr_entries = nr_entries;
1942
1943 /*
1944 * Corresponds to the implicit memory barrier in try_get_desc() to
1945 * ensure reading a non-zero refcount provides up to date bp_desc data.
1946 */
1947 atomic_set_release(&bp_desc.refs, 1);
1948
1949 /*
1950 * Corresponding read barrier in int3 notifier for making sure the
1951 * nr_entries and handler are correctly ordered wrt. patching.
1952 */
1953 smp_wmb();
1954
1955 /*
1956 * First step: add a int3 trap to the address that will be patched.
1957 */
1958 for (i = 0; i < nr_entries; i++) {
1959 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
1960 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1961 }
1962
1963 text_poke_sync();
1964
1965 /*
1966 * Second step: update all but the first byte of the patched range.
1967 */
1968 for (do_sync = 0, i = 0; i < nr_entries; i++) {
1969 u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
1970 int len = tp[i].len;
1971
1972 if (len - INT3_INSN_SIZE > 0) {
1973 memcpy(old + INT3_INSN_SIZE,
1974 text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1975 len - INT3_INSN_SIZE);
1976 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1977 (const char *)tp[i].text + INT3_INSN_SIZE,
1978 len - INT3_INSN_SIZE);
1979 do_sync++;
1980 }
1981
1982 /*
1983 * Emit a perf event to record the text poke, primarily to
1984 * support Intel PT decoding which must walk the executable code
1985 * to reconstruct the trace. The flow up to here is:
1986 * - write INT3 byte
1987 * - IPI-SYNC
1988 * - write instruction tail
1989 * At this point the actual control flow will be through the
1990 * INT3 and handler and not hit the old or new instruction.
1991 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
1992 * can still be decoded. Subsequently:
1993 * - emit RECORD_TEXT_POKE with the new instruction
1994 * - IPI-SYNC
1995 * - write first byte
1996 * - IPI-SYNC
1997 * So before the text poke event timestamp, the decoder will see
1998 * either the old instruction flow or FUP/TIP of INT3. After the
1999 * text poke event timestamp, the decoder will see either the
2000 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2001 * use the timestamp as the point at which to modify the
2002 * executable code.
2003 * The old instruction is recorded so that the event can be
2004 * processed forwards or backwards.
2005 */
2006 perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
2007 tp[i].text, len);
2008 }
2009
2010 if (do_sync) {
2011 /*
2012 * According to Intel, this core syncing is very likely
2013 * not necessary and we'd be safe even without it. But
2014 * better safe than sorry (plus there's not only Intel).
2015 */
2016 text_poke_sync();
2017 }
2018
2019 /*
2020 * Third step: replace the first byte (int3) by the first byte of
2021 * replacing opcode.
2022 */
2023 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2024 if (tp[i].text[0] == INT3_INSN_OPCODE)
2025 continue;
2026
2027 text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
2028 do_sync++;
2029 }
2030
2031 if (do_sync)
2032 text_poke_sync();
2033
2034 /*
2035 * Remove and wait for refs to be zero.
2036 */
2037 if (!atomic_dec_and_test(&bp_desc.refs))
2038 atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2039}
2040
2041static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2042 const void *opcode, size_t len, const void *emulate)
2043{
2044 struct insn insn;
2045 int ret, i;
2046
2047 memcpy((void *)tp->text, opcode, len);
2048 if (!emulate)
2049 emulate = opcode;
2050
2051 ret = insn_decode_kernel(&insn, emulate);
2052 BUG_ON(ret < 0);
2053
2054 tp->rel_addr = addr - (void *)_stext;
2055 tp->len = len;
2056 tp->opcode = insn.opcode.bytes[0];
2057
2058 switch (tp->opcode) {
2059 case RET_INSN_OPCODE:
2060 case JMP32_INSN_OPCODE:
2061 case JMP8_INSN_OPCODE:
2062 /*
2063 * Control flow instructions without implied execution of the
2064 * next instruction can be padded with INT3.
2065 */
2066 for (i = insn.length; i < len; i++)
2067 BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2068 break;
2069
2070 default:
2071 BUG_ON(len != insn.length);
2072 }
2073
2074
2075 switch (tp->opcode) {
2076 case INT3_INSN_OPCODE:
2077 case RET_INSN_OPCODE:
2078 break;
2079
2080 case CALL_INSN_OPCODE:
2081 case JMP32_INSN_OPCODE:
2082 case JMP8_INSN_OPCODE:
2083 tp->disp = insn.immediate.value;
2084 break;
2085
2086 default: /* assume NOP */
2087 switch (len) {
2088 case 2: /* NOP2 -- emulate as JMP8+0 */
2089 BUG_ON(memcmp(emulate, x86_nops[len], len));
2090 tp->opcode = JMP8_INSN_OPCODE;
2091 tp->disp = 0;
2092 break;
2093
2094 case 5: /* NOP5 -- emulate as JMP32+0 */
2095 BUG_ON(memcmp(emulate, x86_nops[len], len));
2096 tp->opcode = JMP32_INSN_OPCODE;
2097 tp->disp = 0;
2098 break;
2099
2100 default: /* unknown instruction */
2101 BUG();
2102 }
2103 break;
2104 }
2105}
2106
2107/*
2108 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2109 * early if needed.
2110 */
2111static bool tp_order_fail(void *addr)
2112{
2113 struct text_poke_loc *tp;
2114
2115 if (!tp_vec_nr)
2116 return false;
2117
2118 if (!addr) /* force */
2119 return true;
2120
2121 tp = &tp_vec[tp_vec_nr - 1];
2122 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2123 return true;
2124
2125 return false;
2126}
2127
2128static void text_poke_flush(void *addr)
2129{
2130 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2131 text_poke_bp_batch(tp_vec, tp_vec_nr);
2132 tp_vec_nr = 0;
2133 }
2134}
2135
2136void text_poke_finish(void)
2137{
2138 text_poke_flush(NULL);
2139}
2140
2141void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2142{
2143 struct text_poke_loc *tp;
2144
2145 text_poke_flush(addr);
2146
2147 tp = &tp_vec[tp_vec_nr++];
2148 text_poke_loc_init(tp, addr, opcode, len, emulate);
2149}
2150
2151/**
2152 * text_poke_bp() -- update instructions on live kernel on SMP
2153 * @addr: address to patch
2154 * @opcode: opcode of new instruction
2155 * @len: length to copy
2156 * @emulate: instruction to be emulated
2157 *
2158 * Update a single instruction with the vector in the stack, avoiding
2159 * dynamically allocated memory. This function should be used when it is
2160 * not possible to allocate memory.
2161 */
2162void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2163{
2164 struct text_poke_loc tp;
2165
2166 text_poke_loc_init(&tp, addr, opcode, len, emulate);
2167 text_poke_bp_batch(&tp, 1);
2168}