Loading...
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/list.h>
5#include <linux/stringify.h>
6#include <linux/kprobes.h>
7#include <linux/mm.h>
8#include <linux/vmalloc.h>
9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
12#include <asm/alternative.h>
13#include <asm/sections.h>
14#include <asm/pgtable.h>
15#include <asm/mce.h>
16#include <asm/nmi.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/io.h>
20#include <asm/fixmap.h>
21
22#define MAX_PATCH_LEN (255-1)
23
24#ifdef CONFIG_HOTPLUG_CPU
25static int smp_alt_once;
26
27static int __init bootonly(char *str)
28{
29 smp_alt_once = 1;
30 return 1;
31}
32__setup("smp-alt-boot", bootonly);
33#else
34#define smp_alt_once 1
35#endif
36
37static int __initdata_or_module debug_alternative;
38
39static int __init debug_alt(char *str)
40{
41 debug_alternative = 1;
42 return 1;
43}
44__setup("debug-alternative", debug_alt);
45
46static int noreplace_smp;
47
48static int __init setup_noreplace_smp(char *str)
49{
50 noreplace_smp = 1;
51 return 1;
52}
53__setup("noreplace-smp", setup_noreplace_smp);
54
55#ifdef CONFIG_PARAVIRT
56static int __initdata_or_module noreplace_paravirt = 0;
57
58static int __init setup_noreplace_paravirt(char *str)
59{
60 noreplace_paravirt = 1;
61 return 1;
62}
63__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif
65
66#define DPRINTK(fmt, args...) if (debug_alternative) \
67 printk(KERN_DEBUG fmt, args)
68
69/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
71 * that correspond to that nop. Getting from one nop to the next, we
72 * add to the array the offset that is equal to the sum of all sizes of
73 * nops preceding the one we are after.
74 *
75 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
76 * nice symmetry of sizes of the previous nops.
77 */
78#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
79static const unsigned char intelnops[] =
80{
81 GENERIC_NOP1,
82 GENERIC_NOP2,
83 GENERIC_NOP3,
84 GENERIC_NOP4,
85 GENERIC_NOP5,
86 GENERIC_NOP6,
87 GENERIC_NOP7,
88 GENERIC_NOP8,
89 GENERIC_NOP5_ATOMIC
90};
91static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
92{
93 NULL,
94 intelnops,
95 intelnops + 1,
96 intelnops + 1 + 2,
97 intelnops + 1 + 2 + 3,
98 intelnops + 1 + 2 + 3 + 4,
99 intelnops + 1 + 2 + 3 + 4 + 5,
100 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
101 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103};
104#endif
105
106#ifdef K8_NOP1
107static const unsigned char k8nops[] =
108{
109 K8_NOP1,
110 K8_NOP2,
111 K8_NOP3,
112 K8_NOP4,
113 K8_NOP5,
114 K8_NOP6,
115 K8_NOP7,
116 K8_NOP8,
117 K8_NOP5_ATOMIC
118};
119static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
120{
121 NULL,
122 k8nops,
123 k8nops + 1,
124 k8nops + 1 + 2,
125 k8nops + 1 + 2 + 3,
126 k8nops + 1 + 2 + 3 + 4,
127 k8nops + 1 + 2 + 3 + 4 + 5,
128 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
131};
132#endif
133
134#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
135static const unsigned char k7nops[] =
136{
137 K7_NOP1,
138 K7_NOP2,
139 K7_NOP3,
140 K7_NOP4,
141 K7_NOP5,
142 K7_NOP6,
143 K7_NOP7,
144 K7_NOP8,
145 K7_NOP5_ATOMIC
146};
147static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
148{
149 NULL,
150 k7nops,
151 k7nops + 1,
152 k7nops + 1 + 2,
153 k7nops + 1 + 2 + 3,
154 k7nops + 1 + 2 + 3 + 4,
155 k7nops + 1 + 2 + 3 + 4 + 5,
156 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
157 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
159};
160#endif
161
162#ifdef P6_NOP1
163static const unsigned char p6nops[] =
164{
165 P6_NOP1,
166 P6_NOP2,
167 P6_NOP3,
168 P6_NOP4,
169 P6_NOP5,
170 P6_NOP6,
171 P6_NOP7,
172 P6_NOP8,
173 P6_NOP5_ATOMIC
174};
175static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
176{
177 NULL,
178 p6nops,
179 p6nops + 1,
180 p6nops + 1 + 2,
181 p6nops + 1 + 2 + 3,
182 p6nops + 1 + 2 + 3 + 4,
183 p6nops + 1 + 2 + 3 + 4 + 5,
184 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
185 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
187};
188#endif
189
190/* Initialize these to a safe default */
191#ifdef CONFIG_X86_64
192const unsigned char * const *ideal_nops = p6_nops;
193#else
194const unsigned char * const *ideal_nops = intel_nops;
195#endif
196
197void __init arch_init_ideal_nops(void)
198{
199 switch (boot_cpu_data.x86_vendor) {
200 case X86_VENDOR_INTEL:
201 /*
202 * Due to a decoder implementation quirk, some
203 * specific Intel CPUs actually perform better with
204 * the "k8_nops" than with the SDM-recommended NOPs.
205 */
206 if (boot_cpu_data.x86 == 6 &&
207 boot_cpu_data.x86_model >= 0x0f &&
208 boot_cpu_data.x86_model != 0x1c &&
209 boot_cpu_data.x86_model != 0x26 &&
210 boot_cpu_data.x86_model != 0x27 &&
211 boot_cpu_data.x86_model < 0x30) {
212 ideal_nops = k8_nops;
213 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
214 ideal_nops = p6_nops;
215 } else {
216#ifdef CONFIG_X86_64
217 ideal_nops = k8_nops;
218#else
219 ideal_nops = intel_nops;
220#endif
221 }
222 break;
223 default:
224#ifdef CONFIG_X86_64
225 ideal_nops = k8_nops;
226#else
227 if (boot_cpu_has(X86_FEATURE_K8))
228 ideal_nops = k8_nops;
229 else if (boot_cpu_has(X86_FEATURE_K7))
230 ideal_nops = k7_nops;
231 else
232 ideal_nops = intel_nops;
233#endif
234 }
235}
236
237/* Use this to add nops to a buffer, then text_poke the whole buffer. */
238static void __init_or_module add_nops(void *insns, unsigned int len)
239{
240 while (len > 0) {
241 unsigned int noplen = len;
242 if (noplen > ASM_NOP_MAX)
243 noplen = ASM_NOP_MAX;
244 memcpy(insns, ideal_nops[noplen], noplen);
245 insns += noplen;
246 len -= noplen;
247 }
248}
249
250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
251extern s32 __smp_locks[], __smp_locks_end[];
252void *text_poke_early(void *addr, const void *opcode, size_t len);
253
254/* Replace instructions with better alternatives for this CPU type.
255 This runs before SMP is initialized to avoid SMP problems with
256 self modifying code. This implies that asymmetric systems where
257 APs have less capabilities than the boot processor are not handled.
258 Tough. Make sure you disable such features by hand. */
259
260void __init_or_module apply_alternatives(struct alt_instr *start,
261 struct alt_instr *end)
262{
263 struct alt_instr *a;
264 u8 *instr, *replacement;
265 u8 insnbuf[MAX_PATCH_LEN];
266
267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
268 /*
269 * The scan order should be from start to end. A later scanned
270 * alternative code can overwrite a previous scanned alternative code.
271 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
272 * patch code.
273 *
274 * So be careful if you want to change the scan order to any other
275 * order.
276 */
277 for (a = start; a < end; a++) {
278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid))
284 continue;
285
286 memcpy(insnbuf, replacement, a->replacementlen);
287
288 /* 0xe8 is a relative jump; fix the offset. */
289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
292 add_nops(insnbuf + a->replacementlen,
293 a->instrlen - a->replacementlen);
294
295 text_poke_early(instr, insnbuf, a->instrlen);
296 }
297}
298
299#ifdef CONFIG_SMP
300
301static void alternatives_smp_lock(const s32 *start, const s32 *end,
302 u8 *text, u8 *text_end)
303{
304 const s32 *poff;
305
306 mutex_lock(&text_mutex);
307 for (poff = start; poff < end; poff++) {
308 u8 *ptr = (u8 *)poff + *poff;
309
310 if (!*poff || ptr < text || ptr >= text_end)
311 continue;
312 /* turn DS segment override prefix into lock prefix */
313 if (*ptr == 0x3e)
314 text_poke(ptr, ((unsigned char []){0xf0}), 1);
315 };
316 mutex_unlock(&text_mutex);
317}
318
319static void alternatives_smp_unlock(const s32 *start, const s32 *end,
320 u8 *text, u8 *text_end)
321{
322 const s32 *poff;
323
324 if (noreplace_smp)
325 return;
326
327 mutex_lock(&text_mutex);
328 for (poff = start; poff < end; poff++) {
329 u8 *ptr = (u8 *)poff + *poff;
330
331 if (!*poff || ptr < text || ptr >= text_end)
332 continue;
333 /* turn lock prefix into DS segment override prefix */
334 if (*ptr == 0xf0)
335 text_poke(ptr, ((unsigned char []){0x3E}), 1);
336 };
337 mutex_unlock(&text_mutex);
338}
339
340struct smp_alt_module {
341 /* what is this ??? */
342 struct module *mod;
343 char *name;
344
345 /* ptrs to lock prefixes */
346 const s32 *locks;
347 const s32 *locks_end;
348
349 /* .text segment, needed to avoid patching init code ;) */
350 u8 *text;
351 u8 *text_end;
352
353 struct list_head next;
354};
355static LIST_HEAD(smp_alt_modules);
356static DEFINE_MUTEX(smp_alt);
357static int smp_mode = 1; /* protected by smp_alt */
358
359void __init_or_module alternatives_smp_module_add(struct module *mod,
360 char *name,
361 void *locks, void *locks_end,
362 void *text, void *text_end)
363{
364 struct smp_alt_module *smp;
365
366 if (noreplace_smp)
367 return;
368
369 if (smp_alt_once) {
370 if (boot_cpu_has(X86_FEATURE_UP))
371 alternatives_smp_unlock(locks, locks_end,
372 text, text_end);
373 return;
374 }
375
376 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
377 if (NULL == smp)
378 return; /* we'll run the (safe but slow) SMP code then ... */
379
380 smp->mod = mod;
381 smp->name = name;
382 smp->locks = locks;
383 smp->locks_end = locks_end;
384 smp->text = text;
385 smp->text_end = text_end;
386 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
387 __func__, smp->locks, smp->locks_end,
388 smp->text, smp->text_end, smp->name);
389
390 mutex_lock(&smp_alt);
391 list_add_tail(&smp->next, &smp_alt_modules);
392 if (boot_cpu_has(X86_FEATURE_UP))
393 alternatives_smp_unlock(smp->locks, smp->locks_end,
394 smp->text, smp->text_end);
395 mutex_unlock(&smp_alt);
396}
397
398void __init_or_module alternatives_smp_module_del(struct module *mod)
399{
400 struct smp_alt_module *item;
401
402 if (smp_alt_once || noreplace_smp)
403 return;
404
405 mutex_lock(&smp_alt);
406 list_for_each_entry(item, &smp_alt_modules, next) {
407 if (mod != item->mod)
408 continue;
409 list_del(&item->next);
410 mutex_unlock(&smp_alt);
411 DPRINTK("%s: %s\n", __func__, item->name);
412 kfree(item);
413 return;
414 }
415 mutex_unlock(&smp_alt);
416}
417
418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
420{
421 struct smp_alt_module *mod;
422
423#ifdef CONFIG_LOCKDEP
424 /*
425 * Older binutils section handling bug prevented
426 * alternatives-replacement from working reliably.
427 *
428 * If this still occurs then you should see a hang
429 * or crash shortly after this line:
430 */
431 printk("lockdep: fixing up alternatives.\n");
432#endif
433
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
435 return;
436 BUG_ON(!smp && (num_online_cpus() > 1));
437
438 mutex_lock(&smp_alt);
439
440 /*
441 * Avoid unnecessary switches because it forces JIT based VMs to
442 * throw away all cached translations, which can be quite costly.
443 */
444 if (smp == smp_mode) {
445 /* nothing */
446 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end);
453 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next)
458 alternatives_smp_unlock(mod->locks, mod->locks_end,
459 mod->text, mod->text_end);
460 }
461 smp_mode = smp;
462 mutex_unlock(&smp_alt);
463}
464
465/* Return 1 if the address range is reserved for smp-alternatives */
466int alternatives_text_reserved(void *start, void *end)
467{
468 struct smp_alt_module *mod;
469 const s32 *poff;
470 u8 *text_start = start;
471 u8 *text_end = end;
472
473 list_for_each_entry(mod, &smp_alt_modules, next) {
474 if (mod->text > text_end || mod->text_end < text_start)
475 continue;
476 for (poff = mod->locks; poff < mod->locks_end; poff++) {
477 const u8 *ptr = (const u8 *)poff + *poff;
478
479 if (text_start <= ptr && text_end > ptr)
480 return 1;
481 }
482 }
483
484 return 0;
485}
486#endif
487
488#ifdef CONFIG_PARAVIRT
489void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
490 struct paravirt_patch_site *end)
491{
492 struct paravirt_patch_site *p;
493 char insnbuf[MAX_PATCH_LEN];
494
495 if (noreplace_paravirt)
496 return;
497
498 for (p = start; p < end; p++) {
499 unsigned int used;
500
501 BUG_ON(p->len > MAX_PATCH_LEN);
502 /* prep the buffer with the original instructions */
503 memcpy(insnbuf, p->instr, p->len);
504 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
505 (unsigned long)p->instr, p->len);
506
507 BUG_ON(used > p->len);
508
509 /* Pad the rest with nops */
510 add_nops(insnbuf + used, p->len - used);
511 text_poke_early(p->instr, insnbuf, p->len);
512 }
513}
514extern struct paravirt_patch_site __start_parainstructions[],
515 __stop_parainstructions[];
516#endif /* CONFIG_PARAVIRT */
517
518void __init alternative_instructions(void)
519{
520 /* The patching is not fully atomic, so try to avoid local interruptions
521 that might execute the to be patched code.
522 Other CPUs are not running. */
523 stop_nmi();
524
525 /*
526 * Don't stop machine check exceptions while patching.
527 * MCEs only happen when something got corrupted and in this
528 * case we must do something about the corruption.
529 * Ignoring it is worse than a unlikely patching race.
530 * Also machine checks tend to be broadcast and if one CPU
531 * goes into machine check the others follow quickly, so we don't
532 * expect a machine check to cause undue problems during to code
533 * patching.
534 */
535
536 apply_alternatives(__alt_instructions, __alt_instructions_end);
537
538 /* switch to patch-once-at-boottime-only mode and free the
539 * tables in case we know the number of CPUs will never ever
540 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542 if (num_possible_cpus() < 2)
543 smp_alt_once = 1;
544#endif
545
546#ifdef CONFIG_SMP
547 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554 _text, _etext);
555 }
556 } else {
557 alternatives_smp_module_add(NULL, "core kernel",
558 __smp_locks, __smp_locks_end,
559 _text, _etext);
560
561 /* Only switch to UP mode if we don't immediately boot others */
562 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563 alternatives_smp_switch(0);
564 }
565#endif
566 apply_paravirt(__parainstructions, __parainstructions_end);
567
568 if (smp_alt_once)
569 free_init_pages("SMP alternatives",
570 (unsigned long)__smp_locks,
571 (unsigned long)__smp_locks_end);
572
573 restart_nmi();
574}
575
576/**
577 * text_poke_early - Update instructions on a live kernel at boot time
578 * @addr: address to modify
579 * @opcode: source of the copy
580 * @len: length to copy
581 *
582 * When you use this code to patch more than one byte of an instruction
583 * you need to make sure that other CPUs cannot execute this code in parallel.
584 * Also no thread must be currently preempted in the middle of these
585 * instructions. And on the local CPU you need to be protected again NMI or MCE
586 * handlers seeing an inconsistent instruction while you patch.
587 */
588void *__init_or_module text_poke_early(void *addr, const void *opcode,
589 size_t len)
590{
591 unsigned long flags;
592 local_irq_save(flags);
593 memcpy(addr, opcode, len);
594 sync_core();
595 local_irq_restore(flags);
596 /* Could also do a CLFLUSH here to speed up CPU recovery; but
597 that causes hangs on some VIA CPUs. */
598 return addr;
599}
600
601/**
602 * text_poke - Update instructions on a live kernel
603 * @addr: address to modify
604 * @opcode: source of the copy
605 * @len: length to copy
606 *
607 * Only atomic text poke/set should be allowed when not doing early patching.
608 * It means the size must be writable atomically and the address must be aligned
609 * in a way that permits an atomic write. It also makes sure we fit on a single
610 * page.
611 *
612 * Note: Must be called under text_mutex.
613 */
614void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
615{
616 unsigned long flags;
617 char *vaddr;
618 struct page *pages[2];
619 int i;
620
621 if (!core_kernel_text((unsigned long)addr)) {
622 pages[0] = vmalloc_to_page(addr);
623 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
624 } else {
625 pages[0] = virt_to_page(addr);
626 WARN_ON(!PageReserved(pages[0]));
627 pages[1] = virt_to_page(addr + PAGE_SIZE);
628 }
629 BUG_ON(!pages[0]);
630 local_irq_save(flags);
631 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
632 if (pages[1])
633 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
634 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
635 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
636 clear_fixmap(FIX_TEXT_POKE0);
637 if (pages[1])
638 clear_fixmap(FIX_TEXT_POKE1);
639 local_flush_tlb();
640 sync_core();
641 /* Could also do a CLFLUSH here to speed up CPU recovery; but
642 that causes hangs on some VIA CPUs. */
643 for (i = 0; i < len; i++)
644 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
645 local_irq_restore(flags);
646 return addr;
647}
648
649/*
650 * Cross-modifying kernel text with stop_machine().
651 * This code originally comes from immediate value.
652 */
653static atomic_t stop_machine_first;
654static int wrote_text;
655
656struct text_poke_params {
657 struct text_poke_param *params;
658 int nparams;
659};
660
661static int __kprobes stop_machine_text_poke(void *data)
662{
663 struct text_poke_params *tpp = data;
664 struct text_poke_param *p;
665 int i;
666
667 if (atomic_dec_and_test(&stop_machine_first)) {
668 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len);
671 }
672 smp_wmb(); /* Make sure other cpus see that this has run */
673 wrote_text = 1;
674 } else {
675 while (!wrote_text)
676 cpu_relax();
677 smp_mb(); /* Load wrote_text before following execution */
678 }
679
680 for (i = 0; i < tpp->nparams; i++) {
681 p = &tpp->params[i];
682 flush_icache_range((unsigned long)p->addr,
683 (unsigned long)p->addr + p->len);
684 }
685 /*
686 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
687 * that a core serializing instruction such as "cpuid" should be
688 * executed on _each_ core before the new instruction is made visible.
689 */
690 sync_core();
691 return 0;
692}
693
694/**
695 * text_poke_smp - Update instructions on a live kernel on SMP
696 * @addr: address to modify
697 * @opcode: source of the copy
698 * @len: length to copy
699 *
700 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
701 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
702 * should be allowed, since stop_machine() does _not_ protect code against
703 * NMI and MCE.
704 *
705 * Note: Must be called under get_online_cpus() and text_mutex.
706 */
707void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
708{
709 struct text_poke_params tpp;
710 struct text_poke_param p;
711
712 p.addr = addr;
713 p.opcode = opcode;
714 p.len = len;
715 tpp.params = &p;
716 tpp.nparams = 1;
717 atomic_set(&stop_machine_first, 1);
718 wrote_text = 0;
719 /* Use __stop_machine() because the caller already got online_cpus. */
720 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
721 return addr;
722}
723
724/**
725 * text_poke_smp_batch - Update instructions on a live kernel on SMP
726 * @params: an array of text_poke parameters
727 * @n: the number of elements in params.
728 *
729 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
730 * stop_machine() is heavy task, it is better to aggregate text_poke requests
731 * and do it once if possible.
732 *
733 * Note: Must be called under get_online_cpus() and text_mutex.
734 */
735void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
736{
737 struct text_poke_params tpp = {.params = params, .nparams = n};
738
739 atomic_set(&stop_machine_first, 1);
740 wrote_text = 0;
741 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
742}
1// SPDX-License-Identifier: GPL-2.0-only
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
4#include <linux/module.h>
5#include <linux/sched.h>
6#include <linux/perf_event.h>
7#include <linux/mutex.h>
8#include <linux/list.h>
9#include <linux/stringify.h>
10#include <linux/highmem.h>
11#include <linux/mm.h>
12#include <linux/vmalloc.h>
13#include <linux/memory.h>
14#include <linux/stop_machine.h>
15#include <linux/slab.h>
16#include <linux/kdebug.h>
17#include <linux/kprobes.h>
18#include <linux/mmu_context.h>
19#include <linux/bsearch.h>
20#include <linux/sync_core.h>
21#include <asm/text-patching.h>
22#include <asm/alternative.h>
23#include <asm/sections.h>
24#include <asm/mce.h>
25#include <asm/nmi.h>
26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h>
28#include <asm/insn.h>
29#include <asm/io.h>
30#include <asm/fixmap.h>
31#include <asm/paravirt.h>
32#include <asm/asm-prototypes.h>
33#include <asm/cfi.h>
34
35int __read_mostly alternatives_patched;
36
37EXPORT_SYMBOL_GPL(alternatives_patched);
38
39#define MAX_PATCH_LEN (255-1)
40
41#define DA_ALL (~0)
42#define DA_ALT 0x01
43#define DA_RET 0x02
44#define DA_RETPOLINE 0x04
45#define DA_ENDBR 0x08
46#define DA_SMP 0x10
47
48static unsigned int debug_alternative;
49
50static int __init debug_alt(char *str)
51{
52 if (str && *str == '=')
53 str++;
54
55 if (!str || kstrtouint(str, 0, &debug_alternative))
56 debug_alternative = DA_ALL;
57
58 return 1;
59}
60__setup("debug-alternative", debug_alt);
61
62static int noreplace_smp;
63
64static int __init setup_noreplace_smp(char *str)
65{
66 noreplace_smp = 1;
67 return 1;
68}
69__setup("noreplace-smp", setup_noreplace_smp);
70
71#define DPRINTK(type, fmt, args...) \
72do { \
73 if (debug_alternative & DA_##type) \
74 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
75} while (0)
76
77#define DUMP_BYTES(type, buf, len, fmt, args...) \
78do { \
79 if (unlikely(debug_alternative & DA_##type)) { \
80 int j; \
81 \
82 if (!(len)) \
83 break; \
84 \
85 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
86 for (j = 0; j < (len) - 1; j++) \
87 printk(KERN_CONT "%02hhx ", buf[j]); \
88 printk(KERN_CONT "%02hhx\n", buf[j]); \
89 } \
90} while (0)
91
92static const unsigned char x86nops[] =
93{
94 BYTES_NOP1,
95 BYTES_NOP2,
96 BYTES_NOP3,
97 BYTES_NOP4,
98 BYTES_NOP5,
99 BYTES_NOP6,
100 BYTES_NOP7,
101 BYTES_NOP8,
102#ifdef CONFIG_64BIT
103 BYTES_NOP9,
104 BYTES_NOP10,
105 BYTES_NOP11,
106#endif
107};
108
109const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
110{
111 NULL,
112 x86nops,
113 x86nops + 1,
114 x86nops + 1 + 2,
115 x86nops + 1 + 2 + 3,
116 x86nops + 1 + 2 + 3 + 4,
117 x86nops + 1 + 2 + 3 + 4 + 5,
118 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
119 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
120#ifdef CONFIG_64BIT
121 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
122 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
123 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
124#endif
125};
126
127/*
128 * Fill the buffer with a single effective instruction of size @len.
129 *
130 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
131 * for every single-byte NOP, try to generate the maximally available NOP of
132 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
133 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
134 * *jump* over instead of executing long and daft NOPs.
135 */
136static void add_nop(u8 *instr, unsigned int len)
137{
138 u8 *target = instr + len;
139
140 if (!len)
141 return;
142
143 if (len <= ASM_NOP_MAX) {
144 memcpy(instr, x86_nops[len], len);
145 return;
146 }
147
148 if (len < 128) {
149 __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
150 instr += JMP8_INSN_SIZE;
151 } else {
152 __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
153 instr += JMP32_INSN_SIZE;
154 }
155
156 for (;instr < target; instr++)
157 *instr = INT3_INSN_OPCODE;
158}
159
160extern s32 __retpoline_sites[], __retpoline_sites_end[];
161extern s32 __return_sites[], __return_sites_end[];
162extern s32 __cfi_sites[], __cfi_sites_end[];
163extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
164extern s32 __smp_locks[], __smp_locks_end[];
165void text_poke_early(void *addr, const void *opcode, size_t len);
166
167/*
168 * Matches NOP and NOPL, not any of the other possible NOPs.
169 */
170static bool insn_is_nop(struct insn *insn)
171{
172 /* Anything NOP, but no REP NOP */
173 if (insn->opcode.bytes[0] == 0x90 &&
174 (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
175 return true;
176
177 /* NOPL */
178 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
179 return true;
180
181 /* TODO: more nops */
182
183 return false;
184}
185
186/*
187 * Find the offset of the first non-NOP instruction starting at @offset
188 * but no further than @len.
189 */
190static int skip_nops(u8 *instr, int offset, int len)
191{
192 struct insn insn;
193
194 for (; offset < len; offset += insn.length) {
195 if (insn_decode_kernel(&insn, &instr[offset]))
196 break;
197
198 if (!insn_is_nop(&insn))
199 break;
200 }
201
202 return offset;
203}
204
205/*
206 * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
207 * to the end of the NOP sequence into a single NOP.
208 */
209static bool
210__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
211{
212 int i = *next - insn->length;
213
214 switch (insn->opcode.bytes[0]) {
215 case JMP8_INSN_OPCODE:
216 case JMP32_INSN_OPCODE:
217 *prev = i;
218 *target = *next + insn->immediate.value;
219 return false;
220 }
221
222 if (insn_is_nop(insn)) {
223 int nop = i;
224
225 *next = skip_nops(instr, *next, len);
226 if (*target && *next == *target)
227 nop = *prev;
228
229 add_nop(instr + nop, *next - nop);
230 DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
231 return true;
232 }
233
234 *target = 0;
235 return false;
236}
237
238/*
239 * "noinline" to cause control flow change and thus invalidate I$ and
240 * cause refetch after modification.
241 */
242static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
243{
244 int prev, target = 0;
245
246 for (int next, i = 0; i < len; i = next) {
247 struct insn insn;
248
249 if (insn_decode_kernel(&insn, &instr[i]))
250 return;
251
252 next = i + insn.length;
253
254 __optimize_nops(instr, len, &insn, &next, &prev, &target);
255 }
256}
257
258static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
259{
260 unsigned long flags;
261
262 local_irq_save(flags);
263 optimize_nops(instr, len);
264 sync_core();
265 local_irq_restore(flags);
266}
267
268/*
269 * In this context, "source" is where the instructions are placed in the
270 * section .altinstr_replacement, for example during kernel build by the
271 * toolchain.
272 * "Destination" is where the instructions are being patched in by this
273 * machinery.
274 *
275 * The source offset is:
276 *
277 * src_imm = target - src_next_ip (1)
278 *
279 * and the target offset is:
280 *
281 * dst_imm = target - dst_next_ip (2)
282 *
283 * so rework (1) as an expression for target like:
284 *
285 * target = src_imm + src_next_ip (1a)
286 *
287 * and substitute in (2) to get:
288 *
289 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
290 *
291 * Now, since the instruction stream is 'identical' at src and dst (it
292 * is being copied after all) it can be stated that:
293 *
294 * src_next_ip = src + ip_offset
295 * dst_next_ip = dst + ip_offset (4)
296 *
297 * Substitute (4) in (3) and observe ip_offset being cancelled out to
298 * obtain:
299 *
300 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
301 * = src_imm + src - dst + ip_offset - ip_offset
302 * = src_imm + src - dst (5)
303 *
304 * IOW, only the relative displacement of the code block matters.
305 */
306
307#define apply_reloc_n(n_, p_, d_) \
308 do { \
309 s32 v = *(s##n_ *)(p_); \
310 v += (d_); \
311 BUG_ON((v >> 31) != (v >> (n_-1))); \
312 *(s##n_ *)(p_) = (s##n_)v; \
313 } while (0)
314
315
316static __always_inline
317void apply_reloc(int n, void *ptr, uintptr_t diff)
318{
319 switch (n) {
320 case 1: apply_reloc_n(8, ptr, diff); break;
321 case 2: apply_reloc_n(16, ptr, diff); break;
322 case 4: apply_reloc_n(32, ptr, diff); break;
323 default: BUG();
324 }
325}
326
327static __always_inline
328bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
329{
330 u8 *target = src + offset;
331 /*
332 * If the target is inside the patched block, it's relative to the
333 * block itself and does not need relocation.
334 */
335 return (target < src || target > src + src_len);
336}
337
338void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
339{
340 int prev, target = 0;
341
342 for (int next, i = 0; i < len; i = next) {
343 struct insn insn;
344
345 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
346 return;
347
348 next = i + insn.length;
349
350 if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
351 continue;
352
353 switch (insn.opcode.bytes[0]) {
354 case 0x0f:
355 if (insn.opcode.bytes[1] < 0x80 ||
356 insn.opcode.bytes[1] > 0x8f)
357 break;
358
359 fallthrough; /* Jcc.d32 */
360 case 0x70 ... 0x7f: /* Jcc.d8 */
361 case JMP8_INSN_OPCODE:
362 case JMP32_INSN_OPCODE:
363 case CALL_INSN_OPCODE:
364 if (need_reloc(next + insn.immediate.value, src, src_len)) {
365 apply_reloc(insn.immediate.nbytes,
366 buf + i + insn_offset_immediate(&insn),
367 src - dest);
368 }
369
370 /*
371 * Where possible, convert JMP.d32 into JMP.d8.
372 */
373 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
374 s32 imm = insn.immediate.value;
375 imm += src - dest;
376 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
377 if ((imm >> 31) == (imm >> 7)) {
378 buf[i+0] = JMP8_INSN_OPCODE;
379 buf[i+1] = (s8)imm;
380
381 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
382 }
383 }
384 break;
385 }
386
387 if (insn_rip_relative(&insn)) {
388 if (need_reloc(next + insn.displacement.value, src, src_len)) {
389 apply_reloc(insn.displacement.nbytes,
390 buf + i + insn_offset_displacement(&insn),
391 src - dest);
392 }
393 }
394 }
395}
396
397/* Low-level backend functions usable from alternative code replacements. */
398DEFINE_ASM_FUNC(nop_func, "", .entry.text);
399EXPORT_SYMBOL_GPL(nop_func);
400
401noinstr void BUG_func(void)
402{
403 BUG();
404}
405EXPORT_SYMBOL(BUG_func);
406
407#define CALL_RIP_REL_OPCODE 0xff
408#define CALL_RIP_REL_MODRM 0x15
409
410/*
411 * Rewrite the "call BUG_func" replacement to point to the target of the
412 * indirect pv_ops call "call *disp(%ip)".
413 */
414static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
415{
416 void *target, *bug = &BUG_func;
417 s32 disp;
418
419 if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
420 pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
421 BUG();
422 }
423
424 if (a->instrlen != 6 ||
425 instr[0] != CALL_RIP_REL_OPCODE ||
426 instr[1] != CALL_RIP_REL_MODRM) {
427 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
428 BUG();
429 }
430
431 /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
432 disp = *(s32 *)(instr + 2);
433#ifdef CONFIG_X86_64
434 /* ff 15 00 00 00 00 call *0x0(%rip) */
435 /* target address is stored at "next instruction + disp". */
436 target = *(void **)(instr + a->instrlen + disp);
437#else
438 /* ff 15 00 00 00 00 call *0x0 */
439 /* target address is stored at disp. */
440 target = *(void **)disp;
441#endif
442 if (!target)
443 target = bug;
444
445 /* (BUG_func - .) + (target - BUG_func) := target - . */
446 *(s32 *)(insn_buff + 1) += target - bug;
447
448 if (target == &nop_func)
449 return 0;
450
451 return 5;
452}
453
454/*
455 * Replace instructions with better alternatives for this CPU type. This runs
456 * before SMP is initialized to avoid SMP problems with self modifying code.
457 * This implies that asymmetric systems where APs have less capabilities than
458 * the boot processor are not handled. Tough. Make sure you disable such
459 * features by hand.
460 *
461 * Marked "noinline" to cause control flow change and thus insn cache
462 * to refetch changed I$ lines.
463 */
464void __init_or_module noinline apply_alternatives(struct alt_instr *start,
465 struct alt_instr *end)
466{
467 struct alt_instr *a;
468 u8 *instr, *replacement;
469 u8 insn_buff[MAX_PATCH_LEN];
470
471 DPRINTK(ALT, "alt table %px, -> %px", start, end);
472
473 /*
474 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
475 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
476 * During the process, KASAN becomes confused seeing partial LA57
477 * conversion and triggers a false-positive out-of-bound report.
478 *
479 * Disable KASAN until the patching is complete.
480 */
481 kasan_disable_current();
482
483 /*
484 * The scan order should be from start to end. A later scanned
485 * alternative code can overwrite previously scanned alternative code.
486 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
487 * patch code.
488 *
489 * So be careful if you want to change the scan order to any other
490 * order.
491 */
492 for (a = start; a < end; a++) {
493 int insn_buff_sz = 0;
494
495 instr = (u8 *)&a->instr_offset + a->instr_offset;
496 replacement = (u8 *)&a->repl_offset + a->repl_offset;
497 BUG_ON(a->instrlen > sizeof(insn_buff));
498 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
499
500 /*
501 * Patch if either:
502 * - feature is present
503 * - feature not present but ALT_FLAG_NOT is set to mean,
504 * patch if feature is *NOT* present.
505 */
506 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
507 optimize_nops_inplace(instr, a->instrlen);
508 continue;
509 }
510
511 DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
512 a->cpuid >> 5,
513 a->cpuid & 0x1f,
514 instr, instr, a->instrlen,
515 replacement, a->replacementlen, a->flags);
516
517 memcpy(insn_buff, replacement, a->replacementlen);
518 insn_buff_sz = a->replacementlen;
519
520 if (a->flags & ALT_FLAG_DIRECT_CALL) {
521 insn_buff_sz = alt_replace_call(instr, insn_buff, a);
522 if (insn_buff_sz < 0)
523 continue;
524 }
525
526 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
527 insn_buff[insn_buff_sz] = 0x90;
528
529 apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
530
531 DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
532 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
533 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
534
535 text_poke_early(instr, insn_buff, insn_buff_sz);
536 }
537
538 kasan_enable_current();
539}
540
541static inline bool is_jcc32(struct insn *insn)
542{
543 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
544 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
545}
546
547#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
548
549/*
550 * CALL/JMP *%\reg
551 */
552static int emit_indirect(int op, int reg, u8 *bytes)
553{
554 int i = 0;
555 u8 modrm;
556
557 switch (op) {
558 case CALL_INSN_OPCODE:
559 modrm = 0x10; /* Reg = 2; CALL r/m */
560 break;
561
562 case JMP32_INSN_OPCODE:
563 modrm = 0x20; /* Reg = 4; JMP r/m */
564 break;
565
566 default:
567 WARN_ON_ONCE(1);
568 return -1;
569 }
570
571 if (reg >= 8) {
572 bytes[i++] = 0x41; /* REX.B prefix */
573 reg -= 8;
574 }
575
576 modrm |= 0xc0; /* Mod = 3 */
577 modrm += reg;
578
579 bytes[i++] = 0xff; /* opcode */
580 bytes[i++] = modrm;
581
582 return i;
583}
584
585static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
586{
587 u8 op = insn->opcode.bytes[0];
588 int i = 0;
589
590 /*
591 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
592 * tail-calls. Deal with them.
593 */
594 if (is_jcc32(insn)) {
595 bytes[i++] = op;
596 op = insn->opcode.bytes[1];
597 goto clang_jcc;
598 }
599
600 if (insn->length == 6)
601 bytes[i++] = 0x2e; /* CS-prefix */
602
603 switch (op) {
604 case CALL_INSN_OPCODE:
605 __text_gen_insn(bytes+i, op, addr+i,
606 __x86_indirect_call_thunk_array[reg],
607 CALL_INSN_SIZE);
608 i += CALL_INSN_SIZE;
609 break;
610
611 case JMP32_INSN_OPCODE:
612clang_jcc:
613 __text_gen_insn(bytes+i, op, addr+i,
614 __x86_indirect_jump_thunk_array[reg],
615 JMP32_INSN_SIZE);
616 i += JMP32_INSN_SIZE;
617 break;
618
619 default:
620 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
621 return -1;
622 }
623
624 WARN_ON_ONCE(i != insn->length);
625
626 return i;
627}
628
629/*
630 * Rewrite the compiler generated retpoline thunk calls.
631 *
632 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
633 * indirect instructions, avoiding the extra indirection.
634 *
635 * For example, convert:
636 *
637 * CALL __x86_indirect_thunk_\reg
638 *
639 * into:
640 *
641 * CALL *%\reg
642 *
643 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
644 */
645static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
646{
647 retpoline_thunk_t *target;
648 int reg, ret, i = 0;
649 u8 op, cc;
650
651 target = addr + insn->length + insn->immediate.value;
652 reg = target - __x86_indirect_thunk_array;
653
654 if (WARN_ON_ONCE(reg & ~0xf))
655 return -1;
656
657 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
658 BUG_ON(reg == 4);
659
660 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
661 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
662 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
663 return emit_call_track_retpoline(addr, insn, reg, bytes);
664
665 return -1;
666 }
667
668 op = insn->opcode.bytes[0];
669
670 /*
671 * Convert:
672 *
673 * Jcc.d32 __x86_indirect_thunk_\reg
674 *
675 * into:
676 *
677 * Jncc.d8 1f
678 * [ LFENCE ]
679 * JMP *%\reg
680 * [ NOP ]
681 * 1:
682 */
683 if (is_jcc32(insn)) {
684 cc = insn->opcode.bytes[1] & 0xf;
685 cc ^= 1; /* invert condition */
686
687 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
688 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
689
690 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
691 op = JMP32_INSN_OPCODE;
692 }
693
694 /*
695 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
696 */
697 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
698 bytes[i++] = 0x0f;
699 bytes[i++] = 0xae;
700 bytes[i++] = 0xe8; /* LFENCE */
701 }
702
703 ret = emit_indirect(op, reg, bytes + i);
704 if (ret < 0)
705 return ret;
706 i += ret;
707
708 /*
709 * The compiler is supposed to EMIT an INT3 after every unconditional
710 * JMP instruction due to AMD BTC. However, if the compiler is too old
711 * or MITIGATION_SLS isn't enabled, we still need an INT3 after
712 * indirect JMPs even on Intel.
713 */
714 if (op == JMP32_INSN_OPCODE && i < insn->length)
715 bytes[i++] = INT3_INSN_OPCODE;
716
717 for (; i < insn->length;)
718 bytes[i++] = BYTES_NOP1;
719
720 return i;
721}
722
723/*
724 * Generated by 'objtool --retpoline'.
725 */
726void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
727{
728 s32 *s;
729
730 for (s = start; s < end; s++) {
731 void *addr = (void *)s + *s;
732 struct insn insn;
733 int len, ret;
734 u8 bytes[16];
735 u8 op1, op2;
736
737 ret = insn_decode_kernel(&insn, addr);
738 if (WARN_ON_ONCE(ret < 0))
739 continue;
740
741 op1 = insn.opcode.bytes[0];
742 op2 = insn.opcode.bytes[1];
743
744 switch (op1) {
745 case CALL_INSN_OPCODE:
746 case JMP32_INSN_OPCODE:
747 break;
748
749 case 0x0f: /* escape */
750 if (op2 >= 0x80 && op2 <= 0x8f)
751 break;
752 fallthrough;
753 default:
754 WARN_ON_ONCE(1);
755 continue;
756 }
757
758 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
759 addr, addr, insn.length,
760 addr + insn.length + insn.immediate.value);
761
762 len = patch_retpoline(addr, &insn, bytes);
763 if (len == insn.length) {
764 optimize_nops(bytes, len);
765 DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
766 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
767 text_poke_early(addr, bytes, len);
768 }
769 }
770}
771
772#ifdef CONFIG_MITIGATION_RETHUNK
773
774/*
775 * Rewrite the compiler generated return thunk tail-calls.
776 *
777 * For example, convert:
778 *
779 * JMP __x86_return_thunk
780 *
781 * into:
782 *
783 * RET
784 */
785static int patch_return(void *addr, struct insn *insn, u8 *bytes)
786{
787 int i = 0;
788
789 /* Patch the custom return thunks... */
790 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
791 i = JMP32_INSN_SIZE;
792 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
793 } else {
794 /* ... or patch them out if not needed. */
795 bytes[i++] = RET_INSN_OPCODE;
796 }
797
798 for (; i < insn->length;)
799 bytes[i++] = INT3_INSN_OPCODE;
800 return i;
801}
802
803void __init_or_module noinline apply_returns(s32 *start, s32 *end)
804{
805 s32 *s;
806
807 if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
808 static_call_force_reinit();
809
810 for (s = start; s < end; s++) {
811 void *dest = NULL, *addr = (void *)s + *s;
812 struct insn insn;
813 int len, ret;
814 u8 bytes[16];
815 u8 op;
816
817 ret = insn_decode_kernel(&insn, addr);
818 if (WARN_ON_ONCE(ret < 0))
819 continue;
820
821 op = insn.opcode.bytes[0];
822 if (op == JMP32_INSN_OPCODE)
823 dest = addr + insn.length + insn.immediate.value;
824
825 if (__static_call_fixup(addr, op, dest) ||
826 WARN_ONCE(dest != &__x86_return_thunk,
827 "missing return thunk: %pS-%pS: %*ph",
828 addr, dest, 5, addr))
829 continue;
830
831 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
832 addr, addr, insn.length,
833 addr + insn.length + insn.immediate.value);
834
835 len = patch_return(addr, &insn, bytes);
836 if (len == insn.length) {
837 DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
838 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
839 text_poke_early(addr, bytes, len);
840 }
841 }
842}
843#else
844void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
845#endif /* CONFIG_MITIGATION_RETHUNK */
846
847#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
848
849void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
850void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
851
852#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
853
854#ifdef CONFIG_X86_KERNEL_IBT
855
856static void poison_cfi(void *addr);
857
858static void __init_or_module poison_endbr(void *addr, bool warn)
859{
860 u32 endbr, poison = gen_endbr_poison();
861
862 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
863 return;
864
865 if (!is_endbr(endbr)) {
866 WARN_ON_ONCE(warn);
867 return;
868 }
869
870 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
871
872 /*
873 * When we have IBT, the lack of ENDBR will trigger #CP
874 */
875 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
876 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
877 text_poke_early(addr, &poison, 4);
878}
879
880/*
881 * Generated by: objtool --ibt
882 *
883 * Seal the functions for indirect calls by clobbering the ENDBR instructions
884 * and the kCFI hash value.
885 */
886void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
887{
888 s32 *s;
889
890 for (s = start; s < end; s++) {
891 void *addr = (void *)s + *s;
892
893 poison_endbr(addr, true);
894 if (IS_ENABLED(CONFIG_FINEIBT))
895 poison_cfi(addr - 16);
896 }
897}
898
899#else
900
901void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
902
903#endif /* CONFIG_X86_KERNEL_IBT */
904
905#ifdef CONFIG_FINEIBT
906#define __CFI_DEFAULT CFI_DEFAULT
907#elif defined(CONFIG_CFI_CLANG)
908#define __CFI_DEFAULT CFI_KCFI
909#else
910#define __CFI_DEFAULT CFI_OFF
911#endif
912
913enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
914
915#ifdef CONFIG_CFI_CLANG
916struct bpf_insn;
917
918/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
919extern unsigned int __bpf_prog_runX(const void *ctx,
920 const struct bpf_insn *insn);
921
922/*
923 * Force a reference to the external symbol so the compiler generates
924 * __kcfi_typid.
925 */
926__ADDRESSABLE(__bpf_prog_runX);
927
928/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
929asm (
930" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
931" .type cfi_bpf_hash,@object \n"
932" .globl cfi_bpf_hash \n"
933" .p2align 2, 0x0 \n"
934"cfi_bpf_hash: \n"
935" .long __kcfi_typeid___bpf_prog_runX \n"
936" .size cfi_bpf_hash, 4 \n"
937" .popsection \n"
938);
939
940/* Must match bpf_callback_t */
941extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
942
943__ADDRESSABLE(__bpf_callback_fn);
944
945/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
946asm (
947" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
948" .type cfi_bpf_subprog_hash,@object \n"
949" .globl cfi_bpf_subprog_hash \n"
950" .p2align 2, 0x0 \n"
951"cfi_bpf_subprog_hash: \n"
952" .long __kcfi_typeid___bpf_callback_fn \n"
953" .size cfi_bpf_subprog_hash, 4 \n"
954" .popsection \n"
955);
956
957u32 cfi_get_func_hash(void *func)
958{
959 u32 hash;
960
961 func -= cfi_get_offset();
962 switch (cfi_mode) {
963 case CFI_FINEIBT:
964 func += 7;
965 break;
966 case CFI_KCFI:
967 func += 1;
968 break;
969 default:
970 return 0;
971 }
972
973 if (get_kernel_nofault(hash, func))
974 return 0;
975
976 return hash;
977}
978#endif
979
980#ifdef CONFIG_FINEIBT
981
982static bool cfi_rand __ro_after_init = true;
983static u32 cfi_seed __ro_after_init;
984
985/*
986 * Re-hash the CFI hash with a boot-time seed while making sure the result is
987 * not a valid ENDBR instruction.
988 */
989static u32 cfi_rehash(u32 hash)
990{
991 hash ^= cfi_seed;
992 while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
993 bool lsb = hash & 1;
994 hash >>= 1;
995 if (lsb)
996 hash ^= 0x80200003;
997 }
998 return hash;
999}
1000
1001static __init int cfi_parse_cmdline(char *str)
1002{
1003 if (!str)
1004 return -EINVAL;
1005
1006 while (str) {
1007 char *next = strchr(str, ',');
1008 if (next) {
1009 *next = 0;
1010 next++;
1011 }
1012
1013 if (!strcmp(str, "auto")) {
1014 cfi_mode = CFI_DEFAULT;
1015 } else if (!strcmp(str, "off")) {
1016 cfi_mode = CFI_OFF;
1017 cfi_rand = false;
1018 } else if (!strcmp(str, "kcfi")) {
1019 cfi_mode = CFI_KCFI;
1020 } else if (!strcmp(str, "fineibt")) {
1021 cfi_mode = CFI_FINEIBT;
1022 } else if (!strcmp(str, "norand")) {
1023 cfi_rand = false;
1024 } else {
1025 pr_err("Ignoring unknown cfi option (%s).", str);
1026 }
1027
1028 str = next;
1029 }
1030
1031 return 0;
1032}
1033early_param("cfi", cfi_parse_cmdline);
1034
1035/*
1036 * kCFI FineIBT
1037 *
1038 * __cfi_\func: __cfi_\func:
1039 * movl $0x12345678,%eax // 5 endbr64 // 4
1040 * nop subl $0x12345678,%r10d // 7
1041 * nop jz 1f // 2
1042 * nop ud2 // 2
1043 * nop 1: nop // 1
1044 * nop
1045 * nop
1046 * nop
1047 * nop
1048 * nop
1049 * nop
1050 * nop
1051 *
1052 *
1053 * caller: caller:
1054 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
1055 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
1056 * je 1f // 2 nop4 // 4
1057 * ud2 // 2
1058 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
1059 *
1060 */
1061
1062asm( ".pushsection .rodata \n"
1063 "fineibt_preamble_start: \n"
1064 " endbr64 \n"
1065 " subl $0x12345678, %r10d \n"
1066 " je fineibt_preamble_end \n"
1067 " ud2 \n"
1068 " nop \n"
1069 "fineibt_preamble_end: \n"
1070 ".popsection\n"
1071);
1072
1073extern u8 fineibt_preamble_start[];
1074extern u8 fineibt_preamble_end[];
1075
1076#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1077#define fineibt_preamble_hash 7
1078
1079asm( ".pushsection .rodata \n"
1080 "fineibt_caller_start: \n"
1081 " movl $0x12345678, %r10d \n"
1082 " sub $16, %r11 \n"
1083 ASM_NOP4
1084 "fineibt_caller_end: \n"
1085 ".popsection \n"
1086);
1087
1088extern u8 fineibt_caller_start[];
1089extern u8 fineibt_caller_end[];
1090
1091#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1092#define fineibt_caller_hash 2
1093
1094#define fineibt_caller_jmp (fineibt_caller_size - 2)
1095
1096static u32 decode_preamble_hash(void *addr)
1097{
1098 u8 *p = addr;
1099
1100 /* b8 78 56 34 12 mov $0x12345678,%eax */
1101 if (p[0] == 0xb8)
1102 return *(u32 *)(addr + 1);
1103
1104 return 0; /* invalid hash value */
1105}
1106
1107static u32 decode_caller_hash(void *addr)
1108{
1109 u8 *p = addr;
1110
1111 /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
1112 if (p[0] == 0x41 && p[1] == 0xba)
1113 return -*(u32 *)(addr + 2);
1114
1115 /* e8 0c 78 56 34 12 jmp.d8 +12 */
1116 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1117 return -*(u32 *)(addr + 2);
1118
1119 return 0; /* invalid hash value */
1120}
1121
1122/* .retpoline_sites */
1123static int cfi_disable_callers(s32 *start, s32 *end)
1124{
1125 /*
1126 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1127 * in tact for later usage. Also see decode_caller_hash() and
1128 * cfi_rewrite_callers().
1129 */
1130 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1131 s32 *s;
1132
1133 for (s = start; s < end; s++) {
1134 void *addr = (void *)s + *s;
1135 u32 hash;
1136
1137 addr -= fineibt_caller_size;
1138 hash = decode_caller_hash(addr);
1139 if (!hash) /* nocfi callers */
1140 continue;
1141
1142 text_poke_early(addr, jmp, 2);
1143 }
1144
1145 return 0;
1146}
1147
1148static int cfi_enable_callers(s32 *start, s32 *end)
1149{
1150 /*
1151 * Re-enable kCFI, undo what cfi_disable_callers() did.
1152 */
1153 const u8 mov[] = { 0x41, 0xba };
1154 s32 *s;
1155
1156 for (s = start; s < end; s++) {
1157 void *addr = (void *)s + *s;
1158 u32 hash;
1159
1160 addr -= fineibt_caller_size;
1161 hash = decode_caller_hash(addr);
1162 if (!hash) /* nocfi callers */
1163 continue;
1164
1165 text_poke_early(addr, mov, 2);
1166 }
1167
1168 return 0;
1169}
1170
1171/* .cfi_sites */
1172static int cfi_rand_preamble(s32 *start, s32 *end)
1173{
1174 s32 *s;
1175
1176 for (s = start; s < end; s++) {
1177 void *addr = (void *)s + *s;
1178 u32 hash;
1179
1180 hash = decode_preamble_hash(addr);
1181 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1182 addr, addr, 5, addr))
1183 return -EINVAL;
1184
1185 hash = cfi_rehash(hash);
1186 text_poke_early(addr + 1, &hash, 4);
1187 }
1188
1189 return 0;
1190}
1191
1192static int cfi_rewrite_preamble(s32 *start, s32 *end)
1193{
1194 s32 *s;
1195
1196 for (s = start; s < end; s++) {
1197 void *addr = (void *)s + *s;
1198 u32 hash;
1199
1200 hash = decode_preamble_hash(addr);
1201 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1202 addr, addr, 5, addr))
1203 return -EINVAL;
1204
1205 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1206 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1207 text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
1208 }
1209
1210 return 0;
1211}
1212
1213static void cfi_rewrite_endbr(s32 *start, s32 *end)
1214{
1215 s32 *s;
1216
1217 for (s = start; s < end; s++) {
1218 void *addr = (void *)s + *s;
1219
1220 poison_endbr(addr+16, false);
1221 }
1222}
1223
1224/* .retpoline_sites */
1225static int cfi_rand_callers(s32 *start, s32 *end)
1226{
1227 s32 *s;
1228
1229 for (s = start; s < end; s++) {
1230 void *addr = (void *)s + *s;
1231 u32 hash;
1232
1233 addr -= fineibt_caller_size;
1234 hash = decode_caller_hash(addr);
1235 if (hash) {
1236 hash = -cfi_rehash(hash);
1237 text_poke_early(addr + 2, &hash, 4);
1238 }
1239 }
1240
1241 return 0;
1242}
1243
1244static int cfi_rewrite_callers(s32 *start, s32 *end)
1245{
1246 s32 *s;
1247
1248 for (s = start; s < end; s++) {
1249 void *addr = (void *)s + *s;
1250 u32 hash;
1251
1252 addr -= fineibt_caller_size;
1253 hash = decode_caller_hash(addr);
1254 if (hash) {
1255 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1256 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1257 text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1258 }
1259 /* rely on apply_retpolines() */
1260 }
1261
1262 return 0;
1263}
1264
1265static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1266 s32 *start_cfi, s32 *end_cfi, bool builtin)
1267{
1268 int ret;
1269
1270 if (WARN_ONCE(fineibt_preamble_size != 16,
1271 "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1272 return;
1273
1274 if (cfi_mode == CFI_DEFAULT) {
1275 cfi_mode = CFI_KCFI;
1276 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
1277 cfi_mode = CFI_FINEIBT;
1278 }
1279
1280 /*
1281 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1282 * rewrite them. This disables all CFI. If this succeeds but any of the
1283 * later stages fails, we're without CFI.
1284 */
1285 ret = cfi_disable_callers(start_retpoline, end_retpoline);
1286 if (ret)
1287 goto err;
1288
1289 if (cfi_rand) {
1290 if (builtin) {
1291 cfi_seed = get_random_u32();
1292 cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1293 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1294 }
1295
1296 ret = cfi_rand_preamble(start_cfi, end_cfi);
1297 if (ret)
1298 goto err;
1299
1300 ret = cfi_rand_callers(start_retpoline, end_retpoline);
1301 if (ret)
1302 goto err;
1303 }
1304
1305 switch (cfi_mode) {
1306 case CFI_OFF:
1307 if (builtin)
1308 pr_info("Disabling CFI\n");
1309 return;
1310
1311 case CFI_KCFI:
1312 ret = cfi_enable_callers(start_retpoline, end_retpoline);
1313 if (ret)
1314 goto err;
1315
1316 if (builtin)
1317 pr_info("Using kCFI\n");
1318 return;
1319
1320 case CFI_FINEIBT:
1321 /* place the FineIBT preamble at func()-16 */
1322 ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1323 if (ret)
1324 goto err;
1325
1326 /* rewrite the callers to target func()-16 */
1327 ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1328 if (ret)
1329 goto err;
1330
1331 /* now that nobody targets func()+0, remove ENDBR there */
1332 cfi_rewrite_endbr(start_cfi, end_cfi);
1333
1334 if (builtin)
1335 pr_info("Using FineIBT CFI\n");
1336 return;
1337
1338 default:
1339 break;
1340 }
1341
1342err:
1343 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1344}
1345
1346static inline void poison_hash(void *addr)
1347{
1348 *(u32 *)addr = 0;
1349}
1350
1351static void poison_cfi(void *addr)
1352{
1353 switch (cfi_mode) {
1354 case CFI_FINEIBT:
1355 /*
1356 * __cfi_\func:
1357 * osp nopl (%rax)
1358 * subl $0, %r10d
1359 * jz 1f
1360 * ud2
1361 * 1: nop
1362 */
1363 poison_endbr(addr, false);
1364 poison_hash(addr + fineibt_preamble_hash);
1365 break;
1366
1367 case CFI_KCFI:
1368 /*
1369 * __cfi_\func:
1370 * movl $0, %eax
1371 * .skip 11, 0x90
1372 */
1373 poison_hash(addr + 1);
1374 break;
1375
1376 default:
1377 break;
1378 }
1379}
1380
1381#else
1382
1383static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1384 s32 *start_cfi, s32 *end_cfi, bool builtin)
1385{
1386}
1387
1388#ifdef CONFIG_X86_KERNEL_IBT
1389static void poison_cfi(void *addr) { }
1390#endif
1391
1392#endif
1393
1394void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1395 s32 *start_cfi, s32 *end_cfi)
1396{
1397 return __apply_fineibt(start_retpoline, end_retpoline,
1398 start_cfi, end_cfi,
1399 /* .builtin = */ false);
1400}
1401
1402#ifdef CONFIG_SMP
1403static void alternatives_smp_lock(const s32 *start, const s32 *end,
1404 u8 *text, u8 *text_end)
1405{
1406 const s32 *poff;
1407
1408 for (poff = start; poff < end; poff++) {
1409 u8 *ptr = (u8 *)poff + *poff;
1410
1411 if (!*poff || ptr < text || ptr >= text_end)
1412 continue;
1413 /* turn DS segment override prefix into lock prefix */
1414 if (*ptr == 0x3e)
1415 text_poke(ptr, ((unsigned char []){0xf0}), 1);
1416 }
1417}
1418
1419static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1420 u8 *text, u8 *text_end)
1421{
1422 const s32 *poff;
1423
1424 for (poff = start; poff < end; poff++) {
1425 u8 *ptr = (u8 *)poff + *poff;
1426
1427 if (!*poff || ptr < text || ptr >= text_end)
1428 continue;
1429 /* turn lock prefix into DS segment override prefix */
1430 if (*ptr == 0xf0)
1431 text_poke(ptr, ((unsigned char []){0x3E}), 1);
1432 }
1433}
1434
1435struct smp_alt_module {
1436 /* what is this ??? */
1437 struct module *mod;
1438 char *name;
1439
1440 /* ptrs to lock prefixes */
1441 const s32 *locks;
1442 const s32 *locks_end;
1443
1444 /* .text segment, needed to avoid patching init code ;) */
1445 u8 *text;
1446 u8 *text_end;
1447
1448 struct list_head next;
1449};
1450static LIST_HEAD(smp_alt_modules);
1451static bool uniproc_patched = false; /* protected by text_mutex */
1452
1453void __init_or_module alternatives_smp_module_add(struct module *mod,
1454 char *name,
1455 void *locks, void *locks_end,
1456 void *text, void *text_end)
1457{
1458 struct smp_alt_module *smp;
1459
1460 mutex_lock(&text_mutex);
1461 if (!uniproc_patched)
1462 goto unlock;
1463
1464 if (num_possible_cpus() == 1)
1465 /* Don't bother remembering, we'll never have to undo it. */
1466 goto smp_unlock;
1467
1468 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1469 if (NULL == smp)
1470 /* we'll run the (safe but slow) SMP code then ... */
1471 goto unlock;
1472
1473 smp->mod = mod;
1474 smp->name = name;
1475 smp->locks = locks;
1476 smp->locks_end = locks_end;
1477 smp->text = text;
1478 smp->text_end = text_end;
1479 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
1480 smp->locks, smp->locks_end,
1481 smp->text, smp->text_end, smp->name);
1482
1483 list_add_tail(&smp->next, &smp_alt_modules);
1484smp_unlock:
1485 alternatives_smp_unlock(locks, locks_end, text, text_end);
1486unlock:
1487 mutex_unlock(&text_mutex);
1488}
1489
1490void __init_or_module alternatives_smp_module_del(struct module *mod)
1491{
1492 struct smp_alt_module *item;
1493
1494 mutex_lock(&text_mutex);
1495 list_for_each_entry(item, &smp_alt_modules, next) {
1496 if (mod != item->mod)
1497 continue;
1498 list_del(&item->next);
1499 kfree(item);
1500 break;
1501 }
1502 mutex_unlock(&text_mutex);
1503}
1504
1505void alternatives_enable_smp(void)
1506{
1507 struct smp_alt_module *mod;
1508
1509 /* Why bother if there are no other CPUs? */
1510 BUG_ON(num_possible_cpus() == 1);
1511
1512 mutex_lock(&text_mutex);
1513
1514 if (uniproc_patched) {
1515 pr_info("switching to SMP code\n");
1516 BUG_ON(num_online_cpus() != 1);
1517 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1518 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1519 list_for_each_entry(mod, &smp_alt_modules, next)
1520 alternatives_smp_lock(mod->locks, mod->locks_end,
1521 mod->text, mod->text_end);
1522 uniproc_patched = false;
1523 }
1524 mutex_unlock(&text_mutex);
1525}
1526
1527/*
1528 * Return 1 if the address range is reserved for SMP-alternatives.
1529 * Must hold text_mutex.
1530 */
1531int alternatives_text_reserved(void *start, void *end)
1532{
1533 struct smp_alt_module *mod;
1534 const s32 *poff;
1535 u8 *text_start = start;
1536 u8 *text_end = end;
1537
1538 lockdep_assert_held(&text_mutex);
1539
1540 list_for_each_entry(mod, &smp_alt_modules, next) {
1541 if (mod->text > text_end || mod->text_end < text_start)
1542 continue;
1543 for (poff = mod->locks; poff < mod->locks_end; poff++) {
1544 const u8 *ptr = (const u8 *)poff + *poff;
1545
1546 if (text_start <= ptr && text_end > ptr)
1547 return 1;
1548 }
1549 }
1550
1551 return 0;
1552}
1553#endif /* CONFIG_SMP */
1554
1555/*
1556 * Self-test for the INT3 based CALL emulation code.
1557 *
1558 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1559 * properly and that there is a stack gap between the INT3 frame and the
1560 * previous context. Without this gap doing a virtual PUSH on the interrupted
1561 * stack would corrupt the INT3 IRET frame.
1562 *
1563 * See entry_{32,64}.S for more details.
1564 */
1565
1566/*
1567 * We define the int3_magic() function in assembly to control the calling
1568 * convention such that we can 'call' it from assembly.
1569 */
1570
1571extern void int3_magic(unsigned int *ptr); /* defined in asm */
1572
1573asm (
1574" .pushsection .init.text, \"ax\", @progbits\n"
1575" .type int3_magic, @function\n"
1576"int3_magic:\n"
1577 ANNOTATE_NOENDBR
1578" movl $1, (%" _ASM_ARG1 ")\n"
1579 ASM_RET
1580" .size int3_magic, .-int3_magic\n"
1581" .popsection\n"
1582);
1583
1584extern void int3_selftest_ip(void); /* defined in asm below */
1585
1586static int __init
1587int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1588{
1589 unsigned long selftest = (unsigned long)&int3_selftest_ip;
1590 struct die_args *args = data;
1591 struct pt_regs *regs = args->regs;
1592
1593 OPTIMIZER_HIDE_VAR(selftest);
1594
1595 if (!regs || user_mode(regs))
1596 return NOTIFY_DONE;
1597
1598 if (val != DIE_INT3)
1599 return NOTIFY_DONE;
1600
1601 if (regs->ip - INT3_INSN_SIZE != selftest)
1602 return NOTIFY_DONE;
1603
1604 int3_emulate_call(regs, (unsigned long)&int3_magic);
1605 return NOTIFY_STOP;
1606}
1607
1608/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1609static noinline void __init int3_selftest(void)
1610{
1611 static __initdata struct notifier_block int3_exception_nb = {
1612 .notifier_call = int3_exception_notify,
1613 .priority = INT_MAX-1, /* last */
1614 };
1615 unsigned int val = 0;
1616
1617 BUG_ON(register_die_notifier(&int3_exception_nb));
1618
1619 /*
1620 * Basically: int3_magic(&val); but really complicated :-)
1621 *
1622 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1623 * notifier above will emulate CALL for us.
1624 */
1625 asm volatile ("int3_selftest_ip:\n\t"
1626 ANNOTATE_NOENDBR
1627 " int3; nop; nop; nop; nop\n\t"
1628 : ASM_CALL_CONSTRAINT
1629 : __ASM_SEL_RAW(a, D) (&val)
1630 : "memory");
1631
1632 BUG_ON(val != 1);
1633
1634 unregister_die_notifier(&int3_exception_nb);
1635}
1636
1637static __initdata int __alt_reloc_selftest_addr;
1638
1639extern void __init __alt_reloc_selftest(void *arg);
1640__visible noinline void __init __alt_reloc_selftest(void *arg)
1641{
1642 WARN_ON(arg != &__alt_reloc_selftest_addr);
1643}
1644
1645static noinline void __init alt_reloc_selftest(void)
1646{
1647 /*
1648 * Tests apply_relocation().
1649 *
1650 * This has a relative immediate (CALL) in a place other than the first
1651 * instruction and additionally on x86_64 we get a RIP-relative LEA:
1652 *
1653 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
1654 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
1655 *
1656 * Getting this wrong will either crash and burn or tickle the WARN
1657 * above.
1658 */
1659 asm_inline volatile (
1660 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
1661 : /* output */
1662 : [mem] "m" (__alt_reloc_selftest_addr)
1663 : _ASM_ARG1
1664 );
1665}
1666
1667void __init alternative_instructions(void)
1668{
1669 int3_selftest();
1670
1671 /*
1672 * The patching is not fully atomic, so try to avoid local
1673 * interruptions that might execute the to be patched code.
1674 * Other CPUs are not running.
1675 */
1676 stop_nmi();
1677
1678 /*
1679 * Don't stop machine check exceptions while patching.
1680 * MCEs only happen when something got corrupted and in this
1681 * case we must do something about the corruption.
1682 * Ignoring it is worse than an unlikely patching race.
1683 * Also machine checks tend to be broadcast and if one CPU
1684 * goes into machine check the others follow quickly, so we don't
1685 * expect a machine check to cause undue problems during to code
1686 * patching.
1687 */
1688
1689 /*
1690 * Make sure to set (artificial) features depending on used paravirt
1691 * functions which can later influence alternative patching.
1692 */
1693 paravirt_set_cap();
1694
1695 __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1696 __cfi_sites, __cfi_sites_end, true);
1697
1698 /*
1699 * Rewrite the retpolines, must be done before alternatives since
1700 * those can rewrite the retpoline thunks.
1701 */
1702 apply_retpolines(__retpoline_sites, __retpoline_sites_end);
1703 apply_returns(__return_sites, __return_sites_end);
1704
1705 apply_alternatives(__alt_instructions, __alt_instructions_end);
1706
1707 /*
1708 * Now all calls are established. Apply the call thunks if
1709 * required.
1710 */
1711 callthunks_patch_builtin_calls();
1712
1713 /*
1714 * Seal all functions that do not have their address taken.
1715 */
1716 apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
1717
1718#ifdef CONFIG_SMP
1719 /* Patch to UP if other cpus not imminent. */
1720 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1721 uniproc_patched = true;
1722 alternatives_smp_module_add(NULL, "core kernel",
1723 __smp_locks, __smp_locks_end,
1724 _text, _etext);
1725 }
1726
1727 if (!uniproc_patched || num_possible_cpus() == 1) {
1728 free_init_pages("SMP alternatives",
1729 (unsigned long)__smp_locks,
1730 (unsigned long)__smp_locks_end);
1731 }
1732#endif
1733
1734 restart_nmi();
1735 alternatives_patched = 1;
1736
1737 alt_reloc_selftest();
1738}
1739
1740/**
1741 * text_poke_early - Update instructions on a live kernel at boot time
1742 * @addr: address to modify
1743 * @opcode: source of the copy
1744 * @len: length to copy
1745 *
1746 * When you use this code to patch more than one byte of an instruction
1747 * you need to make sure that other CPUs cannot execute this code in parallel.
1748 * Also no thread must be currently preempted in the middle of these
1749 * instructions. And on the local CPU you need to be protected against NMI or
1750 * MCE handlers seeing an inconsistent instruction while you patch.
1751 */
1752void __init_or_module text_poke_early(void *addr, const void *opcode,
1753 size_t len)
1754{
1755 unsigned long flags;
1756
1757 if (boot_cpu_has(X86_FEATURE_NX) &&
1758 is_module_text_address((unsigned long)addr)) {
1759 /*
1760 * Modules text is marked initially as non-executable, so the
1761 * code cannot be running and speculative code-fetches are
1762 * prevented. Just change the code.
1763 */
1764 memcpy(addr, opcode, len);
1765 } else {
1766 local_irq_save(flags);
1767 memcpy(addr, opcode, len);
1768 sync_core();
1769 local_irq_restore(flags);
1770
1771 /*
1772 * Could also do a CLFLUSH here to speed up CPU recovery; but
1773 * that causes hangs on some VIA CPUs.
1774 */
1775 }
1776}
1777
1778typedef struct {
1779 struct mm_struct *mm;
1780} temp_mm_state_t;
1781
1782/*
1783 * Using a temporary mm allows to set temporary mappings that are not accessible
1784 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1785 * that override the kernel memory protections (e.g., W^X), without exposing the
1786 * temporary page-table mappings that are required for these write operations to
1787 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1788 * mapping is torn down.
1789 *
1790 * Context: The temporary mm needs to be used exclusively by a single core. To
1791 * harden security IRQs must be disabled while the temporary mm is
1792 * loaded, thereby preventing interrupt handler bugs from overriding
1793 * the kernel memory protection.
1794 */
1795static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1796{
1797 temp_mm_state_t temp_state;
1798
1799 lockdep_assert_irqs_disabled();
1800
1801 /*
1802 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1803 * with a stale address space WITHOUT being in lazy mode after
1804 * restoring the previous mm.
1805 */
1806 if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1807 leave_mm();
1808
1809 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1810 switch_mm_irqs_off(NULL, mm, current);
1811
1812 /*
1813 * If breakpoints are enabled, disable them while the temporary mm is
1814 * used. Userspace might set up watchpoints on addresses that are used
1815 * in the temporary mm, which would lead to wrong signals being sent or
1816 * crashes.
1817 *
1818 * Note that breakpoints are not disabled selectively, which also causes
1819 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1820 * undesirable, but still seems reasonable as the code that runs in the
1821 * temporary mm should be short.
1822 */
1823 if (hw_breakpoint_active())
1824 hw_breakpoint_disable();
1825
1826 return temp_state;
1827}
1828
1829static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1830{
1831 lockdep_assert_irqs_disabled();
1832 switch_mm_irqs_off(NULL, prev_state.mm, current);
1833
1834 /*
1835 * Restore the breakpoints if they were disabled before the temporary mm
1836 * was loaded.
1837 */
1838 if (hw_breakpoint_active())
1839 hw_breakpoint_restore();
1840}
1841
1842__ro_after_init struct mm_struct *poking_mm;
1843__ro_after_init unsigned long poking_addr;
1844
1845static void text_poke_memcpy(void *dst, const void *src, size_t len)
1846{
1847 memcpy(dst, src, len);
1848}
1849
1850static void text_poke_memset(void *dst, const void *src, size_t len)
1851{
1852 int c = *(const int *)src;
1853
1854 memset(dst, c, len);
1855}
1856
1857typedef void text_poke_f(void *dst, const void *src, size_t len);
1858
1859static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1860{
1861 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1862 struct page *pages[2] = {NULL};
1863 temp_mm_state_t prev;
1864 unsigned long flags;
1865 pte_t pte, *ptep;
1866 spinlock_t *ptl;
1867 pgprot_t pgprot;
1868
1869 /*
1870 * While boot memory allocator is running we cannot use struct pages as
1871 * they are not yet initialized. There is no way to recover.
1872 */
1873 BUG_ON(!after_bootmem);
1874
1875 if (!core_kernel_text((unsigned long)addr)) {
1876 pages[0] = vmalloc_to_page(addr);
1877 if (cross_page_boundary)
1878 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1879 } else {
1880 pages[0] = virt_to_page(addr);
1881 WARN_ON(!PageReserved(pages[0]));
1882 if (cross_page_boundary)
1883 pages[1] = virt_to_page(addr + PAGE_SIZE);
1884 }
1885 /*
1886 * If something went wrong, crash and burn since recovery paths are not
1887 * implemented.
1888 */
1889 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1890
1891 /*
1892 * Map the page without the global bit, as TLB flushing is done with
1893 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1894 */
1895 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1896
1897 /*
1898 * The lock is not really needed, but this allows to avoid open-coding.
1899 */
1900 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1901
1902 /*
1903 * This must not fail; preallocated in poking_init().
1904 */
1905 VM_BUG_ON(!ptep);
1906
1907 local_irq_save(flags);
1908
1909 pte = mk_pte(pages[0], pgprot);
1910 set_pte_at(poking_mm, poking_addr, ptep, pte);
1911
1912 if (cross_page_boundary) {
1913 pte = mk_pte(pages[1], pgprot);
1914 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1915 }
1916
1917 /*
1918 * Loading the temporary mm behaves as a compiler barrier, which
1919 * guarantees that the PTE will be set at the time memcpy() is done.
1920 */
1921 prev = use_temporary_mm(poking_mm);
1922
1923 kasan_disable_current();
1924 func((u8 *)poking_addr + offset_in_page(addr), src, len);
1925 kasan_enable_current();
1926
1927 /*
1928 * Ensure that the PTE is only cleared after the instructions of memcpy
1929 * were issued by using a compiler barrier.
1930 */
1931 barrier();
1932
1933 pte_clear(poking_mm, poking_addr, ptep);
1934 if (cross_page_boundary)
1935 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1936
1937 /*
1938 * Loading the previous page-table hierarchy requires a serializing
1939 * instruction that already allows the core to see the updated version.
1940 * Xen-PV is assumed to serialize execution in a similar manner.
1941 */
1942 unuse_temporary_mm(prev);
1943
1944 /*
1945 * Flushing the TLB might involve IPIs, which would require enabled
1946 * IRQs, but not if the mm is not used, as it is in this point.
1947 */
1948 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1949 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1950 PAGE_SHIFT, false);
1951
1952 if (func == text_poke_memcpy) {
1953 /*
1954 * If the text does not match what we just wrote then something is
1955 * fundamentally screwy; there's nothing we can really do about that.
1956 */
1957 BUG_ON(memcmp(addr, src, len));
1958 }
1959
1960 local_irq_restore(flags);
1961 pte_unmap_unlock(ptep, ptl);
1962 return addr;
1963}
1964
1965/**
1966 * text_poke - Update instructions on a live kernel
1967 * @addr: address to modify
1968 * @opcode: source of the copy
1969 * @len: length to copy
1970 *
1971 * Only atomic text poke/set should be allowed when not doing early patching.
1972 * It means the size must be writable atomically and the address must be aligned
1973 * in a way that permits an atomic write. It also makes sure we fit on a single
1974 * page.
1975 *
1976 * Note that the caller must ensure that if the modified code is part of a
1977 * module, the module would not be removed during poking. This can be achieved
1978 * by registering a module notifier, and ordering module removal and patching
1979 * through a mutex.
1980 */
1981void *text_poke(void *addr, const void *opcode, size_t len)
1982{
1983 lockdep_assert_held(&text_mutex);
1984
1985 return __text_poke(text_poke_memcpy, addr, opcode, len);
1986}
1987
1988/**
1989 * text_poke_kgdb - Update instructions on a live kernel by kgdb
1990 * @addr: address to modify
1991 * @opcode: source of the copy
1992 * @len: length to copy
1993 *
1994 * Only atomic text poke/set should be allowed when not doing early patching.
1995 * It means the size must be writable atomically and the address must be aligned
1996 * in a way that permits an atomic write. It also makes sure we fit on a single
1997 * page.
1998 *
1999 * Context: should only be used by kgdb, which ensures no other core is running,
2000 * despite the fact it does not hold the text_mutex.
2001 */
2002void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2003{
2004 return __text_poke(text_poke_memcpy, addr, opcode, len);
2005}
2006
2007void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2008 bool core_ok)
2009{
2010 unsigned long start = (unsigned long)addr;
2011 size_t patched = 0;
2012
2013 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2014 return NULL;
2015
2016 while (patched < len) {
2017 unsigned long ptr = start + patched;
2018 size_t s;
2019
2020 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2021
2022 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2023 patched += s;
2024 }
2025 return addr;
2026}
2027
2028/**
2029 * text_poke_copy - Copy instructions into (an unused part of) RX memory
2030 * @addr: address to modify
2031 * @opcode: source of the copy
2032 * @len: length to copy, could be more than 2x PAGE_SIZE
2033 *
2034 * Not safe against concurrent execution; useful for JITs to dump
2035 * new code blocks into unused regions of RX memory. Can be used in
2036 * conjunction with synchronize_rcu_tasks() to wait for existing
2037 * execution to quiesce after having made sure no existing functions
2038 * pointers are live.
2039 */
2040void *text_poke_copy(void *addr, const void *opcode, size_t len)
2041{
2042 mutex_lock(&text_mutex);
2043 addr = text_poke_copy_locked(addr, opcode, len, false);
2044 mutex_unlock(&text_mutex);
2045 return addr;
2046}
2047
2048/**
2049 * text_poke_set - memset into (an unused part of) RX memory
2050 * @addr: address to modify
2051 * @c: the byte to fill the area with
2052 * @len: length to copy, could be more than 2x PAGE_SIZE
2053 *
2054 * This is useful to overwrite unused regions of RX memory with illegal
2055 * instructions.
2056 */
2057void *text_poke_set(void *addr, int c, size_t len)
2058{
2059 unsigned long start = (unsigned long)addr;
2060 size_t patched = 0;
2061
2062 if (WARN_ON_ONCE(core_kernel_text(start)))
2063 return NULL;
2064
2065 mutex_lock(&text_mutex);
2066 while (patched < len) {
2067 unsigned long ptr = start + patched;
2068 size_t s;
2069
2070 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2071
2072 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2073 patched += s;
2074 }
2075 mutex_unlock(&text_mutex);
2076 return addr;
2077}
2078
2079static void do_sync_core(void *info)
2080{
2081 sync_core();
2082}
2083
2084void text_poke_sync(void)
2085{
2086 on_each_cpu(do_sync_core, NULL, 1);
2087}
2088
2089/*
2090 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2091 * this thing. When len == 6 everything is prefixed with 0x0f and we map
2092 * opcode to Jcc.d8, using len to distinguish.
2093 */
2094struct text_poke_loc {
2095 /* addr := _stext + rel_addr */
2096 s32 rel_addr;
2097 s32 disp;
2098 u8 len;
2099 u8 opcode;
2100 const u8 text[POKE_MAX_OPCODE_SIZE];
2101 /* see text_poke_bp_batch() */
2102 u8 old;
2103};
2104
2105struct bp_patching_desc {
2106 struct text_poke_loc *vec;
2107 int nr_entries;
2108 atomic_t refs;
2109};
2110
2111static struct bp_patching_desc bp_desc;
2112
2113static __always_inline
2114struct bp_patching_desc *try_get_desc(void)
2115{
2116 struct bp_patching_desc *desc = &bp_desc;
2117
2118 if (!raw_atomic_inc_not_zero(&desc->refs))
2119 return NULL;
2120
2121 return desc;
2122}
2123
2124static __always_inline void put_desc(void)
2125{
2126 struct bp_patching_desc *desc = &bp_desc;
2127
2128 smp_mb__before_atomic();
2129 raw_atomic_dec(&desc->refs);
2130}
2131
2132static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
2133{
2134 return _stext + tp->rel_addr;
2135}
2136
2137static __always_inline int patch_cmp(const void *key, const void *elt)
2138{
2139 struct text_poke_loc *tp = (struct text_poke_loc *) elt;
2140
2141 if (key < text_poke_addr(tp))
2142 return -1;
2143 if (key > text_poke_addr(tp))
2144 return 1;
2145 return 0;
2146}
2147
2148noinstr int poke_int3_handler(struct pt_regs *regs)
2149{
2150 struct bp_patching_desc *desc;
2151 struct text_poke_loc *tp;
2152 int ret = 0;
2153 void *ip;
2154
2155 if (user_mode(regs))
2156 return 0;
2157
2158 /*
2159 * Having observed our INT3 instruction, we now must observe
2160 * bp_desc with non-zero refcount:
2161 *
2162 * bp_desc.refs = 1 INT3
2163 * WMB RMB
2164 * write INT3 if (bp_desc.refs != 0)
2165 */
2166 smp_rmb();
2167
2168 desc = try_get_desc();
2169 if (!desc)
2170 return 0;
2171
2172 /*
2173 * Discount the INT3. See text_poke_bp_batch().
2174 */
2175 ip = (void *) regs->ip - INT3_INSN_SIZE;
2176
2177 /*
2178 * Skip the binary search if there is a single member in the vector.
2179 */
2180 if (unlikely(desc->nr_entries > 1)) {
2181 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
2182 sizeof(struct text_poke_loc),
2183 patch_cmp);
2184 if (!tp)
2185 goto out_put;
2186 } else {
2187 tp = desc->vec;
2188 if (text_poke_addr(tp) != ip)
2189 goto out_put;
2190 }
2191
2192 ip += tp->len;
2193
2194 switch (tp->opcode) {
2195 case INT3_INSN_OPCODE:
2196 /*
2197 * Someone poked an explicit INT3, they'll want to handle it,
2198 * do not consume.
2199 */
2200 goto out_put;
2201
2202 case RET_INSN_OPCODE:
2203 int3_emulate_ret(regs);
2204 break;
2205
2206 case CALL_INSN_OPCODE:
2207 int3_emulate_call(regs, (long)ip + tp->disp);
2208 break;
2209
2210 case JMP32_INSN_OPCODE:
2211 case JMP8_INSN_OPCODE:
2212 int3_emulate_jmp(regs, (long)ip + tp->disp);
2213 break;
2214
2215 case 0x70 ... 0x7f: /* Jcc */
2216 int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
2217 break;
2218
2219 default:
2220 BUG();
2221 }
2222
2223 ret = 1;
2224
2225out_put:
2226 put_desc();
2227 return ret;
2228}
2229
2230#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
2231static struct text_poke_loc tp_vec[TP_VEC_MAX];
2232static int tp_vec_nr;
2233
2234/**
2235 * text_poke_bp_batch() -- update instructions on live kernel on SMP
2236 * @tp: vector of instructions to patch
2237 * @nr_entries: number of entries in the vector
2238 *
2239 * Modify multi-byte instruction by using int3 breakpoint on SMP.
2240 * We completely avoid stop_machine() here, and achieve the
2241 * synchronization using int3 breakpoint.
2242 *
2243 * The way it is done:
2244 * - For each entry in the vector:
2245 * - add a int3 trap to the address that will be patched
2246 * - sync cores
2247 * - For each entry in the vector:
2248 * - update all but the first byte of the patched range
2249 * - sync cores
2250 * - For each entry in the vector:
2251 * - replace the first byte (int3) by the first byte of
2252 * replacing opcode
2253 * - sync cores
2254 */
2255static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
2256{
2257 unsigned char int3 = INT3_INSN_OPCODE;
2258 unsigned int i;
2259 int do_sync;
2260
2261 lockdep_assert_held(&text_mutex);
2262
2263 bp_desc.vec = tp;
2264 bp_desc.nr_entries = nr_entries;
2265
2266 /*
2267 * Corresponds to the implicit memory barrier in try_get_desc() to
2268 * ensure reading a non-zero refcount provides up to date bp_desc data.
2269 */
2270 atomic_set_release(&bp_desc.refs, 1);
2271
2272 /*
2273 * Function tracing can enable thousands of places that need to be
2274 * updated. This can take quite some time, and with full kernel debugging
2275 * enabled, this could cause the softlockup watchdog to trigger.
2276 * This function gets called every 256 entries added to be patched.
2277 * Call cond_resched() here to make sure that other tasks can get scheduled
2278 * while processing all the functions being patched.
2279 */
2280 cond_resched();
2281
2282 /*
2283 * Corresponding read barrier in int3 notifier for making sure the
2284 * nr_entries and handler are correctly ordered wrt. patching.
2285 */
2286 smp_wmb();
2287
2288 /*
2289 * First step: add a int3 trap to the address that will be patched.
2290 */
2291 for (i = 0; i < nr_entries; i++) {
2292 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
2293 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
2294 }
2295
2296 text_poke_sync();
2297
2298 /*
2299 * Second step: update all but the first byte of the patched range.
2300 */
2301 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2302 u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
2303 u8 _new[POKE_MAX_OPCODE_SIZE+1];
2304 const u8 *new = tp[i].text;
2305 int len = tp[i].len;
2306
2307 if (len - INT3_INSN_SIZE > 0) {
2308 memcpy(old + INT3_INSN_SIZE,
2309 text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2310 len - INT3_INSN_SIZE);
2311
2312 if (len == 6) {
2313 _new[0] = 0x0f;
2314 memcpy(_new + 1, new, 5);
2315 new = _new;
2316 }
2317
2318 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2319 new + INT3_INSN_SIZE,
2320 len - INT3_INSN_SIZE);
2321
2322 do_sync++;
2323 }
2324
2325 /*
2326 * Emit a perf event to record the text poke, primarily to
2327 * support Intel PT decoding which must walk the executable code
2328 * to reconstruct the trace. The flow up to here is:
2329 * - write INT3 byte
2330 * - IPI-SYNC
2331 * - write instruction tail
2332 * At this point the actual control flow will be through the
2333 * INT3 and handler and not hit the old or new instruction.
2334 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2335 * can still be decoded. Subsequently:
2336 * - emit RECORD_TEXT_POKE with the new instruction
2337 * - IPI-SYNC
2338 * - write first byte
2339 * - IPI-SYNC
2340 * So before the text poke event timestamp, the decoder will see
2341 * either the old instruction flow or FUP/TIP of INT3. After the
2342 * text poke event timestamp, the decoder will see either the
2343 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2344 * use the timestamp as the point at which to modify the
2345 * executable code.
2346 * The old instruction is recorded so that the event can be
2347 * processed forwards or backwards.
2348 */
2349 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
2350 }
2351
2352 if (do_sync) {
2353 /*
2354 * According to Intel, this core syncing is very likely
2355 * not necessary and we'd be safe even without it. But
2356 * better safe than sorry (plus there's not only Intel).
2357 */
2358 text_poke_sync();
2359 }
2360
2361 /*
2362 * Third step: replace the first byte (int3) by the first byte of
2363 * replacing opcode.
2364 */
2365 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2366 u8 byte = tp[i].text[0];
2367
2368 if (tp[i].len == 6)
2369 byte = 0x0f;
2370
2371 if (byte == INT3_INSN_OPCODE)
2372 continue;
2373
2374 text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
2375 do_sync++;
2376 }
2377
2378 if (do_sync)
2379 text_poke_sync();
2380
2381 /*
2382 * Remove and wait for refs to be zero.
2383 */
2384 if (!atomic_dec_and_test(&bp_desc.refs))
2385 atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2386}
2387
2388static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2389 const void *opcode, size_t len, const void *emulate)
2390{
2391 struct insn insn;
2392 int ret, i = 0;
2393
2394 if (len == 6)
2395 i = 1;
2396 memcpy((void *)tp->text, opcode+i, len-i);
2397 if (!emulate)
2398 emulate = opcode;
2399
2400 ret = insn_decode_kernel(&insn, emulate);
2401 BUG_ON(ret < 0);
2402
2403 tp->rel_addr = addr - (void *)_stext;
2404 tp->len = len;
2405 tp->opcode = insn.opcode.bytes[0];
2406
2407 if (is_jcc32(&insn)) {
2408 /*
2409 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2410 */
2411 tp->opcode = insn.opcode.bytes[1] - 0x10;
2412 }
2413
2414 switch (tp->opcode) {
2415 case RET_INSN_OPCODE:
2416 case JMP32_INSN_OPCODE:
2417 case JMP8_INSN_OPCODE:
2418 /*
2419 * Control flow instructions without implied execution of the
2420 * next instruction can be padded with INT3.
2421 */
2422 for (i = insn.length; i < len; i++)
2423 BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2424 break;
2425
2426 default:
2427 BUG_ON(len != insn.length);
2428 }
2429
2430 switch (tp->opcode) {
2431 case INT3_INSN_OPCODE:
2432 case RET_INSN_OPCODE:
2433 break;
2434
2435 case CALL_INSN_OPCODE:
2436 case JMP32_INSN_OPCODE:
2437 case JMP8_INSN_OPCODE:
2438 case 0x70 ... 0x7f: /* Jcc */
2439 tp->disp = insn.immediate.value;
2440 break;
2441
2442 default: /* assume NOP */
2443 switch (len) {
2444 case 2: /* NOP2 -- emulate as JMP8+0 */
2445 BUG_ON(memcmp(emulate, x86_nops[len], len));
2446 tp->opcode = JMP8_INSN_OPCODE;
2447 tp->disp = 0;
2448 break;
2449
2450 case 5: /* NOP5 -- emulate as JMP32+0 */
2451 BUG_ON(memcmp(emulate, x86_nops[len], len));
2452 tp->opcode = JMP32_INSN_OPCODE;
2453 tp->disp = 0;
2454 break;
2455
2456 default: /* unknown instruction */
2457 BUG();
2458 }
2459 break;
2460 }
2461}
2462
2463/*
2464 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2465 * early if needed.
2466 */
2467static bool tp_order_fail(void *addr)
2468{
2469 struct text_poke_loc *tp;
2470
2471 if (!tp_vec_nr)
2472 return false;
2473
2474 if (!addr) /* force */
2475 return true;
2476
2477 tp = &tp_vec[tp_vec_nr - 1];
2478 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2479 return true;
2480
2481 return false;
2482}
2483
2484static void text_poke_flush(void *addr)
2485{
2486 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2487 text_poke_bp_batch(tp_vec, tp_vec_nr);
2488 tp_vec_nr = 0;
2489 }
2490}
2491
2492void text_poke_finish(void)
2493{
2494 text_poke_flush(NULL);
2495}
2496
2497void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2498{
2499 struct text_poke_loc *tp;
2500
2501 text_poke_flush(addr);
2502
2503 tp = &tp_vec[tp_vec_nr++];
2504 text_poke_loc_init(tp, addr, opcode, len, emulate);
2505}
2506
2507/**
2508 * text_poke_bp() -- update instructions on live kernel on SMP
2509 * @addr: address to patch
2510 * @opcode: opcode of new instruction
2511 * @len: length to copy
2512 * @emulate: instruction to be emulated
2513 *
2514 * Update a single instruction with the vector in the stack, avoiding
2515 * dynamically allocated memory. This function should be used when it is
2516 * not possible to allocate memory.
2517 */
2518void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2519{
2520 struct text_poke_loc tp;
2521
2522 text_poke_loc_init(&tp, addr, opcode, len, emulate);
2523 text_poke_bp_batch(&tp, 1);
2524}