Loading...
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/list.h>
5#include <linux/stringify.h>
6#include <linux/kprobes.h>
7#include <linux/mm.h>
8#include <linux/vmalloc.h>
9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
12#include <asm/alternative.h>
13#include <asm/sections.h>
14#include <asm/pgtable.h>
15#include <asm/mce.h>
16#include <asm/nmi.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/io.h>
20#include <asm/fixmap.h>
21
22#define MAX_PATCH_LEN (255-1)
23
24#ifdef CONFIG_HOTPLUG_CPU
25static int smp_alt_once;
26
27static int __init bootonly(char *str)
28{
29 smp_alt_once = 1;
30 return 1;
31}
32__setup("smp-alt-boot", bootonly);
33#else
34#define smp_alt_once 1
35#endif
36
37static int __initdata_or_module debug_alternative;
38
39static int __init debug_alt(char *str)
40{
41 debug_alternative = 1;
42 return 1;
43}
44__setup("debug-alternative", debug_alt);
45
46static int noreplace_smp;
47
48static int __init setup_noreplace_smp(char *str)
49{
50 noreplace_smp = 1;
51 return 1;
52}
53__setup("noreplace-smp", setup_noreplace_smp);
54
55#ifdef CONFIG_PARAVIRT
56static int __initdata_or_module noreplace_paravirt = 0;
57
58static int __init setup_noreplace_paravirt(char *str)
59{
60 noreplace_paravirt = 1;
61 return 1;
62}
63__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif
65
66#define DPRINTK(fmt, args...) if (debug_alternative) \
67 printk(KERN_DEBUG fmt, args)
68
69/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
71 * that correspond to that nop. Getting from one nop to the next, we
72 * add to the array the offset that is equal to the sum of all sizes of
73 * nops preceding the one we are after.
74 *
75 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
76 * nice symmetry of sizes of the previous nops.
77 */
78#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
79static const unsigned char intelnops[] =
80{
81 GENERIC_NOP1,
82 GENERIC_NOP2,
83 GENERIC_NOP3,
84 GENERIC_NOP4,
85 GENERIC_NOP5,
86 GENERIC_NOP6,
87 GENERIC_NOP7,
88 GENERIC_NOP8,
89 GENERIC_NOP5_ATOMIC
90};
91static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
92{
93 NULL,
94 intelnops,
95 intelnops + 1,
96 intelnops + 1 + 2,
97 intelnops + 1 + 2 + 3,
98 intelnops + 1 + 2 + 3 + 4,
99 intelnops + 1 + 2 + 3 + 4 + 5,
100 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
101 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103};
104#endif
105
106#ifdef K8_NOP1
107static const unsigned char k8nops[] =
108{
109 K8_NOP1,
110 K8_NOP2,
111 K8_NOP3,
112 K8_NOP4,
113 K8_NOP5,
114 K8_NOP6,
115 K8_NOP7,
116 K8_NOP8,
117 K8_NOP5_ATOMIC
118};
119static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
120{
121 NULL,
122 k8nops,
123 k8nops + 1,
124 k8nops + 1 + 2,
125 k8nops + 1 + 2 + 3,
126 k8nops + 1 + 2 + 3 + 4,
127 k8nops + 1 + 2 + 3 + 4 + 5,
128 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
131};
132#endif
133
134#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
135static const unsigned char k7nops[] =
136{
137 K7_NOP1,
138 K7_NOP2,
139 K7_NOP3,
140 K7_NOP4,
141 K7_NOP5,
142 K7_NOP6,
143 K7_NOP7,
144 K7_NOP8,
145 K7_NOP5_ATOMIC
146};
147static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
148{
149 NULL,
150 k7nops,
151 k7nops + 1,
152 k7nops + 1 + 2,
153 k7nops + 1 + 2 + 3,
154 k7nops + 1 + 2 + 3 + 4,
155 k7nops + 1 + 2 + 3 + 4 + 5,
156 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
157 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
159};
160#endif
161
162#ifdef P6_NOP1
163static const unsigned char p6nops[] =
164{
165 P6_NOP1,
166 P6_NOP2,
167 P6_NOP3,
168 P6_NOP4,
169 P6_NOP5,
170 P6_NOP6,
171 P6_NOP7,
172 P6_NOP8,
173 P6_NOP5_ATOMIC
174};
175static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
176{
177 NULL,
178 p6nops,
179 p6nops + 1,
180 p6nops + 1 + 2,
181 p6nops + 1 + 2 + 3,
182 p6nops + 1 + 2 + 3 + 4,
183 p6nops + 1 + 2 + 3 + 4 + 5,
184 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
185 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
187};
188#endif
189
190/* Initialize these to a safe default */
191#ifdef CONFIG_X86_64
192const unsigned char * const *ideal_nops = p6_nops;
193#else
194const unsigned char * const *ideal_nops = intel_nops;
195#endif
196
197void __init arch_init_ideal_nops(void)
198{
199 switch (boot_cpu_data.x86_vendor) {
200 case X86_VENDOR_INTEL:
201 /*
202 * Due to a decoder implementation quirk, some
203 * specific Intel CPUs actually perform better with
204 * the "k8_nops" than with the SDM-recommended NOPs.
205 */
206 if (boot_cpu_data.x86 == 6 &&
207 boot_cpu_data.x86_model >= 0x0f &&
208 boot_cpu_data.x86_model != 0x1c &&
209 boot_cpu_data.x86_model != 0x26 &&
210 boot_cpu_data.x86_model != 0x27 &&
211 boot_cpu_data.x86_model < 0x30) {
212 ideal_nops = k8_nops;
213 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
214 ideal_nops = p6_nops;
215 } else {
216#ifdef CONFIG_X86_64
217 ideal_nops = k8_nops;
218#else
219 ideal_nops = intel_nops;
220#endif
221 }
222 break;
223 default:
224#ifdef CONFIG_X86_64
225 ideal_nops = k8_nops;
226#else
227 if (boot_cpu_has(X86_FEATURE_K8))
228 ideal_nops = k8_nops;
229 else if (boot_cpu_has(X86_FEATURE_K7))
230 ideal_nops = k7_nops;
231 else
232 ideal_nops = intel_nops;
233#endif
234 }
235}
236
237/* Use this to add nops to a buffer, then text_poke the whole buffer. */
238static void __init_or_module add_nops(void *insns, unsigned int len)
239{
240 while (len > 0) {
241 unsigned int noplen = len;
242 if (noplen > ASM_NOP_MAX)
243 noplen = ASM_NOP_MAX;
244 memcpy(insns, ideal_nops[noplen], noplen);
245 insns += noplen;
246 len -= noplen;
247 }
248}
249
250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
251extern s32 __smp_locks[], __smp_locks_end[];
252void *text_poke_early(void *addr, const void *opcode, size_t len);
253
254/* Replace instructions with better alternatives for this CPU type.
255 This runs before SMP is initialized to avoid SMP problems with
256 self modifying code. This implies that asymmetric systems where
257 APs have less capabilities than the boot processor are not handled.
258 Tough. Make sure you disable such features by hand. */
259
260void __init_or_module apply_alternatives(struct alt_instr *start,
261 struct alt_instr *end)
262{
263 struct alt_instr *a;
264 u8 *instr, *replacement;
265 u8 insnbuf[MAX_PATCH_LEN];
266
267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
268 /*
269 * The scan order should be from start to end. A later scanned
270 * alternative code can overwrite a previous scanned alternative code.
271 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
272 * patch code.
273 *
274 * So be careful if you want to change the scan order to any other
275 * order.
276 */
277 for (a = start; a < end; a++) {
278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid))
284 continue;
285
286 memcpy(insnbuf, replacement, a->replacementlen);
287
288 /* 0xe8 is a relative jump; fix the offset. */
289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
292 add_nops(insnbuf + a->replacementlen,
293 a->instrlen - a->replacementlen);
294
295 text_poke_early(instr, insnbuf, a->instrlen);
296 }
297}
298
299#ifdef CONFIG_SMP
300
301static void alternatives_smp_lock(const s32 *start, const s32 *end,
302 u8 *text, u8 *text_end)
303{
304 const s32 *poff;
305
306 mutex_lock(&text_mutex);
307 for (poff = start; poff < end; poff++) {
308 u8 *ptr = (u8 *)poff + *poff;
309
310 if (!*poff || ptr < text || ptr >= text_end)
311 continue;
312 /* turn DS segment override prefix into lock prefix */
313 if (*ptr == 0x3e)
314 text_poke(ptr, ((unsigned char []){0xf0}), 1);
315 };
316 mutex_unlock(&text_mutex);
317}
318
319static void alternatives_smp_unlock(const s32 *start, const s32 *end,
320 u8 *text, u8 *text_end)
321{
322 const s32 *poff;
323
324 if (noreplace_smp)
325 return;
326
327 mutex_lock(&text_mutex);
328 for (poff = start; poff < end; poff++) {
329 u8 *ptr = (u8 *)poff + *poff;
330
331 if (!*poff || ptr < text || ptr >= text_end)
332 continue;
333 /* turn lock prefix into DS segment override prefix */
334 if (*ptr == 0xf0)
335 text_poke(ptr, ((unsigned char []){0x3E}), 1);
336 };
337 mutex_unlock(&text_mutex);
338}
339
340struct smp_alt_module {
341 /* what is this ??? */
342 struct module *mod;
343 char *name;
344
345 /* ptrs to lock prefixes */
346 const s32 *locks;
347 const s32 *locks_end;
348
349 /* .text segment, needed to avoid patching init code ;) */
350 u8 *text;
351 u8 *text_end;
352
353 struct list_head next;
354};
355static LIST_HEAD(smp_alt_modules);
356static DEFINE_MUTEX(smp_alt);
357static int smp_mode = 1; /* protected by smp_alt */
358
359void __init_or_module alternatives_smp_module_add(struct module *mod,
360 char *name,
361 void *locks, void *locks_end,
362 void *text, void *text_end)
363{
364 struct smp_alt_module *smp;
365
366 if (noreplace_smp)
367 return;
368
369 if (smp_alt_once) {
370 if (boot_cpu_has(X86_FEATURE_UP))
371 alternatives_smp_unlock(locks, locks_end,
372 text, text_end);
373 return;
374 }
375
376 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
377 if (NULL == smp)
378 return; /* we'll run the (safe but slow) SMP code then ... */
379
380 smp->mod = mod;
381 smp->name = name;
382 smp->locks = locks;
383 smp->locks_end = locks_end;
384 smp->text = text;
385 smp->text_end = text_end;
386 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
387 __func__, smp->locks, smp->locks_end,
388 smp->text, smp->text_end, smp->name);
389
390 mutex_lock(&smp_alt);
391 list_add_tail(&smp->next, &smp_alt_modules);
392 if (boot_cpu_has(X86_FEATURE_UP))
393 alternatives_smp_unlock(smp->locks, smp->locks_end,
394 smp->text, smp->text_end);
395 mutex_unlock(&smp_alt);
396}
397
398void __init_or_module alternatives_smp_module_del(struct module *mod)
399{
400 struct smp_alt_module *item;
401
402 if (smp_alt_once || noreplace_smp)
403 return;
404
405 mutex_lock(&smp_alt);
406 list_for_each_entry(item, &smp_alt_modules, next) {
407 if (mod != item->mod)
408 continue;
409 list_del(&item->next);
410 mutex_unlock(&smp_alt);
411 DPRINTK("%s: %s\n", __func__, item->name);
412 kfree(item);
413 return;
414 }
415 mutex_unlock(&smp_alt);
416}
417
418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
420{
421 struct smp_alt_module *mod;
422
423#ifdef CONFIG_LOCKDEP
424 /*
425 * Older binutils section handling bug prevented
426 * alternatives-replacement from working reliably.
427 *
428 * If this still occurs then you should see a hang
429 * or crash shortly after this line:
430 */
431 printk("lockdep: fixing up alternatives.\n");
432#endif
433
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
435 return;
436 BUG_ON(!smp && (num_online_cpus() > 1));
437
438 mutex_lock(&smp_alt);
439
440 /*
441 * Avoid unnecessary switches because it forces JIT based VMs to
442 * throw away all cached translations, which can be quite costly.
443 */
444 if (smp == smp_mode) {
445 /* nothing */
446 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end);
453 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next)
458 alternatives_smp_unlock(mod->locks, mod->locks_end,
459 mod->text, mod->text_end);
460 }
461 smp_mode = smp;
462 mutex_unlock(&smp_alt);
463}
464
465/* Return 1 if the address range is reserved for smp-alternatives */
466int alternatives_text_reserved(void *start, void *end)
467{
468 struct smp_alt_module *mod;
469 const s32 *poff;
470 u8 *text_start = start;
471 u8 *text_end = end;
472
473 list_for_each_entry(mod, &smp_alt_modules, next) {
474 if (mod->text > text_end || mod->text_end < text_start)
475 continue;
476 for (poff = mod->locks; poff < mod->locks_end; poff++) {
477 const u8 *ptr = (const u8 *)poff + *poff;
478
479 if (text_start <= ptr && text_end > ptr)
480 return 1;
481 }
482 }
483
484 return 0;
485}
486#endif
487
488#ifdef CONFIG_PARAVIRT
489void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
490 struct paravirt_patch_site *end)
491{
492 struct paravirt_patch_site *p;
493 char insnbuf[MAX_PATCH_LEN];
494
495 if (noreplace_paravirt)
496 return;
497
498 for (p = start; p < end; p++) {
499 unsigned int used;
500
501 BUG_ON(p->len > MAX_PATCH_LEN);
502 /* prep the buffer with the original instructions */
503 memcpy(insnbuf, p->instr, p->len);
504 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
505 (unsigned long)p->instr, p->len);
506
507 BUG_ON(used > p->len);
508
509 /* Pad the rest with nops */
510 add_nops(insnbuf + used, p->len - used);
511 text_poke_early(p->instr, insnbuf, p->len);
512 }
513}
514extern struct paravirt_patch_site __start_parainstructions[],
515 __stop_parainstructions[];
516#endif /* CONFIG_PARAVIRT */
517
518void __init alternative_instructions(void)
519{
520 /* The patching is not fully atomic, so try to avoid local interruptions
521 that might execute the to be patched code.
522 Other CPUs are not running. */
523 stop_nmi();
524
525 /*
526 * Don't stop machine check exceptions while patching.
527 * MCEs only happen when something got corrupted and in this
528 * case we must do something about the corruption.
529 * Ignoring it is worse than a unlikely patching race.
530 * Also machine checks tend to be broadcast and if one CPU
531 * goes into machine check the others follow quickly, so we don't
532 * expect a machine check to cause undue problems during to code
533 * patching.
534 */
535
536 apply_alternatives(__alt_instructions, __alt_instructions_end);
537
538 /* switch to patch-once-at-boottime-only mode and free the
539 * tables in case we know the number of CPUs will never ever
540 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542 if (num_possible_cpus() < 2)
543 smp_alt_once = 1;
544#endif
545
546#ifdef CONFIG_SMP
547 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554 _text, _etext);
555 }
556 } else {
557 alternatives_smp_module_add(NULL, "core kernel",
558 __smp_locks, __smp_locks_end,
559 _text, _etext);
560
561 /* Only switch to UP mode if we don't immediately boot others */
562 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563 alternatives_smp_switch(0);
564 }
565#endif
566 apply_paravirt(__parainstructions, __parainstructions_end);
567
568 if (smp_alt_once)
569 free_init_pages("SMP alternatives",
570 (unsigned long)__smp_locks,
571 (unsigned long)__smp_locks_end);
572
573 restart_nmi();
574}
575
576/**
577 * text_poke_early - Update instructions on a live kernel at boot time
578 * @addr: address to modify
579 * @opcode: source of the copy
580 * @len: length to copy
581 *
582 * When you use this code to patch more than one byte of an instruction
583 * you need to make sure that other CPUs cannot execute this code in parallel.
584 * Also no thread must be currently preempted in the middle of these
585 * instructions. And on the local CPU you need to be protected again NMI or MCE
586 * handlers seeing an inconsistent instruction while you patch.
587 */
588void *__init_or_module text_poke_early(void *addr, const void *opcode,
589 size_t len)
590{
591 unsigned long flags;
592 local_irq_save(flags);
593 memcpy(addr, opcode, len);
594 sync_core();
595 local_irq_restore(flags);
596 /* Could also do a CLFLUSH here to speed up CPU recovery; but
597 that causes hangs on some VIA CPUs. */
598 return addr;
599}
600
601/**
602 * text_poke - Update instructions on a live kernel
603 * @addr: address to modify
604 * @opcode: source of the copy
605 * @len: length to copy
606 *
607 * Only atomic text poke/set should be allowed when not doing early patching.
608 * It means the size must be writable atomically and the address must be aligned
609 * in a way that permits an atomic write. It also makes sure we fit on a single
610 * page.
611 *
612 * Note: Must be called under text_mutex.
613 */
614void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
615{
616 unsigned long flags;
617 char *vaddr;
618 struct page *pages[2];
619 int i;
620
621 if (!core_kernel_text((unsigned long)addr)) {
622 pages[0] = vmalloc_to_page(addr);
623 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
624 } else {
625 pages[0] = virt_to_page(addr);
626 WARN_ON(!PageReserved(pages[0]));
627 pages[1] = virt_to_page(addr + PAGE_SIZE);
628 }
629 BUG_ON(!pages[0]);
630 local_irq_save(flags);
631 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
632 if (pages[1])
633 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
634 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
635 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
636 clear_fixmap(FIX_TEXT_POKE0);
637 if (pages[1])
638 clear_fixmap(FIX_TEXT_POKE1);
639 local_flush_tlb();
640 sync_core();
641 /* Could also do a CLFLUSH here to speed up CPU recovery; but
642 that causes hangs on some VIA CPUs. */
643 for (i = 0; i < len; i++)
644 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
645 local_irq_restore(flags);
646 return addr;
647}
648
649/*
650 * Cross-modifying kernel text with stop_machine().
651 * This code originally comes from immediate value.
652 */
653static atomic_t stop_machine_first;
654static int wrote_text;
655
656struct text_poke_params {
657 struct text_poke_param *params;
658 int nparams;
659};
660
661static int __kprobes stop_machine_text_poke(void *data)
662{
663 struct text_poke_params *tpp = data;
664 struct text_poke_param *p;
665 int i;
666
667 if (atomic_dec_and_test(&stop_machine_first)) {
668 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len);
671 }
672 smp_wmb(); /* Make sure other cpus see that this has run */
673 wrote_text = 1;
674 } else {
675 while (!wrote_text)
676 cpu_relax();
677 smp_mb(); /* Load wrote_text before following execution */
678 }
679
680 for (i = 0; i < tpp->nparams; i++) {
681 p = &tpp->params[i];
682 flush_icache_range((unsigned long)p->addr,
683 (unsigned long)p->addr + p->len);
684 }
685 /*
686 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
687 * that a core serializing instruction such as "cpuid" should be
688 * executed on _each_ core before the new instruction is made visible.
689 */
690 sync_core();
691 return 0;
692}
693
694/**
695 * text_poke_smp - Update instructions on a live kernel on SMP
696 * @addr: address to modify
697 * @opcode: source of the copy
698 * @len: length to copy
699 *
700 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
701 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
702 * should be allowed, since stop_machine() does _not_ protect code against
703 * NMI and MCE.
704 *
705 * Note: Must be called under get_online_cpus() and text_mutex.
706 */
707void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
708{
709 struct text_poke_params tpp;
710 struct text_poke_param p;
711
712 p.addr = addr;
713 p.opcode = opcode;
714 p.len = len;
715 tpp.params = &p;
716 tpp.nparams = 1;
717 atomic_set(&stop_machine_first, 1);
718 wrote_text = 0;
719 /* Use __stop_machine() because the caller already got online_cpus. */
720 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
721 return addr;
722}
723
724/**
725 * text_poke_smp_batch - Update instructions on a live kernel on SMP
726 * @params: an array of text_poke parameters
727 * @n: the number of elements in params.
728 *
729 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
730 * stop_machine() is heavy task, it is better to aggregate text_poke requests
731 * and do it once if possible.
732 *
733 * Note: Must be called under get_online_cpus() and text_mutex.
734 */
735void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
736{
737 struct text_poke_params tpp = {.params = params, .nparams = n};
738
739 atomic_set(&stop_machine_first, 1);
740 wrote_text = 0;
741 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
742}
1// SPDX-License-Identifier: GPL-2.0-only
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
4#include <linux/module.h>
5#include <linux/sched.h>
6#include <linux/perf_event.h>
7#include <linux/mutex.h>
8#include <linux/list.h>
9#include <linux/stringify.h>
10#include <linux/highmem.h>
11#include <linux/mm.h>
12#include <linux/vmalloc.h>
13#include <linux/memory.h>
14#include <linux/stop_machine.h>
15#include <linux/slab.h>
16#include <linux/kdebug.h>
17#include <linux/kprobes.h>
18#include <linux/mmu_context.h>
19#include <linux/bsearch.h>
20#include <linux/sync_core.h>
21#include <asm/text-patching.h>
22#include <asm/alternative.h>
23#include <asm/sections.h>
24#include <asm/mce.h>
25#include <asm/nmi.h>
26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h>
28#include <asm/insn.h>
29#include <asm/io.h>
30#include <asm/fixmap.h>
31#include <asm/paravirt.h>
32#include <asm/asm-prototypes.h>
33#include <asm/cfi.h>
34
35int __read_mostly alternatives_patched;
36
37EXPORT_SYMBOL_GPL(alternatives_patched);
38
39#define MAX_PATCH_LEN (255-1)
40
41#define DA_ALL (~0)
42#define DA_ALT 0x01
43#define DA_RET 0x02
44#define DA_RETPOLINE 0x04
45#define DA_ENDBR 0x08
46#define DA_SMP 0x10
47
48static unsigned int __initdata_or_module debug_alternative;
49
50static int __init debug_alt(char *str)
51{
52 if (str && *str == '=')
53 str++;
54
55 if (!str || kstrtouint(str, 0, &debug_alternative))
56 debug_alternative = DA_ALL;
57
58 return 1;
59}
60__setup("debug-alternative", debug_alt);
61
62static int noreplace_smp;
63
64static int __init setup_noreplace_smp(char *str)
65{
66 noreplace_smp = 1;
67 return 1;
68}
69__setup("noreplace-smp", setup_noreplace_smp);
70
71#define DPRINTK(type, fmt, args...) \
72do { \
73 if (debug_alternative & DA_##type) \
74 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
75} while (0)
76
77#define DUMP_BYTES(type, buf, len, fmt, args...) \
78do { \
79 if (unlikely(debug_alternative & DA_##type)) { \
80 int j; \
81 \
82 if (!(len)) \
83 break; \
84 \
85 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
86 for (j = 0; j < (len) - 1; j++) \
87 printk(KERN_CONT "%02hhx ", buf[j]); \
88 printk(KERN_CONT "%02hhx\n", buf[j]); \
89 } \
90} while (0)
91
92static const unsigned char x86nops[] =
93{
94 BYTES_NOP1,
95 BYTES_NOP2,
96 BYTES_NOP3,
97 BYTES_NOP4,
98 BYTES_NOP5,
99 BYTES_NOP6,
100 BYTES_NOP7,
101 BYTES_NOP8,
102#ifdef CONFIG_64BIT
103 BYTES_NOP9,
104 BYTES_NOP10,
105 BYTES_NOP11,
106#endif
107};
108
109const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
110{
111 NULL,
112 x86nops,
113 x86nops + 1,
114 x86nops + 1 + 2,
115 x86nops + 1 + 2 + 3,
116 x86nops + 1 + 2 + 3 + 4,
117 x86nops + 1 + 2 + 3 + 4 + 5,
118 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
119 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
120#ifdef CONFIG_64BIT
121 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
122 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
123 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
124#endif
125};
126
127/*
128 * Fill the buffer with a single effective instruction of size @len.
129 *
130 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
131 * for every single-byte NOP, try to generate the maximally available NOP of
132 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
133 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
134 * *jump* over instead of executing long and daft NOPs.
135 */
136static void __init_or_module add_nop(u8 *instr, unsigned int len)
137{
138 u8 *target = instr + len;
139
140 if (!len)
141 return;
142
143 if (len <= ASM_NOP_MAX) {
144 memcpy(instr, x86_nops[len], len);
145 return;
146 }
147
148 if (len < 128) {
149 __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
150 instr += JMP8_INSN_SIZE;
151 } else {
152 __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
153 instr += JMP32_INSN_SIZE;
154 }
155
156 for (;instr < target; instr++)
157 *instr = INT3_INSN_OPCODE;
158}
159
160extern s32 __retpoline_sites[], __retpoline_sites_end[];
161extern s32 __return_sites[], __return_sites_end[];
162extern s32 __cfi_sites[], __cfi_sites_end[];
163extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
164extern s32 __smp_locks[], __smp_locks_end[];
165void text_poke_early(void *addr, const void *opcode, size_t len);
166
167/*
168 * Matches NOP and NOPL, not any of the other possible NOPs.
169 */
170static bool insn_is_nop(struct insn *insn)
171{
172 /* Anything NOP, but no REP NOP */
173 if (insn->opcode.bytes[0] == 0x90 &&
174 (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
175 return true;
176
177 /* NOPL */
178 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
179 return true;
180
181 /* TODO: more nops */
182
183 return false;
184}
185
186/*
187 * Find the offset of the first non-NOP instruction starting at @offset
188 * but no further than @len.
189 */
190static int skip_nops(u8 *instr, int offset, int len)
191{
192 struct insn insn;
193
194 for (; offset < len; offset += insn.length) {
195 if (insn_decode_kernel(&insn, &instr[offset]))
196 break;
197
198 if (!insn_is_nop(&insn))
199 break;
200 }
201
202 return offset;
203}
204
205/*
206 * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
207 * to the end of the NOP sequence into a single NOP.
208 */
209static bool __init_or_module
210__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
211{
212 int i = *next - insn->length;
213
214 switch (insn->opcode.bytes[0]) {
215 case JMP8_INSN_OPCODE:
216 case JMP32_INSN_OPCODE:
217 *prev = i;
218 *target = *next + insn->immediate.value;
219 return false;
220 }
221
222 if (insn_is_nop(insn)) {
223 int nop = i;
224
225 *next = skip_nops(instr, *next, len);
226 if (*target && *next == *target)
227 nop = *prev;
228
229 add_nop(instr + nop, *next - nop);
230 DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
231 return true;
232 }
233
234 *target = 0;
235 return false;
236}
237
238/*
239 * "noinline" to cause control flow change and thus invalidate I$ and
240 * cause refetch after modification.
241 */
242static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
243{
244 int prev, target = 0;
245
246 for (int next, i = 0; i < len; i = next) {
247 struct insn insn;
248
249 if (insn_decode_kernel(&insn, &instr[i]))
250 return;
251
252 next = i + insn.length;
253
254 __optimize_nops(instr, len, &insn, &next, &prev, &target);
255 }
256}
257
258static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
259{
260 unsigned long flags;
261
262 local_irq_save(flags);
263 optimize_nops(instr, len);
264 sync_core();
265 local_irq_restore(flags);
266}
267
268/*
269 * In this context, "source" is where the instructions are placed in the
270 * section .altinstr_replacement, for example during kernel build by the
271 * toolchain.
272 * "Destination" is where the instructions are being patched in by this
273 * machinery.
274 *
275 * The source offset is:
276 *
277 * src_imm = target - src_next_ip (1)
278 *
279 * and the target offset is:
280 *
281 * dst_imm = target - dst_next_ip (2)
282 *
283 * so rework (1) as an expression for target like:
284 *
285 * target = src_imm + src_next_ip (1a)
286 *
287 * and substitute in (2) to get:
288 *
289 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
290 *
291 * Now, since the instruction stream is 'identical' at src and dst (it
292 * is being copied after all) it can be stated that:
293 *
294 * src_next_ip = src + ip_offset
295 * dst_next_ip = dst + ip_offset (4)
296 *
297 * Substitute (4) in (3) and observe ip_offset being cancelled out to
298 * obtain:
299 *
300 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
301 * = src_imm + src - dst + ip_offset - ip_offset
302 * = src_imm + src - dst (5)
303 *
304 * IOW, only the relative displacement of the code block matters.
305 */
306
307#define apply_reloc_n(n_, p_, d_) \
308 do { \
309 s32 v = *(s##n_ *)(p_); \
310 v += (d_); \
311 BUG_ON((v >> 31) != (v >> (n_-1))); \
312 *(s##n_ *)(p_) = (s##n_)v; \
313 } while (0)
314
315
316static __always_inline
317void apply_reloc(int n, void *ptr, uintptr_t diff)
318{
319 switch (n) {
320 case 1: apply_reloc_n(8, ptr, diff); break;
321 case 2: apply_reloc_n(16, ptr, diff); break;
322 case 4: apply_reloc_n(32, ptr, diff); break;
323 default: BUG();
324 }
325}
326
327static __always_inline
328bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
329{
330 u8 *target = src + offset;
331 /*
332 * If the target is inside the patched block, it's relative to the
333 * block itself and does not need relocation.
334 */
335 return (target < src || target > src + src_len);
336}
337
338static void __init_or_module noinline
339apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
340{
341 int prev, target = 0;
342
343 for (int next, i = 0; i < len; i = next) {
344 struct insn insn;
345
346 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
347 return;
348
349 next = i + insn.length;
350
351 if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
352 continue;
353
354 switch (insn.opcode.bytes[0]) {
355 case 0x0f:
356 if (insn.opcode.bytes[1] < 0x80 ||
357 insn.opcode.bytes[1] > 0x8f)
358 break;
359
360 fallthrough; /* Jcc.d32 */
361 case 0x70 ... 0x7f: /* Jcc.d8 */
362 case JMP8_INSN_OPCODE:
363 case JMP32_INSN_OPCODE:
364 case CALL_INSN_OPCODE:
365 if (need_reloc(next + insn.immediate.value, src, src_len)) {
366 apply_reloc(insn.immediate.nbytes,
367 buf + i + insn_offset_immediate(&insn),
368 src - dest);
369 }
370
371 /*
372 * Where possible, convert JMP.d32 into JMP.d8.
373 */
374 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
375 s32 imm = insn.immediate.value;
376 imm += src - dest;
377 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
378 if ((imm >> 31) == (imm >> 7)) {
379 buf[i+0] = JMP8_INSN_OPCODE;
380 buf[i+1] = (s8)imm;
381
382 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
383 }
384 }
385 break;
386 }
387
388 if (insn_rip_relative(&insn)) {
389 if (need_reloc(next + insn.displacement.value, src, src_len)) {
390 apply_reloc(insn.displacement.nbytes,
391 buf + i + insn_offset_displacement(&insn),
392 src - dest);
393 }
394 }
395 }
396}
397
398/* Low-level backend functions usable from alternative code replacements. */
399DEFINE_ASM_FUNC(nop_func, "", .entry.text);
400EXPORT_SYMBOL_GPL(nop_func);
401
402noinstr void BUG_func(void)
403{
404 BUG();
405}
406EXPORT_SYMBOL(BUG_func);
407
408#define CALL_RIP_REL_OPCODE 0xff
409#define CALL_RIP_REL_MODRM 0x15
410
411/*
412 * Rewrite the "call BUG_func" replacement to point to the target of the
413 * indirect pv_ops call "call *disp(%ip)".
414 */
415static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
416{
417 void *target, *bug = &BUG_func;
418 s32 disp;
419
420 if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
421 pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
422 BUG();
423 }
424
425 if (a->instrlen != 6 ||
426 instr[0] != CALL_RIP_REL_OPCODE ||
427 instr[1] != CALL_RIP_REL_MODRM) {
428 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
429 BUG();
430 }
431
432 /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
433 disp = *(s32 *)(instr + 2);
434#ifdef CONFIG_X86_64
435 /* ff 15 00 00 00 00 call *0x0(%rip) */
436 /* target address is stored at "next instruction + disp". */
437 target = *(void **)(instr + a->instrlen + disp);
438#else
439 /* ff 15 00 00 00 00 call *0x0 */
440 /* target address is stored at disp. */
441 target = *(void **)disp;
442#endif
443 if (!target)
444 target = bug;
445
446 /* (BUG_func - .) + (target - BUG_func) := target - . */
447 *(s32 *)(insn_buff + 1) += target - bug;
448
449 if (target == &nop_func)
450 return 0;
451
452 return 5;
453}
454
455/*
456 * Replace instructions with better alternatives for this CPU type. This runs
457 * before SMP is initialized to avoid SMP problems with self modifying code.
458 * This implies that asymmetric systems where APs have less capabilities than
459 * the boot processor are not handled. Tough. Make sure you disable such
460 * features by hand.
461 *
462 * Marked "noinline" to cause control flow change and thus insn cache
463 * to refetch changed I$ lines.
464 */
465void __init_or_module noinline apply_alternatives(struct alt_instr *start,
466 struct alt_instr *end)
467{
468 struct alt_instr *a;
469 u8 *instr, *replacement;
470 u8 insn_buff[MAX_PATCH_LEN];
471
472 DPRINTK(ALT, "alt table %px, -> %px", start, end);
473
474 /*
475 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
476 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
477 * During the process, KASAN becomes confused seeing partial LA57
478 * conversion and triggers a false-positive out-of-bound report.
479 *
480 * Disable KASAN until the patching is complete.
481 */
482 kasan_disable_current();
483
484 /*
485 * The scan order should be from start to end. A later scanned
486 * alternative code can overwrite previously scanned alternative code.
487 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
488 * patch code.
489 *
490 * So be careful if you want to change the scan order to any other
491 * order.
492 */
493 for (a = start; a < end; a++) {
494 int insn_buff_sz = 0;
495
496 instr = (u8 *)&a->instr_offset + a->instr_offset;
497 replacement = (u8 *)&a->repl_offset + a->repl_offset;
498 BUG_ON(a->instrlen > sizeof(insn_buff));
499 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
500
501 /*
502 * Patch if either:
503 * - feature is present
504 * - feature not present but ALT_FLAG_NOT is set to mean,
505 * patch if feature is *NOT* present.
506 */
507 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
508 optimize_nops_inplace(instr, a->instrlen);
509 continue;
510 }
511
512 DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
513 a->cpuid >> 5,
514 a->cpuid & 0x1f,
515 instr, instr, a->instrlen,
516 replacement, a->replacementlen, a->flags);
517
518 memcpy(insn_buff, replacement, a->replacementlen);
519 insn_buff_sz = a->replacementlen;
520
521 if (a->flags & ALT_FLAG_DIRECT_CALL) {
522 insn_buff_sz = alt_replace_call(instr, insn_buff, a);
523 if (insn_buff_sz < 0)
524 continue;
525 }
526
527 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
528 insn_buff[insn_buff_sz] = 0x90;
529
530 apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
531
532 DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
533 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
534 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
535
536 text_poke_early(instr, insn_buff, insn_buff_sz);
537 }
538
539 kasan_enable_current();
540}
541
542static inline bool is_jcc32(struct insn *insn)
543{
544 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
545 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
546}
547
548#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
549
550/*
551 * CALL/JMP *%\reg
552 */
553static int emit_indirect(int op, int reg, u8 *bytes)
554{
555 int i = 0;
556 u8 modrm;
557
558 switch (op) {
559 case CALL_INSN_OPCODE:
560 modrm = 0x10; /* Reg = 2; CALL r/m */
561 break;
562
563 case JMP32_INSN_OPCODE:
564 modrm = 0x20; /* Reg = 4; JMP r/m */
565 break;
566
567 default:
568 WARN_ON_ONCE(1);
569 return -1;
570 }
571
572 if (reg >= 8) {
573 bytes[i++] = 0x41; /* REX.B prefix */
574 reg -= 8;
575 }
576
577 modrm |= 0xc0; /* Mod = 3 */
578 modrm += reg;
579
580 bytes[i++] = 0xff; /* opcode */
581 bytes[i++] = modrm;
582
583 return i;
584}
585
586static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
587{
588 u8 op = insn->opcode.bytes[0];
589 int i = 0;
590
591 /*
592 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
593 * tail-calls. Deal with them.
594 */
595 if (is_jcc32(insn)) {
596 bytes[i++] = op;
597 op = insn->opcode.bytes[1];
598 goto clang_jcc;
599 }
600
601 if (insn->length == 6)
602 bytes[i++] = 0x2e; /* CS-prefix */
603
604 switch (op) {
605 case CALL_INSN_OPCODE:
606 __text_gen_insn(bytes+i, op, addr+i,
607 __x86_indirect_call_thunk_array[reg],
608 CALL_INSN_SIZE);
609 i += CALL_INSN_SIZE;
610 break;
611
612 case JMP32_INSN_OPCODE:
613clang_jcc:
614 __text_gen_insn(bytes+i, op, addr+i,
615 __x86_indirect_jump_thunk_array[reg],
616 JMP32_INSN_SIZE);
617 i += JMP32_INSN_SIZE;
618 break;
619
620 default:
621 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
622 return -1;
623 }
624
625 WARN_ON_ONCE(i != insn->length);
626
627 return i;
628}
629
630/*
631 * Rewrite the compiler generated retpoline thunk calls.
632 *
633 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
634 * indirect instructions, avoiding the extra indirection.
635 *
636 * For example, convert:
637 *
638 * CALL __x86_indirect_thunk_\reg
639 *
640 * into:
641 *
642 * CALL *%\reg
643 *
644 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
645 */
646static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
647{
648 retpoline_thunk_t *target;
649 int reg, ret, i = 0;
650 u8 op, cc;
651
652 target = addr + insn->length + insn->immediate.value;
653 reg = target - __x86_indirect_thunk_array;
654
655 if (WARN_ON_ONCE(reg & ~0xf))
656 return -1;
657
658 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
659 BUG_ON(reg == 4);
660
661 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
662 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
663 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
664 return emit_call_track_retpoline(addr, insn, reg, bytes);
665
666 return -1;
667 }
668
669 op = insn->opcode.bytes[0];
670
671 /*
672 * Convert:
673 *
674 * Jcc.d32 __x86_indirect_thunk_\reg
675 *
676 * into:
677 *
678 * Jncc.d8 1f
679 * [ LFENCE ]
680 * JMP *%\reg
681 * [ NOP ]
682 * 1:
683 */
684 if (is_jcc32(insn)) {
685 cc = insn->opcode.bytes[1] & 0xf;
686 cc ^= 1; /* invert condition */
687
688 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
689 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
690
691 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
692 op = JMP32_INSN_OPCODE;
693 }
694
695 /*
696 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
697 */
698 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
699 bytes[i++] = 0x0f;
700 bytes[i++] = 0xae;
701 bytes[i++] = 0xe8; /* LFENCE */
702 }
703
704 ret = emit_indirect(op, reg, bytes + i);
705 if (ret < 0)
706 return ret;
707 i += ret;
708
709 /*
710 * The compiler is supposed to EMIT an INT3 after every unconditional
711 * JMP instruction due to AMD BTC. However, if the compiler is too old
712 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
713 * even on Intel.
714 */
715 if (op == JMP32_INSN_OPCODE && i < insn->length)
716 bytes[i++] = INT3_INSN_OPCODE;
717
718 for (; i < insn->length;)
719 bytes[i++] = BYTES_NOP1;
720
721 return i;
722}
723
724/*
725 * Generated by 'objtool --retpoline'.
726 */
727void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
728{
729 s32 *s;
730
731 for (s = start; s < end; s++) {
732 void *addr = (void *)s + *s;
733 struct insn insn;
734 int len, ret;
735 u8 bytes[16];
736 u8 op1, op2;
737
738 ret = insn_decode_kernel(&insn, addr);
739 if (WARN_ON_ONCE(ret < 0))
740 continue;
741
742 op1 = insn.opcode.bytes[0];
743 op2 = insn.opcode.bytes[1];
744
745 switch (op1) {
746 case CALL_INSN_OPCODE:
747 case JMP32_INSN_OPCODE:
748 break;
749
750 case 0x0f: /* escape */
751 if (op2 >= 0x80 && op2 <= 0x8f)
752 break;
753 fallthrough;
754 default:
755 WARN_ON_ONCE(1);
756 continue;
757 }
758
759 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
760 addr, addr, insn.length,
761 addr + insn.length + insn.immediate.value);
762
763 len = patch_retpoline(addr, &insn, bytes);
764 if (len == insn.length) {
765 optimize_nops(bytes, len);
766 DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
767 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
768 text_poke_early(addr, bytes, len);
769 }
770 }
771}
772
773#ifdef CONFIG_RETHUNK
774
775/*
776 * Rewrite the compiler generated return thunk tail-calls.
777 *
778 * For example, convert:
779 *
780 * JMP __x86_return_thunk
781 *
782 * into:
783 *
784 * RET
785 */
786static int patch_return(void *addr, struct insn *insn, u8 *bytes)
787{
788 int i = 0;
789
790 /* Patch the custom return thunks... */
791 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
792 i = JMP32_INSN_SIZE;
793 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
794 } else {
795 /* ... or patch them out if not needed. */
796 bytes[i++] = RET_INSN_OPCODE;
797 }
798
799 for (; i < insn->length;)
800 bytes[i++] = INT3_INSN_OPCODE;
801 return i;
802}
803
804void __init_or_module noinline apply_returns(s32 *start, s32 *end)
805{
806 s32 *s;
807
808 if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
809 static_call_force_reinit();
810
811 for (s = start; s < end; s++) {
812 void *dest = NULL, *addr = (void *)s + *s;
813 struct insn insn;
814 int len, ret;
815 u8 bytes[16];
816 u8 op;
817
818 ret = insn_decode_kernel(&insn, addr);
819 if (WARN_ON_ONCE(ret < 0))
820 continue;
821
822 op = insn.opcode.bytes[0];
823 if (op == JMP32_INSN_OPCODE)
824 dest = addr + insn.length + insn.immediate.value;
825
826 if (__static_call_fixup(addr, op, dest) ||
827 WARN_ONCE(dest != &__x86_return_thunk,
828 "missing return thunk: %pS-%pS: %*ph",
829 addr, dest, 5, addr))
830 continue;
831
832 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
833 addr, addr, insn.length,
834 addr + insn.length + insn.immediate.value);
835
836 len = patch_return(addr, &insn, bytes);
837 if (len == insn.length) {
838 DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
839 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
840 text_poke_early(addr, bytes, len);
841 }
842 }
843}
844#else
845void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
846#endif /* CONFIG_RETHUNK */
847
848#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
849
850void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
851void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
852
853#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
854
855#ifdef CONFIG_X86_KERNEL_IBT
856
857static void poison_cfi(void *addr);
858
859static void __init_or_module poison_endbr(void *addr, bool warn)
860{
861 u32 endbr, poison = gen_endbr_poison();
862
863 if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
864 return;
865
866 if (!is_endbr(endbr)) {
867 WARN_ON_ONCE(warn);
868 return;
869 }
870
871 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
872
873 /*
874 * When we have IBT, the lack of ENDBR will trigger #CP
875 */
876 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
877 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
878 text_poke_early(addr, &poison, 4);
879}
880
881/*
882 * Generated by: objtool --ibt
883 *
884 * Seal the functions for indirect calls by clobbering the ENDBR instructions
885 * and the kCFI hash value.
886 */
887void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
888{
889 s32 *s;
890
891 for (s = start; s < end; s++) {
892 void *addr = (void *)s + *s;
893
894 poison_endbr(addr, true);
895 if (IS_ENABLED(CONFIG_FINEIBT))
896 poison_cfi(addr - 16);
897 }
898}
899
900#else
901
902void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
903
904#endif /* CONFIG_X86_KERNEL_IBT */
905
906#ifdef CONFIG_FINEIBT
907#define __CFI_DEFAULT CFI_DEFAULT
908#elif defined(CONFIG_CFI_CLANG)
909#define __CFI_DEFAULT CFI_KCFI
910#else
911#define __CFI_DEFAULT CFI_OFF
912#endif
913
914enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
915
916#ifdef CONFIG_CFI_CLANG
917struct bpf_insn;
918
919/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
920extern unsigned int __bpf_prog_runX(const void *ctx,
921 const struct bpf_insn *insn);
922
923/*
924 * Force a reference to the external symbol so the compiler generates
925 * __kcfi_typid.
926 */
927__ADDRESSABLE(__bpf_prog_runX);
928
929/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
930asm (
931" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
932" .type cfi_bpf_hash,@object \n"
933" .globl cfi_bpf_hash \n"
934" .p2align 2, 0x0 \n"
935"cfi_bpf_hash: \n"
936" .long __kcfi_typeid___bpf_prog_runX \n"
937" .size cfi_bpf_hash, 4 \n"
938" .popsection \n"
939);
940
941/* Must match bpf_callback_t */
942extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
943
944__ADDRESSABLE(__bpf_callback_fn);
945
946/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
947asm (
948" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
949" .type cfi_bpf_subprog_hash,@object \n"
950" .globl cfi_bpf_subprog_hash \n"
951" .p2align 2, 0x0 \n"
952"cfi_bpf_subprog_hash: \n"
953" .long __kcfi_typeid___bpf_callback_fn \n"
954" .size cfi_bpf_subprog_hash, 4 \n"
955" .popsection \n"
956);
957
958u32 cfi_get_func_hash(void *func)
959{
960 u32 hash;
961
962 func -= cfi_get_offset();
963 switch (cfi_mode) {
964 case CFI_FINEIBT:
965 func += 7;
966 break;
967 case CFI_KCFI:
968 func += 1;
969 break;
970 default:
971 return 0;
972 }
973
974 if (get_kernel_nofault(hash, func))
975 return 0;
976
977 return hash;
978}
979#endif
980
981#ifdef CONFIG_FINEIBT
982
983static bool cfi_rand __ro_after_init = true;
984static u32 cfi_seed __ro_after_init;
985
986/*
987 * Re-hash the CFI hash with a boot-time seed while making sure the result is
988 * not a valid ENDBR instruction.
989 */
990static u32 cfi_rehash(u32 hash)
991{
992 hash ^= cfi_seed;
993 while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
994 bool lsb = hash & 1;
995 hash >>= 1;
996 if (lsb)
997 hash ^= 0x80200003;
998 }
999 return hash;
1000}
1001
1002static __init int cfi_parse_cmdline(char *str)
1003{
1004 if (!str)
1005 return -EINVAL;
1006
1007 while (str) {
1008 char *next = strchr(str, ',');
1009 if (next) {
1010 *next = 0;
1011 next++;
1012 }
1013
1014 if (!strcmp(str, "auto")) {
1015 cfi_mode = CFI_DEFAULT;
1016 } else if (!strcmp(str, "off")) {
1017 cfi_mode = CFI_OFF;
1018 cfi_rand = false;
1019 } else if (!strcmp(str, "kcfi")) {
1020 cfi_mode = CFI_KCFI;
1021 } else if (!strcmp(str, "fineibt")) {
1022 cfi_mode = CFI_FINEIBT;
1023 } else if (!strcmp(str, "norand")) {
1024 cfi_rand = false;
1025 } else {
1026 pr_err("Ignoring unknown cfi option (%s).", str);
1027 }
1028
1029 str = next;
1030 }
1031
1032 return 0;
1033}
1034early_param("cfi", cfi_parse_cmdline);
1035
1036/*
1037 * kCFI FineIBT
1038 *
1039 * __cfi_\func: __cfi_\func:
1040 * movl $0x12345678,%eax // 5 endbr64 // 4
1041 * nop subl $0x12345678,%r10d // 7
1042 * nop jz 1f // 2
1043 * nop ud2 // 2
1044 * nop 1: nop // 1
1045 * nop
1046 * nop
1047 * nop
1048 * nop
1049 * nop
1050 * nop
1051 * nop
1052 *
1053 *
1054 * caller: caller:
1055 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
1056 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
1057 * je 1f // 2 nop4 // 4
1058 * ud2 // 2
1059 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
1060 *
1061 */
1062
1063asm( ".pushsection .rodata \n"
1064 "fineibt_preamble_start: \n"
1065 " endbr64 \n"
1066 " subl $0x12345678, %r10d \n"
1067 " je fineibt_preamble_end \n"
1068 " ud2 \n"
1069 " nop \n"
1070 "fineibt_preamble_end: \n"
1071 ".popsection\n"
1072);
1073
1074extern u8 fineibt_preamble_start[];
1075extern u8 fineibt_preamble_end[];
1076
1077#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1078#define fineibt_preamble_hash 7
1079
1080asm( ".pushsection .rodata \n"
1081 "fineibt_caller_start: \n"
1082 " movl $0x12345678, %r10d \n"
1083 " sub $16, %r11 \n"
1084 ASM_NOP4
1085 "fineibt_caller_end: \n"
1086 ".popsection \n"
1087);
1088
1089extern u8 fineibt_caller_start[];
1090extern u8 fineibt_caller_end[];
1091
1092#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1093#define fineibt_caller_hash 2
1094
1095#define fineibt_caller_jmp (fineibt_caller_size - 2)
1096
1097static u32 decode_preamble_hash(void *addr)
1098{
1099 u8 *p = addr;
1100
1101 /* b8 78 56 34 12 mov $0x12345678,%eax */
1102 if (p[0] == 0xb8)
1103 return *(u32 *)(addr + 1);
1104
1105 return 0; /* invalid hash value */
1106}
1107
1108static u32 decode_caller_hash(void *addr)
1109{
1110 u8 *p = addr;
1111
1112 /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
1113 if (p[0] == 0x41 && p[1] == 0xba)
1114 return -*(u32 *)(addr + 2);
1115
1116 /* e8 0c 78 56 34 12 jmp.d8 +12 */
1117 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1118 return -*(u32 *)(addr + 2);
1119
1120 return 0; /* invalid hash value */
1121}
1122
1123/* .retpoline_sites */
1124static int cfi_disable_callers(s32 *start, s32 *end)
1125{
1126 /*
1127 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1128 * in tact for later usage. Also see decode_caller_hash() and
1129 * cfi_rewrite_callers().
1130 */
1131 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1132 s32 *s;
1133
1134 for (s = start; s < end; s++) {
1135 void *addr = (void *)s + *s;
1136 u32 hash;
1137
1138 addr -= fineibt_caller_size;
1139 hash = decode_caller_hash(addr);
1140 if (!hash) /* nocfi callers */
1141 continue;
1142
1143 text_poke_early(addr, jmp, 2);
1144 }
1145
1146 return 0;
1147}
1148
1149static int cfi_enable_callers(s32 *start, s32 *end)
1150{
1151 /*
1152 * Re-enable kCFI, undo what cfi_disable_callers() did.
1153 */
1154 const u8 mov[] = { 0x41, 0xba };
1155 s32 *s;
1156
1157 for (s = start; s < end; s++) {
1158 void *addr = (void *)s + *s;
1159 u32 hash;
1160
1161 addr -= fineibt_caller_size;
1162 hash = decode_caller_hash(addr);
1163 if (!hash) /* nocfi callers */
1164 continue;
1165
1166 text_poke_early(addr, mov, 2);
1167 }
1168
1169 return 0;
1170}
1171
1172/* .cfi_sites */
1173static int cfi_rand_preamble(s32 *start, s32 *end)
1174{
1175 s32 *s;
1176
1177 for (s = start; s < end; s++) {
1178 void *addr = (void *)s + *s;
1179 u32 hash;
1180
1181 hash = decode_preamble_hash(addr);
1182 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1183 addr, addr, 5, addr))
1184 return -EINVAL;
1185
1186 hash = cfi_rehash(hash);
1187 text_poke_early(addr + 1, &hash, 4);
1188 }
1189
1190 return 0;
1191}
1192
1193static int cfi_rewrite_preamble(s32 *start, s32 *end)
1194{
1195 s32 *s;
1196
1197 for (s = start; s < end; s++) {
1198 void *addr = (void *)s + *s;
1199 u32 hash;
1200
1201 hash = decode_preamble_hash(addr);
1202 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1203 addr, addr, 5, addr))
1204 return -EINVAL;
1205
1206 text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
1207 WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
1208 text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
1209 }
1210
1211 return 0;
1212}
1213
1214static void cfi_rewrite_endbr(s32 *start, s32 *end)
1215{
1216 s32 *s;
1217
1218 for (s = start; s < end; s++) {
1219 void *addr = (void *)s + *s;
1220
1221 poison_endbr(addr+16, false);
1222 }
1223}
1224
1225/* .retpoline_sites */
1226static int cfi_rand_callers(s32 *start, s32 *end)
1227{
1228 s32 *s;
1229
1230 for (s = start; s < end; s++) {
1231 void *addr = (void *)s + *s;
1232 u32 hash;
1233
1234 addr -= fineibt_caller_size;
1235 hash = decode_caller_hash(addr);
1236 if (hash) {
1237 hash = -cfi_rehash(hash);
1238 text_poke_early(addr + 2, &hash, 4);
1239 }
1240 }
1241
1242 return 0;
1243}
1244
1245static int cfi_rewrite_callers(s32 *start, s32 *end)
1246{
1247 s32 *s;
1248
1249 for (s = start; s < end; s++) {
1250 void *addr = (void *)s + *s;
1251 u32 hash;
1252
1253 addr -= fineibt_caller_size;
1254 hash = decode_caller_hash(addr);
1255 if (hash) {
1256 text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
1257 WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
1258 text_poke_early(addr + fineibt_caller_hash, &hash, 4);
1259 }
1260 /* rely on apply_retpolines() */
1261 }
1262
1263 return 0;
1264}
1265
1266static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1267 s32 *start_cfi, s32 *end_cfi, bool builtin)
1268{
1269 int ret;
1270
1271 if (WARN_ONCE(fineibt_preamble_size != 16,
1272 "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1273 return;
1274
1275 if (cfi_mode == CFI_DEFAULT) {
1276 cfi_mode = CFI_KCFI;
1277 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
1278 cfi_mode = CFI_FINEIBT;
1279 }
1280
1281 /*
1282 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1283 * rewrite them. This disables all CFI. If this succeeds but any of the
1284 * later stages fails, we're without CFI.
1285 */
1286 ret = cfi_disable_callers(start_retpoline, end_retpoline);
1287 if (ret)
1288 goto err;
1289
1290 if (cfi_rand) {
1291 if (builtin) {
1292 cfi_seed = get_random_u32();
1293 cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1294 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1295 }
1296
1297 ret = cfi_rand_preamble(start_cfi, end_cfi);
1298 if (ret)
1299 goto err;
1300
1301 ret = cfi_rand_callers(start_retpoline, end_retpoline);
1302 if (ret)
1303 goto err;
1304 }
1305
1306 switch (cfi_mode) {
1307 case CFI_OFF:
1308 if (builtin)
1309 pr_info("Disabling CFI\n");
1310 return;
1311
1312 case CFI_KCFI:
1313 ret = cfi_enable_callers(start_retpoline, end_retpoline);
1314 if (ret)
1315 goto err;
1316
1317 if (builtin)
1318 pr_info("Using kCFI\n");
1319 return;
1320
1321 case CFI_FINEIBT:
1322 /* place the FineIBT preamble at func()-16 */
1323 ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1324 if (ret)
1325 goto err;
1326
1327 /* rewrite the callers to target func()-16 */
1328 ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1329 if (ret)
1330 goto err;
1331
1332 /* now that nobody targets func()+0, remove ENDBR there */
1333 cfi_rewrite_endbr(start_cfi, end_cfi);
1334
1335 if (builtin)
1336 pr_info("Using FineIBT CFI\n");
1337 return;
1338
1339 default:
1340 break;
1341 }
1342
1343err:
1344 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1345}
1346
1347static inline void poison_hash(void *addr)
1348{
1349 *(u32 *)addr = 0;
1350}
1351
1352static void poison_cfi(void *addr)
1353{
1354 switch (cfi_mode) {
1355 case CFI_FINEIBT:
1356 /*
1357 * __cfi_\func:
1358 * osp nopl (%rax)
1359 * subl $0, %r10d
1360 * jz 1f
1361 * ud2
1362 * 1: nop
1363 */
1364 poison_endbr(addr, false);
1365 poison_hash(addr + fineibt_preamble_hash);
1366 break;
1367
1368 case CFI_KCFI:
1369 /*
1370 * __cfi_\func:
1371 * movl $0, %eax
1372 * .skip 11, 0x90
1373 */
1374 poison_hash(addr + 1);
1375 break;
1376
1377 default:
1378 break;
1379 }
1380}
1381
1382#else
1383
1384static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1385 s32 *start_cfi, s32 *end_cfi, bool builtin)
1386{
1387}
1388
1389#ifdef CONFIG_X86_KERNEL_IBT
1390static void poison_cfi(void *addr) { }
1391#endif
1392
1393#endif
1394
1395void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1396 s32 *start_cfi, s32 *end_cfi)
1397{
1398 return __apply_fineibt(start_retpoline, end_retpoline,
1399 start_cfi, end_cfi,
1400 /* .builtin = */ false);
1401}
1402
1403#ifdef CONFIG_SMP
1404static void alternatives_smp_lock(const s32 *start, const s32 *end,
1405 u8 *text, u8 *text_end)
1406{
1407 const s32 *poff;
1408
1409 for (poff = start; poff < end; poff++) {
1410 u8 *ptr = (u8 *)poff + *poff;
1411
1412 if (!*poff || ptr < text || ptr >= text_end)
1413 continue;
1414 /* turn DS segment override prefix into lock prefix */
1415 if (*ptr == 0x3e)
1416 text_poke(ptr, ((unsigned char []){0xf0}), 1);
1417 }
1418}
1419
1420static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1421 u8 *text, u8 *text_end)
1422{
1423 const s32 *poff;
1424
1425 for (poff = start; poff < end; poff++) {
1426 u8 *ptr = (u8 *)poff + *poff;
1427
1428 if (!*poff || ptr < text || ptr >= text_end)
1429 continue;
1430 /* turn lock prefix into DS segment override prefix */
1431 if (*ptr == 0xf0)
1432 text_poke(ptr, ((unsigned char []){0x3E}), 1);
1433 }
1434}
1435
1436struct smp_alt_module {
1437 /* what is this ??? */
1438 struct module *mod;
1439 char *name;
1440
1441 /* ptrs to lock prefixes */
1442 const s32 *locks;
1443 const s32 *locks_end;
1444
1445 /* .text segment, needed to avoid patching init code ;) */
1446 u8 *text;
1447 u8 *text_end;
1448
1449 struct list_head next;
1450};
1451static LIST_HEAD(smp_alt_modules);
1452static bool uniproc_patched = false; /* protected by text_mutex */
1453
1454void __init_or_module alternatives_smp_module_add(struct module *mod,
1455 char *name,
1456 void *locks, void *locks_end,
1457 void *text, void *text_end)
1458{
1459 struct smp_alt_module *smp;
1460
1461 mutex_lock(&text_mutex);
1462 if (!uniproc_patched)
1463 goto unlock;
1464
1465 if (num_possible_cpus() == 1)
1466 /* Don't bother remembering, we'll never have to undo it. */
1467 goto smp_unlock;
1468
1469 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1470 if (NULL == smp)
1471 /* we'll run the (safe but slow) SMP code then ... */
1472 goto unlock;
1473
1474 smp->mod = mod;
1475 smp->name = name;
1476 smp->locks = locks;
1477 smp->locks_end = locks_end;
1478 smp->text = text;
1479 smp->text_end = text_end;
1480 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
1481 smp->locks, smp->locks_end,
1482 smp->text, smp->text_end, smp->name);
1483
1484 list_add_tail(&smp->next, &smp_alt_modules);
1485smp_unlock:
1486 alternatives_smp_unlock(locks, locks_end, text, text_end);
1487unlock:
1488 mutex_unlock(&text_mutex);
1489}
1490
1491void __init_or_module alternatives_smp_module_del(struct module *mod)
1492{
1493 struct smp_alt_module *item;
1494
1495 mutex_lock(&text_mutex);
1496 list_for_each_entry(item, &smp_alt_modules, next) {
1497 if (mod != item->mod)
1498 continue;
1499 list_del(&item->next);
1500 kfree(item);
1501 break;
1502 }
1503 mutex_unlock(&text_mutex);
1504}
1505
1506void alternatives_enable_smp(void)
1507{
1508 struct smp_alt_module *mod;
1509
1510 /* Why bother if there are no other CPUs? */
1511 BUG_ON(num_possible_cpus() == 1);
1512
1513 mutex_lock(&text_mutex);
1514
1515 if (uniproc_patched) {
1516 pr_info("switching to SMP code\n");
1517 BUG_ON(num_online_cpus() != 1);
1518 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1519 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1520 list_for_each_entry(mod, &smp_alt_modules, next)
1521 alternatives_smp_lock(mod->locks, mod->locks_end,
1522 mod->text, mod->text_end);
1523 uniproc_patched = false;
1524 }
1525 mutex_unlock(&text_mutex);
1526}
1527
1528/*
1529 * Return 1 if the address range is reserved for SMP-alternatives.
1530 * Must hold text_mutex.
1531 */
1532int alternatives_text_reserved(void *start, void *end)
1533{
1534 struct smp_alt_module *mod;
1535 const s32 *poff;
1536 u8 *text_start = start;
1537 u8 *text_end = end;
1538
1539 lockdep_assert_held(&text_mutex);
1540
1541 list_for_each_entry(mod, &smp_alt_modules, next) {
1542 if (mod->text > text_end || mod->text_end < text_start)
1543 continue;
1544 for (poff = mod->locks; poff < mod->locks_end; poff++) {
1545 const u8 *ptr = (const u8 *)poff + *poff;
1546
1547 if (text_start <= ptr && text_end > ptr)
1548 return 1;
1549 }
1550 }
1551
1552 return 0;
1553}
1554#endif /* CONFIG_SMP */
1555
1556/*
1557 * Self-test for the INT3 based CALL emulation code.
1558 *
1559 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1560 * properly and that there is a stack gap between the INT3 frame and the
1561 * previous context. Without this gap doing a virtual PUSH on the interrupted
1562 * stack would corrupt the INT3 IRET frame.
1563 *
1564 * See entry_{32,64}.S for more details.
1565 */
1566
1567/*
1568 * We define the int3_magic() function in assembly to control the calling
1569 * convention such that we can 'call' it from assembly.
1570 */
1571
1572extern void int3_magic(unsigned int *ptr); /* defined in asm */
1573
1574asm (
1575" .pushsection .init.text, \"ax\", @progbits\n"
1576" .type int3_magic, @function\n"
1577"int3_magic:\n"
1578 ANNOTATE_NOENDBR
1579" movl $1, (%" _ASM_ARG1 ")\n"
1580 ASM_RET
1581" .size int3_magic, .-int3_magic\n"
1582" .popsection\n"
1583);
1584
1585extern void int3_selftest_ip(void); /* defined in asm below */
1586
1587static int __init
1588int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1589{
1590 unsigned long selftest = (unsigned long)&int3_selftest_ip;
1591 struct die_args *args = data;
1592 struct pt_regs *regs = args->regs;
1593
1594 OPTIMIZER_HIDE_VAR(selftest);
1595
1596 if (!regs || user_mode(regs))
1597 return NOTIFY_DONE;
1598
1599 if (val != DIE_INT3)
1600 return NOTIFY_DONE;
1601
1602 if (regs->ip - INT3_INSN_SIZE != selftest)
1603 return NOTIFY_DONE;
1604
1605 int3_emulate_call(regs, (unsigned long)&int3_magic);
1606 return NOTIFY_STOP;
1607}
1608
1609/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1610static noinline void __init int3_selftest(void)
1611{
1612 static __initdata struct notifier_block int3_exception_nb = {
1613 .notifier_call = int3_exception_notify,
1614 .priority = INT_MAX-1, /* last */
1615 };
1616 unsigned int val = 0;
1617
1618 BUG_ON(register_die_notifier(&int3_exception_nb));
1619
1620 /*
1621 * Basically: int3_magic(&val); but really complicated :-)
1622 *
1623 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1624 * notifier above will emulate CALL for us.
1625 */
1626 asm volatile ("int3_selftest_ip:\n\t"
1627 ANNOTATE_NOENDBR
1628 " int3; nop; nop; nop; nop\n\t"
1629 : ASM_CALL_CONSTRAINT
1630 : __ASM_SEL_RAW(a, D) (&val)
1631 : "memory");
1632
1633 BUG_ON(val != 1);
1634
1635 unregister_die_notifier(&int3_exception_nb);
1636}
1637
1638static __initdata int __alt_reloc_selftest_addr;
1639
1640extern void __init __alt_reloc_selftest(void *arg);
1641__visible noinline void __init __alt_reloc_selftest(void *arg)
1642{
1643 WARN_ON(arg != &__alt_reloc_selftest_addr);
1644}
1645
1646static noinline void __init alt_reloc_selftest(void)
1647{
1648 /*
1649 * Tests apply_relocation().
1650 *
1651 * This has a relative immediate (CALL) in a place other than the first
1652 * instruction and additionally on x86_64 we get a RIP-relative LEA:
1653 *
1654 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
1655 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
1656 *
1657 * Getting this wrong will either crash and burn or tickle the WARN
1658 * above.
1659 */
1660 asm_inline volatile (
1661 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
1662 : /* output */
1663 : [mem] "m" (__alt_reloc_selftest_addr)
1664 : _ASM_ARG1
1665 );
1666}
1667
1668void __init alternative_instructions(void)
1669{
1670 int3_selftest();
1671
1672 /*
1673 * The patching is not fully atomic, so try to avoid local
1674 * interruptions that might execute the to be patched code.
1675 * Other CPUs are not running.
1676 */
1677 stop_nmi();
1678
1679 /*
1680 * Don't stop machine check exceptions while patching.
1681 * MCEs only happen when something got corrupted and in this
1682 * case we must do something about the corruption.
1683 * Ignoring it is worse than an unlikely patching race.
1684 * Also machine checks tend to be broadcast and if one CPU
1685 * goes into machine check the others follow quickly, so we don't
1686 * expect a machine check to cause undue problems during to code
1687 * patching.
1688 */
1689
1690 /*
1691 * Make sure to set (artificial) features depending on used paravirt
1692 * functions which can later influence alternative patching.
1693 */
1694 paravirt_set_cap();
1695
1696 __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1697 __cfi_sites, __cfi_sites_end, true);
1698
1699 /*
1700 * Rewrite the retpolines, must be done before alternatives since
1701 * those can rewrite the retpoline thunks.
1702 */
1703 apply_retpolines(__retpoline_sites, __retpoline_sites_end);
1704 apply_returns(__return_sites, __return_sites_end);
1705
1706 apply_alternatives(__alt_instructions, __alt_instructions_end);
1707
1708 /*
1709 * Now all calls are established. Apply the call thunks if
1710 * required.
1711 */
1712 callthunks_patch_builtin_calls();
1713
1714 /*
1715 * Seal all functions that do not have their address taken.
1716 */
1717 apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
1718
1719#ifdef CONFIG_SMP
1720 /* Patch to UP if other cpus not imminent. */
1721 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1722 uniproc_patched = true;
1723 alternatives_smp_module_add(NULL, "core kernel",
1724 __smp_locks, __smp_locks_end,
1725 _text, _etext);
1726 }
1727
1728 if (!uniproc_patched || num_possible_cpus() == 1) {
1729 free_init_pages("SMP alternatives",
1730 (unsigned long)__smp_locks,
1731 (unsigned long)__smp_locks_end);
1732 }
1733#endif
1734
1735 restart_nmi();
1736 alternatives_patched = 1;
1737
1738 alt_reloc_selftest();
1739}
1740
1741/**
1742 * text_poke_early - Update instructions on a live kernel at boot time
1743 * @addr: address to modify
1744 * @opcode: source of the copy
1745 * @len: length to copy
1746 *
1747 * When you use this code to patch more than one byte of an instruction
1748 * you need to make sure that other CPUs cannot execute this code in parallel.
1749 * Also no thread must be currently preempted in the middle of these
1750 * instructions. And on the local CPU you need to be protected against NMI or
1751 * MCE handlers seeing an inconsistent instruction while you patch.
1752 */
1753void __init_or_module text_poke_early(void *addr, const void *opcode,
1754 size_t len)
1755{
1756 unsigned long flags;
1757
1758 if (boot_cpu_has(X86_FEATURE_NX) &&
1759 is_module_text_address((unsigned long)addr)) {
1760 /*
1761 * Modules text is marked initially as non-executable, so the
1762 * code cannot be running and speculative code-fetches are
1763 * prevented. Just change the code.
1764 */
1765 memcpy(addr, opcode, len);
1766 } else {
1767 local_irq_save(flags);
1768 memcpy(addr, opcode, len);
1769 sync_core();
1770 local_irq_restore(flags);
1771
1772 /*
1773 * Could also do a CLFLUSH here to speed up CPU recovery; but
1774 * that causes hangs on some VIA CPUs.
1775 */
1776 }
1777}
1778
1779typedef struct {
1780 struct mm_struct *mm;
1781} temp_mm_state_t;
1782
1783/*
1784 * Using a temporary mm allows to set temporary mappings that are not accessible
1785 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1786 * that override the kernel memory protections (e.g., W^X), without exposing the
1787 * temporary page-table mappings that are required for these write operations to
1788 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1789 * mapping is torn down.
1790 *
1791 * Context: The temporary mm needs to be used exclusively by a single core. To
1792 * harden security IRQs must be disabled while the temporary mm is
1793 * loaded, thereby preventing interrupt handler bugs from overriding
1794 * the kernel memory protection.
1795 */
1796static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1797{
1798 temp_mm_state_t temp_state;
1799
1800 lockdep_assert_irqs_disabled();
1801
1802 /*
1803 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1804 * with a stale address space WITHOUT being in lazy mode after
1805 * restoring the previous mm.
1806 */
1807 if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1808 leave_mm(smp_processor_id());
1809
1810 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1811 switch_mm_irqs_off(NULL, mm, current);
1812
1813 /*
1814 * If breakpoints are enabled, disable them while the temporary mm is
1815 * used. Userspace might set up watchpoints on addresses that are used
1816 * in the temporary mm, which would lead to wrong signals being sent or
1817 * crashes.
1818 *
1819 * Note that breakpoints are not disabled selectively, which also causes
1820 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1821 * undesirable, but still seems reasonable as the code that runs in the
1822 * temporary mm should be short.
1823 */
1824 if (hw_breakpoint_active())
1825 hw_breakpoint_disable();
1826
1827 return temp_state;
1828}
1829
1830static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1831{
1832 lockdep_assert_irqs_disabled();
1833 switch_mm_irqs_off(NULL, prev_state.mm, current);
1834
1835 /*
1836 * Restore the breakpoints if they were disabled before the temporary mm
1837 * was loaded.
1838 */
1839 if (hw_breakpoint_active())
1840 hw_breakpoint_restore();
1841}
1842
1843__ro_after_init struct mm_struct *poking_mm;
1844__ro_after_init unsigned long poking_addr;
1845
1846static void text_poke_memcpy(void *dst, const void *src, size_t len)
1847{
1848 memcpy(dst, src, len);
1849}
1850
1851static void text_poke_memset(void *dst, const void *src, size_t len)
1852{
1853 int c = *(const int *)src;
1854
1855 memset(dst, c, len);
1856}
1857
1858typedef void text_poke_f(void *dst, const void *src, size_t len);
1859
1860static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1861{
1862 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1863 struct page *pages[2] = {NULL};
1864 temp_mm_state_t prev;
1865 unsigned long flags;
1866 pte_t pte, *ptep;
1867 spinlock_t *ptl;
1868 pgprot_t pgprot;
1869
1870 /*
1871 * While boot memory allocator is running we cannot use struct pages as
1872 * they are not yet initialized. There is no way to recover.
1873 */
1874 BUG_ON(!after_bootmem);
1875
1876 if (!core_kernel_text((unsigned long)addr)) {
1877 pages[0] = vmalloc_to_page(addr);
1878 if (cross_page_boundary)
1879 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1880 } else {
1881 pages[0] = virt_to_page(addr);
1882 WARN_ON(!PageReserved(pages[0]));
1883 if (cross_page_boundary)
1884 pages[1] = virt_to_page(addr + PAGE_SIZE);
1885 }
1886 /*
1887 * If something went wrong, crash and burn since recovery paths are not
1888 * implemented.
1889 */
1890 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1891
1892 /*
1893 * Map the page without the global bit, as TLB flushing is done with
1894 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1895 */
1896 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1897
1898 /*
1899 * The lock is not really needed, but this allows to avoid open-coding.
1900 */
1901 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1902
1903 /*
1904 * This must not fail; preallocated in poking_init().
1905 */
1906 VM_BUG_ON(!ptep);
1907
1908 local_irq_save(flags);
1909
1910 pte = mk_pte(pages[0], pgprot);
1911 set_pte_at(poking_mm, poking_addr, ptep, pte);
1912
1913 if (cross_page_boundary) {
1914 pte = mk_pte(pages[1], pgprot);
1915 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1916 }
1917
1918 /*
1919 * Loading the temporary mm behaves as a compiler barrier, which
1920 * guarantees that the PTE will be set at the time memcpy() is done.
1921 */
1922 prev = use_temporary_mm(poking_mm);
1923
1924 kasan_disable_current();
1925 func((u8 *)poking_addr + offset_in_page(addr), src, len);
1926 kasan_enable_current();
1927
1928 /*
1929 * Ensure that the PTE is only cleared after the instructions of memcpy
1930 * were issued by using a compiler barrier.
1931 */
1932 barrier();
1933
1934 pte_clear(poking_mm, poking_addr, ptep);
1935 if (cross_page_boundary)
1936 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1937
1938 /*
1939 * Loading the previous page-table hierarchy requires a serializing
1940 * instruction that already allows the core to see the updated version.
1941 * Xen-PV is assumed to serialize execution in a similar manner.
1942 */
1943 unuse_temporary_mm(prev);
1944
1945 /*
1946 * Flushing the TLB might involve IPIs, which would require enabled
1947 * IRQs, but not if the mm is not used, as it is in this point.
1948 */
1949 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1950 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1951 PAGE_SHIFT, false);
1952
1953 if (func == text_poke_memcpy) {
1954 /*
1955 * If the text does not match what we just wrote then something is
1956 * fundamentally screwy; there's nothing we can really do about that.
1957 */
1958 BUG_ON(memcmp(addr, src, len));
1959 }
1960
1961 local_irq_restore(flags);
1962 pte_unmap_unlock(ptep, ptl);
1963 return addr;
1964}
1965
1966/**
1967 * text_poke - Update instructions on a live kernel
1968 * @addr: address to modify
1969 * @opcode: source of the copy
1970 * @len: length to copy
1971 *
1972 * Only atomic text poke/set should be allowed when not doing early patching.
1973 * It means the size must be writable atomically and the address must be aligned
1974 * in a way that permits an atomic write. It also makes sure we fit on a single
1975 * page.
1976 *
1977 * Note that the caller must ensure that if the modified code is part of a
1978 * module, the module would not be removed during poking. This can be achieved
1979 * by registering a module notifier, and ordering module removal and patching
1980 * through a mutex.
1981 */
1982void *text_poke(void *addr, const void *opcode, size_t len)
1983{
1984 lockdep_assert_held(&text_mutex);
1985
1986 return __text_poke(text_poke_memcpy, addr, opcode, len);
1987}
1988
1989/**
1990 * text_poke_kgdb - Update instructions on a live kernel by kgdb
1991 * @addr: address to modify
1992 * @opcode: source of the copy
1993 * @len: length to copy
1994 *
1995 * Only atomic text poke/set should be allowed when not doing early patching.
1996 * It means the size must be writable atomically and the address must be aligned
1997 * in a way that permits an atomic write. It also makes sure we fit on a single
1998 * page.
1999 *
2000 * Context: should only be used by kgdb, which ensures no other core is running,
2001 * despite the fact it does not hold the text_mutex.
2002 */
2003void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2004{
2005 return __text_poke(text_poke_memcpy, addr, opcode, len);
2006}
2007
2008void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2009 bool core_ok)
2010{
2011 unsigned long start = (unsigned long)addr;
2012 size_t patched = 0;
2013
2014 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2015 return NULL;
2016
2017 while (patched < len) {
2018 unsigned long ptr = start + patched;
2019 size_t s;
2020
2021 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2022
2023 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2024 patched += s;
2025 }
2026 return addr;
2027}
2028
2029/**
2030 * text_poke_copy - Copy instructions into (an unused part of) RX memory
2031 * @addr: address to modify
2032 * @opcode: source of the copy
2033 * @len: length to copy, could be more than 2x PAGE_SIZE
2034 *
2035 * Not safe against concurrent execution; useful for JITs to dump
2036 * new code blocks into unused regions of RX memory. Can be used in
2037 * conjunction with synchronize_rcu_tasks() to wait for existing
2038 * execution to quiesce after having made sure no existing functions
2039 * pointers are live.
2040 */
2041void *text_poke_copy(void *addr, const void *opcode, size_t len)
2042{
2043 mutex_lock(&text_mutex);
2044 addr = text_poke_copy_locked(addr, opcode, len, false);
2045 mutex_unlock(&text_mutex);
2046 return addr;
2047}
2048
2049/**
2050 * text_poke_set - memset into (an unused part of) RX memory
2051 * @addr: address to modify
2052 * @c: the byte to fill the area with
2053 * @len: length to copy, could be more than 2x PAGE_SIZE
2054 *
2055 * This is useful to overwrite unused regions of RX memory with illegal
2056 * instructions.
2057 */
2058void *text_poke_set(void *addr, int c, size_t len)
2059{
2060 unsigned long start = (unsigned long)addr;
2061 size_t patched = 0;
2062
2063 if (WARN_ON_ONCE(core_kernel_text(start)))
2064 return NULL;
2065
2066 mutex_lock(&text_mutex);
2067 while (patched < len) {
2068 unsigned long ptr = start + patched;
2069 size_t s;
2070
2071 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2072
2073 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2074 patched += s;
2075 }
2076 mutex_unlock(&text_mutex);
2077 return addr;
2078}
2079
2080static void do_sync_core(void *info)
2081{
2082 sync_core();
2083}
2084
2085void text_poke_sync(void)
2086{
2087 on_each_cpu(do_sync_core, NULL, 1);
2088}
2089
2090/*
2091 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2092 * this thing. When len == 6 everything is prefixed with 0x0f and we map
2093 * opcode to Jcc.d8, using len to distinguish.
2094 */
2095struct text_poke_loc {
2096 /* addr := _stext + rel_addr */
2097 s32 rel_addr;
2098 s32 disp;
2099 u8 len;
2100 u8 opcode;
2101 const u8 text[POKE_MAX_OPCODE_SIZE];
2102 /* see text_poke_bp_batch() */
2103 u8 old;
2104};
2105
2106struct bp_patching_desc {
2107 struct text_poke_loc *vec;
2108 int nr_entries;
2109 atomic_t refs;
2110};
2111
2112static struct bp_patching_desc bp_desc;
2113
2114static __always_inline
2115struct bp_patching_desc *try_get_desc(void)
2116{
2117 struct bp_patching_desc *desc = &bp_desc;
2118
2119 if (!raw_atomic_inc_not_zero(&desc->refs))
2120 return NULL;
2121
2122 return desc;
2123}
2124
2125static __always_inline void put_desc(void)
2126{
2127 struct bp_patching_desc *desc = &bp_desc;
2128
2129 smp_mb__before_atomic();
2130 raw_atomic_dec(&desc->refs);
2131}
2132
2133static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
2134{
2135 return _stext + tp->rel_addr;
2136}
2137
2138static __always_inline int patch_cmp(const void *key, const void *elt)
2139{
2140 struct text_poke_loc *tp = (struct text_poke_loc *) elt;
2141
2142 if (key < text_poke_addr(tp))
2143 return -1;
2144 if (key > text_poke_addr(tp))
2145 return 1;
2146 return 0;
2147}
2148
2149noinstr int poke_int3_handler(struct pt_regs *regs)
2150{
2151 struct bp_patching_desc *desc;
2152 struct text_poke_loc *tp;
2153 int ret = 0;
2154 void *ip;
2155
2156 if (user_mode(regs))
2157 return 0;
2158
2159 /*
2160 * Having observed our INT3 instruction, we now must observe
2161 * bp_desc with non-zero refcount:
2162 *
2163 * bp_desc.refs = 1 INT3
2164 * WMB RMB
2165 * write INT3 if (bp_desc.refs != 0)
2166 */
2167 smp_rmb();
2168
2169 desc = try_get_desc();
2170 if (!desc)
2171 return 0;
2172
2173 /*
2174 * Discount the INT3. See text_poke_bp_batch().
2175 */
2176 ip = (void *) regs->ip - INT3_INSN_SIZE;
2177
2178 /*
2179 * Skip the binary search if there is a single member in the vector.
2180 */
2181 if (unlikely(desc->nr_entries > 1)) {
2182 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
2183 sizeof(struct text_poke_loc),
2184 patch_cmp);
2185 if (!tp)
2186 goto out_put;
2187 } else {
2188 tp = desc->vec;
2189 if (text_poke_addr(tp) != ip)
2190 goto out_put;
2191 }
2192
2193 ip += tp->len;
2194
2195 switch (tp->opcode) {
2196 case INT3_INSN_OPCODE:
2197 /*
2198 * Someone poked an explicit INT3, they'll want to handle it,
2199 * do not consume.
2200 */
2201 goto out_put;
2202
2203 case RET_INSN_OPCODE:
2204 int3_emulate_ret(regs);
2205 break;
2206
2207 case CALL_INSN_OPCODE:
2208 int3_emulate_call(regs, (long)ip + tp->disp);
2209 break;
2210
2211 case JMP32_INSN_OPCODE:
2212 case JMP8_INSN_OPCODE:
2213 int3_emulate_jmp(regs, (long)ip + tp->disp);
2214 break;
2215
2216 case 0x70 ... 0x7f: /* Jcc */
2217 int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
2218 break;
2219
2220 default:
2221 BUG();
2222 }
2223
2224 ret = 1;
2225
2226out_put:
2227 put_desc();
2228 return ret;
2229}
2230
2231#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
2232static struct text_poke_loc tp_vec[TP_VEC_MAX];
2233static int tp_vec_nr;
2234
2235/**
2236 * text_poke_bp_batch() -- update instructions on live kernel on SMP
2237 * @tp: vector of instructions to patch
2238 * @nr_entries: number of entries in the vector
2239 *
2240 * Modify multi-byte instruction by using int3 breakpoint on SMP.
2241 * We completely avoid stop_machine() here, and achieve the
2242 * synchronization using int3 breakpoint.
2243 *
2244 * The way it is done:
2245 * - For each entry in the vector:
2246 * - add a int3 trap to the address that will be patched
2247 * - sync cores
2248 * - For each entry in the vector:
2249 * - update all but the first byte of the patched range
2250 * - sync cores
2251 * - For each entry in the vector:
2252 * - replace the first byte (int3) by the first byte of
2253 * replacing opcode
2254 * - sync cores
2255 */
2256static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
2257{
2258 unsigned char int3 = INT3_INSN_OPCODE;
2259 unsigned int i;
2260 int do_sync;
2261
2262 lockdep_assert_held(&text_mutex);
2263
2264 bp_desc.vec = tp;
2265 bp_desc.nr_entries = nr_entries;
2266
2267 /*
2268 * Corresponds to the implicit memory barrier in try_get_desc() to
2269 * ensure reading a non-zero refcount provides up to date bp_desc data.
2270 */
2271 atomic_set_release(&bp_desc.refs, 1);
2272
2273 /*
2274 * Function tracing can enable thousands of places that need to be
2275 * updated. This can take quite some time, and with full kernel debugging
2276 * enabled, this could cause the softlockup watchdog to trigger.
2277 * This function gets called every 256 entries added to be patched.
2278 * Call cond_resched() here to make sure that other tasks can get scheduled
2279 * while processing all the functions being patched.
2280 */
2281 cond_resched();
2282
2283 /*
2284 * Corresponding read barrier in int3 notifier for making sure the
2285 * nr_entries and handler are correctly ordered wrt. patching.
2286 */
2287 smp_wmb();
2288
2289 /*
2290 * First step: add a int3 trap to the address that will be patched.
2291 */
2292 for (i = 0; i < nr_entries; i++) {
2293 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
2294 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
2295 }
2296
2297 text_poke_sync();
2298
2299 /*
2300 * Second step: update all but the first byte of the patched range.
2301 */
2302 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2303 u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
2304 u8 _new[POKE_MAX_OPCODE_SIZE+1];
2305 const u8 *new = tp[i].text;
2306 int len = tp[i].len;
2307
2308 if (len - INT3_INSN_SIZE > 0) {
2309 memcpy(old + INT3_INSN_SIZE,
2310 text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2311 len - INT3_INSN_SIZE);
2312
2313 if (len == 6) {
2314 _new[0] = 0x0f;
2315 memcpy(_new + 1, new, 5);
2316 new = _new;
2317 }
2318
2319 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2320 new + INT3_INSN_SIZE,
2321 len - INT3_INSN_SIZE);
2322
2323 do_sync++;
2324 }
2325
2326 /*
2327 * Emit a perf event to record the text poke, primarily to
2328 * support Intel PT decoding which must walk the executable code
2329 * to reconstruct the trace. The flow up to here is:
2330 * - write INT3 byte
2331 * - IPI-SYNC
2332 * - write instruction tail
2333 * At this point the actual control flow will be through the
2334 * INT3 and handler and not hit the old or new instruction.
2335 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2336 * can still be decoded. Subsequently:
2337 * - emit RECORD_TEXT_POKE with the new instruction
2338 * - IPI-SYNC
2339 * - write first byte
2340 * - IPI-SYNC
2341 * So before the text poke event timestamp, the decoder will see
2342 * either the old instruction flow or FUP/TIP of INT3. After the
2343 * text poke event timestamp, the decoder will see either the
2344 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2345 * use the timestamp as the point at which to modify the
2346 * executable code.
2347 * The old instruction is recorded so that the event can be
2348 * processed forwards or backwards.
2349 */
2350 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
2351 }
2352
2353 if (do_sync) {
2354 /*
2355 * According to Intel, this core syncing is very likely
2356 * not necessary and we'd be safe even without it. But
2357 * better safe than sorry (plus there's not only Intel).
2358 */
2359 text_poke_sync();
2360 }
2361
2362 /*
2363 * Third step: replace the first byte (int3) by the first byte of
2364 * replacing opcode.
2365 */
2366 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2367 u8 byte = tp[i].text[0];
2368
2369 if (tp[i].len == 6)
2370 byte = 0x0f;
2371
2372 if (byte == INT3_INSN_OPCODE)
2373 continue;
2374
2375 text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
2376 do_sync++;
2377 }
2378
2379 if (do_sync)
2380 text_poke_sync();
2381
2382 /*
2383 * Remove and wait for refs to be zero.
2384 */
2385 if (!atomic_dec_and_test(&bp_desc.refs))
2386 atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2387}
2388
2389static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2390 const void *opcode, size_t len, const void *emulate)
2391{
2392 struct insn insn;
2393 int ret, i = 0;
2394
2395 if (len == 6)
2396 i = 1;
2397 memcpy((void *)tp->text, opcode+i, len-i);
2398 if (!emulate)
2399 emulate = opcode;
2400
2401 ret = insn_decode_kernel(&insn, emulate);
2402 BUG_ON(ret < 0);
2403
2404 tp->rel_addr = addr - (void *)_stext;
2405 tp->len = len;
2406 tp->opcode = insn.opcode.bytes[0];
2407
2408 if (is_jcc32(&insn)) {
2409 /*
2410 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2411 */
2412 tp->opcode = insn.opcode.bytes[1] - 0x10;
2413 }
2414
2415 switch (tp->opcode) {
2416 case RET_INSN_OPCODE:
2417 case JMP32_INSN_OPCODE:
2418 case JMP8_INSN_OPCODE:
2419 /*
2420 * Control flow instructions without implied execution of the
2421 * next instruction can be padded with INT3.
2422 */
2423 for (i = insn.length; i < len; i++)
2424 BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2425 break;
2426
2427 default:
2428 BUG_ON(len != insn.length);
2429 }
2430
2431 switch (tp->opcode) {
2432 case INT3_INSN_OPCODE:
2433 case RET_INSN_OPCODE:
2434 break;
2435
2436 case CALL_INSN_OPCODE:
2437 case JMP32_INSN_OPCODE:
2438 case JMP8_INSN_OPCODE:
2439 case 0x70 ... 0x7f: /* Jcc */
2440 tp->disp = insn.immediate.value;
2441 break;
2442
2443 default: /* assume NOP */
2444 switch (len) {
2445 case 2: /* NOP2 -- emulate as JMP8+0 */
2446 BUG_ON(memcmp(emulate, x86_nops[len], len));
2447 tp->opcode = JMP8_INSN_OPCODE;
2448 tp->disp = 0;
2449 break;
2450
2451 case 5: /* NOP5 -- emulate as JMP32+0 */
2452 BUG_ON(memcmp(emulate, x86_nops[len], len));
2453 tp->opcode = JMP32_INSN_OPCODE;
2454 tp->disp = 0;
2455 break;
2456
2457 default: /* unknown instruction */
2458 BUG();
2459 }
2460 break;
2461 }
2462}
2463
2464/*
2465 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2466 * early if needed.
2467 */
2468static bool tp_order_fail(void *addr)
2469{
2470 struct text_poke_loc *tp;
2471
2472 if (!tp_vec_nr)
2473 return false;
2474
2475 if (!addr) /* force */
2476 return true;
2477
2478 tp = &tp_vec[tp_vec_nr - 1];
2479 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2480 return true;
2481
2482 return false;
2483}
2484
2485static void text_poke_flush(void *addr)
2486{
2487 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2488 text_poke_bp_batch(tp_vec, tp_vec_nr);
2489 tp_vec_nr = 0;
2490 }
2491}
2492
2493void text_poke_finish(void)
2494{
2495 text_poke_flush(NULL);
2496}
2497
2498void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2499{
2500 struct text_poke_loc *tp;
2501
2502 text_poke_flush(addr);
2503
2504 tp = &tp_vec[tp_vec_nr++];
2505 text_poke_loc_init(tp, addr, opcode, len, emulate);
2506}
2507
2508/**
2509 * text_poke_bp() -- update instructions on live kernel on SMP
2510 * @addr: address to patch
2511 * @opcode: opcode of new instruction
2512 * @len: length to copy
2513 * @emulate: instruction to be emulated
2514 *
2515 * Update a single instruction with the vector in the stack, avoiding
2516 * dynamically allocated memory. This function should be used when it is
2517 * not possible to allocate memory.
2518 */
2519void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2520{
2521 struct text_poke_loc tp;
2522
2523 text_poke_loc_init(&tp, addr, opcode, len, emulate);
2524 text_poke_bp_batch(&tp, 1);
2525}