Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
4#include <linux/module.h>
5#include <linux/sched.h>
6#include <linux/perf_event.h>
7#include <linux/mutex.h>
8#include <linux/list.h>
9#include <linux/stringify.h>
10#include <linux/highmem.h>
11#include <linux/mm.h>
12#include <linux/vmalloc.h>
13#include <linux/memory.h>
14#include <linux/stop_machine.h>
15#include <linux/slab.h>
16#include <linux/kdebug.h>
17#include <linux/kprobes.h>
18#include <linux/mmu_context.h>
19#include <linux/bsearch.h>
20#include <linux/sync_core.h>
21#include <asm/text-patching.h>
22#include <asm/alternative.h>
23#include <asm/sections.h>
24#include <asm/mce.h>
25#include <asm/nmi.h>
26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h>
28#include <asm/insn.h>
29#include <asm/io.h>
30#include <asm/fixmap.h>
31#include <asm/paravirt.h>
32#include <asm/asm-prototypes.h>
33#include <asm/cfi.h>
34
35int __read_mostly alternatives_patched;
36
37EXPORT_SYMBOL_GPL(alternatives_patched);
38
39#define MAX_PATCH_LEN (255-1)
40
41#define DA_ALL (~0)
42#define DA_ALT 0x01
43#define DA_RET 0x02
44#define DA_RETPOLINE 0x04
45#define DA_ENDBR 0x08
46#define DA_SMP 0x10
47
48static unsigned int debug_alternative;
49
50static int __init debug_alt(char *str)
51{
52 if (str && *str == '=')
53 str++;
54
55 if (!str || kstrtouint(str, 0, &debug_alternative))
56 debug_alternative = DA_ALL;
57
58 return 1;
59}
60__setup("debug-alternative", debug_alt);
61
62static int noreplace_smp;
63
64static int __init setup_noreplace_smp(char *str)
65{
66 noreplace_smp = 1;
67 return 1;
68}
69__setup("noreplace-smp", setup_noreplace_smp);
70
71#define DPRINTK(type, fmt, args...) \
72do { \
73 if (debug_alternative & DA_##type) \
74 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
75} while (0)
76
77#define DUMP_BYTES(type, buf, len, fmt, args...) \
78do { \
79 if (unlikely(debug_alternative & DA_##type)) { \
80 int j; \
81 \
82 if (!(len)) \
83 break; \
84 \
85 printk(KERN_DEBUG pr_fmt(fmt), ##args); \
86 for (j = 0; j < (len) - 1; j++) \
87 printk(KERN_CONT "%02hhx ", buf[j]); \
88 printk(KERN_CONT "%02hhx\n", buf[j]); \
89 } \
90} while (0)
91
92static const unsigned char x86nops[] =
93{
94 BYTES_NOP1,
95 BYTES_NOP2,
96 BYTES_NOP3,
97 BYTES_NOP4,
98 BYTES_NOP5,
99 BYTES_NOP6,
100 BYTES_NOP7,
101 BYTES_NOP8,
102#ifdef CONFIG_64BIT
103 BYTES_NOP9,
104 BYTES_NOP10,
105 BYTES_NOP11,
106#endif
107};
108
109const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
110{
111 NULL,
112 x86nops,
113 x86nops + 1,
114 x86nops + 1 + 2,
115 x86nops + 1 + 2 + 3,
116 x86nops + 1 + 2 + 3 + 4,
117 x86nops + 1 + 2 + 3 + 4 + 5,
118 x86nops + 1 + 2 + 3 + 4 + 5 + 6,
119 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
120#ifdef CONFIG_64BIT
121 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
122 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
123 x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
124#endif
125};
126
127/*
128 * Nomenclature for variable names to simplify and clarify this code and ease
129 * any potential staring at it:
130 *
131 * @instr: source address of the original instructions in the kernel text as
132 * generated by the compiler.
133 *
134 * @buf: temporary buffer on which the patching operates. This buffer is
135 * eventually text-poked into the kernel image.
136 *
137 * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
138 * in the .altinstr_replacement section.
139 */
140
141/*
142 * Fill the buffer with a single effective instruction of size @len.
143 *
144 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
145 * for every single-byte NOP, try to generate the maximally available NOP of
146 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
147 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
148 * *jump* over instead of executing long and daft NOPs.
149 */
150static void add_nop(u8 *buf, unsigned int len)
151{
152 u8 *target = buf + len;
153
154 if (!len)
155 return;
156
157 if (len <= ASM_NOP_MAX) {
158 memcpy(buf, x86_nops[len], len);
159 return;
160 }
161
162 if (len < 128) {
163 __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
164 buf += JMP8_INSN_SIZE;
165 } else {
166 __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
167 buf += JMP32_INSN_SIZE;
168 }
169
170 for (;buf < target; buf++)
171 *buf = INT3_INSN_OPCODE;
172}
173
174extern s32 __retpoline_sites[], __retpoline_sites_end[];
175extern s32 __return_sites[], __return_sites_end[];
176extern s32 __cfi_sites[], __cfi_sites_end[];
177extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
178extern s32 __smp_locks[], __smp_locks_end[];
179void text_poke_early(void *addr, const void *opcode, size_t len);
180
181/*
182 * Matches NOP and NOPL, not any of the other possible NOPs.
183 */
184static bool insn_is_nop(struct insn *insn)
185{
186 /* Anything NOP, but no REP NOP */
187 if (insn->opcode.bytes[0] == 0x90 &&
188 (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
189 return true;
190
191 /* NOPL */
192 if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
193 return true;
194
195 /* TODO: more nops */
196
197 return false;
198}
199
200/*
201 * Find the offset of the first non-NOP instruction starting at @offset
202 * but no further than @len.
203 */
204static int skip_nops(u8 *buf, int offset, int len)
205{
206 struct insn insn;
207
208 for (; offset < len; offset += insn.length) {
209 if (insn_decode_kernel(&insn, &buf[offset]))
210 break;
211
212 if (!insn_is_nop(&insn))
213 break;
214 }
215
216 return offset;
217}
218
219/*
220 * "noinline" to cause control flow change and thus invalidate I$ and
221 * cause refetch after modification.
222 */
223static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
224{
225 for (int next, i = 0; i < len; i = next) {
226 struct insn insn;
227
228 if (insn_decode_kernel(&insn, &buf[i]))
229 return;
230
231 next = i + insn.length;
232
233 if (insn_is_nop(&insn)) {
234 int nop = i;
235
236 /* Has the NOP already been optimized? */
237 if (i + insn.length == len)
238 return;
239
240 next = skip_nops(buf, next, len);
241
242 add_nop(buf + nop, next - nop);
243 DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
244 }
245 }
246}
247
248/*
249 * In this context, "source" is where the instructions are placed in the
250 * section .altinstr_replacement, for example during kernel build by the
251 * toolchain.
252 * "Destination" is where the instructions are being patched in by this
253 * machinery.
254 *
255 * The source offset is:
256 *
257 * src_imm = target - src_next_ip (1)
258 *
259 * and the target offset is:
260 *
261 * dst_imm = target - dst_next_ip (2)
262 *
263 * so rework (1) as an expression for target like:
264 *
265 * target = src_imm + src_next_ip (1a)
266 *
267 * and substitute in (2) to get:
268 *
269 * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
270 *
271 * Now, since the instruction stream is 'identical' at src and dst (it
272 * is being copied after all) it can be stated that:
273 *
274 * src_next_ip = src + ip_offset
275 * dst_next_ip = dst + ip_offset (4)
276 *
277 * Substitute (4) in (3) and observe ip_offset being cancelled out to
278 * obtain:
279 *
280 * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
281 * = src_imm + src - dst + ip_offset - ip_offset
282 * = src_imm + src - dst (5)
283 *
284 * IOW, only the relative displacement of the code block matters.
285 */
286
287#define apply_reloc_n(n_, p_, d_) \
288 do { \
289 s32 v = *(s##n_ *)(p_); \
290 v += (d_); \
291 BUG_ON((v >> 31) != (v >> (n_-1))); \
292 *(s##n_ *)(p_) = (s##n_)v; \
293 } while (0)
294
295
296static __always_inline
297void apply_reloc(int n, void *ptr, uintptr_t diff)
298{
299 switch (n) {
300 case 1: apply_reloc_n(8, ptr, diff); break;
301 case 2: apply_reloc_n(16, ptr, diff); break;
302 case 4: apply_reloc_n(32, ptr, diff); break;
303 default: BUG();
304 }
305}
306
307static __always_inline
308bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
309{
310 u8 *target = src + offset;
311 /*
312 * If the target is inside the patched block, it's relative to the
313 * block itself and does not need relocation.
314 */
315 return (target < src || target > src + src_len);
316}
317
318static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
319{
320 for (int next, i = 0; i < instrlen; i = next) {
321 struct insn insn;
322
323 if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
324 return;
325
326 next = i + insn.length;
327
328 switch (insn.opcode.bytes[0]) {
329 case 0x0f:
330 if (insn.opcode.bytes[1] < 0x80 ||
331 insn.opcode.bytes[1] > 0x8f)
332 break;
333
334 fallthrough; /* Jcc.d32 */
335 case 0x70 ... 0x7f: /* Jcc.d8 */
336 case JMP8_INSN_OPCODE:
337 case JMP32_INSN_OPCODE:
338 case CALL_INSN_OPCODE:
339 if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
340 apply_reloc(insn.immediate.nbytes,
341 buf + i + insn_offset_immediate(&insn),
342 repl - instr);
343 }
344
345 /*
346 * Where possible, convert JMP.d32 into JMP.d8.
347 */
348 if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
349 s32 imm = insn.immediate.value;
350 imm += repl - instr;
351 imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
352 if ((imm >> 31) == (imm >> 7)) {
353 buf[i+0] = JMP8_INSN_OPCODE;
354 buf[i+1] = (s8)imm;
355
356 memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
357 }
358 }
359 break;
360 }
361
362 if (insn_rip_relative(&insn)) {
363 if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
364 apply_reloc(insn.displacement.nbytes,
365 buf + i + insn_offset_displacement(&insn),
366 repl - instr);
367 }
368 }
369 }
370}
371
372void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
373{
374 __apply_relocation(buf, instr, instrlen, repl, repl_len);
375 optimize_nops(instr, buf, instrlen);
376}
377
378/* Low-level backend functions usable from alternative code replacements. */
379DEFINE_ASM_FUNC(nop_func, "", .entry.text);
380EXPORT_SYMBOL_GPL(nop_func);
381
382noinstr void BUG_func(void)
383{
384 BUG();
385}
386EXPORT_SYMBOL(BUG_func);
387
388#define CALL_RIP_REL_OPCODE 0xff
389#define CALL_RIP_REL_MODRM 0x15
390
391/*
392 * Rewrite the "call BUG_func" replacement to point to the target of the
393 * indirect pv_ops call "call *disp(%ip)".
394 */
395static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
396 struct module *mod)
397{
398 u8 *wr_instr = module_writable_address(mod, instr);
399 void *target, *bug = &BUG_func;
400 s32 disp;
401
402 if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
403 pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
404 BUG();
405 }
406
407 if (a->instrlen != 6 ||
408 wr_instr[0] != CALL_RIP_REL_OPCODE ||
409 wr_instr[1] != CALL_RIP_REL_MODRM) {
410 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
411 BUG();
412 }
413
414 /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
415 disp = *(s32 *)(wr_instr + 2);
416#ifdef CONFIG_X86_64
417 /* ff 15 00 00 00 00 call *0x0(%rip) */
418 /* target address is stored at "next instruction + disp". */
419 target = *(void **)(instr + a->instrlen + disp);
420#else
421 /* ff 15 00 00 00 00 call *0x0 */
422 /* target address is stored at disp. */
423 target = *(void **)disp;
424#endif
425 if (!target)
426 target = bug;
427
428 /* (BUG_func - .) + (target - BUG_func) := target - . */
429 *(s32 *)(insn_buff + 1) += target - bug;
430
431 if (target == &nop_func)
432 return 0;
433
434 return 5;
435}
436
437static inline u8 * instr_va(struct alt_instr *i)
438{
439 return (u8 *)&i->instr_offset + i->instr_offset;
440}
441
442/*
443 * Replace instructions with better alternatives for this CPU type. This runs
444 * before SMP is initialized to avoid SMP problems with self modifying code.
445 * This implies that asymmetric systems where APs have less capabilities than
446 * the boot processor are not handled. Tough. Make sure you disable such
447 * features by hand.
448 *
449 * Marked "noinline" to cause control flow change and thus insn cache
450 * to refetch changed I$ lines.
451 */
452void __init_or_module noinline apply_alternatives(struct alt_instr *start,
453 struct alt_instr *end,
454 struct module *mod)
455{
456 u8 insn_buff[MAX_PATCH_LEN];
457 u8 *instr, *replacement;
458 struct alt_instr *a, *b;
459
460 DPRINTK(ALT, "alt table %px, -> %px", start, end);
461
462 /*
463 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
464 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
465 * During the process, KASAN becomes confused seeing partial LA57
466 * conversion and triggers a false-positive out-of-bound report.
467 *
468 * Disable KASAN until the patching is complete.
469 */
470 kasan_disable_current();
471
472 /*
473 * The scan order should be from start to end. A later scanned
474 * alternative code can overwrite previously scanned alternative code.
475 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
476 * patch code.
477 *
478 * So be careful if you want to change the scan order to any other
479 * order.
480 */
481 for (a = start; a < end; a++) {
482 int insn_buff_sz = 0;
483 u8 *wr_instr, *wr_replacement;
484
485 /*
486 * In case of nested ALTERNATIVE()s the outer alternative might
487 * add more padding. To ensure consistent patching find the max
488 * padding for all alt_instr entries for this site (nested
489 * alternatives result in consecutive entries).
490 */
491 for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
492 u8 len = max(a->instrlen, b->instrlen);
493 a->instrlen = b->instrlen = len;
494 }
495
496 instr = instr_va(a);
497 wr_instr = module_writable_address(mod, instr);
498
499 replacement = (u8 *)&a->repl_offset + a->repl_offset;
500 wr_replacement = module_writable_address(mod, replacement);
501
502 BUG_ON(a->instrlen > sizeof(insn_buff));
503 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
504
505 /*
506 * Patch if either:
507 * - feature is present
508 * - feature not present but ALT_FLAG_NOT is set to mean,
509 * patch if feature is *NOT* present.
510 */
511 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
512 memcpy(insn_buff, wr_instr, a->instrlen);
513 optimize_nops(instr, insn_buff, a->instrlen);
514 text_poke_early(wr_instr, insn_buff, a->instrlen);
515 continue;
516 }
517
518 DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
519 a->cpuid >> 5,
520 a->cpuid & 0x1f,
521 instr, instr, a->instrlen,
522 replacement, a->replacementlen, a->flags);
523
524 memcpy(insn_buff, wr_replacement, a->replacementlen);
525 insn_buff_sz = a->replacementlen;
526
527 if (a->flags & ALT_FLAG_DIRECT_CALL) {
528 insn_buff_sz = alt_replace_call(instr, insn_buff, a,
529 mod);
530 if (insn_buff_sz < 0)
531 continue;
532 }
533
534 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
535 insn_buff[insn_buff_sz] = 0x90;
536
537 apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
538
539 DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr);
540 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
541 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
542
543 text_poke_early(wr_instr, insn_buff, insn_buff_sz);
544 }
545
546 kasan_enable_current();
547}
548
549static inline bool is_jcc32(struct insn *insn)
550{
551 /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
552 return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
553}
554
555#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
556
557/*
558 * CALL/JMP *%\reg
559 */
560static int emit_indirect(int op, int reg, u8 *bytes)
561{
562 int i = 0;
563 u8 modrm;
564
565 switch (op) {
566 case CALL_INSN_OPCODE:
567 modrm = 0x10; /* Reg = 2; CALL r/m */
568 break;
569
570 case JMP32_INSN_OPCODE:
571 modrm = 0x20; /* Reg = 4; JMP r/m */
572 break;
573
574 default:
575 WARN_ON_ONCE(1);
576 return -1;
577 }
578
579 if (reg >= 8) {
580 bytes[i++] = 0x41; /* REX.B prefix */
581 reg -= 8;
582 }
583
584 modrm |= 0xc0; /* Mod = 3 */
585 modrm += reg;
586
587 bytes[i++] = 0xff; /* opcode */
588 bytes[i++] = modrm;
589
590 return i;
591}
592
593static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
594{
595 u8 op = insn->opcode.bytes[0];
596 int i = 0;
597
598 /*
599 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
600 * tail-calls. Deal with them.
601 */
602 if (is_jcc32(insn)) {
603 bytes[i++] = op;
604 op = insn->opcode.bytes[1];
605 goto clang_jcc;
606 }
607
608 if (insn->length == 6)
609 bytes[i++] = 0x2e; /* CS-prefix */
610
611 switch (op) {
612 case CALL_INSN_OPCODE:
613 __text_gen_insn(bytes+i, op, addr+i,
614 __x86_indirect_call_thunk_array[reg],
615 CALL_INSN_SIZE);
616 i += CALL_INSN_SIZE;
617 break;
618
619 case JMP32_INSN_OPCODE:
620clang_jcc:
621 __text_gen_insn(bytes+i, op, addr+i,
622 __x86_indirect_jump_thunk_array[reg],
623 JMP32_INSN_SIZE);
624 i += JMP32_INSN_SIZE;
625 break;
626
627 default:
628 WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
629 return -1;
630 }
631
632 WARN_ON_ONCE(i != insn->length);
633
634 return i;
635}
636
637/*
638 * Rewrite the compiler generated retpoline thunk calls.
639 *
640 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
641 * indirect instructions, avoiding the extra indirection.
642 *
643 * For example, convert:
644 *
645 * CALL __x86_indirect_thunk_\reg
646 *
647 * into:
648 *
649 * CALL *%\reg
650 *
651 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
652 */
653static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
654{
655 retpoline_thunk_t *target;
656 int reg, ret, i = 0;
657 u8 op, cc;
658
659 target = addr + insn->length + insn->immediate.value;
660 reg = target - __x86_indirect_thunk_array;
661
662 if (WARN_ON_ONCE(reg & ~0xf))
663 return -1;
664
665 /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
666 BUG_ON(reg == 4);
667
668 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
669 !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
670 if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
671 return emit_call_track_retpoline(addr, insn, reg, bytes);
672
673 return -1;
674 }
675
676 op = insn->opcode.bytes[0];
677
678 /*
679 * Convert:
680 *
681 * Jcc.d32 __x86_indirect_thunk_\reg
682 *
683 * into:
684 *
685 * Jncc.d8 1f
686 * [ LFENCE ]
687 * JMP *%\reg
688 * [ NOP ]
689 * 1:
690 */
691 if (is_jcc32(insn)) {
692 cc = insn->opcode.bytes[1] & 0xf;
693 cc ^= 1; /* invert condition */
694
695 bytes[i++] = 0x70 + cc; /* Jcc.d8 */
696 bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
697
698 /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
699 op = JMP32_INSN_OPCODE;
700 }
701
702 /*
703 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
704 */
705 if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
706 bytes[i++] = 0x0f;
707 bytes[i++] = 0xae;
708 bytes[i++] = 0xe8; /* LFENCE */
709 }
710
711 ret = emit_indirect(op, reg, bytes + i);
712 if (ret < 0)
713 return ret;
714 i += ret;
715
716 /*
717 * The compiler is supposed to EMIT an INT3 after every unconditional
718 * JMP instruction due to AMD BTC. However, if the compiler is too old
719 * or MITIGATION_SLS isn't enabled, we still need an INT3 after
720 * indirect JMPs even on Intel.
721 */
722 if (op == JMP32_INSN_OPCODE && i < insn->length)
723 bytes[i++] = INT3_INSN_OPCODE;
724
725 for (; i < insn->length;)
726 bytes[i++] = BYTES_NOP1;
727
728 return i;
729}
730
731/*
732 * Generated by 'objtool --retpoline'.
733 */
734void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
735 struct module *mod)
736{
737 s32 *s;
738
739 for (s = start; s < end; s++) {
740 void *addr = (void *)s + *s;
741 void *wr_addr = module_writable_address(mod, addr);
742 struct insn insn;
743 int len, ret;
744 u8 bytes[16];
745 u8 op1, op2;
746
747 ret = insn_decode_kernel(&insn, wr_addr);
748 if (WARN_ON_ONCE(ret < 0))
749 continue;
750
751 op1 = insn.opcode.bytes[0];
752 op2 = insn.opcode.bytes[1];
753
754 switch (op1) {
755 case CALL_INSN_OPCODE:
756 case JMP32_INSN_OPCODE:
757 break;
758
759 case 0x0f: /* escape */
760 if (op2 >= 0x80 && op2 <= 0x8f)
761 break;
762 fallthrough;
763 default:
764 WARN_ON_ONCE(1);
765 continue;
766 }
767
768 DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
769 addr, addr, insn.length,
770 addr + insn.length + insn.immediate.value);
771
772 len = patch_retpoline(addr, &insn, bytes);
773 if (len == insn.length) {
774 optimize_nops(addr, bytes, len);
775 DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr);
776 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
777 text_poke_early(wr_addr, bytes, len);
778 }
779 }
780}
781
782#ifdef CONFIG_MITIGATION_RETHUNK
783
784/*
785 * Rewrite the compiler generated return thunk tail-calls.
786 *
787 * For example, convert:
788 *
789 * JMP __x86_return_thunk
790 *
791 * into:
792 *
793 * RET
794 */
795static int patch_return(void *addr, struct insn *insn, u8 *bytes)
796{
797 int i = 0;
798
799 /* Patch the custom return thunks... */
800 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
801 i = JMP32_INSN_SIZE;
802 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
803 } else {
804 /* ... or patch them out if not needed. */
805 bytes[i++] = RET_INSN_OPCODE;
806 }
807
808 for (; i < insn->length;)
809 bytes[i++] = INT3_INSN_OPCODE;
810 return i;
811}
812
813void __init_or_module noinline apply_returns(s32 *start, s32 *end,
814 struct module *mod)
815{
816 s32 *s;
817
818 if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
819 static_call_force_reinit();
820
821 for (s = start; s < end; s++) {
822 void *dest = NULL, *addr = (void *)s + *s;
823 void *wr_addr = module_writable_address(mod, addr);
824 struct insn insn;
825 int len, ret;
826 u8 bytes[16];
827 u8 op;
828
829 ret = insn_decode_kernel(&insn, wr_addr);
830 if (WARN_ON_ONCE(ret < 0))
831 continue;
832
833 op = insn.opcode.bytes[0];
834 if (op == JMP32_INSN_OPCODE)
835 dest = addr + insn.length + insn.immediate.value;
836
837 if (__static_call_fixup(addr, op, dest) ||
838 WARN_ONCE(dest != &__x86_return_thunk,
839 "missing return thunk: %pS-%pS: %*ph",
840 addr, dest, 5, addr))
841 continue;
842
843 DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
844 addr, addr, insn.length,
845 addr + insn.length + insn.immediate.value);
846
847 len = patch_return(addr, &insn, bytes);
848 if (len == insn.length) {
849 DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr);
850 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
851 text_poke_early(wr_addr, bytes, len);
852 }
853 }
854}
855#else
856void __init_or_module noinline apply_returns(s32 *start, s32 *end,
857 struct module *mod) { }
858#endif /* CONFIG_MITIGATION_RETHUNK */
859
860#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
861
862void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
863 struct module *mod) { }
864void __init_or_module noinline apply_returns(s32 *start, s32 *end,
865 struct module *mod) { }
866
867#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
868
869#ifdef CONFIG_X86_KERNEL_IBT
870
871static void poison_cfi(void *addr, void *wr_addr);
872
873static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
874{
875 u32 endbr, poison = gen_endbr_poison();
876
877 if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr)))
878 return;
879
880 if (!is_endbr(endbr)) {
881 WARN_ON_ONCE(warn);
882 return;
883 }
884
885 DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
886
887 /*
888 * When we have IBT, the lack of ENDBR will trigger #CP
889 */
890 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
891 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
892 text_poke_early(wr_addr, &poison, 4);
893}
894
895/*
896 * Generated by: objtool --ibt
897 *
898 * Seal the functions for indirect calls by clobbering the ENDBR instructions
899 * and the kCFI hash value.
900 */
901void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
902{
903 s32 *s;
904
905 for (s = start; s < end; s++) {
906 void *addr = (void *)s + *s;
907 void *wr_addr = module_writable_address(mod, addr);
908
909 poison_endbr(addr, wr_addr, true);
910 if (IS_ENABLED(CONFIG_FINEIBT))
911 poison_cfi(addr - 16, wr_addr - 16);
912 }
913}
914
915#else
916
917void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { }
918
919#endif /* CONFIG_X86_KERNEL_IBT */
920
921#ifdef CONFIG_CFI_AUTO_DEFAULT
922#define __CFI_DEFAULT CFI_AUTO
923#elif defined(CONFIG_CFI_CLANG)
924#define __CFI_DEFAULT CFI_KCFI
925#else
926#define __CFI_DEFAULT CFI_OFF
927#endif
928
929enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
930
931#ifdef CONFIG_CFI_CLANG
932struct bpf_insn;
933
934/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
935extern unsigned int __bpf_prog_runX(const void *ctx,
936 const struct bpf_insn *insn);
937
938/*
939 * Force a reference to the external symbol so the compiler generates
940 * __kcfi_typid.
941 */
942__ADDRESSABLE(__bpf_prog_runX);
943
944/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
945asm (
946" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
947" .type cfi_bpf_hash,@object \n"
948" .globl cfi_bpf_hash \n"
949" .p2align 2, 0x0 \n"
950"cfi_bpf_hash: \n"
951" .long __kcfi_typeid___bpf_prog_runX \n"
952" .size cfi_bpf_hash, 4 \n"
953" .popsection \n"
954);
955
956/* Must match bpf_callback_t */
957extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
958
959__ADDRESSABLE(__bpf_callback_fn);
960
961/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
962asm (
963" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
964" .type cfi_bpf_subprog_hash,@object \n"
965" .globl cfi_bpf_subprog_hash \n"
966" .p2align 2, 0x0 \n"
967"cfi_bpf_subprog_hash: \n"
968" .long __kcfi_typeid___bpf_callback_fn \n"
969" .size cfi_bpf_subprog_hash, 4 \n"
970" .popsection \n"
971);
972
973u32 cfi_get_func_hash(void *func)
974{
975 u32 hash;
976
977 func -= cfi_get_offset();
978 switch (cfi_mode) {
979 case CFI_FINEIBT:
980 func += 7;
981 break;
982 case CFI_KCFI:
983 func += 1;
984 break;
985 default:
986 return 0;
987 }
988
989 if (get_kernel_nofault(hash, func))
990 return 0;
991
992 return hash;
993}
994#endif
995
996#ifdef CONFIG_FINEIBT
997
998static bool cfi_rand __ro_after_init = true;
999static u32 cfi_seed __ro_after_init;
1000
1001/*
1002 * Re-hash the CFI hash with a boot-time seed while making sure the result is
1003 * not a valid ENDBR instruction.
1004 */
1005static u32 cfi_rehash(u32 hash)
1006{
1007 hash ^= cfi_seed;
1008 while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
1009 bool lsb = hash & 1;
1010 hash >>= 1;
1011 if (lsb)
1012 hash ^= 0x80200003;
1013 }
1014 return hash;
1015}
1016
1017static __init int cfi_parse_cmdline(char *str)
1018{
1019 if (!str)
1020 return -EINVAL;
1021
1022 while (str) {
1023 char *next = strchr(str, ',');
1024 if (next) {
1025 *next = 0;
1026 next++;
1027 }
1028
1029 if (!strcmp(str, "auto")) {
1030 cfi_mode = CFI_AUTO;
1031 } else if (!strcmp(str, "off")) {
1032 cfi_mode = CFI_OFF;
1033 cfi_rand = false;
1034 } else if (!strcmp(str, "kcfi")) {
1035 cfi_mode = CFI_KCFI;
1036 } else if (!strcmp(str, "fineibt")) {
1037 cfi_mode = CFI_FINEIBT;
1038 } else if (!strcmp(str, "norand")) {
1039 cfi_rand = false;
1040 } else {
1041 pr_err("Ignoring unknown cfi option (%s).", str);
1042 }
1043
1044 str = next;
1045 }
1046
1047 return 0;
1048}
1049early_param("cfi", cfi_parse_cmdline);
1050
1051/*
1052 * kCFI FineIBT
1053 *
1054 * __cfi_\func: __cfi_\func:
1055 * movl $0x12345678,%eax // 5 endbr64 // 4
1056 * nop subl $0x12345678,%r10d // 7
1057 * nop jz 1f // 2
1058 * nop ud2 // 2
1059 * nop 1: nop // 1
1060 * nop
1061 * nop
1062 * nop
1063 * nop
1064 * nop
1065 * nop
1066 * nop
1067 *
1068 *
1069 * caller: caller:
1070 * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
1071 * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
1072 * je 1f // 2 nop4 // 4
1073 * ud2 // 2
1074 * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
1075 *
1076 */
1077
1078asm( ".pushsection .rodata \n"
1079 "fineibt_preamble_start: \n"
1080 " endbr64 \n"
1081 " subl $0x12345678, %r10d \n"
1082 " je fineibt_preamble_end \n"
1083 " ud2 \n"
1084 " nop \n"
1085 "fineibt_preamble_end: \n"
1086 ".popsection\n"
1087);
1088
1089extern u8 fineibt_preamble_start[];
1090extern u8 fineibt_preamble_end[];
1091
1092#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1093#define fineibt_preamble_hash 7
1094
1095asm( ".pushsection .rodata \n"
1096 "fineibt_caller_start: \n"
1097 " movl $0x12345678, %r10d \n"
1098 " sub $16, %r11 \n"
1099 ASM_NOP4
1100 "fineibt_caller_end: \n"
1101 ".popsection \n"
1102);
1103
1104extern u8 fineibt_caller_start[];
1105extern u8 fineibt_caller_end[];
1106
1107#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1108#define fineibt_caller_hash 2
1109
1110#define fineibt_caller_jmp (fineibt_caller_size - 2)
1111
1112static u32 decode_preamble_hash(void *addr)
1113{
1114 u8 *p = addr;
1115
1116 /* b8 78 56 34 12 mov $0x12345678,%eax */
1117 if (p[0] == 0xb8)
1118 return *(u32 *)(addr + 1);
1119
1120 return 0; /* invalid hash value */
1121}
1122
1123static u32 decode_caller_hash(void *addr)
1124{
1125 u8 *p = addr;
1126
1127 /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
1128 if (p[0] == 0x41 && p[1] == 0xba)
1129 return -*(u32 *)(addr + 2);
1130
1131 /* e8 0c 78 56 34 12 jmp.d8 +12 */
1132 if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1133 return -*(u32 *)(addr + 2);
1134
1135 return 0; /* invalid hash value */
1136}
1137
1138/* .retpoline_sites */
1139static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
1140{
1141 /*
1142 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1143 * in tact for later usage. Also see decode_caller_hash() and
1144 * cfi_rewrite_callers().
1145 */
1146 const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1147 s32 *s;
1148
1149 for (s = start; s < end; s++) {
1150 void *addr = (void *)s + *s;
1151 void *wr_addr;
1152 u32 hash;
1153
1154 addr -= fineibt_caller_size;
1155 wr_addr = module_writable_address(mod, addr);
1156 hash = decode_caller_hash(wr_addr);
1157
1158 if (!hash) /* nocfi callers */
1159 continue;
1160
1161 text_poke_early(wr_addr, jmp, 2);
1162 }
1163
1164 return 0;
1165}
1166
1167static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
1168{
1169 /*
1170 * Re-enable kCFI, undo what cfi_disable_callers() did.
1171 */
1172 const u8 mov[] = { 0x41, 0xba };
1173 s32 *s;
1174
1175 for (s = start; s < end; s++) {
1176 void *addr = (void *)s + *s;
1177 void *wr_addr;
1178 u32 hash;
1179
1180 addr -= fineibt_caller_size;
1181 wr_addr = module_writable_address(mod, addr);
1182 hash = decode_caller_hash(wr_addr);
1183 if (!hash) /* nocfi callers */
1184 continue;
1185
1186 text_poke_early(wr_addr, mov, 2);
1187 }
1188
1189 return 0;
1190}
1191
1192/* .cfi_sites */
1193static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod)
1194{
1195 s32 *s;
1196
1197 for (s = start; s < end; s++) {
1198 void *addr = (void *)s + *s;
1199 void *wr_addr = module_writable_address(mod, addr);
1200 u32 hash;
1201
1202 hash = decode_preamble_hash(wr_addr);
1203 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1204 addr, addr, 5, addr))
1205 return -EINVAL;
1206
1207 hash = cfi_rehash(hash);
1208 text_poke_early(wr_addr + 1, &hash, 4);
1209 }
1210
1211 return 0;
1212}
1213
1214static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod)
1215{
1216 s32 *s;
1217
1218 for (s = start; s < end; s++) {
1219 void *addr = (void *)s + *s;
1220 void *wr_addr = module_writable_address(mod, addr);
1221 u32 hash;
1222
1223 hash = decode_preamble_hash(wr_addr);
1224 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1225 addr, addr, 5, addr))
1226 return -EINVAL;
1227
1228 text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size);
1229 WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678);
1230 text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4);
1231 }
1232
1233 return 0;
1234}
1235
1236static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod)
1237{
1238 s32 *s;
1239
1240 for (s = start; s < end; s++) {
1241 void *addr = (void *)s + *s;
1242 void *wr_addr = module_writable_address(mod, addr);
1243
1244 poison_endbr(addr + 16, wr_addr + 16, false);
1245 }
1246}
1247
1248/* .retpoline_sites */
1249static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod)
1250{
1251 s32 *s;
1252
1253 for (s = start; s < end; s++) {
1254 void *addr = (void *)s + *s;
1255 void *wr_addr;
1256 u32 hash;
1257
1258 addr -= fineibt_caller_size;
1259 wr_addr = module_writable_address(mod, addr);
1260 hash = decode_caller_hash(wr_addr);
1261 if (hash) {
1262 hash = -cfi_rehash(hash);
1263 text_poke_early(wr_addr + 2, &hash, 4);
1264 }
1265 }
1266
1267 return 0;
1268}
1269
1270static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod)
1271{
1272 s32 *s;
1273
1274 for (s = start; s < end; s++) {
1275 void *addr = (void *)s + *s;
1276 void *wr_addr;
1277 u32 hash;
1278
1279 addr -= fineibt_caller_size;
1280 wr_addr = module_writable_address(mod, addr);
1281 hash = decode_caller_hash(wr_addr);
1282 if (hash) {
1283 text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size);
1284 WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678);
1285 text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4);
1286 }
1287 /* rely on apply_retpolines() */
1288 }
1289
1290 return 0;
1291}
1292
1293static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1294 s32 *start_cfi, s32 *end_cfi, struct module *mod)
1295{
1296 bool builtin = mod ? false : true;
1297 int ret;
1298
1299 if (WARN_ONCE(fineibt_preamble_size != 16,
1300 "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1301 return;
1302
1303 if (cfi_mode == CFI_AUTO) {
1304 cfi_mode = CFI_KCFI;
1305 if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
1306 cfi_mode = CFI_FINEIBT;
1307 }
1308
1309 /*
1310 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1311 * rewrite them. This disables all CFI. If this succeeds but any of the
1312 * later stages fails, we're without CFI.
1313 */
1314 ret = cfi_disable_callers(start_retpoline, end_retpoline, mod);
1315 if (ret)
1316 goto err;
1317
1318 if (cfi_rand) {
1319 if (builtin) {
1320 cfi_seed = get_random_u32();
1321 cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1322 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1323 }
1324
1325 ret = cfi_rand_preamble(start_cfi, end_cfi, mod);
1326 if (ret)
1327 goto err;
1328
1329 ret = cfi_rand_callers(start_retpoline, end_retpoline, mod);
1330 if (ret)
1331 goto err;
1332 }
1333
1334 switch (cfi_mode) {
1335 case CFI_OFF:
1336 if (builtin)
1337 pr_info("Disabling CFI\n");
1338 return;
1339
1340 case CFI_KCFI:
1341 ret = cfi_enable_callers(start_retpoline, end_retpoline, mod);
1342 if (ret)
1343 goto err;
1344
1345 if (builtin)
1346 pr_info("Using kCFI\n");
1347 return;
1348
1349 case CFI_FINEIBT:
1350 /* place the FineIBT preamble at func()-16 */
1351 ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod);
1352 if (ret)
1353 goto err;
1354
1355 /* rewrite the callers to target func()-16 */
1356 ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod);
1357 if (ret)
1358 goto err;
1359
1360 /* now that nobody targets func()+0, remove ENDBR there */
1361 cfi_rewrite_endbr(start_cfi, end_cfi, mod);
1362
1363 if (builtin)
1364 pr_info("Using FineIBT CFI\n");
1365 return;
1366
1367 default:
1368 break;
1369 }
1370
1371err:
1372 pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1373}
1374
1375static inline void poison_hash(void *addr)
1376{
1377 *(u32 *)addr = 0;
1378}
1379
1380static void poison_cfi(void *addr, void *wr_addr)
1381{
1382 switch (cfi_mode) {
1383 case CFI_FINEIBT:
1384 /*
1385 * __cfi_\func:
1386 * osp nopl (%rax)
1387 * subl $0, %r10d
1388 * jz 1f
1389 * ud2
1390 * 1: nop
1391 */
1392 poison_endbr(addr, wr_addr, false);
1393 poison_hash(wr_addr + fineibt_preamble_hash);
1394 break;
1395
1396 case CFI_KCFI:
1397 /*
1398 * __cfi_\func:
1399 * movl $0, %eax
1400 * .skip 11, 0x90
1401 */
1402 poison_hash(wr_addr + 1);
1403 break;
1404
1405 default:
1406 break;
1407 }
1408}
1409
1410#else
1411
1412static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1413 s32 *start_cfi, s32 *end_cfi, struct module *mod)
1414{
1415}
1416
1417#ifdef CONFIG_X86_KERNEL_IBT
1418static void poison_cfi(void *addr, void *wr_addr) { }
1419#endif
1420
1421#endif
1422
1423void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1424 s32 *start_cfi, s32 *end_cfi, struct module *mod)
1425{
1426 return __apply_fineibt(start_retpoline, end_retpoline,
1427 start_cfi, end_cfi, mod);
1428}
1429
1430#ifdef CONFIG_SMP
1431static void alternatives_smp_lock(const s32 *start, const s32 *end,
1432 u8 *text, u8 *text_end)
1433{
1434 const s32 *poff;
1435
1436 for (poff = start; poff < end; poff++) {
1437 u8 *ptr = (u8 *)poff + *poff;
1438
1439 if (!*poff || ptr < text || ptr >= text_end)
1440 continue;
1441 /* turn DS segment override prefix into lock prefix */
1442 if (*ptr == 0x3e)
1443 text_poke(ptr, ((unsigned char []){0xf0}), 1);
1444 }
1445}
1446
1447static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1448 u8 *text, u8 *text_end)
1449{
1450 const s32 *poff;
1451
1452 for (poff = start; poff < end; poff++) {
1453 u8 *ptr = (u8 *)poff + *poff;
1454
1455 if (!*poff || ptr < text || ptr >= text_end)
1456 continue;
1457 /* turn lock prefix into DS segment override prefix */
1458 if (*ptr == 0xf0)
1459 text_poke(ptr, ((unsigned char []){0x3E}), 1);
1460 }
1461}
1462
1463struct smp_alt_module {
1464 /* what is this ??? */
1465 struct module *mod;
1466 char *name;
1467
1468 /* ptrs to lock prefixes */
1469 const s32 *locks;
1470 const s32 *locks_end;
1471
1472 /* .text segment, needed to avoid patching init code ;) */
1473 u8 *text;
1474 u8 *text_end;
1475
1476 struct list_head next;
1477};
1478static LIST_HEAD(smp_alt_modules);
1479static bool uniproc_patched = false; /* protected by text_mutex */
1480
1481void __init_or_module alternatives_smp_module_add(struct module *mod,
1482 char *name,
1483 void *locks, void *locks_end,
1484 void *text, void *text_end)
1485{
1486 struct smp_alt_module *smp;
1487
1488 mutex_lock(&text_mutex);
1489 if (!uniproc_patched)
1490 goto unlock;
1491
1492 if (num_possible_cpus() == 1)
1493 /* Don't bother remembering, we'll never have to undo it. */
1494 goto smp_unlock;
1495
1496 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1497 if (NULL == smp)
1498 /* we'll run the (safe but slow) SMP code then ... */
1499 goto unlock;
1500
1501 smp->mod = mod;
1502 smp->name = name;
1503 smp->locks = locks;
1504 smp->locks_end = locks_end;
1505 smp->text = text;
1506 smp->text_end = text_end;
1507 DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
1508 smp->locks, smp->locks_end,
1509 smp->text, smp->text_end, smp->name);
1510
1511 list_add_tail(&smp->next, &smp_alt_modules);
1512smp_unlock:
1513 alternatives_smp_unlock(locks, locks_end, text, text_end);
1514unlock:
1515 mutex_unlock(&text_mutex);
1516}
1517
1518void __init_or_module alternatives_smp_module_del(struct module *mod)
1519{
1520 struct smp_alt_module *item;
1521
1522 mutex_lock(&text_mutex);
1523 list_for_each_entry(item, &smp_alt_modules, next) {
1524 if (mod != item->mod)
1525 continue;
1526 list_del(&item->next);
1527 kfree(item);
1528 break;
1529 }
1530 mutex_unlock(&text_mutex);
1531}
1532
1533void alternatives_enable_smp(void)
1534{
1535 struct smp_alt_module *mod;
1536
1537 /* Why bother if there are no other CPUs? */
1538 BUG_ON(num_possible_cpus() == 1);
1539
1540 mutex_lock(&text_mutex);
1541
1542 if (uniproc_patched) {
1543 pr_info("switching to SMP code\n");
1544 BUG_ON(num_online_cpus() != 1);
1545 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1546 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1547 list_for_each_entry(mod, &smp_alt_modules, next)
1548 alternatives_smp_lock(mod->locks, mod->locks_end,
1549 mod->text, mod->text_end);
1550 uniproc_patched = false;
1551 }
1552 mutex_unlock(&text_mutex);
1553}
1554
1555/*
1556 * Return 1 if the address range is reserved for SMP-alternatives.
1557 * Must hold text_mutex.
1558 */
1559int alternatives_text_reserved(void *start, void *end)
1560{
1561 struct smp_alt_module *mod;
1562 const s32 *poff;
1563 u8 *text_start = start;
1564 u8 *text_end = end;
1565
1566 lockdep_assert_held(&text_mutex);
1567
1568 list_for_each_entry(mod, &smp_alt_modules, next) {
1569 if (mod->text > text_end || mod->text_end < text_start)
1570 continue;
1571 for (poff = mod->locks; poff < mod->locks_end; poff++) {
1572 const u8 *ptr = (const u8 *)poff + *poff;
1573
1574 if (text_start <= ptr && text_end > ptr)
1575 return 1;
1576 }
1577 }
1578
1579 return 0;
1580}
1581#endif /* CONFIG_SMP */
1582
1583/*
1584 * Self-test for the INT3 based CALL emulation code.
1585 *
1586 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1587 * properly and that there is a stack gap between the INT3 frame and the
1588 * previous context. Without this gap doing a virtual PUSH on the interrupted
1589 * stack would corrupt the INT3 IRET frame.
1590 *
1591 * See entry_{32,64}.S for more details.
1592 */
1593
1594/*
1595 * We define the int3_magic() function in assembly to control the calling
1596 * convention such that we can 'call' it from assembly.
1597 */
1598
1599extern void int3_magic(unsigned int *ptr); /* defined in asm */
1600
1601asm (
1602" .pushsection .init.text, \"ax\", @progbits\n"
1603" .type int3_magic, @function\n"
1604"int3_magic:\n"
1605 ANNOTATE_NOENDBR
1606" movl $1, (%" _ASM_ARG1 ")\n"
1607 ASM_RET
1608" .size int3_magic, .-int3_magic\n"
1609" .popsection\n"
1610);
1611
1612extern void int3_selftest_ip(void); /* defined in asm below */
1613
1614static int __init
1615int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1616{
1617 unsigned long selftest = (unsigned long)&int3_selftest_ip;
1618 struct die_args *args = data;
1619 struct pt_regs *regs = args->regs;
1620
1621 OPTIMIZER_HIDE_VAR(selftest);
1622
1623 if (!regs || user_mode(regs))
1624 return NOTIFY_DONE;
1625
1626 if (val != DIE_INT3)
1627 return NOTIFY_DONE;
1628
1629 if (regs->ip - INT3_INSN_SIZE != selftest)
1630 return NOTIFY_DONE;
1631
1632 int3_emulate_call(regs, (unsigned long)&int3_magic);
1633 return NOTIFY_STOP;
1634}
1635
1636/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1637static noinline void __init int3_selftest(void)
1638{
1639 static __initdata struct notifier_block int3_exception_nb = {
1640 .notifier_call = int3_exception_notify,
1641 .priority = INT_MAX-1, /* last */
1642 };
1643 unsigned int val = 0;
1644
1645 BUG_ON(register_die_notifier(&int3_exception_nb));
1646
1647 /*
1648 * Basically: int3_magic(&val); but really complicated :-)
1649 *
1650 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1651 * notifier above will emulate CALL for us.
1652 */
1653 asm volatile ("int3_selftest_ip:\n\t"
1654 ANNOTATE_NOENDBR
1655 " int3; nop; nop; nop; nop\n\t"
1656 : ASM_CALL_CONSTRAINT
1657 : __ASM_SEL_RAW(a, D) (&val)
1658 : "memory");
1659
1660 BUG_ON(val != 1);
1661
1662 unregister_die_notifier(&int3_exception_nb);
1663}
1664
1665static __initdata int __alt_reloc_selftest_addr;
1666
1667extern void __init __alt_reloc_selftest(void *arg);
1668__visible noinline void __init __alt_reloc_selftest(void *arg)
1669{
1670 WARN_ON(arg != &__alt_reloc_selftest_addr);
1671}
1672
1673static noinline void __init alt_reloc_selftest(void)
1674{
1675 /*
1676 * Tests apply_relocation().
1677 *
1678 * This has a relative immediate (CALL) in a place other than the first
1679 * instruction and additionally on x86_64 we get a RIP-relative LEA:
1680 *
1681 * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c
1682 * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4
1683 *
1684 * Getting this wrong will either crash and burn or tickle the WARN
1685 * above.
1686 */
1687 asm_inline volatile (
1688 ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
1689 : ASM_CALL_CONSTRAINT
1690 : [mem] "m" (__alt_reloc_selftest_addr)
1691 : _ASM_ARG1
1692 );
1693}
1694
1695void __init alternative_instructions(void)
1696{
1697 int3_selftest();
1698
1699 /*
1700 * The patching is not fully atomic, so try to avoid local
1701 * interruptions that might execute the to be patched code.
1702 * Other CPUs are not running.
1703 */
1704 stop_nmi();
1705
1706 /*
1707 * Don't stop machine check exceptions while patching.
1708 * MCEs only happen when something got corrupted and in this
1709 * case we must do something about the corruption.
1710 * Ignoring it is worse than an unlikely patching race.
1711 * Also machine checks tend to be broadcast and if one CPU
1712 * goes into machine check the others follow quickly, so we don't
1713 * expect a machine check to cause undue problems during to code
1714 * patching.
1715 */
1716
1717 /*
1718 * Make sure to set (artificial) features depending on used paravirt
1719 * functions which can later influence alternative patching.
1720 */
1721 paravirt_set_cap();
1722
1723 __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1724 __cfi_sites, __cfi_sites_end, NULL);
1725
1726 /*
1727 * Rewrite the retpolines, must be done before alternatives since
1728 * those can rewrite the retpoline thunks.
1729 */
1730 apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
1731 apply_returns(__return_sites, __return_sites_end, NULL);
1732
1733 apply_alternatives(__alt_instructions, __alt_instructions_end, NULL);
1734
1735 /*
1736 * Now all calls are established. Apply the call thunks if
1737 * required.
1738 */
1739 callthunks_patch_builtin_calls();
1740
1741 /*
1742 * Seal all functions that do not have their address taken.
1743 */
1744 apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL);
1745
1746#ifdef CONFIG_SMP
1747 /* Patch to UP if other cpus not imminent. */
1748 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1749 uniproc_patched = true;
1750 alternatives_smp_module_add(NULL, "core kernel",
1751 __smp_locks, __smp_locks_end,
1752 _text, _etext);
1753 }
1754
1755 if (!uniproc_patched || num_possible_cpus() == 1) {
1756 free_init_pages("SMP alternatives",
1757 (unsigned long)__smp_locks,
1758 (unsigned long)__smp_locks_end);
1759 }
1760#endif
1761
1762 restart_nmi();
1763 alternatives_patched = 1;
1764
1765 alt_reloc_selftest();
1766}
1767
1768/**
1769 * text_poke_early - Update instructions on a live kernel at boot time
1770 * @addr: address to modify
1771 * @opcode: source of the copy
1772 * @len: length to copy
1773 *
1774 * When you use this code to patch more than one byte of an instruction
1775 * you need to make sure that other CPUs cannot execute this code in parallel.
1776 * Also no thread must be currently preempted in the middle of these
1777 * instructions. And on the local CPU you need to be protected against NMI or
1778 * MCE handlers seeing an inconsistent instruction while you patch.
1779 */
1780void __init_or_module text_poke_early(void *addr, const void *opcode,
1781 size_t len)
1782{
1783 unsigned long flags;
1784
1785 if (boot_cpu_has(X86_FEATURE_NX) &&
1786 is_module_text_address((unsigned long)addr)) {
1787 /*
1788 * Modules text is marked initially as non-executable, so the
1789 * code cannot be running and speculative code-fetches are
1790 * prevented. Just change the code.
1791 */
1792 memcpy(addr, opcode, len);
1793 } else {
1794 local_irq_save(flags);
1795 memcpy(addr, opcode, len);
1796 sync_core();
1797 local_irq_restore(flags);
1798
1799 /*
1800 * Could also do a CLFLUSH here to speed up CPU recovery; but
1801 * that causes hangs on some VIA CPUs.
1802 */
1803 }
1804}
1805
1806typedef struct {
1807 struct mm_struct *mm;
1808} temp_mm_state_t;
1809
1810/*
1811 * Using a temporary mm allows to set temporary mappings that are not accessible
1812 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1813 * that override the kernel memory protections (e.g., W^X), without exposing the
1814 * temporary page-table mappings that are required for these write operations to
1815 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1816 * mapping is torn down.
1817 *
1818 * Context: The temporary mm needs to be used exclusively by a single core. To
1819 * harden security IRQs must be disabled while the temporary mm is
1820 * loaded, thereby preventing interrupt handler bugs from overriding
1821 * the kernel memory protection.
1822 */
1823static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1824{
1825 temp_mm_state_t temp_state;
1826
1827 lockdep_assert_irqs_disabled();
1828
1829 /*
1830 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1831 * with a stale address space WITHOUT being in lazy mode after
1832 * restoring the previous mm.
1833 */
1834 if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1835 leave_mm();
1836
1837 temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1838 switch_mm_irqs_off(NULL, mm, current);
1839
1840 /*
1841 * If breakpoints are enabled, disable them while the temporary mm is
1842 * used. Userspace might set up watchpoints on addresses that are used
1843 * in the temporary mm, which would lead to wrong signals being sent or
1844 * crashes.
1845 *
1846 * Note that breakpoints are not disabled selectively, which also causes
1847 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1848 * undesirable, but still seems reasonable as the code that runs in the
1849 * temporary mm should be short.
1850 */
1851 if (hw_breakpoint_active())
1852 hw_breakpoint_disable();
1853
1854 return temp_state;
1855}
1856
1857static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1858{
1859 lockdep_assert_irqs_disabled();
1860 switch_mm_irqs_off(NULL, prev_state.mm, current);
1861
1862 /*
1863 * Restore the breakpoints if they were disabled before the temporary mm
1864 * was loaded.
1865 */
1866 if (hw_breakpoint_active())
1867 hw_breakpoint_restore();
1868}
1869
1870__ro_after_init struct mm_struct *poking_mm;
1871__ro_after_init unsigned long poking_addr;
1872
1873static void text_poke_memcpy(void *dst, const void *src, size_t len)
1874{
1875 memcpy(dst, src, len);
1876}
1877
1878static void text_poke_memset(void *dst, const void *src, size_t len)
1879{
1880 int c = *(const int *)src;
1881
1882 memset(dst, c, len);
1883}
1884
1885typedef void text_poke_f(void *dst, const void *src, size_t len);
1886
1887static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1888{
1889 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1890 struct page *pages[2] = {NULL};
1891 temp_mm_state_t prev;
1892 unsigned long flags;
1893 pte_t pte, *ptep;
1894 spinlock_t *ptl;
1895 pgprot_t pgprot;
1896
1897 /*
1898 * While boot memory allocator is running we cannot use struct pages as
1899 * they are not yet initialized. There is no way to recover.
1900 */
1901 BUG_ON(!after_bootmem);
1902
1903 if (!core_kernel_text((unsigned long)addr)) {
1904 pages[0] = vmalloc_to_page(addr);
1905 if (cross_page_boundary)
1906 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1907 } else {
1908 pages[0] = virt_to_page(addr);
1909 WARN_ON(!PageReserved(pages[0]));
1910 if (cross_page_boundary)
1911 pages[1] = virt_to_page(addr + PAGE_SIZE);
1912 }
1913 /*
1914 * If something went wrong, crash and burn since recovery paths are not
1915 * implemented.
1916 */
1917 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1918
1919 /*
1920 * Map the page without the global bit, as TLB flushing is done with
1921 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1922 */
1923 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1924
1925 /*
1926 * The lock is not really needed, but this allows to avoid open-coding.
1927 */
1928 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1929
1930 /*
1931 * This must not fail; preallocated in poking_init().
1932 */
1933 VM_BUG_ON(!ptep);
1934
1935 local_irq_save(flags);
1936
1937 pte = mk_pte(pages[0], pgprot);
1938 set_pte_at(poking_mm, poking_addr, ptep, pte);
1939
1940 if (cross_page_boundary) {
1941 pte = mk_pte(pages[1], pgprot);
1942 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1943 }
1944
1945 /*
1946 * Loading the temporary mm behaves as a compiler barrier, which
1947 * guarantees that the PTE will be set at the time memcpy() is done.
1948 */
1949 prev = use_temporary_mm(poking_mm);
1950
1951 kasan_disable_current();
1952 func((u8 *)poking_addr + offset_in_page(addr), src, len);
1953 kasan_enable_current();
1954
1955 /*
1956 * Ensure that the PTE is only cleared after the instructions of memcpy
1957 * were issued by using a compiler barrier.
1958 */
1959 barrier();
1960
1961 pte_clear(poking_mm, poking_addr, ptep);
1962 if (cross_page_boundary)
1963 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1964
1965 /*
1966 * Loading the previous page-table hierarchy requires a serializing
1967 * instruction that already allows the core to see the updated version.
1968 * Xen-PV is assumed to serialize execution in a similar manner.
1969 */
1970 unuse_temporary_mm(prev);
1971
1972 /*
1973 * Flushing the TLB might involve IPIs, which would require enabled
1974 * IRQs, but not if the mm is not used, as it is in this point.
1975 */
1976 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1977 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1978 PAGE_SHIFT, false);
1979
1980 if (func == text_poke_memcpy) {
1981 /*
1982 * If the text does not match what we just wrote then something is
1983 * fundamentally screwy; there's nothing we can really do about that.
1984 */
1985 BUG_ON(memcmp(addr, src, len));
1986 }
1987
1988 local_irq_restore(flags);
1989 pte_unmap_unlock(ptep, ptl);
1990 return addr;
1991}
1992
1993/**
1994 * text_poke - Update instructions on a live kernel
1995 * @addr: address to modify
1996 * @opcode: source of the copy
1997 * @len: length to copy
1998 *
1999 * Only atomic text poke/set should be allowed when not doing early patching.
2000 * It means the size must be writable atomically and the address must be aligned
2001 * in a way that permits an atomic write. It also makes sure we fit on a single
2002 * page.
2003 *
2004 * Note that the caller must ensure that if the modified code is part of a
2005 * module, the module would not be removed during poking. This can be achieved
2006 * by registering a module notifier, and ordering module removal and patching
2007 * through a mutex.
2008 */
2009void *text_poke(void *addr, const void *opcode, size_t len)
2010{
2011 lockdep_assert_held(&text_mutex);
2012
2013 return __text_poke(text_poke_memcpy, addr, opcode, len);
2014}
2015
2016/**
2017 * text_poke_kgdb - Update instructions on a live kernel by kgdb
2018 * @addr: address to modify
2019 * @opcode: source of the copy
2020 * @len: length to copy
2021 *
2022 * Only atomic text poke/set should be allowed when not doing early patching.
2023 * It means the size must be writable atomically and the address must be aligned
2024 * in a way that permits an atomic write. It also makes sure we fit on a single
2025 * page.
2026 *
2027 * Context: should only be used by kgdb, which ensures no other core is running,
2028 * despite the fact it does not hold the text_mutex.
2029 */
2030void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2031{
2032 return __text_poke(text_poke_memcpy, addr, opcode, len);
2033}
2034
2035void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2036 bool core_ok)
2037{
2038 unsigned long start = (unsigned long)addr;
2039 size_t patched = 0;
2040
2041 if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2042 return NULL;
2043
2044 while (patched < len) {
2045 unsigned long ptr = start + patched;
2046 size_t s;
2047
2048 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2049
2050 __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2051 patched += s;
2052 }
2053 return addr;
2054}
2055
2056/**
2057 * text_poke_copy - Copy instructions into (an unused part of) RX memory
2058 * @addr: address to modify
2059 * @opcode: source of the copy
2060 * @len: length to copy, could be more than 2x PAGE_SIZE
2061 *
2062 * Not safe against concurrent execution; useful for JITs to dump
2063 * new code blocks into unused regions of RX memory. Can be used in
2064 * conjunction with synchronize_rcu_tasks() to wait for existing
2065 * execution to quiesce after having made sure no existing functions
2066 * pointers are live.
2067 */
2068void *text_poke_copy(void *addr, const void *opcode, size_t len)
2069{
2070 mutex_lock(&text_mutex);
2071 addr = text_poke_copy_locked(addr, opcode, len, false);
2072 mutex_unlock(&text_mutex);
2073 return addr;
2074}
2075
2076/**
2077 * text_poke_set - memset into (an unused part of) RX memory
2078 * @addr: address to modify
2079 * @c: the byte to fill the area with
2080 * @len: length to copy, could be more than 2x PAGE_SIZE
2081 *
2082 * This is useful to overwrite unused regions of RX memory with illegal
2083 * instructions.
2084 */
2085void *text_poke_set(void *addr, int c, size_t len)
2086{
2087 unsigned long start = (unsigned long)addr;
2088 size_t patched = 0;
2089
2090 if (WARN_ON_ONCE(core_kernel_text(start)))
2091 return NULL;
2092
2093 mutex_lock(&text_mutex);
2094 while (patched < len) {
2095 unsigned long ptr = start + patched;
2096 size_t s;
2097
2098 s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2099
2100 __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2101 patched += s;
2102 }
2103 mutex_unlock(&text_mutex);
2104 return addr;
2105}
2106
2107static void do_sync_core(void *info)
2108{
2109 sync_core();
2110}
2111
2112void text_poke_sync(void)
2113{
2114 on_each_cpu(do_sync_core, NULL, 1);
2115}
2116
2117/*
2118 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2119 * this thing. When len == 6 everything is prefixed with 0x0f and we map
2120 * opcode to Jcc.d8, using len to distinguish.
2121 */
2122struct text_poke_loc {
2123 /* addr := _stext + rel_addr */
2124 s32 rel_addr;
2125 s32 disp;
2126 u8 len;
2127 u8 opcode;
2128 const u8 text[POKE_MAX_OPCODE_SIZE];
2129 /* see text_poke_bp_batch() */
2130 u8 old;
2131};
2132
2133struct bp_patching_desc {
2134 struct text_poke_loc *vec;
2135 int nr_entries;
2136 atomic_t refs;
2137};
2138
2139static struct bp_patching_desc bp_desc;
2140
2141static __always_inline
2142struct bp_patching_desc *try_get_desc(void)
2143{
2144 struct bp_patching_desc *desc = &bp_desc;
2145
2146 if (!raw_atomic_inc_not_zero(&desc->refs))
2147 return NULL;
2148
2149 return desc;
2150}
2151
2152static __always_inline void put_desc(void)
2153{
2154 struct bp_patching_desc *desc = &bp_desc;
2155
2156 smp_mb__before_atomic();
2157 raw_atomic_dec(&desc->refs);
2158}
2159
2160static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
2161{
2162 return _stext + tp->rel_addr;
2163}
2164
2165static __always_inline int patch_cmp(const void *key, const void *elt)
2166{
2167 struct text_poke_loc *tp = (struct text_poke_loc *) elt;
2168
2169 if (key < text_poke_addr(tp))
2170 return -1;
2171 if (key > text_poke_addr(tp))
2172 return 1;
2173 return 0;
2174}
2175
2176noinstr int poke_int3_handler(struct pt_regs *regs)
2177{
2178 struct bp_patching_desc *desc;
2179 struct text_poke_loc *tp;
2180 int ret = 0;
2181 void *ip;
2182
2183 if (user_mode(regs))
2184 return 0;
2185
2186 /*
2187 * Having observed our INT3 instruction, we now must observe
2188 * bp_desc with non-zero refcount:
2189 *
2190 * bp_desc.refs = 1 INT3
2191 * WMB RMB
2192 * write INT3 if (bp_desc.refs != 0)
2193 */
2194 smp_rmb();
2195
2196 desc = try_get_desc();
2197 if (!desc)
2198 return 0;
2199
2200 /*
2201 * Discount the INT3. See text_poke_bp_batch().
2202 */
2203 ip = (void *) regs->ip - INT3_INSN_SIZE;
2204
2205 /*
2206 * Skip the binary search if there is a single member in the vector.
2207 */
2208 if (unlikely(desc->nr_entries > 1)) {
2209 tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
2210 sizeof(struct text_poke_loc),
2211 patch_cmp);
2212 if (!tp)
2213 goto out_put;
2214 } else {
2215 tp = desc->vec;
2216 if (text_poke_addr(tp) != ip)
2217 goto out_put;
2218 }
2219
2220 ip += tp->len;
2221
2222 switch (tp->opcode) {
2223 case INT3_INSN_OPCODE:
2224 /*
2225 * Someone poked an explicit INT3, they'll want to handle it,
2226 * do not consume.
2227 */
2228 goto out_put;
2229
2230 case RET_INSN_OPCODE:
2231 int3_emulate_ret(regs);
2232 break;
2233
2234 case CALL_INSN_OPCODE:
2235 int3_emulate_call(regs, (long)ip + tp->disp);
2236 break;
2237
2238 case JMP32_INSN_OPCODE:
2239 case JMP8_INSN_OPCODE:
2240 int3_emulate_jmp(regs, (long)ip + tp->disp);
2241 break;
2242
2243 case 0x70 ... 0x7f: /* Jcc */
2244 int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
2245 break;
2246
2247 default:
2248 BUG();
2249 }
2250
2251 ret = 1;
2252
2253out_put:
2254 put_desc();
2255 return ret;
2256}
2257
2258#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
2259static struct text_poke_loc tp_vec[TP_VEC_MAX];
2260static int tp_vec_nr;
2261
2262/**
2263 * text_poke_bp_batch() -- update instructions on live kernel on SMP
2264 * @tp: vector of instructions to patch
2265 * @nr_entries: number of entries in the vector
2266 *
2267 * Modify multi-byte instruction by using int3 breakpoint on SMP.
2268 * We completely avoid stop_machine() here, and achieve the
2269 * synchronization using int3 breakpoint.
2270 *
2271 * The way it is done:
2272 * - For each entry in the vector:
2273 * - add a int3 trap to the address that will be patched
2274 * - sync cores
2275 * - For each entry in the vector:
2276 * - update all but the first byte of the patched range
2277 * - sync cores
2278 * - For each entry in the vector:
2279 * - replace the first byte (int3) by the first byte of
2280 * replacing opcode
2281 * - sync cores
2282 */
2283static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
2284{
2285 unsigned char int3 = INT3_INSN_OPCODE;
2286 unsigned int i;
2287 int do_sync;
2288
2289 lockdep_assert_held(&text_mutex);
2290
2291 bp_desc.vec = tp;
2292 bp_desc.nr_entries = nr_entries;
2293
2294 /*
2295 * Corresponds to the implicit memory barrier in try_get_desc() to
2296 * ensure reading a non-zero refcount provides up to date bp_desc data.
2297 */
2298 atomic_set_release(&bp_desc.refs, 1);
2299
2300 /*
2301 * Function tracing can enable thousands of places that need to be
2302 * updated. This can take quite some time, and with full kernel debugging
2303 * enabled, this could cause the softlockup watchdog to trigger.
2304 * This function gets called every 256 entries added to be patched.
2305 * Call cond_resched() here to make sure that other tasks can get scheduled
2306 * while processing all the functions being patched.
2307 */
2308 cond_resched();
2309
2310 /*
2311 * Corresponding read barrier in int3 notifier for making sure the
2312 * nr_entries and handler are correctly ordered wrt. patching.
2313 */
2314 smp_wmb();
2315
2316 /*
2317 * First step: add a int3 trap to the address that will be patched.
2318 */
2319 for (i = 0; i < nr_entries; i++) {
2320 tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
2321 text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
2322 }
2323
2324 text_poke_sync();
2325
2326 /*
2327 * Second step: update all but the first byte of the patched range.
2328 */
2329 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2330 u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
2331 u8 _new[POKE_MAX_OPCODE_SIZE+1];
2332 const u8 *new = tp[i].text;
2333 int len = tp[i].len;
2334
2335 if (len - INT3_INSN_SIZE > 0) {
2336 memcpy(old + INT3_INSN_SIZE,
2337 text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2338 len - INT3_INSN_SIZE);
2339
2340 if (len == 6) {
2341 _new[0] = 0x0f;
2342 memcpy(_new + 1, new, 5);
2343 new = _new;
2344 }
2345
2346 text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2347 new + INT3_INSN_SIZE,
2348 len - INT3_INSN_SIZE);
2349
2350 do_sync++;
2351 }
2352
2353 /*
2354 * Emit a perf event to record the text poke, primarily to
2355 * support Intel PT decoding which must walk the executable code
2356 * to reconstruct the trace. The flow up to here is:
2357 * - write INT3 byte
2358 * - IPI-SYNC
2359 * - write instruction tail
2360 * At this point the actual control flow will be through the
2361 * INT3 and handler and not hit the old or new instruction.
2362 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2363 * can still be decoded. Subsequently:
2364 * - emit RECORD_TEXT_POKE with the new instruction
2365 * - IPI-SYNC
2366 * - write first byte
2367 * - IPI-SYNC
2368 * So before the text poke event timestamp, the decoder will see
2369 * either the old instruction flow or FUP/TIP of INT3. After the
2370 * text poke event timestamp, the decoder will see either the
2371 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2372 * use the timestamp as the point at which to modify the
2373 * executable code.
2374 * The old instruction is recorded so that the event can be
2375 * processed forwards or backwards.
2376 */
2377 perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
2378 }
2379
2380 if (do_sync) {
2381 /*
2382 * According to Intel, this core syncing is very likely
2383 * not necessary and we'd be safe even without it. But
2384 * better safe than sorry (plus there's not only Intel).
2385 */
2386 text_poke_sync();
2387 }
2388
2389 /*
2390 * Third step: replace the first byte (int3) by the first byte of
2391 * replacing opcode.
2392 */
2393 for (do_sync = 0, i = 0; i < nr_entries; i++) {
2394 u8 byte = tp[i].text[0];
2395
2396 if (tp[i].len == 6)
2397 byte = 0x0f;
2398
2399 if (byte == INT3_INSN_OPCODE)
2400 continue;
2401
2402 text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
2403 do_sync++;
2404 }
2405
2406 if (do_sync)
2407 text_poke_sync();
2408
2409 /*
2410 * Remove and wait for refs to be zero.
2411 */
2412 if (!atomic_dec_and_test(&bp_desc.refs))
2413 atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2414}
2415
2416static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2417 const void *opcode, size_t len, const void *emulate)
2418{
2419 struct insn insn;
2420 int ret, i = 0;
2421
2422 if (len == 6)
2423 i = 1;
2424 memcpy((void *)tp->text, opcode+i, len-i);
2425 if (!emulate)
2426 emulate = opcode;
2427
2428 ret = insn_decode_kernel(&insn, emulate);
2429 BUG_ON(ret < 0);
2430
2431 tp->rel_addr = addr - (void *)_stext;
2432 tp->len = len;
2433 tp->opcode = insn.opcode.bytes[0];
2434
2435 if (is_jcc32(&insn)) {
2436 /*
2437 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2438 */
2439 tp->opcode = insn.opcode.bytes[1] - 0x10;
2440 }
2441
2442 switch (tp->opcode) {
2443 case RET_INSN_OPCODE:
2444 case JMP32_INSN_OPCODE:
2445 case JMP8_INSN_OPCODE:
2446 /*
2447 * Control flow instructions without implied execution of the
2448 * next instruction can be padded with INT3.
2449 */
2450 for (i = insn.length; i < len; i++)
2451 BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2452 break;
2453
2454 default:
2455 BUG_ON(len != insn.length);
2456 }
2457
2458 switch (tp->opcode) {
2459 case INT3_INSN_OPCODE:
2460 case RET_INSN_OPCODE:
2461 break;
2462
2463 case CALL_INSN_OPCODE:
2464 case JMP32_INSN_OPCODE:
2465 case JMP8_INSN_OPCODE:
2466 case 0x70 ... 0x7f: /* Jcc */
2467 tp->disp = insn.immediate.value;
2468 break;
2469
2470 default: /* assume NOP */
2471 switch (len) {
2472 case 2: /* NOP2 -- emulate as JMP8+0 */
2473 BUG_ON(memcmp(emulate, x86_nops[len], len));
2474 tp->opcode = JMP8_INSN_OPCODE;
2475 tp->disp = 0;
2476 break;
2477
2478 case 5: /* NOP5 -- emulate as JMP32+0 */
2479 BUG_ON(memcmp(emulate, x86_nops[len], len));
2480 tp->opcode = JMP32_INSN_OPCODE;
2481 tp->disp = 0;
2482 break;
2483
2484 default: /* unknown instruction */
2485 BUG();
2486 }
2487 break;
2488 }
2489}
2490
2491/*
2492 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2493 * early if needed.
2494 */
2495static bool tp_order_fail(void *addr)
2496{
2497 struct text_poke_loc *tp;
2498
2499 if (!tp_vec_nr)
2500 return false;
2501
2502 if (!addr) /* force */
2503 return true;
2504
2505 tp = &tp_vec[tp_vec_nr - 1];
2506 if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2507 return true;
2508
2509 return false;
2510}
2511
2512static void text_poke_flush(void *addr)
2513{
2514 if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2515 text_poke_bp_batch(tp_vec, tp_vec_nr);
2516 tp_vec_nr = 0;
2517 }
2518}
2519
2520void text_poke_finish(void)
2521{
2522 text_poke_flush(NULL);
2523}
2524
2525void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2526{
2527 struct text_poke_loc *tp;
2528
2529 text_poke_flush(addr);
2530
2531 tp = &tp_vec[tp_vec_nr++];
2532 text_poke_loc_init(tp, addr, opcode, len, emulate);
2533}
2534
2535/**
2536 * text_poke_bp() -- update instructions on live kernel on SMP
2537 * @addr: address to patch
2538 * @opcode: opcode of new instruction
2539 * @len: length to copy
2540 * @emulate: instruction to be emulated
2541 *
2542 * Update a single instruction with the vector in the stack, avoiding
2543 * dynamically allocated memory. This function should be used when it is
2544 * not possible to allocate memory.
2545 */
2546void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2547{
2548 struct text_poke_loc tp;
2549
2550 text_poke_loc_init(&tp, addr, opcode, len, emulate);
2551 text_poke_bp_batch(&tp, 1);
2552}
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/list.h>
5#include <linux/stringify.h>
6#include <linux/kprobes.h>
7#include <linux/mm.h>
8#include <linux/vmalloc.h>
9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
12#include <asm/alternative.h>
13#include <asm/sections.h>
14#include <asm/pgtable.h>
15#include <asm/mce.h>
16#include <asm/nmi.h>
17#include <asm/cacheflush.h>
18#include <asm/tlbflush.h>
19#include <asm/io.h>
20#include <asm/fixmap.h>
21
22#define MAX_PATCH_LEN (255-1)
23
24#ifdef CONFIG_HOTPLUG_CPU
25static int smp_alt_once;
26
27static int __init bootonly(char *str)
28{
29 smp_alt_once = 1;
30 return 1;
31}
32__setup("smp-alt-boot", bootonly);
33#else
34#define smp_alt_once 1
35#endif
36
37static int __initdata_or_module debug_alternative;
38
39static int __init debug_alt(char *str)
40{
41 debug_alternative = 1;
42 return 1;
43}
44__setup("debug-alternative", debug_alt);
45
46static int noreplace_smp;
47
48static int __init setup_noreplace_smp(char *str)
49{
50 noreplace_smp = 1;
51 return 1;
52}
53__setup("noreplace-smp", setup_noreplace_smp);
54
55#ifdef CONFIG_PARAVIRT
56static int __initdata_or_module noreplace_paravirt = 0;
57
58static int __init setup_noreplace_paravirt(char *str)
59{
60 noreplace_paravirt = 1;
61 return 1;
62}
63__setup("noreplace-paravirt", setup_noreplace_paravirt);
64#endif
65
66#define DPRINTK(fmt, args...) if (debug_alternative) \
67 printk(KERN_DEBUG fmt, args)
68
69/*
70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
71 * that correspond to that nop. Getting from one nop to the next, we
72 * add to the array the offset that is equal to the sum of all sizes of
73 * nops preceding the one we are after.
74 *
75 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
76 * nice symmetry of sizes of the previous nops.
77 */
78#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
79static const unsigned char intelnops[] =
80{
81 GENERIC_NOP1,
82 GENERIC_NOP2,
83 GENERIC_NOP3,
84 GENERIC_NOP4,
85 GENERIC_NOP5,
86 GENERIC_NOP6,
87 GENERIC_NOP7,
88 GENERIC_NOP8,
89 GENERIC_NOP5_ATOMIC
90};
91static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
92{
93 NULL,
94 intelnops,
95 intelnops + 1,
96 intelnops + 1 + 2,
97 intelnops + 1 + 2 + 3,
98 intelnops + 1 + 2 + 3 + 4,
99 intelnops + 1 + 2 + 3 + 4 + 5,
100 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
101 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103};
104#endif
105
106#ifdef K8_NOP1
107static const unsigned char k8nops[] =
108{
109 K8_NOP1,
110 K8_NOP2,
111 K8_NOP3,
112 K8_NOP4,
113 K8_NOP5,
114 K8_NOP6,
115 K8_NOP7,
116 K8_NOP8,
117 K8_NOP5_ATOMIC
118};
119static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
120{
121 NULL,
122 k8nops,
123 k8nops + 1,
124 k8nops + 1 + 2,
125 k8nops + 1 + 2 + 3,
126 k8nops + 1 + 2 + 3 + 4,
127 k8nops + 1 + 2 + 3 + 4 + 5,
128 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
131};
132#endif
133
134#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
135static const unsigned char k7nops[] =
136{
137 K7_NOP1,
138 K7_NOP2,
139 K7_NOP3,
140 K7_NOP4,
141 K7_NOP5,
142 K7_NOP6,
143 K7_NOP7,
144 K7_NOP8,
145 K7_NOP5_ATOMIC
146};
147static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
148{
149 NULL,
150 k7nops,
151 k7nops + 1,
152 k7nops + 1 + 2,
153 k7nops + 1 + 2 + 3,
154 k7nops + 1 + 2 + 3 + 4,
155 k7nops + 1 + 2 + 3 + 4 + 5,
156 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
157 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
159};
160#endif
161
162#ifdef P6_NOP1
163static const unsigned char __initconst_or_module p6nops[] =
164{
165 P6_NOP1,
166 P6_NOP2,
167 P6_NOP3,
168 P6_NOP4,
169 P6_NOP5,
170 P6_NOP6,
171 P6_NOP7,
172 P6_NOP8,
173 P6_NOP5_ATOMIC
174};
175static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
176{
177 NULL,
178 p6nops,
179 p6nops + 1,
180 p6nops + 1 + 2,
181 p6nops + 1 + 2 + 3,
182 p6nops + 1 + 2 + 3 + 4,
183 p6nops + 1 + 2 + 3 + 4 + 5,
184 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
185 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
187};
188#endif
189
190/* Initialize these to a safe default */
191#ifdef CONFIG_X86_64
192const unsigned char * const *ideal_nops = p6_nops;
193#else
194const unsigned char * const *ideal_nops = intel_nops;
195#endif
196
197void __init arch_init_ideal_nops(void)
198{
199 switch (boot_cpu_data.x86_vendor) {
200 case X86_VENDOR_INTEL:
201 /*
202 * Due to a decoder implementation quirk, some
203 * specific Intel CPUs actually perform better with
204 * the "k8_nops" than with the SDM-recommended NOPs.
205 */
206 if (boot_cpu_data.x86 == 6 &&
207 boot_cpu_data.x86_model >= 0x0f &&
208 boot_cpu_data.x86_model != 0x1c &&
209 boot_cpu_data.x86_model != 0x26 &&
210 boot_cpu_data.x86_model != 0x27 &&
211 boot_cpu_data.x86_model < 0x30) {
212 ideal_nops = k8_nops;
213 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
214 ideal_nops = p6_nops;
215 } else {
216#ifdef CONFIG_X86_64
217 ideal_nops = k8_nops;
218#else
219 ideal_nops = intel_nops;
220#endif
221 }
222
223 default:
224#ifdef CONFIG_X86_64
225 ideal_nops = k8_nops;
226#else
227 if (boot_cpu_has(X86_FEATURE_K8))
228 ideal_nops = k8_nops;
229 else if (boot_cpu_has(X86_FEATURE_K7))
230 ideal_nops = k7_nops;
231 else
232 ideal_nops = intel_nops;
233#endif
234 }
235}
236
237/* Use this to add nops to a buffer, then text_poke the whole buffer. */
238static void __init_or_module add_nops(void *insns, unsigned int len)
239{
240 while (len > 0) {
241 unsigned int noplen = len;
242 if (noplen > ASM_NOP_MAX)
243 noplen = ASM_NOP_MAX;
244 memcpy(insns, ideal_nops[noplen], noplen);
245 insns += noplen;
246 len -= noplen;
247 }
248}
249
250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
251extern s32 __smp_locks[], __smp_locks_end[];
252void *text_poke_early(void *addr, const void *opcode, size_t len);
253
254/* Replace instructions with better alternatives for this CPU type.
255 This runs before SMP is initialized to avoid SMP problems with
256 self modifying code. This implies that asymmetric systems where
257 APs have less capabilities than the boot processor are not handled.
258 Tough. Make sure you disable such features by hand. */
259
260void __init_or_module apply_alternatives(struct alt_instr *start,
261 struct alt_instr *end)
262{
263 struct alt_instr *a;
264 u8 *instr, *replacement;
265 u8 insnbuf[MAX_PATCH_LEN];
266
267 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
268 /*
269 * The scan order should be from start to end. A later scanned
270 * alternative code can overwrite a previous scanned alternative code.
271 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
272 * patch code.
273 *
274 * So be careful if you want to change the scan order to any other
275 * order.
276 */
277 for (a = start; a < end; a++) {
278 instr = (u8 *)&a->instr_offset + a->instr_offset;
279 replacement = (u8 *)&a->repl_offset + a->repl_offset;
280 BUG_ON(a->replacementlen > a->instrlen);
281 BUG_ON(a->instrlen > sizeof(insnbuf));
282 BUG_ON(a->cpuid >= NCAPINTS*32);
283 if (!boot_cpu_has(a->cpuid))
284 continue;
285
286 memcpy(insnbuf, replacement, a->replacementlen);
287
288 /* 0xe8 is a relative jump; fix the offset. */
289 if (*insnbuf == 0xe8 && a->replacementlen == 5)
290 *(s32 *)(insnbuf + 1) += replacement - instr;
291
292 add_nops(insnbuf + a->replacementlen,
293 a->instrlen - a->replacementlen);
294
295 text_poke_early(instr, insnbuf, a->instrlen);
296 }
297}
298
299#ifdef CONFIG_SMP
300
301static void alternatives_smp_lock(const s32 *start, const s32 *end,
302 u8 *text, u8 *text_end)
303{
304 const s32 *poff;
305
306 mutex_lock(&text_mutex);
307 for (poff = start; poff < end; poff++) {
308 u8 *ptr = (u8 *)poff + *poff;
309
310 if (!*poff || ptr < text || ptr >= text_end)
311 continue;
312 /* turn DS segment override prefix into lock prefix */
313 if (*ptr == 0x3e)
314 text_poke(ptr, ((unsigned char []){0xf0}), 1);
315 };
316 mutex_unlock(&text_mutex);
317}
318
319static void alternatives_smp_unlock(const s32 *start, const s32 *end,
320 u8 *text, u8 *text_end)
321{
322 const s32 *poff;
323
324 if (noreplace_smp)
325 return;
326
327 mutex_lock(&text_mutex);
328 for (poff = start; poff < end; poff++) {
329 u8 *ptr = (u8 *)poff + *poff;
330
331 if (!*poff || ptr < text || ptr >= text_end)
332 continue;
333 /* turn lock prefix into DS segment override prefix */
334 if (*ptr == 0xf0)
335 text_poke(ptr, ((unsigned char []){0x3E}), 1);
336 };
337 mutex_unlock(&text_mutex);
338}
339
340struct smp_alt_module {
341 /* what is this ??? */
342 struct module *mod;
343 char *name;
344
345 /* ptrs to lock prefixes */
346 const s32 *locks;
347 const s32 *locks_end;
348
349 /* .text segment, needed to avoid patching init code ;) */
350 u8 *text;
351 u8 *text_end;
352
353 struct list_head next;
354};
355static LIST_HEAD(smp_alt_modules);
356static DEFINE_MUTEX(smp_alt);
357static int smp_mode = 1; /* protected by smp_alt */
358
359void __init_or_module alternatives_smp_module_add(struct module *mod,
360 char *name,
361 void *locks, void *locks_end,
362 void *text, void *text_end)
363{
364 struct smp_alt_module *smp;
365
366 if (noreplace_smp)
367 return;
368
369 if (smp_alt_once) {
370 if (boot_cpu_has(X86_FEATURE_UP))
371 alternatives_smp_unlock(locks, locks_end,
372 text, text_end);
373 return;
374 }
375
376 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
377 if (NULL == smp)
378 return; /* we'll run the (safe but slow) SMP code then ... */
379
380 smp->mod = mod;
381 smp->name = name;
382 smp->locks = locks;
383 smp->locks_end = locks_end;
384 smp->text = text;
385 smp->text_end = text_end;
386 DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
387 __func__, smp->locks, smp->locks_end,
388 smp->text, smp->text_end, smp->name);
389
390 mutex_lock(&smp_alt);
391 list_add_tail(&smp->next, &smp_alt_modules);
392 if (boot_cpu_has(X86_FEATURE_UP))
393 alternatives_smp_unlock(smp->locks, smp->locks_end,
394 smp->text, smp->text_end);
395 mutex_unlock(&smp_alt);
396}
397
398void __init_or_module alternatives_smp_module_del(struct module *mod)
399{
400 struct smp_alt_module *item;
401
402 if (smp_alt_once || noreplace_smp)
403 return;
404
405 mutex_lock(&smp_alt);
406 list_for_each_entry(item, &smp_alt_modules, next) {
407 if (mod != item->mod)
408 continue;
409 list_del(&item->next);
410 mutex_unlock(&smp_alt);
411 DPRINTK("%s: %s\n", __func__, item->name);
412 kfree(item);
413 return;
414 }
415 mutex_unlock(&smp_alt);
416}
417
418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
420{
421 struct smp_alt_module *mod;
422
423#ifdef CONFIG_LOCKDEP
424 /*
425 * Older binutils section handling bug prevented
426 * alternatives-replacement from working reliably.
427 *
428 * If this still occurs then you should see a hang
429 * or crash shortly after this line:
430 */
431 printk("lockdep: fixing up alternatives.\n");
432#endif
433
434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
435 return;
436 BUG_ON(!smp && (num_online_cpus() > 1));
437
438 mutex_lock(&smp_alt);
439
440 /*
441 * Avoid unnecessary switches because it forces JIT based VMs to
442 * throw away all cached translations, which can be quite costly.
443 */
444 if (smp == smp_mode) {
445 /* nothing */
446 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450 list_for_each_entry(mod, &smp_alt_modules, next)
451 alternatives_smp_lock(mod->locks, mod->locks_end,
452 mod->text, mod->text_end);
453 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next)
458 alternatives_smp_unlock(mod->locks, mod->locks_end,
459 mod->text, mod->text_end);
460 }
461 smp_mode = smp;
462 mutex_unlock(&smp_alt);
463}
464
465/* Return 1 if the address range is reserved for smp-alternatives */
466int alternatives_text_reserved(void *start, void *end)
467{
468 struct smp_alt_module *mod;
469 const s32 *poff;
470 u8 *text_start = start;
471 u8 *text_end = end;
472
473 list_for_each_entry(mod, &smp_alt_modules, next) {
474 if (mod->text > text_end || mod->text_end < text_start)
475 continue;
476 for (poff = mod->locks; poff < mod->locks_end; poff++) {
477 const u8 *ptr = (const u8 *)poff + *poff;
478
479 if (text_start <= ptr && text_end > ptr)
480 return 1;
481 }
482 }
483
484 return 0;
485}
486#endif
487
488#ifdef CONFIG_PARAVIRT
489void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
490 struct paravirt_patch_site *end)
491{
492 struct paravirt_patch_site *p;
493 char insnbuf[MAX_PATCH_LEN];
494
495 if (noreplace_paravirt)
496 return;
497
498 for (p = start; p < end; p++) {
499 unsigned int used;
500
501 BUG_ON(p->len > MAX_PATCH_LEN);
502 /* prep the buffer with the original instructions */
503 memcpy(insnbuf, p->instr, p->len);
504 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
505 (unsigned long)p->instr, p->len);
506
507 BUG_ON(used > p->len);
508
509 /* Pad the rest with nops */
510 add_nops(insnbuf + used, p->len - used);
511 text_poke_early(p->instr, insnbuf, p->len);
512 }
513}
514extern struct paravirt_patch_site __start_parainstructions[],
515 __stop_parainstructions[];
516#endif /* CONFIG_PARAVIRT */
517
518void __init alternative_instructions(void)
519{
520 /* The patching is not fully atomic, so try to avoid local interruptions
521 that might execute the to be patched code.
522 Other CPUs are not running. */
523 stop_nmi();
524
525 /*
526 * Don't stop machine check exceptions while patching.
527 * MCEs only happen when something got corrupted and in this
528 * case we must do something about the corruption.
529 * Ignoring it is worse than a unlikely patching race.
530 * Also machine checks tend to be broadcast and if one CPU
531 * goes into machine check the others follow quickly, so we don't
532 * expect a machine check to cause undue problems during to code
533 * patching.
534 */
535
536 apply_alternatives(__alt_instructions, __alt_instructions_end);
537
538 /* switch to patch-once-at-boottime-only mode and free the
539 * tables in case we know the number of CPUs will never ever
540 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542 if (num_possible_cpus() < 2)
543 smp_alt_once = 1;
544#endif
545
546#ifdef CONFIG_SMP
547 if (smp_alt_once) {
548 if (1 == num_possible_cpus()) {
549 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554 _text, _etext);
555 }
556 } else {
557 alternatives_smp_module_add(NULL, "core kernel",
558 __smp_locks, __smp_locks_end,
559 _text, _etext);
560
561 /* Only switch to UP mode if we don't immediately boot others */
562 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563 alternatives_smp_switch(0);
564 }
565#endif
566 apply_paravirt(__parainstructions, __parainstructions_end);
567
568 if (smp_alt_once)
569 free_init_pages("SMP alternatives",
570 (unsigned long)__smp_locks,
571 (unsigned long)__smp_locks_end);
572
573 restart_nmi();
574}
575
576/**
577 * text_poke_early - Update instructions on a live kernel at boot time
578 * @addr: address to modify
579 * @opcode: source of the copy
580 * @len: length to copy
581 *
582 * When you use this code to patch more than one byte of an instruction
583 * you need to make sure that other CPUs cannot execute this code in parallel.
584 * Also no thread must be currently preempted in the middle of these
585 * instructions. And on the local CPU you need to be protected again NMI or MCE
586 * handlers seeing an inconsistent instruction while you patch.
587 */
588void *__init_or_module text_poke_early(void *addr, const void *opcode,
589 size_t len)
590{
591 unsigned long flags;
592 local_irq_save(flags);
593 memcpy(addr, opcode, len);
594 sync_core();
595 local_irq_restore(flags);
596 /* Could also do a CLFLUSH here to speed up CPU recovery; but
597 that causes hangs on some VIA CPUs. */
598 return addr;
599}
600
601/**
602 * text_poke - Update instructions on a live kernel
603 * @addr: address to modify
604 * @opcode: source of the copy
605 * @len: length to copy
606 *
607 * Only atomic text poke/set should be allowed when not doing early patching.
608 * It means the size must be writable atomically and the address must be aligned
609 * in a way that permits an atomic write. It also makes sure we fit on a single
610 * page.
611 *
612 * Note: Must be called under text_mutex.
613 */
614void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
615{
616 unsigned long flags;
617 char *vaddr;
618 struct page *pages[2];
619 int i;
620
621 if (!core_kernel_text((unsigned long)addr)) {
622 pages[0] = vmalloc_to_page(addr);
623 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
624 } else {
625 pages[0] = virt_to_page(addr);
626 WARN_ON(!PageReserved(pages[0]));
627 pages[1] = virt_to_page(addr + PAGE_SIZE);
628 }
629 BUG_ON(!pages[0]);
630 local_irq_save(flags);
631 set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
632 if (pages[1])
633 set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
634 vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
635 memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
636 clear_fixmap(FIX_TEXT_POKE0);
637 if (pages[1])
638 clear_fixmap(FIX_TEXT_POKE1);
639 local_flush_tlb();
640 sync_core();
641 /* Could also do a CLFLUSH here to speed up CPU recovery; but
642 that causes hangs on some VIA CPUs. */
643 for (i = 0; i < len; i++)
644 BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
645 local_irq_restore(flags);
646 return addr;
647}
648
649/*
650 * Cross-modifying kernel text with stop_machine().
651 * This code originally comes from immediate value.
652 */
653static atomic_t stop_machine_first;
654static int wrote_text;
655
656struct text_poke_params {
657 struct text_poke_param *params;
658 int nparams;
659};
660
661static int __kprobes stop_machine_text_poke(void *data)
662{
663 struct text_poke_params *tpp = data;
664 struct text_poke_param *p;
665 int i;
666
667 if (atomic_dec_and_test(&stop_machine_first)) {
668 for (i = 0; i < tpp->nparams; i++) {
669 p = &tpp->params[i];
670 text_poke(p->addr, p->opcode, p->len);
671 }
672 smp_wmb(); /* Make sure other cpus see that this has run */
673 wrote_text = 1;
674 } else {
675 while (!wrote_text)
676 cpu_relax();
677 smp_mb(); /* Load wrote_text before following execution */
678 }
679
680 for (i = 0; i < tpp->nparams; i++) {
681 p = &tpp->params[i];
682 flush_icache_range((unsigned long)p->addr,
683 (unsigned long)p->addr + p->len);
684 }
685 /*
686 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
687 * that a core serializing instruction such as "cpuid" should be
688 * executed on _each_ core before the new instruction is made visible.
689 */
690 sync_core();
691 return 0;
692}
693
694/**
695 * text_poke_smp - Update instructions on a live kernel on SMP
696 * @addr: address to modify
697 * @opcode: source of the copy
698 * @len: length to copy
699 *
700 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
701 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
702 * should be allowed, since stop_machine() does _not_ protect code against
703 * NMI and MCE.
704 *
705 * Note: Must be called under get_online_cpus() and text_mutex.
706 */
707void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
708{
709 struct text_poke_params tpp;
710 struct text_poke_param p;
711
712 p.addr = addr;
713 p.opcode = opcode;
714 p.len = len;
715 tpp.params = &p;
716 tpp.nparams = 1;
717 atomic_set(&stop_machine_first, 1);
718 wrote_text = 0;
719 /* Use __stop_machine() because the caller already got online_cpus. */
720 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
721 return addr;
722}
723
724/**
725 * text_poke_smp_batch - Update instructions on a live kernel on SMP
726 * @params: an array of text_poke parameters
727 * @n: the number of elements in params.
728 *
729 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
730 * stop_machine() is heavy task, it is better to aggregate text_poke requests
731 * and do it once if possible.
732 *
733 * Note: Must be called under get_online_cpus() and text_mutex.
734 */
735void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
736{
737 struct text_poke_params tpp = {.params = params, .nparams = n};
738
739 atomic_set(&stop_machine_first, 1);
740 wrote_text = 0;
741 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
742}