Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2#define pr_fmt(fmt) "SMP alternatives: " fmt
   3
   4#include <linux/module.h>
   5#include <linux/sched.h>
   6#include <linux/perf_event.h>
   7#include <linux/mutex.h>
   8#include <linux/list.h>
   9#include <linux/stringify.h>
  10#include <linux/highmem.h>
  11#include <linux/mm.h>
  12#include <linux/vmalloc.h>
  13#include <linux/memory.h>
  14#include <linux/stop_machine.h>
  15#include <linux/slab.h>
  16#include <linux/kdebug.h>
  17#include <linux/kprobes.h>
  18#include <linux/mmu_context.h>
  19#include <linux/bsearch.h>
  20#include <linux/sync_core.h>
  21#include <asm/text-patching.h>
  22#include <asm/alternative.h>
  23#include <asm/sections.h>
 
  24#include <asm/mce.h>
  25#include <asm/nmi.h>
  26#include <asm/cacheflush.h>
  27#include <asm/tlbflush.h>
  28#include <asm/insn.h>
  29#include <asm/io.h>
  30#include <asm/fixmap.h>
  31#include <asm/paravirt.h>
  32#include <asm/asm-prototypes.h>
  33#include <asm/cfi.h>
  34
  35int __read_mostly alternatives_patched;
  36
  37EXPORT_SYMBOL_GPL(alternatives_patched);
  38
  39#define MAX_PATCH_LEN (255-1)
  40
  41#define DA_ALL		(~0)
  42#define DA_ALT		0x01
  43#define DA_RET		0x02
  44#define DA_RETPOLINE	0x04
  45#define DA_ENDBR	0x08
  46#define DA_SMP		0x10
  47
  48static unsigned int debug_alternative;
  49
  50static int __init debug_alt(char *str)
  51{
  52	if (str && *str == '=')
  53		str++;
 
 
 
 
 
  54
  55	if (!str || kstrtouint(str, 0, &debug_alternative))
  56		debug_alternative = DA_ALL;
  57
 
 
 
  58	return 1;
  59}
  60__setup("debug-alternative", debug_alt);
  61
  62static int noreplace_smp;
  63
  64static int __init setup_noreplace_smp(char *str)
  65{
  66	noreplace_smp = 1;
  67	return 1;
  68}
  69__setup("noreplace-smp", setup_noreplace_smp);
  70
  71#define DPRINTK(type, fmt, args...)					\
  72do {									\
  73	if (debug_alternative & DA_##type)				\
  74		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
  75} while (0)
  76
  77#define DUMP_BYTES(type, buf, len, fmt, args...)			\
  78do {									\
  79	if (unlikely(debug_alternative & DA_##type)) {			\
  80		int j;							\
  81									\
  82		if (!(len))						\
  83			break;						\
  84									\
  85		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
  86		for (j = 0; j < (len) - 1; j++)				\
  87			printk(KERN_CONT "%02hhx ", buf[j]);		\
  88		printk(KERN_CONT "%02hhx\n", buf[j]);			\
  89	}								\
  90} while (0)
  91
  92static const unsigned char x86nops[] =
  93{
  94	BYTES_NOP1,
  95	BYTES_NOP2,
  96	BYTES_NOP3,
  97	BYTES_NOP4,
  98	BYTES_NOP5,
  99	BYTES_NOP6,
 100	BYTES_NOP7,
 101	BYTES_NOP8,
 102#ifdef CONFIG_64BIT
 103	BYTES_NOP9,
 104	BYTES_NOP10,
 105	BYTES_NOP11,
 106#endif
 107};
 108
 109const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
 110{
 111	NULL,
 112	x86nops,
 113	x86nops + 1,
 114	x86nops + 1 + 2,
 115	x86nops + 1 + 2 + 3,
 116	x86nops + 1 + 2 + 3 + 4,
 117	x86nops + 1 + 2 + 3 + 4 + 5,
 118	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
 119	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 120#ifdef CONFIG_64BIT
 121	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 122	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
 123	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
 124#endif
 125};
 126
 127/*
 128 * Nomenclature for variable names to simplify and clarify this code and ease
 129 * any potential staring at it:
 130 *
 131 * @instr: source address of the original instructions in the kernel text as
 132 * generated by the compiler.
 133 *
 134 * @buf: temporary buffer on which the patching operates. This buffer is
 135 * eventually text-poked into the kernel image.
 136 *
 137 * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
 138 * in the .altinstr_replacement section.
 139 */
 140
 141/*
 142 * Fill the buffer with a single effective instruction of size @len.
 143 *
 144 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
 145 * for every single-byte NOP, try to generate the maximally available NOP of
 146 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
 147 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
 148 * *jump* over instead of executing long and daft NOPs.
 149 */
 150static void add_nop(u8 *buf, unsigned int len)
 151{
 152	u8 *target = buf + len;
 153
 154	if (!len)
 155		return;
 156
 157	if (len <= ASM_NOP_MAX) {
 158		memcpy(buf, x86_nops[len], len);
 159		return;
 160	}
 161
 162	if (len < 128) {
 163		__text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
 164		buf += JMP8_INSN_SIZE;
 165	} else {
 166		__text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
 167		buf += JMP32_INSN_SIZE;
 168	}
 169
 170	for (;buf < target; buf++)
 171		*buf = INT3_INSN_OPCODE;
 172}
 
 
 173
 174extern s32 __retpoline_sites[], __retpoline_sites_end[];
 175extern s32 __return_sites[], __return_sites_end[];
 176extern s32 __cfi_sites[], __cfi_sites_end[];
 177extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
 178extern s32 __smp_locks[], __smp_locks_end[];
 179void text_poke_early(void *addr, const void *opcode, size_t len);
 180
 181/*
 182 * Matches NOP and NOPL, not any of the other possible NOPs.
 183 */
 184static bool insn_is_nop(struct insn *insn)
 185{
 186	/* Anything NOP, but no REP NOP */
 187	if (insn->opcode.bytes[0] == 0x90 &&
 188	    (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
 189		return true;
 190
 191	/* NOPL */
 192	if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
 193		return true;
 194
 195	/* TODO: more nops */
 196
 197	return false;
 198}
 199
 200/*
 201 * Find the offset of the first non-NOP instruction starting at @offset
 202 * but no further than @len.
 203 */
 204static int skip_nops(u8 *buf, int offset, int len)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 205{
 206	struct insn insn;
 207
 208	for (; offset < len; offset += insn.length) {
 209		if (insn_decode_kernel(&insn, &buf[offset]))
 210			break;
 211
 212		if (!insn_is_nop(&insn))
 213			break;
 214	}
 215
 216	return offset;
 217}
 218
 219/*
 220 * "noinline" to cause control flow change and thus invalidate I$ and
 221 * cause refetch after modification.
 222 */
 223static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
 224{
 225	for (int next, i = 0; i < len; i = next) {
 226		struct insn insn;
 227
 228		if (insn_decode_kernel(&insn, &buf[i]))
 229			return;
 230
 231		next = i + insn.length;
 232
 233		if (insn_is_nop(&insn)) {
 234			int nop = i;
 235
 236			/* Has the NOP already been optimized? */
 237			if (i + insn.length == len)
 238				return;
 239
 240			next = skip_nops(buf, next, len);
 241
 242			add_nop(buf + nop, next - nop);
 243			DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
 244		}
 245	}
 246}
 247
 248/*
 249 * In this context, "source" is where the instructions are placed in the
 250 * section .altinstr_replacement, for example during kernel build by the
 251 * toolchain.
 252 * "Destination" is where the instructions are being patched in by this
 253 * machinery.
 254 *
 255 * The source offset is:
 256 *
 257 *   src_imm = target - src_next_ip                  (1)
 258 *
 259 * and the target offset is:
 260 *
 261 *   dst_imm = target - dst_next_ip                  (2)
 262 *
 263 * so rework (1) as an expression for target like:
 264 *
 265 *   target = src_imm + src_next_ip                  (1a)
 266 *
 267 * and substitute in (2) to get:
 268 *
 269 *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
 270 *
 271 * Now, since the instruction stream is 'identical' at src and dst (it
 272 * is being copied after all) it can be stated that:
 273 *
 274 *   src_next_ip = src + ip_offset
 275 *   dst_next_ip = dst + ip_offset                   (4)
 276 *
 277 * Substitute (4) in (3) and observe ip_offset being cancelled out to
 278 * obtain:
 279 *
 280 *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
 281 *           = src_imm + src - dst + ip_offset - ip_offset
 282 *           = src_imm + src - dst                   (5)
 283 *
 284 * IOW, only the relative displacement of the code block matters.
 285 */
 286
 287#define apply_reloc_n(n_, p_, d_)				\
 288	do {							\
 289		s32 v = *(s##n_ *)(p_);				\
 290		v += (d_);					\
 291		BUG_ON((v >> 31) != (v >> (n_-1)));		\
 292		*(s##n_ *)(p_) = (s##n_)v;			\
 293	} while (0)
 294
 295
 296static __always_inline
 297void apply_reloc(int n, void *ptr, uintptr_t diff)
 298{
 299	switch (n) {
 300	case 1: apply_reloc_n(8, ptr, diff); break;
 301	case 2: apply_reloc_n(16, ptr, diff); break;
 302	case 4: apply_reloc_n(32, ptr, diff); break;
 303	default: BUG();
 304	}
 305}
 306
 307static __always_inline
 308bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
 309{
 310	u8 *target = src + offset;
 311	/*
 312	 * If the target is inside the patched block, it's relative to the
 313	 * block itself and does not need relocation.
 314	 */
 315	return (target < src || target > src + src_len);
 316}
 
 
 
 
 
 317
 318static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
 
 319{
 320	for (int next, i = 0; i < instrlen; i = next) {
 321		struct insn insn;
 322
 323		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
 324			return;
 325
 326		next = i + insn.length;
 327
 328		switch (insn.opcode.bytes[0]) {
 329		case 0x0f:
 330			if (insn.opcode.bytes[1] < 0x80 ||
 331			    insn.opcode.bytes[1] > 0x8f)
 332				break;
 333
 334			fallthrough;	/* Jcc.d32 */
 335		case 0x70 ... 0x7f:	/* Jcc.d8 */
 336		case JMP8_INSN_OPCODE:
 337		case JMP32_INSN_OPCODE:
 338		case CALL_INSN_OPCODE:
 339			if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
 340				apply_reloc(insn.immediate.nbytes,
 341					    buf + i + insn_offset_immediate(&insn),
 342					    repl - instr);
 343			}
 344
 345			/*
 346			 * Where possible, convert JMP.d32 into JMP.d8.
 347			 */
 348			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
 349				s32 imm = insn.immediate.value;
 350				imm += repl - instr;
 351				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
 352				if ((imm >> 31) == (imm >> 7)) {
 353					buf[i+0] = JMP8_INSN_OPCODE;
 354					buf[i+1] = (s8)imm;
 355
 356					memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
 357				}
 358			}
 359			break;
 360		}
 361
 362		if (insn_rip_relative(&insn)) {
 363			if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
 364				apply_reloc(insn.displacement.nbytes,
 365					    buf + i + insn_offset_displacement(&insn),
 366					    repl - instr);
 367			}
 368		}
 369	}
 370}
 371
 372void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
 373{
 374	__apply_relocation(buf, instr, instrlen, repl, repl_len);
 375	optimize_nops(instr, buf, instrlen);
 376}
 377
 378/* Low-level backend functions usable from alternative code replacements. */
 379DEFINE_ASM_FUNC(nop_func, "", .entry.text);
 380EXPORT_SYMBOL_GPL(nop_func);
 
 
 
 
 
 381
 382noinstr void BUG_func(void)
 
 383{
 384	BUG();
 385}
 386EXPORT_SYMBOL(BUG_func);
 387
 388#define CALL_RIP_REL_OPCODE	0xff
 389#define CALL_RIP_REL_MODRM	0x15
 390
 391/*
 392 * Rewrite the "call BUG_func" replacement to point to the target of the
 393 * indirect pv_ops call "call *disp(%ip)".
 394 */
 395static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
 396			    struct module *mod)
 397{
 398	u8 *wr_instr = module_writable_address(mod, instr);
 399	void *target, *bug = &BUG_func;
 400	s32 disp;
 401
 402	if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
 403		pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
 404		BUG();
 405	}
 406
 407	if (a->instrlen != 6 ||
 408	    wr_instr[0] != CALL_RIP_REL_OPCODE ||
 409	    wr_instr[1] != CALL_RIP_REL_MODRM) {
 410		pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
 411		BUG();
 412	}
 413
 414	/* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
 415	disp = *(s32 *)(wr_instr + 2);
 416#ifdef CONFIG_X86_64
 417	/* ff 15 00 00 00 00   call   *0x0(%rip) */
 418	/* target address is stored at "next instruction + disp". */
 419	target = *(void **)(instr + a->instrlen + disp);
 420#else
 421	/* ff 15 00 00 00 00   call   *0x0 */
 422	/* target address is stored at disp. */
 423	target = *(void **)disp;
 424#endif
 425	if (!target)
 426		target = bug;
 427
 428	/* (BUG_func - .) + (target - BUG_func) := target - . */
 429	*(s32 *)(insn_buff + 1) += target - bug;
 430
 431	if (target == &nop_func)
 432		return 0;
 433
 434	return 5;
 435}
 436
 437static inline u8 * instr_va(struct alt_instr *i)
 438{
 439	return (u8 *)&i->instr_offset + i->instr_offset;
 440}
 441
 442/*
 443 * Replace instructions with better alternatives for this CPU type. This runs
 444 * before SMP is initialized to avoid SMP problems with self modifying code.
 445 * This implies that asymmetric systems where APs have less capabilities than
 446 * the boot processor are not handled. Tough. Make sure you disable such
 447 * features by hand.
 448 *
 449 * Marked "noinline" to cause control flow change and thus insn cache
 450 * to refetch changed I$ lines.
 451 */
 452void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 453						  struct alt_instr *end,
 454						  struct module *mod)
 455{
 456	u8 insn_buff[MAX_PATCH_LEN];
 457	u8 *instr, *replacement;
 458	struct alt_instr *a, *b;
 459
 460	DPRINTK(ALT, "alt table %px, -> %px", start, end);
 461
 462	/*
 463	 * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
 464	 * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
 465	 * During the process, KASAN becomes confused seeing partial LA57
 466	 * conversion and triggers a false-positive out-of-bound report.
 467	 *
 468	 * Disable KASAN until the patching is complete.
 469	 */
 470	kasan_disable_current();
 471
 472	/*
 473	 * The scan order should be from start to end. A later scanned
 474	 * alternative code can overwrite previously scanned alternative code.
 475	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 476	 * patch code.
 477	 *
 478	 * So be careful if you want to change the scan order to any other
 479	 * order.
 480	 */
 481	for (a = start; a < end; a++) {
 482		int insn_buff_sz = 0;
 483		u8 *wr_instr, *wr_replacement;
 484
 485		/*
 486		 * In case of nested ALTERNATIVE()s the outer alternative might
 487		 * add more padding. To ensure consistent patching find the max
 488		 * padding for all alt_instr entries for this site (nested
 489		 * alternatives result in consecutive entries).
 490		 */
 491		for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
 492			u8 len = max(a->instrlen, b->instrlen);
 493			a->instrlen = b->instrlen = len;
 494		}
 495
 496		instr = instr_va(a);
 497		wr_instr = module_writable_address(mod, instr);
 498
 499		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 500		wr_replacement = module_writable_address(mod, replacement);
 501
 502		BUG_ON(a->instrlen > sizeof(insn_buff));
 503		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 504
 505		/*
 506		 * Patch if either:
 507		 * - feature is present
 508		 * - feature not present but ALT_FLAG_NOT is set to mean,
 509		 *   patch if feature is *NOT* present.
 510		 */
 511		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
 512			memcpy(insn_buff, wr_instr, a->instrlen);
 513			optimize_nops(instr, insn_buff, a->instrlen);
 514			text_poke_early(wr_instr, insn_buff, a->instrlen);
 515			continue;
 516		}
 517
 518		DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
 519			a->cpuid >> 5,
 520			a->cpuid & 0x1f,
 521			instr, instr, a->instrlen,
 522			replacement, a->replacementlen, a->flags);
 523
 524		memcpy(insn_buff, wr_replacement, a->replacementlen);
 525		insn_buff_sz = a->replacementlen;
 526
 527		if (a->flags & ALT_FLAG_DIRECT_CALL) {
 528			insn_buff_sz = alt_replace_call(instr, insn_buff, a,
 529							mod);
 530			if (insn_buff_sz < 0)
 531				continue;
 532		}
 533
 534		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 535			insn_buff[insn_buff_sz] = 0x90;
 536
 537		apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
 538
 539		DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px:   old_insn: ", instr);
 540		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 541		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 542
 543		text_poke_early(wr_instr, insn_buff, insn_buff_sz);
 544	}
 545
 546	kasan_enable_current();
 547}
 548
 549static inline bool is_jcc32(struct insn *insn)
 550{
 551	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
 552	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
 553}
 554
 555#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
 556
 557/*
 558 * CALL/JMP *%\reg
 559 */
 560static int emit_indirect(int op, int reg, u8 *bytes)
 561{
 562	int i = 0;
 563	u8 modrm;
 564
 565	switch (op) {
 566	case CALL_INSN_OPCODE:
 567		modrm = 0x10; /* Reg = 2; CALL r/m */
 568		break;
 569
 570	case JMP32_INSN_OPCODE:
 571		modrm = 0x20; /* Reg = 4; JMP r/m */
 572		break;
 573
 574	default:
 575		WARN_ON_ONCE(1);
 576		return -1;
 577	}
 578
 579	if (reg >= 8) {
 580		bytes[i++] = 0x41; /* REX.B prefix */
 581		reg -= 8;
 582	}
 583
 584	modrm |= 0xc0; /* Mod = 3 */
 585	modrm += reg;
 586
 587	bytes[i++] = 0xff; /* opcode */
 588	bytes[i++] = modrm;
 589
 590	return i;
 591}
 592
 593static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
 594{
 595	u8 op = insn->opcode.bytes[0];
 596	int i = 0;
 597
 598	/*
 599	 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
 600	 * tail-calls. Deal with them.
 601	 */
 602	if (is_jcc32(insn)) {
 603		bytes[i++] = op;
 604		op = insn->opcode.bytes[1];
 605		goto clang_jcc;
 606	}
 607
 608	if (insn->length == 6)
 609		bytes[i++] = 0x2e; /* CS-prefix */
 610
 611	switch (op) {
 612	case CALL_INSN_OPCODE:
 613		__text_gen_insn(bytes+i, op, addr+i,
 614				__x86_indirect_call_thunk_array[reg],
 615				CALL_INSN_SIZE);
 616		i += CALL_INSN_SIZE;
 617		break;
 618
 619	case JMP32_INSN_OPCODE:
 620clang_jcc:
 621		__text_gen_insn(bytes+i, op, addr+i,
 622				__x86_indirect_jump_thunk_array[reg],
 623				JMP32_INSN_SIZE);
 624		i += JMP32_INSN_SIZE;
 625		break;
 626
 627	default:
 628		WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
 629		return -1;
 630	}
 631
 632	WARN_ON_ONCE(i != insn->length);
 633
 634	return i;
 635}
 636
 637/*
 638 * Rewrite the compiler generated retpoline thunk calls.
 639 *
 640 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
 641 * indirect instructions, avoiding the extra indirection.
 642 *
 643 * For example, convert:
 644 *
 645 *   CALL __x86_indirect_thunk_\reg
 646 *
 647 * into:
 648 *
 649 *   CALL *%\reg
 650 *
 651 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
 652 */
 653static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
 654{
 655	retpoline_thunk_t *target;
 656	int reg, ret, i = 0;
 657	u8 op, cc;
 658
 659	target = addr + insn->length + insn->immediate.value;
 660	reg = target - __x86_indirect_thunk_array;
 661
 662	if (WARN_ON_ONCE(reg & ~0xf))
 663		return -1;
 664
 665	/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
 666	BUG_ON(reg == 4);
 667
 668	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
 669	    !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 670		if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
 671			return emit_call_track_retpoline(addr, insn, reg, bytes);
 672
 673		return -1;
 674	}
 675
 676	op = insn->opcode.bytes[0];
 677
 678	/*
 679	 * Convert:
 680	 *
 681	 *   Jcc.d32 __x86_indirect_thunk_\reg
 682	 *
 683	 * into:
 684	 *
 685	 *   Jncc.d8 1f
 686	 *   [ LFENCE ]
 687	 *   JMP *%\reg
 688	 *   [ NOP ]
 689	 * 1:
 690	 */
 691	if (is_jcc32(insn)) {
 692		cc = insn->opcode.bytes[1] & 0xf;
 693		cc ^= 1; /* invert condition */
 694
 695		bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
 696		bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
 697
 698		/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
 699		op = JMP32_INSN_OPCODE;
 700	}
 701
 702	/*
 703	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
 704	 */
 705	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 706		bytes[i++] = 0x0f;
 707		bytes[i++] = 0xae;
 708		bytes[i++] = 0xe8; /* LFENCE */
 709	}
 710
 711	ret = emit_indirect(op, reg, bytes + i);
 712	if (ret < 0)
 713		return ret;
 714	i += ret;
 715
 716	/*
 717	 * The compiler is supposed to EMIT an INT3 after every unconditional
 718	 * JMP instruction due to AMD BTC. However, if the compiler is too old
 719	 * or MITIGATION_SLS isn't enabled, we still need an INT3 after
 720	 * indirect JMPs even on Intel.
 721	 */
 722	if (op == JMP32_INSN_OPCODE && i < insn->length)
 723		bytes[i++] = INT3_INSN_OPCODE;
 724
 725	for (; i < insn->length;)
 726		bytes[i++] = BYTES_NOP1;
 727
 728	return i;
 729}
 730
 731/*
 732 * Generated by 'objtool --retpoline'.
 733 */
 734void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
 735						struct module *mod)
 736{
 737	s32 *s;
 738
 739	for (s = start; s < end; s++) {
 740		void *addr = (void *)s + *s;
 741		void *wr_addr = module_writable_address(mod, addr);
 742		struct insn insn;
 743		int len, ret;
 744		u8 bytes[16];
 745		u8 op1, op2;
 746
 747		ret = insn_decode_kernel(&insn, wr_addr);
 748		if (WARN_ON_ONCE(ret < 0))
 749			continue;
 750
 751		op1 = insn.opcode.bytes[0];
 752		op2 = insn.opcode.bytes[1];
 753
 754		switch (op1) {
 755		case CALL_INSN_OPCODE:
 756		case JMP32_INSN_OPCODE:
 757			break;
 758
 759		case 0x0f: /* escape */
 760			if (op2 >= 0x80 && op2 <= 0x8f)
 761				break;
 762			fallthrough;
 763		default:
 764			WARN_ON_ONCE(1);
 765			continue;
 766		}
 767
 768		DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
 769			addr, addr, insn.length,
 770			addr + insn.length + insn.immediate.value);
 771
 772		len = patch_retpoline(addr, &insn, bytes);
 773		if (len == insn.length) {
 774			optimize_nops(addr, bytes, len);
 775			DUMP_BYTES(RETPOLINE, ((u8*)wr_addr),  len, "%px: orig: ", addr);
 776			DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
 777			text_poke_early(wr_addr, bytes, len);
 778		}
 779	}
 780}
 781
 782#ifdef CONFIG_MITIGATION_RETHUNK
 783
 784/*
 785 * Rewrite the compiler generated return thunk tail-calls.
 786 *
 787 * For example, convert:
 788 *
 789 *   JMP __x86_return_thunk
 790 *
 791 * into:
 792 *
 793 *   RET
 794 */
 795static int patch_return(void *addr, struct insn *insn, u8 *bytes)
 796{
 797	int i = 0;
 798
 799	/* Patch the custom return thunks... */
 800	if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
 801		i = JMP32_INSN_SIZE;
 802		__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
 803	} else {
 804		/* ... or patch them out if not needed. */
 805		bytes[i++] = RET_INSN_OPCODE;
 806	}
 807
 808	for (; i < insn->length;)
 809		bytes[i++] = INT3_INSN_OPCODE;
 810	return i;
 811}
 812
 813void __init_or_module noinline apply_returns(s32 *start, s32 *end,
 814					     struct module *mod)
 815{
 816	s32 *s;
 817
 818	if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
 819		static_call_force_reinit();
 820
 821	for (s = start; s < end; s++) {
 822		void *dest = NULL, *addr = (void *)s + *s;
 823		void *wr_addr = module_writable_address(mod, addr);
 824		struct insn insn;
 825		int len, ret;
 826		u8 bytes[16];
 827		u8 op;
 828
 829		ret = insn_decode_kernel(&insn, wr_addr);
 830		if (WARN_ON_ONCE(ret < 0))
 831			continue;
 832
 833		op = insn.opcode.bytes[0];
 834		if (op == JMP32_INSN_OPCODE)
 835			dest = addr + insn.length + insn.immediate.value;
 836
 837		if (__static_call_fixup(addr, op, dest) ||
 838		    WARN_ONCE(dest != &__x86_return_thunk,
 839			      "missing return thunk: %pS-%pS: %*ph",
 840			      addr, dest, 5, addr))
 841			continue;
 842
 843		DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
 844			addr, addr, insn.length,
 845			addr + insn.length + insn.immediate.value);
 846
 847		len = patch_return(addr, &insn, bytes);
 848		if (len == insn.length) {
 849			DUMP_BYTES(RET, ((u8*)wr_addr),  len, "%px: orig: ", addr);
 850			DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
 851			text_poke_early(wr_addr, bytes, len);
 852		}
 853	}
 854}
 855#else
 856void __init_or_module noinline apply_returns(s32 *start, s32 *end,
 857					     struct module *mod) { }
 858#endif /* CONFIG_MITIGATION_RETHUNK */
 859
 860#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
 861
 862void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
 863						struct module *mod) { }
 864void __init_or_module noinline apply_returns(s32 *start, s32 *end,
 865					     struct module *mod) { }
 866
 867#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
 868
 869#ifdef CONFIG_X86_KERNEL_IBT
 870
 871static void poison_cfi(void *addr, void *wr_addr);
 872
 873static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
 874{
 875	u32 endbr, poison = gen_endbr_poison();
 876
 877	if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr)))
 878		return;
 879
 880	if (!is_endbr(endbr)) {
 881		WARN_ON_ONCE(warn);
 882		return;
 883	}
 884
 885	DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
 886
 887	/*
 888	 * When we have IBT, the lack of ENDBR will trigger #CP
 889	 */
 890	DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
 891	DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
 892	text_poke_early(wr_addr, &poison, 4);
 893}
 894
 895/*
 896 * Generated by: objtool --ibt
 897 *
 898 * Seal the functions for indirect calls by clobbering the ENDBR instructions
 899 * and the kCFI hash value.
 900 */
 901void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
 902{
 903	s32 *s;
 904
 905	for (s = start; s < end; s++) {
 906		void *addr = (void *)s + *s;
 907		void *wr_addr = module_writable_address(mod, addr);
 908
 909		poison_endbr(addr, wr_addr, true);
 910		if (IS_ENABLED(CONFIG_FINEIBT))
 911			poison_cfi(addr - 16, wr_addr - 16);
 912	}
 913}
 914
 915#else
 916
 917void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { }
 918
 919#endif /* CONFIG_X86_KERNEL_IBT */
 920
 921#ifdef CONFIG_CFI_AUTO_DEFAULT
 922#define __CFI_DEFAULT	CFI_AUTO
 923#elif defined(CONFIG_CFI_CLANG)
 924#define __CFI_DEFAULT	CFI_KCFI
 925#else
 926#define __CFI_DEFAULT	CFI_OFF
 927#endif
 
 928
 929enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
 930
 931#ifdef CONFIG_CFI_CLANG
 932struct bpf_insn;
 933
 934/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
 935extern unsigned int __bpf_prog_runX(const void *ctx,
 936				    const struct bpf_insn *insn);
 937
 938/*
 939 * Force a reference to the external symbol so the compiler generates
 940 * __kcfi_typid.
 941 */
 942__ADDRESSABLE(__bpf_prog_runX);
 943
 944/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
 945asm (
 946"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
 947"	.type	cfi_bpf_hash,@object				\n"
 948"	.globl	cfi_bpf_hash					\n"
 949"	.p2align	2, 0x0					\n"
 950"cfi_bpf_hash:							\n"
 951"	.long	__kcfi_typeid___bpf_prog_runX			\n"
 952"	.size	cfi_bpf_hash, 4					\n"
 953"	.popsection						\n"
 954);
 955
 956/* Must match bpf_callback_t */
 957extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
 958
 959__ADDRESSABLE(__bpf_callback_fn);
 960
 961/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
 962asm (
 963"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
 964"	.type	cfi_bpf_subprog_hash,@object			\n"
 965"	.globl	cfi_bpf_subprog_hash				\n"
 966"	.p2align	2, 0x0					\n"
 967"cfi_bpf_subprog_hash:						\n"
 968"	.long	__kcfi_typeid___bpf_callback_fn			\n"
 969"	.size	cfi_bpf_subprog_hash, 4				\n"
 970"	.popsection						\n"
 971);
 972
 973u32 cfi_get_func_hash(void *func)
 974{
 975	u32 hash;
 976
 977	func -= cfi_get_offset();
 978	switch (cfi_mode) {
 979	case CFI_FINEIBT:
 980		func += 7;
 981		break;
 982	case CFI_KCFI:
 983		func += 1;
 984		break;
 985	default:
 986		return 0;
 987	}
 988
 989	if (get_kernel_nofault(hash, func))
 990		return 0;
 991
 992	return hash;
 993}
 
 994#endif
 995
 996#ifdef CONFIG_FINEIBT
 997
 998static bool cfi_rand __ro_after_init = true;
 999static u32  cfi_seed __ro_after_init;
1000
1001/*
1002 * Re-hash the CFI hash with a boot-time seed while making sure the result is
1003 * not a valid ENDBR instruction.
1004 */
1005static u32 cfi_rehash(u32 hash)
1006{
1007	hash ^= cfi_seed;
1008	while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
1009		bool lsb = hash & 1;
1010		hash >>= 1;
1011		if (lsb)
1012			hash ^= 0x80200003;
1013	}
1014	return hash;
1015}
1016
1017static __init int cfi_parse_cmdline(char *str)
 
1018{
1019	if (!str)
1020		return -EINVAL;
1021
1022	while (str) {
1023		char *next = strchr(str, ',');
1024		if (next) {
1025			*next = 0;
1026			next++;
1027		}
1028
1029		if (!strcmp(str, "auto")) {
1030			cfi_mode = CFI_AUTO;
1031		} else if (!strcmp(str, "off")) {
1032			cfi_mode = CFI_OFF;
1033			cfi_rand = false;
1034		} else if (!strcmp(str, "kcfi")) {
1035			cfi_mode = CFI_KCFI;
1036		} else if (!strcmp(str, "fineibt")) {
1037			cfi_mode = CFI_FINEIBT;
1038		} else if (!strcmp(str, "norand")) {
1039			cfi_rand = false;
1040		} else {
1041			pr_err("Ignoring unknown cfi option (%s).", str);
1042		}
1043
1044		str = next;
1045	}
1046
1047	return 0;
1048}
1049early_param("cfi", cfi_parse_cmdline);
1050
1051/*
1052 * kCFI						FineIBT
1053 *
1054 * __cfi_\func:					__cfi_\func:
1055 *	movl   $0x12345678,%eax		// 5	     endbr64			// 4
1056 *	nop					     subl   $0x12345678,%r10d   // 7
1057 *	nop					     jz     1f			// 2
1058 *	nop					     ud2			// 2
1059 *	nop					1:   nop			// 1
1060 *	nop
1061 *	nop
1062 *	nop
1063 *	nop
1064 *	nop
1065 *	nop
1066 *	nop
1067 *
1068 *
1069 * caller:					caller:
1070 *	movl	$(-0x12345678),%r10d	 // 6	     movl   $0x12345678,%r10d	// 6
1071 *	addl	$-15(%r11),%r10d	 // 4	     sub    $16,%r11		// 4
1072 *	je	1f			 // 2	     nop4			// 4
1073 *	ud2				 // 2
1074 * 1:	call	__x86_indirect_thunk_r11 // 5	     call   *%r11; nop2;	// 5
1075 *
1076 */
1077
1078asm(	".pushsection .rodata			\n"
1079	"fineibt_preamble_start:		\n"
1080	"	endbr64				\n"
1081	"	subl	$0x12345678, %r10d	\n"
1082	"	je	fineibt_preamble_end	\n"
1083	"	ud2				\n"
1084	"	nop				\n"
1085	"fineibt_preamble_end:			\n"
1086	".popsection\n"
1087);
1088
1089extern u8 fineibt_preamble_start[];
1090extern u8 fineibt_preamble_end[];
1091
1092#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
1093#define fineibt_preamble_hash 7
1094
1095asm(	".pushsection .rodata			\n"
1096	"fineibt_caller_start:			\n"
1097	"	movl	$0x12345678, %r10d	\n"
1098	"	sub	$16, %r11		\n"
1099	ASM_NOP4
1100	"fineibt_caller_end:			\n"
1101	".popsection				\n"
1102);
1103
1104extern u8 fineibt_caller_start[];
1105extern u8 fineibt_caller_end[];
1106
1107#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
1108#define fineibt_caller_hash 2
1109
1110#define fineibt_caller_jmp (fineibt_caller_size - 2)
1111
1112static u32 decode_preamble_hash(void *addr)
1113{
1114	u8 *p = addr;
1115
1116	/* b8 78 56 34 12          mov    $0x12345678,%eax */
1117	if (p[0] == 0xb8)
1118		return *(u32 *)(addr + 1);
1119
1120	return 0; /* invalid hash value */
1121}
1122
1123static u32 decode_caller_hash(void *addr)
1124{
1125	u8 *p = addr;
1126
1127	/* 41 ba 78 56 34 12       mov    $0x12345678,%r10d */
1128	if (p[0] == 0x41 && p[1] == 0xba)
1129		return -*(u32 *)(addr + 2);
1130
1131	/* e8 0c 78 56 34 12	   jmp.d8  +12 */
1132	if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
1133		return -*(u32 *)(addr + 2);
1134
1135	return 0; /* invalid hash value */
1136}
 
 
 
1137
1138/* .retpoline_sites */
1139static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
1140{
1141	/*
1142	 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
1143	 * in tact for later usage. Also see decode_caller_hash() and
1144	 * cfi_rewrite_callers().
1145	 */
1146	const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
1147	s32 *s;
1148
1149	for (s = start; s < end; s++) {
1150		void *addr = (void *)s + *s;
1151		void *wr_addr;
1152		u32 hash;
1153
1154		addr -= fineibt_caller_size;
1155		wr_addr = module_writable_address(mod, addr);
1156		hash = decode_caller_hash(wr_addr);
1157
1158		if (!hash) /* nocfi callers */
1159			continue;
1160
1161		text_poke_early(wr_addr, jmp, 2);
1162	}
1163
1164	return 0;
1165}
1166
1167static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
1168{
1169	/*
1170	 * Re-enable kCFI, undo what cfi_disable_callers() did.
 
 
 
 
 
 
1171	 */
1172	const u8 mov[] = { 0x41, 0xba };
1173	s32 *s;
1174
1175	for (s = start; s < end; s++) {
1176		void *addr = (void *)s + *s;
1177		void *wr_addr;
1178		u32 hash;
1179
1180		addr -= fineibt_caller_size;
1181		wr_addr = module_writable_address(mod, addr);
1182		hash = decode_caller_hash(wr_addr);
1183		if (!hash) /* nocfi callers */
1184			continue;
1185
1186		text_poke_early(wr_addr, mov, 2);
1187	}
1188
1189	return 0;
1190}
1191
1192/* .cfi_sites */
1193static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod)
1194{
1195	s32 *s;
1196
1197	for (s = start; s < end; s++) {
1198		void *addr = (void *)s + *s;
1199		void *wr_addr = module_writable_address(mod, addr);
1200		u32 hash;
1201
1202		hash = decode_preamble_hash(wr_addr);
1203		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1204			 addr, addr, 5, addr))
1205			return -EINVAL;
1206
1207		hash = cfi_rehash(hash);
1208		text_poke_early(wr_addr + 1, &hash, 4);
1209	}
1210
1211	return 0;
1212}
1213
1214static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod)
1215{
1216	s32 *s;
1217
1218	for (s = start; s < end; s++) {
1219		void *addr = (void *)s + *s;
1220		void *wr_addr = module_writable_address(mod, addr);
1221		u32 hash;
1222
1223		hash = decode_preamble_hash(wr_addr);
1224		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
1225			 addr, addr, 5, addr))
1226			return -EINVAL;
1227
1228		text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size);
1229		WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678);
1230		text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4);
1231	}
1232
1233	return 0;
1234}
1235
1236static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod)
1237{
1238	s32 *s;
1239
1240	for (s = start; s < end; s++) {
1241		void *addr = (void *)s + *s;
1242		void *wr_addr = module_writable_address(mod, addr);
1243
1244		poison_endbr(addr + 16, wr_addr + 16, false);
1245	}
1246}
1247
1248/* .retpoline_sites */
1249static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod)
1250{
1251	s32 *s;
1252
1253	for (s = start; s < end; s++) {
1254		void *addr = (void *)s + *s;
1255		void *wr_addr;
1256		u32 hash;
1257
1258		addr -= fineibt_caller_size;
1259		wr_addr = module_writable_address(mod, addr);
1260		hash = decode_caller_hash(wr_addr);
1261		if (hash) {
1262			hash = -cfi_rehash(hash);
1263			text_poke_early(wr_addr + 2, &hash, 4);
1264		}
1265	}
1266
1267	return 0;
1268}
1269
1270static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod)
1271{
1272	s32 *s;
1273
1274	for (s = start; s < end; s++) {
1275		void *addr = (void *)s + *s;
1276		void *wr_addr;
1277		u32 hash;
1278
1279		addr -= fineibt_caller_size;
1280		wr_addr = module_writable_address(mod, addr);
1281		hash = decode_caller_hash(wr_addr);
1282		if (hash) {
1283			text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size);
1284			WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678);
1285			text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4);
1286		}
1287		/* rely on apply_retpolines() */
1288	}
1289
1290	return 0;
1291}
1292
1293static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1294			    s32 *start_cfi, s32 *end_cfi, struct module *mod)
1295{
1296	bool builtin = mod ? false : true;
1297	int ret;
1298
1299	if (WARN_ONCE(fineibt_preamble_size != 16,
1300		      "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
1301		return;
1302
1303	if (cfi_mode == CFI_AUTO) {
1304		cfi_mode = CFI_KCFI;
1305		if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
1306			cfi_mode = CFI_FINEIBT;
1307	}
1308
1309	/*
1310	 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1311	 * rewrite them. This disables all CFI. If this succeeds but any of the
1312	 * later stages fails, we're without CFI.
1313	 */
1314	ret = cfi_disable_callers(start_retpoline, end_retpoline, mod);
1315	if (ret)
1316		goto err;
1317
1318	if (cfi_rand) {
1319		if (builtin) {
1320			cfi_seed = get_random_u32();
1321			cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
1322			cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
1323		}
1324
1325		ret = cfi_rand_preamble(start_cfi, end_cfi, mod);
1326		if (ret)
1327			goto err;
1328
1329		ret = cfi_rand_callers(start_retpoline, end_retpoline, mod);
1330		if (ret)
1331			goto err;
1332	}
1333
1334	switch (cfi_mode) {
1335	case CFI_OFF:
1336		if (builtin)
1337			pr_info("Disabling CFI\n");
1338		return;
1339
1340	case CFI_KCFI:
1341		ret = cfi_enable_callers(start_retpoline, end_retpoline, mod);
1342		if (ret)
1343			goto err;
1344
1345		if (builtin)
1346			pr_info("Using kCFI\n");
1347		return;
1348
1349	case CFI_FINEIBT:
1350		/* place the FineIBT preamble at func()-16 */
1351		ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod);
1352		if (ret)
1353			goto err;
1354
1355		/* rewrite the callers to target func()-16 */
1356		ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod);
1357		if (ret)
1358			goto err;
1359
1360		/* now that nobody targets func()+0, remove ENDBR there */
1361		cfi_rewrite_endbr(start_cfi, end_cfi, mod);
1362
1363		if (builtin)
1364			pr_info("Using FineIBT CFI\n");
1365		return;
1366
1367	default:
1368		break;
1369	}
1370
1371err:
1372	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1373}
1374
1375static inline void poison_hash(void *addr)
1376{
1377	*(u32 *)addr = 0;
1378}
1379
1380static void poison_cfi(void *addr, void *wr_addr)
1381{
1382	switch (cfi_mode) {
1383	case CFI_FINEIBT:
1384		/*
1385		 * __cfi_\func:
1386		 *	osp nopl (%rax)
1387		 *	subl	$0, %r10d
1388		 *	jz	1f
1389		 *	ud2
1390		 * 1:	nop
1391		 */
1392		poison_endbr(addr, wr_addr, false);
1393		poison_hash(wr_addr + fineibt_preamble_hash);
1394		break;
1395
1396	case CFI_KCFI:
1397		/*
1398		 * __cfi_\func:
1399		 *	movl	$0, %eax
1400		 *	.skip	11, 0x90
1401		 */
1402		poison_hash(wr_addr + 1);
1403		break;
1404
1405	default:
1406		break;
1407	}
1408}
1409
1410#else
1411
1412static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1413			    s32 *start_cfi, s32 *end_cfi, struct module *mod)
1414{
1415}
1416
1417#ifdef CONFIG_X86_KERNEL_IBT
1418static void poison_cfi(void *addr, void *wr_addr) { }
1419#endif
1420
1421#endif
1422
1423void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1424		   s32 *start_cfi, s32 *end_cfi, struct module *mod)
1425{
1426	return __apply_fineibt(start_retpoline, end_retpoline,
1427			       start_cfi, end_cfi, mod);
1428}
1429
1430#ifdef CONFIG_SMP
 
1431static void alternatives_smp_lock(const s32 *start, const s32 *end,
1432				  u8 *text, u8 *text_end)
1433{
1434	const s32 *poff;
1435
 
1436	for (poff = start; poff < end; poff++) {
1437		u8 *ptr = (u8 *)poff + *poff;
1438
1439		if (!*poff || ptr < text || ptr >= text_end)
1440			continue;
1441		/* turn DS segment override prefix into lock prefix */
1442		if (*ptr == 0x3e)
1443			text_poke(ptr, ((unsigned char []){0xf0}), 1);
1444	}
 
1445}
1446
1447static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1448				    u8 *text, u8 *text_end)
1449{
1450	const s32 *poff;
1451
 
 
 
 
1452	for (poff = start; poff < end; poff++) {
1453		u8 *ptr = (u8 *)poff + *poff;
1454
1455		if (!*poff || ptr < text || ptr >= text_end)
1456			continue;
1457		/* turn lock prefix into DS segment override prefix */
1458		if (*ptr == 0xf0)
1459			text_poke(ptr, ((unsigned char []){0x3E}), 1);
1460	}
 
1461}
1462
1463struct smp_alt_module {
1464	/* what is this ??? */
1465	struct module	*mod;
1466	char		*name;
1467
1468	/* ptrs to lock prefixes */
1469	const s32	*locks;
1470	const s32	*locks_end;
1471
1472	/* .text segment, needed to avoid patching init code ;) */
1473	u8		*text;
1474	u8		*text_end;
1475
1476	struct list_head next;
1477};
1478static LIST_HEAD(smp_alt_modules);
1479static bool uniproc_patched = false;	/* protected by text_mutex */
 
1480
1481void __init_or_module alternatives_smp_module_add(struct module *mod,
1482						  char *name,
1483						  void *locks, void *locks_end,
1484						  void *text,  void *text_end)
1485{
1486	struct smp_alt_module *smp;
1487
1488	mutex_lock(&text_mutex);
1489	if (!uniproc_patched)
1490		goto unlock;
1491
1492	if (num_possible_cpus() == 1)
1493		/* Don't bother remembering, we'll never have to undo it. */
1494		goto smp_unlock;
 
 
 
1495
1496	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1497	if (NULL == smp)
1498		/* we'll run the (safe but slow) SMP code then ... */
1499		goto unlock;
1500
1501	smp->mod	= mod;
1502	smp->name	= name;
1503	smp->locks	= locks;
1504	smp->locks_end	= locks_end;
1505	smp->text	= text;
1506	smp->text_end	= text_end;
1507	DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
1508		smp->locks, smp->locks_end,
1509		smp->text, smp->text_end, smp->name);
1510
 
1511	list_add_tail(&smp->next, &smp_alt_modules);
1512smp_unlock:
1513	alternatives_smp_unlock(locks, locks_end, text, text_end);
1514unlock:
1515	mutex_unlock(&text_mutex);
1516}
1517
1518void __init_or_module alternatives_smp_module_del(struct module *mod)
1519{
1520	struct smp_alt_module *item;
1521
1522	mutex_lock(&text_mutex);
 
 
 
1523	list_for_each_entry(item, &smp_alt_modules, next) {
1524		if (mod != item->mod)
1525			continue;
1526		list_del(&item->next);
 
 
1527		kfree(item);
1528		break;
1529	}
1530	mutex_unlock(&text_mutex);
1531}
1532
1533void alternatives_enable_smp(void)
 
1534{
1535	struct smp_alt_module *mod;
1536
1537	/* Why bother if there are no other CPUs? */
1538	BUG_ON(num_possible_cpus() == 1);
 
 
 
 
 
 
 
 
1539
1540	mutex_lock(&text_mutex);
 
 
 
 
1541
1542	if (uniproc_patched) {
1543		pr_info("switching to SMP code\n");
1544		BUG_ON(num_online_cpus() != 1);
 
 
 
 
 
1545		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1546		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1547		list_for_each_entry(mod, &smp_alt_modules, next)
1548			alternatives_smp_lock(mod->locks, mod->locks_end,
1549					      mod->text, mod->text_end);
1550		uniproc_patched = false;
 
 
 
 
 
 
1551	}
1552	mutex_unlock(&text_mutex);
 
1553}
1554
1555/*
1556 * Return 1 if the address range is reserved for SMP-alternatives.
1557 * Must hold text_mutex.
1558 */
1559int alternatives_text_reserved(void *start, void *end)
1560{
1561	struct smp_alt_module *mod;
1562	const s32 *poff;
1563	u8 *text_start = start;
1564	u8 *text_end = end;
1565
1566	lockdep_assert_held(&text_mutex);
1567
1568	list_for_each_entry(mod, &smp_alt_modules, next) {
1569		if (mod->text > text_end || mod->text_end < text_start)
1570			continue;
1571		for (poff = mod->locks; poff < mod->locks_end; poff++) {
1572			const u8 *ptr = (const u8 *)poff + *poff;
1573
1574			if (text_start <= ptr && text_end > ptr)
1575				return 1;
1576		}
1577	}
1578
1579	return 0;
1580}
1581#endif /* CONFIG_SMP */
1582
1583/*
1584 * Self-test for the INT3 based CALL emulation code.
1585 *
1586 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1587 * properly and that there is a stack gap between the INT3 frame and the
1588 * previous context. Without this gap doing a virtual PUSH on the interrupted
1589 * stack would corrupt the INT3 IRET frame.
1590 *
1591 * See entry_{32,64}.S for more details.
1592 */
1593
1594/*
1595 * We define the int3_magic() function in assembly to control the calling
1596 * convention such that we can 'call' it from assembly.
1597 */
1598
1599extern void int3_magic(unsigned int *ptr); /* defined in asm */
1600
1601asm (
1602"	.pushsection	.init.text, \"ax\", @progbits\n"
1603"	.type		int3_magic, @function\n"
1604"int3_magic:\n"
1605	ANNOTATE_NOENDBR
1606"	movl	$1, (%" _ASM_ARG1 ")\n"
1607	ASM_RET
1608"	.size		int3_magic, .-int3_magic\n"
1609"	.popsection\n"
1610);
1611
1612extern void int3_selftest_ip(void); /* defined in asm below */
1613
1614static int __init
1615int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1616{
1617	unsigned long selftest = (unsigned long)&int3_selftest_ip;
1618	struct die_args *args = data;
1619	struct pt_regs *regs = args->regs;
1620
1621	OPTIMIZER_HIDE_VAR(selftest);
1622
1623	if (!regs || user_mode(regs))
1624		return NOTIFY_DONE;
1625
1626	if (val != DIE_INT3)
1627		return NOTIFY_DONE;
1628
1629	if (regs->ip - INT3_INSN_SIZE != selftest)
1630		return NOTIFY_DONE;
1631
1632	int3_emulate_call(regs, (unsigned long)&int3_magic);
1633	return NOTIFY_STOP;
1634}
1635
1636/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1637static noinline void __init int3_selftest(void)
 
1638{
1639	static __initdata struct notifier_block int3_exception_nb = {
1640		.notifier_call	= int3_exception_notify,
1641		.priority	= INT_MAX-1, /* last */
1642	};
1643	unsigned int val = 0;
1644
1645	BUG_ON(register_die_notifier(&int3_exception_nb));
1646
1647	/*
1648	 * Basically: int3_magic(&val); but really complicated :-)
1649	 *
1650	 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1651	 * notifier above will emulate CALL for us.
1652	 */
1653	asm volatile ("int3_selftest_ip:\n\t"
1654		      ANNOTATE_NOENDBR
1655		      "    int3; nop; nop; nop; nop\n\t"
1656		      : ASM_CALL_CONSTRAINT
1657		      : __ASM_SEL_RAW(a, D) (&val)
1658		      : "memory");
1659
1660	BUG_ON(val != 1);
1661
1662	unregister_die_notifier(&int3_exception_nb);
1663}
1664
1665static __initdata int __alt_reloc_selftest_addr;
 
 
 
 
1666
1667extern void __init __alt_reloc_selftest(void *arg);
1668__visible noinline void __init __alt_reloc_selftest(void *arg)
1669{
1670	WARN_ON(arg != &__alt_reloc_selftest_addr);
1671}
1672
1673static noinline void __init alt_reloc_selftest(void)
1674{
1675	/*
1676	 * Tests apply_relocation().
1677	 *
1678	 * This has a relative immediate (CALL) in a place other than the first
1679	 * instruction and additionally on x86_64 we get a RIP-relative LEA:
1680	 *
1681	 *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
1682	 *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
1683	 *
1684	 * Getting this wrong will either crash and burn or tickle the WARN
1685	 * above.
1686	 */
1687	asm_inline volatile (
1688		ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
1689		: ASM_CALL_CONSTRAINT
1690		: [mem] "m" (__alt_reloc_selftest_addr)
1691		: _ASM_ARG1
1692	);
1693}
 
 
 
1694
1695void __init alternative_instructions(void)
1696{
1697	int3_selftest();
1698
1699	/*
1700	 * The patching is not fully atomic, so try to avoid local
1701	 * interruptions that might execute the to be patched code.
1702	 * Other CPUs are not running.
1703	 */
1704	stop_nmi();
1705
1706	/*
1707	 * Don't stop machine check exceptions while patching.
1708	 * MCEs only happen when something got corrupted and in this
1709	 * case we must do something about the corruption.
1710	 * Ignoring it is worse than an unlikely patching race.
1711	 * Also machine checks tend to be broadcast and if one CPU
1712	 * goes into machine check the others follow quickly, so we don't
1713	 * expect a machine check to cause undue problems during to code
1714	 * patching.
1715	 */
1716
1717	/*
1718	 * Make sure to set (artificial) features depending on used paravirt
1719	 * functions which can later influence alternative patching.
1720	 */
1721	paravirt_set_cap();
1722
1723	__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1724			__cfi_sites, __cfi_sites_end, NULL);
1725
1726	/*
1727	 * Rewrite the retpolines, must be done before alternatives since
1728	 * those can rewrite the retpoline thunks.
1729	 */
1730	apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
1731	apply_returns(__return_sites, __return_sites_end, NULL);
1732
1733	apply_alternatives(__alt_instructions, __alt_instructions_end, NULL);
1734
1735	/*
1736	 * Now all calls are established. Apply the call thunks if
1737	 * required.
1738	 */
1739	callthunks_patch_builtin_calls();
1740
1741	/*
1742	 * Seal all functions that do not have their address taken.
1743	 */
1744	apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL);
1745
1746#ifdef CONFIG_SMP
1747	/* Patch to UP if other cpus not imminent. */
1748	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1749		uniproc_patched = true;
 
 
 
 
 
 
 
1750		alternatives_smp_module_add(NULL, "core kernel",
1751					    __smp_locks, __smp_locks_end,
1752					    _text, _etext);
 
 
 
 
1753	}
 
 
1754
1755	if (!uniproc_patched || num_possible_cpus() == 1) {
1756		free_init_pages("SMP alternatives",
1757				(unsigned long)__smp_locks,
1758				(unsigned long)__smp_locks_end);
1759	}
1760#endif
1761
1762	restart_nmi();
1763	alternatives_patched = 1;
1764
1765	alt_reloc_selftest();
1766}
1767
1768/**
1769 * text_poke_early - Update instructions on a live kernel at boot time
1770 * @addr: address to modify
1771 * @opcode: source of the copy
1772 * @len: length to copy
1773 *
1774 * When you use this code to patch more than one byte of an instruction
1775 * you need to make sure that other CPUs cannot execute this code in parallel.
1776 * Also no thread must be currently preempted in the middle of these
1777 * instructions. And on the local CPU you need to be protected against NMI or
1778 * MCE handlers seeing an inconsistent instruction while you patch.
1779 */
1780void __init_or_module text_poke_early(void *addr, const void *opcode,
1781				      size_t len)
1782{
1783	unsigned long flags;
1784
1785	if (boot_cpu_has(X86_FEATURE_NX) &&
1786	    is_module_text_address((unsigned long)addr)) {
1787		/*
1788		 * Modules text is marked initially as non-executable, so the
1789		 * code cannot be running and speculative code-fetches are
1790		 * prevented. Just change the code.
1791		 */
1792		memcpy(addr, opcode, len);
1793	} else {
1794		local_irq_save(flags);
1795		memcpy(addr, opcode, len);
1796		sync_core();
1797		local_irq_restore(flags);
1798
1799		/*
1800		 * Could also do a CLFLUSH here to speed up CPU recovery; but
1801		 * that causes hangs on some VIA CPUs.
1802		 */
1803	}
1804}
1805
1806typedef struct {
1807	struct mm_struct *mm;
1808} temp_mm_state_t;
1809
1810/*
1811 * Using a temporary mm allows to set temporary mappings that are not accessible
1812 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1813 * that override the kernel memory protections (e.g., W^X), without exposing the
1814 * temporary page-table mappings that are required for these write operations to
1815 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1816 * mapping is torn down.
1817 *
1818 * Context: The temporary mm needs to be used exclusively by a single core. To
1819 *          harden security IRQs must be disabled while the temporary mm is
1820 *          loaded, thereby preventing interrupt handler bugs from overriding
1821 *          the kernel memory protection.
1822 */
1823static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1824{
1825	temp_mm_state_t temp_state;
1826
1827	lockdep_assert_irqs_disabled();
1828
1829	/*
1830	 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1831	 * with a stale address space WITHOUT being in lazy mode after
1832	 * restoring the previous mm.
1833	 */
1834	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1835		leave_mm();
1836
1837	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1838	switch_mm_irqs_off(NULL, mm, current);
1839
1840	/*
1841	 * If breakpoints are enabled, disable them while the temporary mm is
1842	 * used. Userspace might set up watchpoints on addresses that are used
1843	 * in the temporary mm, which would lead to wrong signals being sent or
1844	 * crashes.
1845	 *
1846	 * Note that breakpoints are not disabled selectively, which also causes
1847	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1848	 * undesirable, but still seems reasonable as the code that runs in the
1849	 * temporary mm should be short.
1850	 */
1851	if (hw_breakpoint_active())
1852		hw_breakpoint_disable();
1853
1854	return temp_state;
1855}
1856
1857static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1858{
1859	lockdep_assert_irqs_disabled();
1860	switch_mm_irqs_off(NULL, prev_state.mm, current);
1861
1862	/*
1863	 * Restore the breakpoints if they were disabled before the temporary mm
1864	 * was loaded.
1865	 */
1866	if (hw_breakpoint_active())
1867		hw_breakpoint_restore();
1868}
1869
1870__ro_after_init struct mm_struct *poking_mm;
1871__ro_after_init unsigned long poking_addr;
1872
1873static void text_poke_memcpy(void *dst, const void *src, size_t len)
1874{
1875	memcpy(dst, src, len);
1876}
1877
1878static void text_poke_memset(void *dst, const void *src, size_t len)
1879{
1880	int c = *(const int *)src;
1881
1882	memset(dst, c, len);
1883}
1884
1885typedef void text_poke_f(void *dst, const void *src, size_t len);
1886
1887static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1888{
1889	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1890	struct page *pages[2] = {NULL};
1891	temp_mm_state_t prev;
1892	unsigned long flags;
1893	pte_t pte, *ptep;
1894	spinlock_t *ptl;
1895	pgprot_t pgprot;
1896
1897	/*
1898	 * While boot memory allocator is running we cannot use struct pages as
1899	 * they are not yet initialized. There is no way to recover.
1900	 */
1901	BUG_ON(!after_bootmem);
1902
1903	if (!core_kernel_text((unsigned long)addr)) {
1904		pages[0] = vmalloc_to_page(addr);
1905		if (cross_page_boundary)
1906			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1907	} else {
1908		pages[0] = virt_to_page(addr);
1909		WARN_ON(!PageReserved(pages[0]));
1910		if (cross_page_boundary)
1911			pages[1] = virt_to_page(addr + PAGE_SIZE);
1912	}
1913	/*
1914	 * If something went wrong, crash and burn since recovery paths are not
1915	 * implemented.
1916	 */
1917	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1918
1919	/*
1920	 * Map the page without the global bit, as TLB flushing is done with
1921	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1922	 */
1923	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1924
1925	/*
1926	 * The lock is not really needed, but this allows to avoid open-coding.
1927	 */
1928	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1929
1930	/*
1931	 * This must not fail; preallocated in poking_init().
1932	 */
1933	VM_BUG_ON(!ptep);
1934
1935	local_irq_save(flags);
1936
1937	pte = mk_pte(pages[0], pgprot);
1938	set_pte_at(poking_mm, poking_addr, ptep, pte);
1939
1940	if (cross_page_boundary) {
1941		pte = mk_pte(pages[1], pgprot);
1942		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1943	}
1944
1945	/*
1946	 * Loading the temporary mm behaves as a compiler barrier, which
1947	 * guarantees that the PTE will be set at the time memcpy() is done.
1948	 */
1949	prev = use_temporary_mm(poking_mm);
1950
1951	kasan_disable_current();
1952	func((u8 *)poking_addr + offset_in_page(addr), src, len);
1953	kasan_enable_current();
1954
1955	/*
1956	 * Ensure that the PTE is only cleared after the instructions of memcpy
1957	 * were issued by using a compiler barrier.
1958	 */
1959	barrier();
1960
1961	pte_clear(poking_mm, poking_addr, ptep);
1962	if (cross_page_boundary)
1963		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1964
1965	/*
1966	 * Loading the previous page-table hierarchy requires a serializing
1967	 * instruction that already allows the core to see the updated version.
1968	 * Xen-PV is assumed to serialize execution in a similar manner.
1969	 */
1970	unuse_temporary_mm(prev);
1971
1972	/*
1973	 * Flushing the TLB might involve IPIs, which would require enabled
1974	 * IRQs, but not if the mm is not used, as it is in this point.
1975	 */
1976	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1977			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1978			   PAGE_SHIFT, false);
1979
1980	if (func == text_poke_memcpy) {
1981		/*
1982		 * If the text does not match what we just wrote then something is
1983		 * fundamentally screwy; there's nothing we can really do about that.
1984		 */
1985		BUG_ON(memcmp(addr, src, len));
1986	}
1987
1988	local_irq_restore(flags);
1989	pte_unmap_unlock(ptep, ptl);
 
1990	return addr;
1991}
1992
1993/**
1994 * text_poke - Update instructions on a live kernel
1995 * @addr: address to modify
1996 * @opcode: source of the copy
1997 * @len: length to copy
1998 *
1999 * Only atomic text poke/set should be allowed when not doing early patching.
2000 * It means the size must be writable atomically and the address must be aligned
2001 * in a way that permits an atomic write. It also makes sure we fit on a single
2002 * page.
2003 *
2004 * Note that the caller must ensure that if the modified code is part of a
2005 * module, the module would not be removed during poking. This can be achieved
2006 * by registering a module notifier, and ordering module removal and patching
2007 * through a mutex.
2008 */
2009void *text_poke(void *addr, const void *opcode, size_t len)
2010{
2011	lockdep_assert_held(&text_mutex);
2012
2013	return __text_poke(text_poke_memcpy, addr, opcode, len);
2014}
2015
2016/**
2017 * text_poke_kgdb - Update instructions on a live kernel by kgdb
2018 * @addr: address to modify
2019 * @opcode: source of the copy
2020 * @len: length to copy
2021 *
2022 * Only atomic text poke/set should be allowed when not doing early patching.
2023 * It means the size must be writable atomically and the address must be aligned
2024 * in a way that permits an atomic write. It also makes sure we fit on a single
2025 * page.
2026 *
2027 * Context: should only be used by kgdb, which ensures no other core is running,
2028 *	    despite the fact it does not hold the text_mutex.
2029 */
2030void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
2031{
2032	return __text_poke(text_poke_memcpy, addr, opcode, len);
2033}
2034
2035void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
2036			    bool core_ok)
2037{
2038	unsigned long start = (unsigned long)addr;
2039	size_t patched = 0;
2040
2041	if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
2042		return NULL;
2043
2044	while (patched < len) {
2045		unsigned long ptr = start + patched;
2046		size_t s;
2047
2048		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2049
2050		__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
2051		patched += s;
2052	}
2053	return addr;
2054}
2055
2056/**
2057 * text_poke_copy - Copy instructions into (an unused part of) RX memory
2058 * @addr: address to modify
2059 * @opcode: source of the copy
2060 * @len: length to copy, could be more than 2x PAGE_SIZE
2061 *
2062 * Not safe against concurrent execution; useful for JITs to dump
2063 * new code blocks into unused regions of RX memory. Can be used in
2064 * conjunction with synchronize_rcu_tasks() to wait for existing
2065 * execution to quiesce after having made sure no existing functions
2066 * pointers are live.
2067 */
2068void *text_poke_copy(void *addr, const void *opcode, size_t len)
2069{
2070	mutex_lock(&text_mutex);
2071	addr = text_poke_copy_locked(addr, opcode, len, false);
2072	mutex_unlock(&text_mutex);
2073	return addr;
2074}
2075
2076/**
2077 * text_poke_set - memset into (an unused part of) RX memory
2078 * @addr: address to modify
2079 * @c: the byte to fill the area with
2080 * @len: length to copy, could be more than 2x PAGE_SIZE
2081 *
2082 * This is useful to overwrite unused regions of RX memory with illegal
2083 * instructions.
2084 */
2085void *text_poke_set(void *addr, int c, size_t len)
2086{
2087	unsigned long start = (unsigned long)addr;
2088	size_t patched = 0;
2089
2090	if (WARN_ON_ONCE(core_kernel_text(start)))
2091		return NULL;
2092
2093	mutex_lock(&text_mutex);
2094	while (patched < len) {
2095		unsigned long ptr = start + patched;
2096		size_t s;
2097
2098		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
2099
2100		__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
2101		patched += s;
 
 
 
 
 
2102	}
2103	mutex_unlock(&text_mutex);
2104	return addr;
2105}
2106
2107static void do_sync_core(void *info)
2108{
 
 
 
 
 
2109	sync_core();
2110}
2111
2112void text_poke_sync(void)
2113{
2114	on_each_cpu(do_sync_core, NULL, 1);
 
2115}
2116
2117/*
2118 * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
2119 * this thing. When len == 6 everything is prefixed with 0x0f and we map
2120 * opcode to Jcc.d8, using len to distinguish.
2121 */
2122struct text_poke_loc {
2123	/* addr := _stext + rel_addr */
2124	s32 rel_addr;
2125	s32 disp;
2126	u8 len;
2127	u8 opcode;
2128	const u8 text[POKE_MAX_OPCODE_SIZE];
2129	/* see text_poke_bp_batch() */
2130	u8 old;
2131};
2132
2133struct bp_patching_desc {
2134	struct text_poke_loc *vec;
2135	int nr_entries;
2136	atomic_t refs;
2137};
2138
2139static struct bp_patching_desc bp_desc;
2140
2141static __always_inline
2142struct bp_patching_desc *try_get_desc(void)
2143{
2144	struct bp_patching_desc *desc = &bp_desc;
2145
2146	if (!raw_atomic_inc_not_zero(&desc->refs))
2147		return NULL;
2148
2149	return desc;
2150}
2151
2152static __always_inline void put_desc(void)
2153{
2154	struct bp_patching_desc *desc = &bp_desc;
2155
2156	smp_mb__before_atomic();
2157	raw_atomic_dec(&desc->refs);
2158}
2159
2160static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
2161{
2162	return _stext + tp->rel_addr;
2163}
2164
2165static __always_inline int patch_cmp(const void *key, const void *elt)
2166{
2167	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
2168
2169	if (key < text_poke_addr(tp))
2170		return -1;
2171	if (key > text_poke_addr(tp))
2172		return 1;
2173	return 0;
2174}
2175
2176noinstr int poke_int3_handler(struct pt_regs *regs)
2177{
2178	struct bp_patching_desc *desc;
2179	struct text_poke_loc *tp;
2180	int ret = 0;
2181	void *ip;
2182
2183	if (user_mode(regs))
2184		return 0;
2185
2186	/*
2187	 * Having observed our INT3 instruction, we now must observe
2188	 * bp_desc with non-zero refcount:
2189	 *
2190	 *	bp_desc.refs = 1		INT3
2191	 *	WMB				RMB
2192	 *	write INT3			if (bp_desc.refs != 0)
2193	 */
2194	smp_rmb();
2195
2196	desc = try_get_desc();
2197	if (!desc)
2198		return 0;
2199
2200	/*
2201	 * Discount the INT3. See text_poke_bp_batch().
2202	 */
2203	ip = (void *) regs->ip - INT3_INSN_SIZE;
2204
2205	/*
2206	 * Skip the binary search if there is a single member in the vector.
2207	 */
2208	if (unlikely(desc->nr_entries > 1)) {
2209		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
2210				      sizeof(struct text_poke_loc),
2211				      patch_cmp);
2212		if (!tp)
2213			goto out_put;
2214	} else {
2215		tp = desc->vec;
2216		if (text_poke_addr(tp) != ip)
2217			goto out_put;
2218	}
2219
2220	ip += tp->len;
2221
2222	switch (tp->opcode) {
2223	case INT3_INSN_OPCODE:
2224		/*
2225		 * Someone poked an explicit INT3, they'll want to handle it,
2226		 * do not consume.
2227		 */
2228		goto out_put;
2229
2230	case RET_INSN_OPCODE:
2231		int3_emulate_ret(regs);
2232		break;
2233
2234	case CALL_INSN_OPCODE:
2235		int3_emulate_call(regs, (long)ip + tp->disp);
2236		break;
2237
2238	case JMP32_INSN_OPCODE:
2239	case JMP8_INSN_OPCODE:
2240		int3_emulate_jmp(regs, (long)ip + tp->disp);
2241		break;
2242
2243	case 0x70 ... 0x7f: /* Jcc */
2244		int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
2245		break;
2246
2247	default:
2248		BUG();
2249	}
2250
2251	ret = 1;
2252
2253out_put:
2254	put_desc();
2255	return ret;
2256}
2257
2258#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
2259static struct text_poke_loc tp_vec[TP_VEC_MAX];
2260static int tp_vec_nr;
2261
2262/**
2263 * text_poke_bp_batch() -- update instructions on live kernel on SMP
2264 * @tp:			vector of instructions to patch
2265 * @nr_entries:		number of entries in the vector
2266 *
2267 * Modify multi-byte instruction by using int3 breakpoint on SMP.
2268 * We completely avoid stop_machine() here, and achieve the
2269 * synchronization using int3 breakpoint.
2270 *
2271 * The way it is done:
2272 *	- For each entry in the vector:
2273 *		- add a int3 trap to the address that will be patched
2274 *	- sync cores
2275 *	- For each entry in the vector:
2276 *		- update all but the first byte of the patched range
2277 *	- sync cores
2278 *	- For each entry in the vector:
2279 *		- replace the first byte (int3) by the first byte of
2280 *		  replacing opcode
2281 *	- sync cores
2282 */
2283static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
2284{
2285	unsigned char int3 = INT3_INSN_OPCODE;
2286	unsigned int i;
2287	int do_sync;
2288
2289	lockdep_assert_held(&text_mutex);
2290
2291	bp_desc.vec = tp;
2292	bp_desc.nr_entries = nr_entries;
2293
2294	/*
2295	 * Corresponds to the implicit memory barrier in try_get_desc() to
2296	 * ensure reading a non-zero refcount provides up to date bp_desc data.
2297	 */
2298	atomic_set_release(&bp_desc.refs, 1);
2299
2300	/*
2301	 * Function tracing can enable thousands of places that need to be
2302	 * updated. This can take quite some time, and with full kernel debugging
2303	 * enabled, this could cause the softlockup watchdog to trigger.
2304	 * This function gets called every 256 entries added to be patched.
2305	 * Call cond_resched() here to make sure that other tasks can get scheduled
2306	 * while processing all the functions being patched.
2307	 */
2308	cond_resched();
2309
2310	/*
2311	 * Corresponding read barrier in int3 notifier for making sure the
2312	 * nr_entries and handler are correctly ordered wrt. patching.
2313	 */
2314	smp_wmb();
2315
2316	/*
2317	 * First step: add a int3 trap to the address that will be patched.
2318	 */
2319	for (i = 0; i < nr_entries; i++) {
2320		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
2321		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
2322	}
2323
2324	text_poke_sync();
2325
2326	/*
2327	 * Second step: update all but the first byte of the patched range.
2328	 */
2329	for (do_sync = 0, i = 0; i < nr_entries; i++) {
2330		u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
2331		u8 _new[POKE_MAX_OPCODE_SIZE+1];
2332		const u8 *new = tp[i].text;
2333		int len = tp[i].len;
2334
2335		if (len - INT3_INSN_SIZE > 0) {
2336			memcpy(old + INT3_INSN_SIZE,
2337			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2338			       len - INT3_INSN_SIZE);
2339
2340			if (len == 6) {
2341				_new[0] = 0x0f;
2342				memcpy(_new + 1, new, 5);
2343				new = _new;
2344			}
2345
2346			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
2347				  new + INT3_INSN_SIZE,
2348				  len - INT3_INSN_SIZE);
2349
2350			do_sync++;
2351		}
2352
2353		/*
2354		 * Emit a perf event to record the text poke, primarily to
2355		 * support Intel PT decoding which must walk the executable code
2356		 * to reconstruct the trace. The flow up to here is:
2357		 *   - write INT3 byte
2358		 *   - IPI-SYNC
2359		 *   - write instruction tail
2360		 * At this point the actual control flow will be through the
2361		 * INT3 and handler and not hit the old or new instruction.
2362		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2363		 * can still be decoded. Subsequently:
2364		 *   - emit RECORD_TEXT_POKE with the new instruction
2365		 *   - IPI-SYNC
2366		 *   - write first byte
2367		 *   - IPI-SYNC
2368		 * So before the text poke event timestamp, the decoder will see
2369		 * either the old instruction flow or FUP/TIP of INT3. After the
2370		 * text poke event timestamp, the decoder will see either the
2371		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2372		 * use the timestamp as the point at which to modify the
2373		 * executable code.
2374		 * The old instruction is recorded so that the event can be
2375		 * processed forwards or backwards.
2376		 */
2377		perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
2378	}
2379
2380	if (do_sync) {
2381		/*
2382		 * According to Intel, this core syncing is very likely
2383		 * not necessary and we'd be safe even without it. But
2384		 * better safe than sorry (plus there's not only Intel).
2385		 */
2386		text_poke_sync();
2387	}
2388
2389	/*
2390	 * Third step: replace the first byte (int3) by the first byte of
2391	 * replacing opcode.
 
2392	 */
2393	for (do_sync = 0, i = 0; i < nr_entries; i++) {
2394		u8 byte = tp[i].text[0];
2395
2396		if (tp[i].len == 6)
2397			byte = 0x0f;
2398
2399		if (byte == INT3_INSN_OPCODE)
2400			continue;
2401
2402		text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
2403		do_sync++;
2404	}
2405
2406	if (do_sync)
2407		text_poke_sync();
2408
2409	/*
2410	 * Remove and wait for refs to be zero.
2411	 */
2412	if (!atomic_dec_and_test(&bp_desc.refs))
2413		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2414}
2415
2416static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2417			       const void *opcode, size_t len, const void *emulate)
2418{
2419	struct insn insn;
2420	int ret, i = 0;
2421
2422	if (len == 6)
2423		i = 1;
2424	memcpy((void *)tp->text, opcode+i, len-i);
2425	if (!emulate)
2426		emulate = opcode;
2427
2428	ret = insn_decode_kernel(&insn, emulate);
2429	BUG_ON(ret < 0);
2430
2431	tp->rel_addr = addr - (void *)_stext;
2432	tp->len = len;
2433	tp->opcode = insn.opcode.bytes[0];
2434
2435	if (is_jcc32(&insn)) {
2436		/*
2437		 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2438		 */
2439		tp->opcode = insn.opcode.bytes[1] - 0x10;
2440	}
2441
2442	switch (tp->opcode) {
2443	case RET_INSN_OPCODE:
2444	case JMP32_INSN_OPCODE:
2445	case JMP8_INSN_OPCODE:
2446		/*
2447		 * Control flow instructions without implied execution of the
2448		 * next instruction can be padded with INT3.
2449		 */
2450		for (i = insn.length; i < len; i++)
2451			BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2452		break;
2453
2454	default:
2455		BUG_ON(len != insn.length);
2456	}
2457
2458	switch (tp->opcode) {
2459	case INT3_INSN_OPCODE:
2460	case RET_INSN_OPCODE:
2461		break;
2462
2463	case CALL_INSN_OPCODE:
2464	case JMP32_INSN_OPCODE:
2465	case JMP8_INSN_OPCODE:
2466	case 0x70 ... 0x7f: /* Jcc */
2467		tp->disp = insn.immediate.value;
2468		break;
2469
2470	default: /* assume NOP */
2471		switch (len) {
2472		case 2: /* NOP2 -- emulate as JMP8+0 */
2473			BUG_ON(memcmp(emulate, x86_nops[len], len));
2474			tp->opcode = JMP8_INSN_OPCODE;
2475			tp->disp = 0;
2476			break;
2477
2478		case 5: /* NOP5 -- emulate as JMP32+0 */
2479			BUG_ON(memcmp(emulate, x86_nops[len], len));
2480			tp->opcode = JMP32_INSN_OPCODE;
2481			tp->disp = 0;
2482			break;
2483
2484		default: /* unknown instruction */
2485			BUG();
2486		}
2487		break;
2488	}
2489}
2490
2491/*
2492 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2493 * early if needed.
2494 */
2495static bool tp_order_fail(void *addr)
2496{
2497	struct text_poke_loc *tp;
2498
2499	if (!tp_vec_nr)
2500		return false;
2501
2502	if (!addr) /* force */
2503		return true;
2504
2505	tp = &tp_vec[tp_vec_nr - 1];
2506	if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2507		return true;
2508
2509	return false;
2510}
2511
2512static void text_poke_flush(void *addr)
2513{
2514	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2515		text_poke_bp_batch(tp_vec, tp_vec_nr);
2516		tp_vec_nr = 0;
2517	}
2518}
2519
2520void text_poke_finish(void)
2521{
2522	text_poke_flush(NULL);
2523}
2524
2525void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2526{
2527	struct text_poke_loc *tp;
2528
2529	text_poke_flush(addr);
2530
2531	tp = &tp_vec[tp_vec_nr++];
2532	text_poke_loc_init(tp, addr, opcode, len, emulate);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2533}
2534
2535/**
2536 * text_poke_bp() -- update instructions on live kernel on SMP
2537 * @addr:	address to patch
2538 * @opcode:	opcode of new instruction
2539 * @len:	length to copy
2540 * @emulate:	instruction to be emulated
 
 
2541 *
2542 * Update a single instruction with the vector in the stack, avoiding
2543 * dynamically allocated memory. This function should be used when it is
2544 * not possible to allocate memory.
2545 */
2546void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2547{
2548	struct text_poke_loc tp;
2549
2550	text_poke_loc_init(&tp, addr, opcode, len, emulate);
2551	text_poke_bp_batch(&tp, 1);
 
2552}
v3.1
 
 
 
  1#include <linux/module.h>
  2#include <linux/sched.h>
 
  3#include <linux/mutex.h>
  4#include <linux/list.h>
  5#include <linux/stringify.h>
  6#include <linux/kprobes.h>
  7#include <linux/mm.h>
  8#include <linux/vmalloc.h>
  9#include <linux/memory.h>
 10#include <linux/stop_machine.h>
 11#include <linux/slab.h>
 
 
 
 
 
 
 12#include <asm/alternative.h>
 13#include <asm/sections.h>
 14#include <asm/pgtable.h>
 15#include <asm/mce.h>
 16#include <asm/nmi.h>
 17#include <asm/cacheflush.h>
 18#include <asm/tlbflush.h>
 
 19#include <asm/io.h>
 20#include <asm/fixmap.h>
 
 
 
 
 
 
 
 21
 22#define MAX_PATCH_LEN (255-1)
 23
 24#ifdef CONFIG_HOTPLUG_CPU
 25static int smp_alt_once;
 
 
 
 
 
 
 26
 27static int __init bootonly(char *str)
 28{
 29	smp_alt_once = 1;
 30	return 1;
 31}
 32__setup("smp-alt-boot", bootonly);
 33#else
 34#define smp_alt_once 1
 35#endif
 36
 37static int __initdata_or_module debug_alternative;
 
 38
 39static int __init debug_alt(char *str)
 40{
 41	debug_alternative = 1;
 42	return 1;
 43}
 44__setup("debug-alternative", debug_alt);
 45
 46static int noreplace_smp;
 47
 48static int __init setup_noreplace_smp(char *str)
 49{
 50	noreplace_smp = 1;
 51	return 1;
 52}
 53__setup("noreplace-smp", setup_noreplace_smp);
 54
 55#ifdef CONFIG_PARAVIRT
 56static int __initdata_or_module noreplace_paravirt = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 57
 58static int __init setup_noreplace_paravirt(char *str)
 59{
 60	noreplace_paravirt = 1;
 61	return 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 62}
 63__setup("noreplace-paravirt", setup_noreplace_paravirt);
 64#endif
 65
 66#define DPRINTK(fmt, args...) if (debug_alternative) \
 67	printk(KERN_DEBUG fmt, args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 68
 69/*
 70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
 71 * that correspond to that nop. Getting from one nop to the next, we
 72 * add to the array the offset that is equal to the sum of all sizes of
 73 * nops preceding the one we are after.
 74 *
 75 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
 76 * nice symmetry of sizes of the previous nops.
 77 */
 78#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
 79static const unsigned char intelnops[] =
 80{
 81	GENERIC_NOP1,
 82	GENERIC_NOP2,
 83	GENERIC_NOP3,
 84	GENERIC_NOP4,
 85	GENERIC_NOP5,
 86	GENERIC_NOP6,
 87	GENERIC_NOP7,
 88	GENERIC_NOP8,
 89	GENERIC_NOP5_ATOMIC
 90};
 91static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
 92{
 93	NULL,
 94	intelnops,
 95	intelnops + 1,
 96	intelnops + 1 + 2,
 97	intelnops + 1 + 2 + 3,
 98	intelnops + 1 + 2 + 3 + 4,
 99	intelnops + 1 + 2 + 3 + 4 + 5,
100	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
101	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103};
104#endif
105
106#ifdef K8_NOP1
107static const unsigned char k8nops[] =
 
 
 
108{
109	K8_NOP1,
110	K8_NOP2,
111	K8_NOP3,
112	K8_NOP4,
113	K8_NOP5,
114	K8_NOP6,
115	K8_NOP7,
116	K8_NOP8,
117	K8_NOP5_ATOMIC
118};
119static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120{
121	NULL,
122	k8nops,
123	k8nops + 1,
124	k8nops + 1 + 2,
125	k8nops + 1 + 2 + 3,
126	k8nops + 1 + 2 + 3 + 4,
127	k8nops + 1 + 2 + 3 + 4 + 5,
128	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
129	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
130	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
131};
132#endif
133
134#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
135static const unsigned char k7nops[] =
136{
137	K7_NOP1,
138	K7_NOP2,
139	K7_NOP3,
140	K7_NOP4,
141	K7_NOP5,
142	K7_NOP6,
143	K7_NOP7,
144	K7_NOP8,
145	K7_NOP5_ATOMIC
146};
147static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148{
149	NULL,
150	k7nops,
151	k7nops + 1,
152	k7nops + 1 + 2,
153	k7nops + 1 + 2 + 3,
154	k7nops + 1 + 2 + 3 + 4,
155	k7nops + 1 + 2 + 3 + 4 + 5,
156	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
157	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
158	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
159};
160#endif
161
162#ifdef P6_NOP1
163static const unsigned char  __initconst_or_module p6nops[] =
164{
165	P6_NOP1,
166	P6_NOP2,
167	P6_NOP3,
168	P6_NOP4,
169	P6_NOP5,
170	P6_NOP6,
171	P6_NOP7,
172	P6_NOP8,
173	P6_NOP5_ATOMIC
174};
175static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
 
 
176{
177	NULL,
178	p6nops,
179	p6nops + 1,
180	p6nops + 1 + 2,
181	p6nops + 1 + 2 + 3,
182	p6nops + 1 + 2 + 3 + 4,
183	p6nops + 1 + 2 + 3 + 4 + 5,
184	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
185	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
186	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
187};
188#endif
 
 
 
189
190/* Initialize these to a safe default */
 
191#ifdef CONFIG_X86_64
192const unsigned char * const *ideal_nops = p6_nops;
 
 
193#else
194const unsigned char * const *ideal_nops = intel_nops;
 
 
195#endif
 
 
 
 
 
 
 
 
 
 
 
196
197void __init arch_init_ideal_nops(void)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198{
199	switch (boot_cpu_data.x86_vendor) {
200	case X86_VENDOR_INTEL:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201		/*
202		 * Due to a decoder implementation quirk, some
203		 * specific Intel CPUs actually perform better with
204		 * the "k8_nops" than with the SDM-recommended NOPs.
205		 */
206		if (boot_cpu_data.x86 == 6 &&
207		    boot_cpu_data.x86_model >= 0x0f &&
208		    boot_cpu_data.x86_model != 0x1c &&
209		    boot_cpu_data.x86_model != 0x26 &&
210		    boot_cpu_data.x86_model != 0x27 &&
211		    boot_cpu_data.x86_model < 0x30) {
212			ideal_nops = k8_nops;
213		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
214			   ideal_nops = p6_nops;
215		} else {
216#ifdef CONFIG_X86_64
217			ideal_nops = k8_nops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218#else
219			ideal_nops = intel_nops;
220#endif
221		}
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223	default:
224#ifdef CONFIG_X86_64
225		ideal_nops = k8_nops;
226#else
227		if (boot_cpu_has(X86_FEATURE_K8))
228			ideal_nops = k8_nops;
229		else if (boot_cpu_has(X86_FEATURE_K7))
230			ideal_nops = k7_nops;
231		else
232			ideal_nops = intel_nops;
233#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234	}
 
235}
236
237/* Use this to add nops to a buffer, then text_poke the whole buffer. */
238static void __init_or_module add_nops(void *insns, unsigned int len)
239{
240	while (len > 0) {
241		unsigned int noplen = len;
242		if (noplen > ASM_NOP_MAX)
243			noplen = ASM_NOP_MAX;
244		memcpy(insns, ideal_nops[noplen], noplen);
245		insns += noplen;
246		len -= noplen;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247	}
 
 
248}
 
249
250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
251extern s32 __smp_locks[], __smp_locks_end[];
252void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
254/* Replace instructions with better alternatives for this CPU type.
255   This runs before SMP is initialized to avoid SMP problems with
256   self modifying code. This implies that asymmetric systems where
257   APs have less capabilities than the boot processor are not handled.
258   Tough. Make sure you disable such features by hand. */
259
260void __init_or_module apply_alternatives(struct alt_instr *start,
261					 struct alt_instr *end)
262{
263	struct alt_instr *a;
264	u8 *instr, *replacement;
265	u8 insnbuf[MAX_PATCH_LEN];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
267	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
 
268	/*
269	 * The scan order should be from start to end. A later scanned
270	 * alternative code can overwrite a previous scanned alternative code.
271	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
272	 * patch code.
273	 *
274	 * So be careful if you want to change the scan order to any other
275	 * order.
276	 */
277	for (a = start; a < end; a++) {
278		instr = (u8 *)&a->instr_offset + a->instr_offset;
279		replacement = (u8 *)&a->repl_offset + a->repl_offset;
280		BUG_ON(a->replacementlen > a->instrlen);
281		BUG_ON(a->instrlen > sizeof(insnbuf));
282		BUG_ON(a->cpuid >= NCAPINTS*32);
283		if (!boot_cpu_has(a->cpuid))
 
 
 
 
 
284			continue;
285
286		memcpy(insnbuf, replacement, a->replacementlen);
 
 
 
 
287
288		/* 0xe8 is a relative jump; fix the offset. */
289		if (*insnbuf == 0xe8 && a->replacementlen == 5)
290		    *(s32 *)(insnbuf + 1) += replacement - instr;
 
291
292		add_nops(insnbuf + a->replacementlen,
293			 a->instrlen - a->replacementlen);
 
 
 
 
 
 
 
294
295		text_poke_early(instr, insnbuf, a->instrlen);
 
296	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297}
298
299#ifdef CONFIG_SMP
300
301static void alternatives_smp_lock(const s32 *start, const s32 *end,
302				  u8 *text, u8 *text_end)
303{
304	const s32 *poff;
305
306	mutex_lock(&text_mutex);
307	for (poff = start; poff < end; poff++) {
308		u8 *ptr = (u8 *)poff + *poff;
309
310		if (!*poff || ptr < text || ptr >= text_end)
311			continue;
312		/* turn DS segment override prefix into lock prefix */
313		if (*ptr == 0x3e)
314			text_poke(ptr, ((unsigned char []){0xf0}), 1);
315	};
316	mutex_unlock(&text_mutex);
317}
318
319static void alternatives_smp_unlock(const s32 *start, const s32 *end,
320				    u8 *text, u8 *text_end)
321{
322	const s32 *poff;
323
324	if (noreplace_smp)
325		return;
326
327	mutex_lock(&text_mutex);
328	for (poff = start; poff < end; poff++) {
329		u8 *ptr = (u8 *)poff + *poff;
330
331		if (!*poff || ptr < text || ptr >= text_end)
332			continue;
333		/* turn lock prefix into DS segment override prefix */
334		if (*ptr == 0xf0)
335			text_poke(ptr, ((unsigned char []){0x3E}), 1);
336	};
337	mutex_unlock(&text_mutex);
338}
339
340struct smp_alt_module {
341	/* what is this ??? */
342	struct module	*mod;
343	char		*name;
344
345	/* ptrs to lock prefixes */
346	const s32	*locks;
347	const s32	*locks_end;
348
349	/* .text segment, needed to avoid patching init code ;) */
350	u8		*text;
351	u8		*text_end;
352
353	struct list_head next;
354};
355static LIST_HEAD(smp_alt_modules);
356static DEFINE_MUTEX(smp_alt);
357static int smp_mode = 1;	/* protected by smp_alt */
358
359void __init_or_module alternatives_smp_module_add(struct module *mod,
360						  char *name,
361						  void *locks, void *locks_end,
362						  void *text,  void *text_end)
363{
364	struct smp_alt_module *smp;
365
366	if (noreplace_smp)
367		return;
 
368
369	if (smp_alt_once) {
370		if (boot_cpu_has(X86_FEATURE_UP))
371			alternatives_smp_unlock(locks, locks_end,
372						text, text_end);
373		return;
374	}
375
376	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
377	if (NULL == smp)
378		return; /* we'll run the (safe but slow) SMP code then ... */
 
379
380	smp->mod	= mod;
381	smp->name	= name;
382	smp->locks	= locks;
383	smp->locks_end	= locks_end;
384	smp->text	= text;
385	smp->text_end	= text_end;
386	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
387		__func__, smp->locks, smp->locks_end,
388		smp->text, smp->text_end, smp->name);
389
390	mutex_lock(&smp_alt);
391	list_add_tail(&smp->next, &smp_alt_modules);
392	if (boot_cpu_has(X86_FEATURE_UP))
393		alternatives_smp_unlock(smp->locks, smp->locks_end,
394					smp->text, smp->text_end);
395	mutex_unlock(&smp_alt);
396}
397
398void __init_or_module alternatives_smp_module_del(struct module *mod)
399{
400	struct smp_alt_module *item;
401
402	if (smp_alt_once || noreplace_smp)
403		return;
404
405	mutex_lock(&smp_alt);
406	list_for_each_entry(item, &smp_alt_modules, next) {
407		if (mod != item->mod)
408			continue;
409		list_del(&item->next);
410		mutex_unlock(&smp_alt);
411		DPRINTK("%s: %s\n", __func__, item->name);
412		kfree(item);
413		return;
414	}
415	mutex_unlock(&smp_alt);
416}
417
418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
420{
421	struct smp_alt_module *mod;
422
423#ifdef CONFIG_LOCKDEP
424	/*
425	 * Older binutils section handling bug prevented
426	 * alternatives-replacement from working reliably.
427	 *
428	 * If this still occurs then you should see a hang
429	 * or crash shortly after this line:
430	 */
431	printk("lockdep: fixing up alternatives.\n");
432#endif
433
434	if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
435		return;
436	BUG_ON(!smp && (num_online_cpus() > 1));
437
438	mutex_lock(&smp_alt);
439
440	/*
441	 * Avoid unnecessary switches because it forces JIT based VMs to
442	 * throw away all cached translations, which can be quite costly.
443	 */
444	if (smp == smp_mode) {
445		/* nothing */
446	} else if (smp) {
447		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
448		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450		list_for_each_entry(mod, &smp_alt_modules, next)
451			alternatives_smp_lock(mod->locks, mod->locks_end,
452					      mod->text, mod->text_end);
453	} else {
454		printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455		set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456		set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457		list_for_each_entry(mod, &smp_alt_modules, next)
458			alternatives_smp_unlock(mod->locks, mod->locks_end,
459						mod->text, mod->text_end);
460	}
461	smp_mode = smp;
462	mutex_unlock(&smp_alt);
463}
464
465/* Return 1 if the address range is reserved for smp-alternatives */
 
 
 
466int alternatives_text_reserved(void *start, void *end)
467{
468	struct smp_alt_module *mod;
469	const s32 *poff;
470	u8 *text_start = start;
471	u8 *text_end = end;
472
 
 
473	list_for_each_entry(mod, &smp_alt_modules, next) {
474		if (mod->text > text_end || mod->text_end < text_start)
475			continue;
476		for (poff = mod->locks; poff < mod->locks_end; poff++) {
477			const u8 *ptr = (const u8 *)poff + *poff;
478
479			if (text_start <= ptr && text_end > ptr)
480				return 1;
481		}
482	}
483
484	return 0;
485}
486#endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
488#ifdef CONFIG_PARAVIRT
489void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
490				     struct paravirt_patch_site *end)
491{
492	struct paravirt_patch_site *p;
493	char insnbuf[MAX_PATCH_LEN];
 
 
 
 
 
494
495	if (noreplace_paravirt)
496		return;
 
 
 
 
 
 
 
 
 
 
 
 
497
498	for (p = start; p < end; p++) {
499		unsigned int used;
500
501		BUG_ON(p->len > MAX_PATCH_LEN);
502		/* prep the buffer with the original instructions */
503		memcpy(insnbuf, p->instr, p->len);
504		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
505					 (unsigned long)p->instr, p->len);
506
507		BUG_ON(used > p->len);
 
 
 
 
508
509		/* Pad the rest with nops */
510		add_nops(insnbuf + used, p->len - used);
511		text_poke_early(p->instr, insnbuf, p->len);
512	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513}
514extern struct paravirt_patch_site __start_parainstructions[],
515	__stop_parainstructions[];
516#endif	/* CONFIG_PARAVIRT */
517
518void __init alternative_instructions(void)
519{
520	/* The patching is not fully atomic, so try to avoid local interruptions
521	   that might execute the to be patched code.
522	   Other CPUs are not running. */
 
 
 
 
523	stop_nmi();
524
525	/*
526	 * Don't stop machine check exceptions while patching.
527	 * MCEs only happen when something got corrupted and in this
528	 * case we must do something about the corruption.
529	 * Ignoring it is worse than a unlikely patching race.
530	 * Also machine checks tend to be broadcast and if one CPU
531	 * goes into machine check the others follow quickly, so we don't
532	 * expect a machine check to cause undue problems during to code
533	 * patching.
534	 */
535
536	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
 
 
 
 
 
 
537
538	/* switch to patch-once-at-boottime-only mode and free the
539	 * tables in case we know the number of CPUs will never ever
540	 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542	if (num_possible_cpus() < 2)
543		smp_alt_once = 1;
544#endif
 
 
 
 
 
 
 
 
 
 
 
 
545
546#ifdef CONFIG_SMP
547	if (smp_alt_once) {
548		if (1 == num_possible_cpus()) {
549			printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550			set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551			set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553			alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554						_text, _etext);
555		}
556	} else {
557		alternatives_smp_module_add(NULL, "core kernel",
558					    __smp_locks, __smp_locks_end,
559					    _text, _etext);
560
561		/* Only switch to UP mode if we don't immediately boot others */
562		if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563			alternatives_smp_switch(0);
564	}
565#endif
566 	apply_paravirt(__parainstructions, __parainstructions_end);
567
568	if (smp_alt_once)
569		free_init_pages("SMP alternatives",
570				(unsigned long)__smp_locks,
571				(unsigned long)__smp_locks_end);
 
 
572
573	restart_nmi();
 
 
 
574}
575
576/**
577 * text_poke_early - Update instructions on a live kernel at boot time
578 * @addr: address to modify
579 * @opcode: source of the copy
580 * @len: length to copy
581 *
582 * When you use this code to patch more than one byte of an instruction
583 * you need to make sure that other CPUs cannot execute this code in parallel.
584 * Also no thread must be currently preempted in the middle of these
585 * instructions. And on the local CPU you need to be protected again NMI or MCE
586 * handlers seeing an inconsistent instruction while you patch.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587 */
588void *__init_or_module text_poke_early(void *addr, const void *opcode,
589					      size_t len)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
590{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591	unsigned long flags;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592	local_irq_save(flags);
593	memcpy(addr, opcode, len);
594	sync_core();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595	local_irq_restore(flags);
596	/* Could also do a CLFLUSH here to speed up CPU recovery; but
597	   that causes hangs on some VIA CPUs. */
598	return addr;
599}
600
601/**
602 * text_poke - Update instructions on a live kernel
603 * @addr: address to modify
604 * @opcode: source of the copy
605 * @len: length to copy
606 *
607 * Only atomic text poke/set should be allowed when not doing early patching.
608 * It means the size must be writable atomically and the address must be aligned
609 * in a way that permits an atomic write. It also makes sure we fit on a single
610 * page.
611 *
612 * Note: Must be called under text_mutex.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613 */
614void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
615{
616	unsigned long flags;
617	char *vaddr;
618	struct page *pages[2];
619	int i;
 
 
 
 
 
 
 
 
620
621	if (!core_kernel_text((unsigned long)addr)) {
622		pages[0] = vmalloc_to_page(addr);
623		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
624	} else {
625		pages[0] = virt_to_page(addr);
626		WARN_ON(!PageReserved(pages[0]));
627		pages[1] = virt_to_page(addr + PAGE_SIZE);
628	}
629	BUG_ON(!pages[0]);
630	local_irq_save(flags);
631	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
632	if (pages[1])
633		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
634	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
635	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
636	clear_fixmap(FIX_TEXT_POKE0);
637	if (pages[1])
638		clear_fixmap(FIX_TEXT_POKE1);
639	local_flush_tlb();
640	sync_core();
641	/* Could also do a CLFLUSH here to speed up CPU recovery; but
642	   that causes hangs on some VIA CPUs. */
643	for (i = 0; i < len; i++)
644		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
645	local_irq_restore(flags);
646	return addr;
647}
648
649/*
650 * Cross-modifying kernel text with stop_machine().
651 * This code originally comes from immediate value.
 
652 */
653static atomic_t stop_machine_first;
654static int wrote_text;
 
 
 
 
 
 
 
 
655
656struct text_poke_params {
657	struct text_poke_param *params;
658	int nparams;
 
659};
660
661static int __kprobes stop_machine_text_poke(void *data)
 
 
 
662{
663	struct text_poke_params *tpp = data;
664	struct text_poke_param *p;
665	int i;
666
667	if (atomic_dec_and_test(&stop_machine_first)) {
668		for (i = 0; i < tpp->nparams; i++) {
669			p = &tpp->params[i];
670			text_poke(p->addr, p->opcode, p->len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671		}
672		smp_wmb();	/* Make sure other cpus see that this has run */
673		wrote_text = 1;
674	} else {
675		while (!wrote_text)
676			cpu_relax();
677		smp_mb();	/* Load wrote_text before following execution */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
678	}
679
680	for (i = 0; i < tpp->nparams; i++) {
681		p = &tpp->params[i];
682		flush_icache_range((unsigned long)p->addr,
683				   (unsigned long)p->addr + p->len);
 
 
 
684	}
 
685	/*
686	 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
687	 * that a core serializing instruction such as "cpuid" should be
688	 * executed on _each_ core before the new instruction is made visible.
689	 */
690	sync_core();
691	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692}
693
694/**
695 * text_poke_smp - Update instructions on a live kernel on SMP
696 * @addr: address to modify
697 * @opcode: source of the copy
698 * @len: length to copy
699 *
700 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
701 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
702 * should be allowed, since stop_machine() does _not_ protect code against
703 * NMI and MCE.
704 *
705 * Note: Must be called under get_online_cpus() and text_mutex.
706 */
707void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
708{
709	struct text_poke_params tpp;
710	struct text_poke_param p;
711
712	p.addr = addr;
713	p.opcode = opcode;
714	p.len = len;
715	tpp.params = &p;
716	tpp.nparams = 1;
717	atomic_set(&stop_machine_first, 1);
718	wrote_text = 0;
719	/* Use __stop_machine() because the caller already got online_cpus. */
720	__stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
721	return addr;
722}
723
724/**
725 * text_poke_smp_batch - Update instructions on a live kernel on SMP
726 * @params: an array of text_poke parameters
727 * @n: the number of elements in params.
728 *
729 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
730 * stop_machine() is heavy task, it is better to aggregate text_poke requests
731 * and do it once if possible.
732 *
733 * Note: Must be called under get_online_cpus() and text_mutex.
 
 
734 */
735void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
736{
737	struct text_poke_params tpp = {.params = params, .nparams = n};
738
739	atomic_set(&stop_machine_first, 1);
740	wrote_text = 0;
741	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
742}