Linux Audio

Check our new training course

Loading...
v3.5.6
 
 
 
  1#include <linux/module.h>
  2#include <linux/sched.h>
 
  3#include <linux/mutex.h>
  4#include <linux/list.h>
  5#include <linux/stringify.h>
  6#include <linux/kprobes.h>
  7#include <linux/mm.h>
  8#include <linux/vmalloc.h>
  9#include <linux/memory.h>
 10#include <linux/stop_machine.h>
 11#include <linux/slab.h>
 
 
 
 
 
 
 12#include <asm/alternative.h>
 13#include <asm/sections.h>
 14#include <asm/pgtable.h>
 15#include <asm/mce.h>
 16#include <asm/nmi.h>
 17#include <asm/cacheflush.h>
 18#include <asm/tlbflush.h>
 
 19#include <asm/io.h>
 20#include <asm/fixmap.h>
 
 
 21
 22#define MAX_PATCH_LEN (255-1)
 23
 24#ifdef CONFIG_HOTPLUG_CPU
 25static int smp_alt_once;
 26
 27static int __init bootonly(char *str)
 28{
 29	smp_alt_once = 1;
 30	return 1;
 31}
 32__setup("smp-alt-boot", bootonly);
 33#else
 34#define smp_alt_once 1
 35#endif
 36
 37static int __initdata_or_module debug_alternative;
 38
 39static int __init debug_alt(char *str)
 40{
 41	debug_alternative = 1;
 42	return 1;
 43}
 44__setup("debug-alternative", debug_alt);
 45
 46static int noreplace_smp;
 47
 48static int __init setup_noreplace_smp(char *str)
 49{
 50	noreplace_smp = 1;
 51	return 1;
 52}
 53__setup("noreplace-smp", setup_noreplace_smp);
 54
 55#ifdef CONFIG_PARAVIRT
 56static int __initdata_or_module noreplace_paravirt = 0;
 57
 58static int __init setup_noreplace_paravirt(char *str)
 59{
 60	noreplace_paravirt = 1;
 61	return 1;
 62}
 63__setup("noreplace-paravirt", setup_noreplace_paravirt);
 64#endif
 65
 66#define DPRINTK(fmt, args...) if (debug_alternative) \
 67	printk(KERN_DEBUG fmt, args)
 68
 69/*
 70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
 71 * that correspond to that nop. Getting from one nop to the next, we
 72 * add to the array the offset that is equal to the sum of all sizes of
 73 * nops preceding the one we are after.
 74 *
 75 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
 76 * nice symmetry of sizes of the previous nops.
 77 */
 78#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
 79static const unsigned char intelnops[] =
 80{
 81	GENERIC_NOP1,
 82	GENERIC_NOP2,
 83	GENERIC_NOP3,
 84	GENERIC_NOP4,
 85	GENERIC_NOP5,
 86	GENERIC_NOP6,
 87	GENERIC_NOP7,
 88	GENERIC_NOP8,
 89	GENERIC_NOP5_ATOMIC
 90};
 91static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
 92{
 93	NULL,
 94	intelnops,
 95	intelnops + 1,
 96	intelnops + 1 + 2,
 97	intelnops + 1 + 2 + 3,
 98	intelnops + 1 + 2 + 3 + 4,
 99	intelnops + 1 + 2 + 3 + 4 + 5,
100	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
101	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
103};
104#endif
105
106#ifdef K8_NOP1
107static const unsigned char k8nops[] =
108{
109	K8_NOP1,
110	K8_NOP2,
111	K8_NOP3,
112	K8_NOP4,
113	K8_NOP5,
114	K8_NOP6,
115	K8_NOP7,
116	K8_NOP8,
117	K8_NOP5_ATOMIC
118};
119static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
120{
121	NULL,
122	k8nops,
123	k8nops + 1,
124	k8nops + 1 + 2,
125	k8nops + 1 + 2 + 3,
126	k8nops + 1 + 2 + 3 + 4,
127	k8nops + 1 + 2 + 3 + 4 + 5,
128	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
129	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
130	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
131};
132#endif
133
134#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
135static const unsigned char k7nops[] =
136{
137	K7_NOP1,
138	K7_NOP2,
139	K7_NOP3,
140	K7_NOP4,
141	K7_NOP5,
142	K7_NOP6,
143	K7_NOP7,
144	K7_NOP8,
145	K7_NOP5_ATOMIC
146};
147static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
148{
149	NULL,
150	k7nops,
151	k7nops + 1,
152	k7nops + 1 + 2,
153	k7nops + 1 + 2 + 3,
154	k7nops + 1 + 2 + 3 + 4,
155	k7nops + 1 + 2 + 3 + 4 + 5,
156	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
157	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
158	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
159};
160#endif
161
162#ifdef P6_NOP1
163static const unsigned char p6nops[] =
164{
165	P6_NOP1,
166	P6_NOP2,
167	P6_NOP3,
168	P6_NOP4,
169	P6_NOP5,
170	P6_NOP6,
171	P6_NOP7,
172	P6_NOP8,
173	P6_NOP5_ATOMIC
174};
175static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
176{
177	NULL,
178	p6nops,
179	p6nops + 1,
180	p6nops + 1 + 2,
181	p6nops + 1 + 2 + 3,
182	p6nops + 1 + 2 + 3 + 4,
183	p6nops + 1 + 2 + 3 + 4 + 5,
184	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
185	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
186	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
187};
188#endif
189
190/* Initialize these to a safe default */
191#ifdef CONFIG_X86_64
192const unsigned char * const *ideal_nops = p6_nops;
193#else
194const unsigned char * const *ideal_nops = intel_nops;
195#endif
196
197void __init arch_init_ideal_nops(void)
198{
199	switch (boot_cpu_data.x86_vendor) {
200	case X86_VENDOR_INTEL:
201		/*
202		 * Due to a decoder implementation quirk, some
203		 * specific Intel CPUs actually perform better with
204		 * the "k8_nops" than with the SDM-recommended NOPs.
205		 */
206		if (boot_cpu_data.x86 == 6 &&
207		    boot_cpu_data.x86_model >= 0x0f &&
208		    boot_cpu_data.x86_model != 0x1c &&
209		    boot_cpu_data.x86_model != 0x26 &&
210		    boot_cpu_data.x86_model != 0x27 &&
211		    boot_cpu_data.x86_model < 0x30) {
212			ideal_nops = k8_nops;
213		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
214			   ideal_nops = p6_nops;
215		} else {
216#ifdef CONFIG_X86_64
217			ideal_nops = k8_nops;
218#else
219			ideal_nops = intel_nops;
220#endif
221		}
222		break;
223	default:
224#ifdef CONFIG_X86_64
225		ideal_nops = k8_nops;
226#else
227		if (boot_cpu_has(X86_FEATURE_K8))
228			ideal_nops = k8_nops;
229		else if (boot_cpu_has(X86_FEATURE_K7))
230			ideal_nops = k7_nops;
231		else
232			ideal_nops = intel_nops;
233#endif
234	}
235}
236
237/* Use this to add nops to a buffer, then text_poke the whole buffer. */
238static void __init_or_module add_nops(void *insns, unsigned int len)
239{
240	while (len > 0) {
241		unsigned int noplen = len;
242		if (noplen > ASM_NOP_MAX)
243			noplen = ASM_NOP_MAX;
244		memcpy(insns, ideal_nops[noplen], noplen);
245		insns += noplen;
246		len -= noplen;
247	}
248}
249
 
 
 
 
250extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
251extern s32 __smp_locks[], __smp_locks_end[];
252void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
254/* Replace instructions with better alternatives for this CPU type.
255   This runs before SMP is initialized to avoid SMP problems with
256   self modifying code. This implies that asymmetric systems where
257   APs have less capabilities than the boot processor are not handled.
258   Tough. Make sure you disable such features by hand. */
259
260void __init_or_module apply_alternatives(struct alt_instr *start,
261					 struct alt_instr *end)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262{
263	struct alt_instr *a;
264	u8 *instr, *replacement;
265	u8 insnbuf[MAX_PATCH_LEN];
266
267	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
268	/*
269	 * The scan order should be from start to end. A later scanned
270	 * alternative code can overwrite a previous scanned alternative code.
271	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
272	 * patch code.
273	 *
274	 * So be careful if you want to change the scan order to any other
275	 * order.
276	 */
277	for (a = start; a < end; a++) {
 
 
 
 
278		instr = (u8 *)&a->instr_offset + a->instr_offset;
279		replacement = (u8 *)&a->repl_offset + a->repl_offset;
280		BUG_ON(a->replacementlen > a->instrlen);
281		BUG_ON(a->instrlen > sizeof(insnbuf));
282		BUG_ON(a->cpuid >= NCAPINTS*32);
283		if (!boot_cpu_has(a->cpuid))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284			continue;
285
286		memcpy(insnbuf, replacement, a->replacementlen);
 
287
288		/* 0xe8 is a relative jump; fix the offset. */
289		if (*insnbuf == 0xe8 && a->replacementlen == 5)
290		    *(s32 *)(insnbuf + 1) += replacement - instr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
292		add_nops(insnbuf + a->replacementlen,
293			 a->instrlen - a->replacementlen);
294
295		text_poke_early(instr, insnbuf, a->instrlen);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296	}
 
 
 
 
297}
298
299#ifdef CONFIG_SMP
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301static void alternatives_smp_lock(const s32 *start, const s32 *end,
302				  u8 *text, u8 *text_end)
303{
304	const s32 *poff;
305
306	mutex_lock(&text_mutex);
307	for (poff = start; poff < end; poff++) {
308		u8 *ptr = (u8 *)poff + *poff;
309
310		if (!*poff || ptr < text || ptr >= text_end)
311			continue;
312		/* turn DS segment override prefix into lock prefix */
313		if (*ptr == 0x3e)
314			text_poke(ptr, ((unsigned char []){0xf0}), 1);
315	};
316	mutex_unlock(&text_mutex);
317}
318
319static void alternatives_smp_unlock(const s32 *start, const s32 *end,
320				    u8 *text, u8 *text_end)
321{
322	const s32 *poff;
323
324	if (noreplace_smp)
325		return;
326
327	mutex_lock(&text_mutex);
328	for (poff = start; poff < end; poff++) {
329		u8 *ptr = (u8 *)poff + *poff;
330
331		if (!*poff || ptr < text || ptr >= text_end)
332			continue;
333		/* turn lock prefix into DS segment override prefix */
334		if (*ptr == 0xf0)
335			text_poke(ptr, ((unsigned char []){0x3E}), 1);
336	};
337	mutex_unlock(&text_mutex);
338}
339
340struct smp_alt_module {
341	/* what is this ??? */
342	struct module	*mod;
343	char		*name;
344
345	/* ptrs to lock prefixes */
346	const s32	*locks;
347	const s32	*locks_end;
348
349	/* .text segment, needed to avoid patching init code ;) */
350	u8		*text;
351	u8		*text_end;
352
353	struct list_head next;
354};
355static LIST_HEAD(smp_alt_modules);
356static DEFINE_MUTEX(smp_alt);
357static int smp_mode = 1;	/* protected by smp_alt */
358
359void __init_or_module alternatives_smp_module_add(struct module *mod,
360						  char *name,
361						  void *locks, void *locks_end,
362						  void *text,  void *text_end)
363{
364	struct smp_alt_module *smp;
365
366	if (noreplace_smp)
367		return;
 
368
369	if (smp_alt_once) {
370		if (boot_cpu_has(X86_FEATURE_UP))
371			alternatives_smp_unlock(locks, locks_end,
372						text, text_end);
373		return;
374	}
375
376	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
377	if (NULL == smp)
378		return; /* we'll run the (safe but slow) SMP code then ... */
 
379
380	smp->mod	= mod;
381	smp->name	= name;
382	smp->locks	= locks;
383	smp->locks_end	= locks_end;
384	smp->text	= text;
385	smp->text_end	= text_end;
386	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
387		__func__, smp->locks, smp->locks_end,
388		smp->text, smp->text_end, smp->name);
389
390	mutex_lock(&smp_alt);
391	list_add_tail(&smp->next, &smp_alt_modules);
392	if (boot_cpu_has(X86_FEATURE_UP))
393		alternatives_smp_unlock(smp->locks, smp->locks_end,
394					smp->text, smp->text_end);
395	mutex_unlock(&smp_alt);
396}
397
398void __init_or_module alternatives_smp_module_del(struct module *mod)
399{
400	struct smp_alt_module *item;
401
402	if (smp_alt_once || noreplace_smp)
403		return;
404
405	mutex_lock(&smp_alt);
406	list_for_each_entry(item, &smp_alt_modules, next) {
407		if (mod != item->mod)
408			continue;
409		list_del(&item->next);
410		mutex_unlock(&smp_alt);
411		DPRINTK("%s: %s\n", __func__, item->name);
412		kfree(item);
413		return;
414	}
415	mutex_unlock(&smp_alt);
416}
417
418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
420{
421	struct smp_alt_module *mod;
422
423#ifdef CONFIG_LOCKDEP
424	/*
425	 * Older binutils section handling bug prevented
426	 * alternatives-replacement from working reliably.
427	 *
428	 * If this still occurs then you should see a hang
429	 * or crash shortly after this line:
430	 */
431	printk("lockdep: fixing up alternatives.\n");
432#endif
433
434	if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
435		return;
436	BUG_ON(!smp && (num_online_cpus() > 1));
437
438	mutex_lock(&smp_alt);
439
440	/*
441	 * Avoid unnecessary switches because it forces JIT based VMs to
442	 * throw away all cached translations, which can be quite costly.
443	 */
444	if (smp == smp_mode) {
445		/* nothing */
446	} else if (smp) {
447		printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
448		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
449		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
450		list_for_each_entry(mod, &smp_alt_modules, next)
451			alternatives_smp_lock(mod->locks, mod->locks_end,
452					      mod->text, mod->text_end);
453	} else {
454		printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455		set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456		set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457		list_for_each_entry(mod, &smp_alt_modules, next)
458			alternatives_smp_unlock(mod->locks, mod->locks_end,
459						mod->text, mod->text_end);
460	}
461	smp_mode = smp;
462	mutex_unlock(&smp_alt);
463}
464
465/* Return 1 if the address range is reserved for smp-alternatives */
 
 
 
466int alternatives_text_reserved(void *start, void *end)
467{
468	struct smp_alt_module *mod;
469	const s32 *poff;
470	u8 *text_start = start;
471	u8 *text_end = end;
472
 
 
473	list_for_each_entry(mod, &smp_alt_modules, next) {
474		if (mod->text > text_end || mod->text_end < text_start)
475			continue;
476		for (poff = mod->locks; poff < mod->locks_end; poff++) {
477			const u8 *ptr = (const u8 *)poff + *poff;
478
479			if (text_start <= ptr && text_end > ptr)
480				return 1;
481		}
482	}
483
484	return 0;
485}
486#endif
487
488#ifdef CONFIG_PARAVIRT
489void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
490				     struct paravirt_patch_site *end)
491{
492	struct paravirt_patch_site *p;
493	char insnbuf[MAX_PATCH_LEN];
494
495	if (noreplace_paravirt)
496		return;
497
498	for (p = start; p < end; p++) {
499		unsigned int used;
500
501		BUG_ON(p->len > MAX_PATCH_LEN);
502		/* prep the buffer with the original instructions */
503		memcpy(insnbuf, p->instr, p->len);
504		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
505					 (unsigned long)p->instr, p->len);
506
507		BUG_ON(used > p->len);
508
509		/* Pad the rest with nops */
510		add_nops(insnbuf + used, p->len - used);
511		text_poke_early(p->instr, insnbuf, p->len);
512	}
513}
514extern struct paravirt_patch_site __start_parainstructions[],
515	__stop_parainstructions[];
516#endif	/* CONFIG_PARAVIRT */
517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518void __init alternative_instructions(void)
519{
520	/* The patching is not fully atomic, so try to avoid local interruptions
521	   that might execute the to be patched code.
522	   Other CPUs are not running. */
 
 
 
 
523	stop_nmi();
524
525	/*
526	 * Don't stop machine check exceptions while patching.
527	 * MCEs only happen when something got corrupted and in this
528	 * case we must do something about the corruption.
529	 * Ignoring it is worse than a unlikely patching race.
530	 * Also machine checks tend to be broadcast and if one CPU
531	 * goes into machine check the others follow quickly, so we don't
532	 * expect a machine check to cause undue problems during to code
533	 * patching.
534	 */
535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536	apply_alternatives(__alt_instructions, __alt_instructions_end);
537
538	/* switch to patch-once-at-boottime-only mode and free the
539	 * tables in case we know the number of CPUs will never ever
540	 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542	if (num_possible_cpus() < 2)
543		smp_alt_once = 1;
544#endif
545
546#ifdef CONFIG_SMP
547	if (smp_alt_once) {
548		if (1 == num_possible_cpus()) {
549			printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550			set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551			set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553			alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554						_text, _etext);
555		}
556	} else {
557		alternatives_smp_module_add(NULL, "core kernel",
558					    __smp_locks, __smp_locks_end,
559					    _text, _etext);
560
561		/* Only switch to UP mode if we don't immediately boot others */
562		if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563			alternatives_smp_switch(0);
564	}
565#endif
566 	apply_paravirt(__parainstructions, __parainstructions_end);
567
568	if (smp_alt_once)
569		free_init_pages("SMP alternatives",
570				(unsigned long)__smp_locks,
571				(unsigned long)__smp_locks_end);
 
 
572
573	restart_nmi();
 
574}
575
576/**
577 * text_poke_early - Update instructions on a live kernel at boot time
578 * @addr: address to modify
579 * @opcode: source of the copy
580 * @len: length to copy
581 *
582 * When you use this code to patch more than one byte of an instruction
583 * you need to make sure that other CPUs cannot execute this code in parallel.
584 * Also no thread must be currently preempted in the middle of these
585 * instructions. And on the local CPU you need to be protected again NMI or MCE
586 * handlers seeing an inconsistent instruction while you patch.
587 */
588void *__init_or_module text_poke_early(void *addr, const void *opcode,
589					      size_t len)
590{
591	unsigned long flags;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592	local_irq_save(flags);
593	memcpy(addr, opcode, len);
594	sync_core();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595	local_irq_restore(flags);
596	/* Could also do a CLFLUSH here to speed up CPU recovery; but
597	   that causes hangs on some VIA CPUs. */
598	return addr;
599}
600
601/**
602 * text_poke - Update instructions on a live kernel
603 * @addr: address to modify
604 * @opcode: source of the copy
605 * @len: length to copy
606 *
607 * Only atomic text poke/set should be allowed when not doing early patching.
608 * It means the size must be writable atomically and the address must be aligned
609 * in a way that permits an atomic write. It also makes sure we fit on a single
610 * page.
611 *
612 * Note: Must be called under text_mutex.
 
 
 
613 */
614void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
615{
616	unsigned long flags;
617	char *vaddr;
618	struct page *pages[2];
619	int i;
620
621	if (!core_kernel_text((unsigned long)addr)) {
622		pages[0] = vmalloc_to_page(addr);
623		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
624	} else {
625		pages[0] = virt_to_page(addr);
626		WARN_ON(!PageReserved(pages[0]));
627		pages[1] = virt_to_page(addr + PAGE_SIZE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628	}
629	BUG_ON(!pages[0]);
630	local_irq_save(flags);
631	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
632	if (pages[1])
633		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
634	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
635	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
636	clear_fixmap(FIX_TEXT_POKE0);
637	if (pages[1])
638		clear_fixmap(FIX_TEXT_POKE1);
639	local_flush_tlb();
640	sync_core();
641	/* Could also do a CLFLUSH here to speed up CPU recovery; but
642	   that causes hangs on some VIA CPUs. */
643	for (i = 0; i < len; i++)
644		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
645	local_irq_restore(flags);
646	return addr;
647}
648
649/*
650 * Cross-modifying kernel text with stop_machine().
651 * This code originally comes from immediate value.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652 */
653static atomic_t stop_machine_first;
654static int wrote_text;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
656struct text_poke_params {
657	struct text_poke_param *params;
658	int nparams;
 
659};
660
661static int __kprobes stop_machine_text_poke(void *data)
 
 
 
662{
663	struct text_poke_params *tpp = data;
664	struct text_poke_param *p;
665	int i;
666
667	if (atomic_dec_and_test(&stop_machine_first)) {
668		for (i = 0; i < tpp->nparams; i++) {
669			p = &tpp->params[i];
670			text_poke(p->addr, p->opcode, p->len);
671		}
672		smp_wmb();	/* Make sure other cpus see that this has run */
673		wrote_text = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674	} else {
675		while (!wrote_text)
676			cpu_relax();
677		smp_mb();	/* Load wrote_text before following execution */
678	}
679
680	for (i = 0; i < tpp->nparams; i++) {
681		p = &tpp->params[i];
682		flush_icache_range((unsigned long)p->addr,
683				   (unsigned long)p->addr + p->len);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
684	}
685	/*
686	 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
687	 * that a core serializing instruction such as "cpuid" should be
688	 * executed on _each_ core before the new instruction is made visible.
689	 */
690	sync_core();
691	return 0;
692}
693
 
 
 
 
694/**
695 * text_poke_smp - Update instructions on a live kernel on SMP
696 * @addr: address to modify
697 * @opcode: source of the copy
698 * @len: length to copy
699 *
700 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
701 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
702 * should be allowed, since stop_machine() does _not_ protect code against
703 * NMI and MCE.
704 *
705 * Note: Must be called under get_online_cpus() and text_mutex.
706 */
707void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
708{
709	struct text_poke_params tpp;
710	struct text_poke_param p;
711
712	p.addr = addr;
713	p.opcode = opcode;
714	p.len = len;
715	tpp.params = &p;
716	tpp.nparams = 1;
717	atomic_set(&stop_machine_first, 1);
718	wrote_text = 0;
719	/* Use __stop_machine() because the caller already got online_cpus. */
720	__stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
721	return addr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722}
723
724/**
725 * text_poke_smp_batch - Update instructions on a live kernel on SMP
726 * @params: an array of text_poke parameters
727 * @n: the number of elements in params.
728 *
729 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
730 * stop_machine() is heavy task, it is better to aggregate text_poke requests
731 * and do it once if possible.
732 *
733 * Note: Must be called under get_online_cpus() and text_mutex.
 
 
734 */
735void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
736{
737	struct text_poke_params tpp = {.params = params, .nparams = n};
738
739	atomic_set(&stop_machine_first, 1);
740	wrote_text = 0;
741	__stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
742}
v6.2
   1// SPDX-License-Identifier: GPL-2.0-only
   2#define pr_fmt(fmt) "SMP alternatives: " fmt
   3
   4#include <linux/module.h>
   5#include <linux/sched.h>
   6#include <linux/perf_event.h>
   7#include <linux/mutex.h>
   8#include <linux/list.h>
   9#include <linux/stringify.h>
  10#include <linux/highmem.h>
  11#include <linux/mm.h>
  12#include <linux/vmalloc.h>
  13#include <linux/memory.h>
  14#include <linux/stop_machine.h>
  15#include <linux/slab.h>
  16#include <linux/kdebug.h>
  17#include <linux/kprobes.h>
  18#include <linux/mmu_context.h>
  19#include <linux/bsearch.h>
  20#include <linux/sync_core.h>
  21#include <asm/text-patching.h>
  22#include <asm/alternative.h>
  23#include <asm/sections.h>
 
  24#include <asm/mce.h>
  25#include <asm/nmi.h>
  26#include <asm/cacheflush.h>
  27#include <asm/tlbflush.h>
  28#include <asm/insn.h>
  29#include <asm/io.h>
  30#include <asm/fixmap.h>
  31#include <asm/paravirt.h>
  32#include <asm/asm-prototypes.h>
  33
  34int __read_mostly alternatives_patched;
  35
  36EXPORT_SYMBOL_GPL(alternatives_patched);
 
  37
  38#define MAX_PATCH_LEN (255-1)
 
 
 
 
 
 
 
 
  39
  40static int __initdata_or_module debug_alternative;
  41
  42static int __init debug_alt(char *str)
  43{
  44	debug_alternative = 1;
  45	return 1;
  46}
  47__setup("debug-alternative", debug_alt);
  48
  49static int noreplace_smp;
  50
  51static int __init setup_noreplace_smp(char *str)
  52{
  53	noreplace_smp = 1;
  54	return 1;
  55}
  56__setup("noreplace-smp", setup_noreplace_smp);
  57
  58#define DPRINTK(fmt, args...)						\
  59do {									\
  60	if (debug_alternative)						\
  61		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
  62} while (0)
  63
  64#define DUMP_BYTES(buf, len, fmt, args...)				\
  65do {									\
  66	if (unlikely(debug_alternative)) {				\
  67		int j;							\
  68									\
  69		if (!(len))						\
  70			break;						\
  71									\
  72		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
  73		for (j = 0; j < (len) - 1; j++)				\
  74			printk(KERN_CONT "%02hhx ", buf[j]);		\
  75		printk(KERN_CONT "%02hhx\n", buf[j]);			\
  76	}								\
  77} while (0)
  78
  79static const unsigned char x86nops[] =
  80{
  81	BYTES_NOP1,
  82	BYTES_NOP2,
  83	BYTES_NOP3,
  84	BYTES_NOP4,
  85	BYTES_NOP5,
  86	BYTES_NOP6,
  87	BYTES_NOP7,
  88	BYTES_NOP8,
 
 
 
 
  89};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  90
  91const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
 
 
 
 
 
 
 
 
 
 
 
 
 
  92{
  93	NULL,
  94	x86nops,
  95	x86nops + 1,
  96	x86nops + 1 + 2,
  97	x86nops + 1 + 2 + 3,
  98	x86nops + 1 + 2 + 3 + 4,
  99	x86nops + 1 + 2 + 3 + 4 + 5,
 100	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
 101	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 
 102};
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 103
 104/* Use this to add nops to a buffer, then text_poke the whole buffer. */
 105static void __init_or_module add_nops(void *insns, unsigned int len)
 106{
 107	while (len > 0) {
 108		unsigned int noplen = len;
 109		if (noplen > ASM_NOP_MAX)
 110			noplen = ASM_NOP_MAX;
 111		memcpy(insns, x86_nops[noplen], noplen);
 112		insns += noplen;
 113		len -= noplen;
 114	}
 115}
 116
 117extern s32 __retpoline_sites[], __retpoline_sites_end[];
 118extern s32 __return_sites[], __return_sites_end[];
 119extern s32 __cfi_sites[], __cfi_sites_end[];
 120extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
 121extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 122extern s32 __smp_locks[], __smp_locks_end[];
 123void text_poke_early(void *addr, const void *opcode, size_t len);
 124
 125/*
 126 * Are we looking at a near JMP with a 1 or 4-byte displacement.
 127 */
 128static inline bool is_jmp(const u8 opcode)
 129{
 130	return opcode == 0xeb || opcode == 0xe9;
 131}
 132
 133static void __init_or_module
 134recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
 135{
 136	u8 *next_rip, *tgt_rip;
 137	s32 n_dspl, o_dspl;
 138	int repl_len;
 139
 140	if (a->replacementlen != 5)
 141		return;
 142
 143	o_dspl = *(s32 *)(insn_buff + 1);
 144
 145	/* next_rip of the replacement JMP */
 146	next_rip = repl_insn + a->replacementlen;
 147	/* target rip of the replacement JMP */
 148	tgt_rip  = next_rip + o_dspl;
 149	n_dspl = tgt_rip - orig_insn;
 150
 151	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
 152
 153	if (tgt_rip - orig_insn >= 0) {
 154		if (n_dspl - 2 <= 127)
 155			goto two_byte_jmp;
 156		else
 157			goto five_byte_jmp;
 158	/* negative offset */
 159	} else {
 160		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
 161			goto two_byte_jmp;
 162		else
 163			goto five_byte_jmp;
 164	}
 165
 166two_byte_jmp:
 167	n_dspl -= 2;
 168
 169	insn_buff[0] = 0xeb;
 170	insn_buff[1] = (s8)n_dspl;
 171	add_nops(insn_buff + 2, 3);
 172
 173	repl_len = 2;
 174	goto done;
 175
 176five_byte_jmp:
 177	n_dspl -= 5;
 178
 179	insn_buff[0] = 0xe9;
 180	*(s32 *)&insn_buff[1] = n_dspl;
 181
 182	repl_len = 5;
 183
 184done:
 185
 186	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
 187		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 188}
 189
 190/*
 191 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
 192 *
 193 * @instr: instruction byte stream
 194 * @instrlen: length of the above
 195 * @off: offset within @instr where the first NOP has been detected
 196 *
 197 * Return: number of NOPs found (and replaced).
 198 */
 199static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
 200{
 201	unsigned long flags;
 202	int i = off, nnops;
 203
 204	while (i < instrlen) {
 205		if (instr[i] != 0x90)
 206			break;
 207
 208		i++;
 209	}
 210
 211	nnops = i - off;
 212
 213	if (nnops <= 1)
 214		return nnops;
 215
 216	local_irq_save(flags);
 217	add_nops(instr + off, nnops);
 218	local_irq_restore(flags);
 219
 220	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
 221
 222	return nnops;
 223}
 
 
 
 224
 225/*
 226 * "noinline" to cause control flow change and thus invalidate I$ and
 227 * cause refetch after modification.
 228 */
 229static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
 230{
 231	struct insn insn;
 232	int i = 0;
 233
 234	/*
 235	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
 236	 * ones.
 237	 */
 238	for (;;) {
 239		if (insn_decode_kernel(&insn, &instr[i]))
 240			return;
 241
 242		/*
 243		 * See if this and any potentially following NOPs can be
 244		 * optimized.
 245		 */
 246		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
 247			i += optimize_nops_range(instr, len, i);
 248		else
 249			i += insn.length;
 250
 251		if (i >= len)
 252			return;
 253	}
 254}
 255
 256/*
 257 * Replace instructions with better alternatives for this CPU type. This runs
 258 * before SMP is initialized to avoid SMP problems with self modifying code.
 259 * This implies that asymmetric systems where APs have less capabilities than
 260 * the boot processor are not handled. Tough. Make sure you disable such
 261 * features by hand.
 262 *
 263 * Marked "noinline" to cause control flow change and thus insn cache
 264 * to refetch changed I$ lines.
 265 */
 266void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 267						  struct alt_instr *end)
 268{
 269	struct alt_instr *a;
 270	u8 *instr, *replacement;
 271	u8 insn_buff[MAX_PATCH_LEN];
 272
 273	DPRINTK("alt table %px, -> %px", start, end);
 274	/*
 275	 * The scan order should be from start to end. A later scanned
 276	 * alternative code can overwrite previously scanned alternative code.
 277	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 278	 * patch code.
 279	 *
 280	 * So be careful if you want to change the scan order to any other
 281	 * order.
 282	 */
 283	for (a = start; a < end; a++) {
 284		int insn_buff_sz = 0;
 285		/* Mask away "NOT" flag bit for feature to test. */
 286		u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
 287
 288		instr = (u8 *)&a->instr_offset + a->instr_offset;
 289		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 290		BUG_ON(a->instrlen > sizeof(insn_buff));
 291		BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
 292
 293		/*
 294		 * Patch if either:
 295		 * - feature is present
 296		 * - feature not present but ALTINSTR_FLAG_INV is set to mean,
 297		 *   patch if feature is *NOT* present.
 298		 */
 299		if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
 300			goto next;
 301
 302		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
 303			(a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
 304			feature >> 5,
 305			feature & 0x1f,
 306			instr, instr, a->instrlen,
 307			replacement, a->replacementlen);
 308
 309		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
 310		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
 311
 312		memcpy(insn_buff, replacement, a->replacementlen);
 313		insn_buff_sz = a->replacementlen;
 314
 315		/*
 316		 * 0xe8 is a relative jump; fix the offset.
 317		 *
 318		 * Instruction length is checked before the opcode to avoid
 319		 * accessing uninitialized bytes for zero-length replacements.
 320		 */
 321		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
 322			*(s32 *)(insn_buff + 1) += replacement - instr;
 323			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
 324				*(s32 *)(insn_buff + 1),
 325				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
 326		}
 327
 328		if (a->replacementlen && is_jmp(replacement[0]))
 329			recompute_jump(a, instr, replacement, insn_buff);
 330
 331		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
 332			insn_buff[insn_buff_sz] = 0x90;
 333
 334		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 335
 336		text_poke_early(instr, insn_buff, insn_buff_sz);
 337
 338next:
 339		optimize_nops(instr, a->instrlen);
 340	}
 341}
 342
 343#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
 344
 345/*
 346 * CALL/JMP *%\reg
 347 */
 348static int emit_indirect(int op, int reg, u8 *bytes)
 349{
 350	int i = 0;
 351	u8 modrm;
 352
 353	switch (op) {
 354	case CALL_INSN_OPCODE:
 355		modrm = 0x10; /* Reg = 2; CALL r/m */
 356		break;
 357
 358	case JMP32_INSN_OPCODE:
 359		modrm = 0x20; /* Reg = 4; JMP r/m */
 360		break;
 361
 362	default:
 363		WARN_ON_ONCE(1);
 364		return -1;
 365	}
 366
 367	if (reg >= 8) {
 368		bytes[i++] = 0x41; /* REX.B prefix */
 369		reg -= 8;
 370	}
 371
 372	modrm |= 0xc0; /* Mod = 3 */
 373	modrm += reg;
 374
 375	bytes[i++] = 0xff; /* opcode */
 376	bytes[i++] = modrm;
 377
 378	return i;
 379}
 380
 381static inline bool is_jcc32(struct insn *insn)
 382{
 383	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
 384	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
 385}
 386
 387static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
 388{
 389	u8 op = insn->opcode.bytes[0];
 390	int i = 0;
 391
 392	/*
 393	 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
 394	 * tail-calls. Deal with them.
 395	 */
 396	if (is_jcc32(insn)) {
 397		bytes[i++] = op;
 398		op = insn->opcode.bytes[1];
 399		goto clang_jcc;
 400	}
 401
 402	if (insn->length == 6)
 403		bytes[i++] = 0x2e; /* CS-prefix */
 404
 405	switch (op) {
 406	case CALL_INSN_OPCODE:
 407		__text_gen_insn(bytes+i, op, addr+i,
 408				__x86_indirect_call_thunk_array[reg],
 409				CALL_INSN_SIZE);
 410		i += CALL_INSN_SIZE;
 411		break;
 412
 413	case JMP32_INSN_OPCODE:
 414clang_jcc:
 415		__text_gen_insn(bytes+i, op, addr+i,
 416				__x86_indirect_jump_thunk_array[reg],
 417				JMP32_INSN_SIZE);
 418		i += JMP32_INSN_SIZE;
 419		break;
 420
 421	default:
 422		WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
 423		return -1;
 424	}
 425
 426	WARN_ON_ONCE(i != insn->length);
 427
 428	return i;
 429}
 430
 431/*
 432 * Rewrite the compiler generated retpoline thunk calls.
 433 *
 434 * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
 435 * indirect instructions, avoiding the extra indirection.
 436 *
 437 * For example, convert:
 438 *
 439 *   CALL __x86_indirect_thunk_\reg
 440 *
 441 * into:
 442 *
 443 *   CALL *%\reg
 444 *
 445 * It also tries to inline spectre_v2=retpoline,lfence when size permits.
 446 */
 447static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
 448{
 449	retpoline_thunk_t *target;
 450	int reg, ret, i = 0;
 451	u8 op, cc;
 452
 453	target = addr + insn->length + insn->immediate.value;
 454	reg = target - __x86_indirect_thunk_array;
 455
 456	if (WARN_ON_ONCE(reg & ~0xf))
 457		return -1;
 458
 459	/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
 460	BUG_ON(reg == 4);
 461
 462	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
 463	    !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 464		if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
 465			return emit_call_track_retpoline(addr, insn, reg, bytes);
 466
 467		return -1;
 468	}
 469
 470	op = insn->opcode.bytes[0];
 471
 472	/*
 473	 * Convert:
 474	 *
 475	 *   Jcc.d32 __x86_indirect_thunk_\reg
 476	 *
 477	 * into:
 478	 *
 479	 *   Jncc.d8 1f
 480	 *   [ LFENCE ]
 481	 *   JMP *%\reg
 482	 *   [ NOP ]
 483	 * 1:
 484	 */
 485	if (is_jcc32(insn)) {
 486		cc = insn->opcode.bytes[1] & 0xf;
 487		cc ^= 1; /* invert condition */
 488
 489		bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
 490		bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
 491
 492		/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
 493		op = JMP32_INSN_OPCODE;
 494	}
 495
 496	/*
 497	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
 498	 */
 499	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
 500		bytes[i++] = 0x0f;
 501		bytes[i++] = 0xae;
 502		bytes[i++] = 0xe8; /* LFENCE */
 503	}
 504
 505	ret = emit_indirect(op, reg, bytes + i);
 506	if (ret < 0)
 507		return ret;
 508	i += ret;
 509
 510	/*
 511	 * The compiler is supposed to EMIT an INT3 after every unconditional
 512	 * JMP instruction due to AMD BTC. However, if the compiler is too old
 513	 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
 514	 * even on Intel.
 515	 */
 516	if (op == JMP32_INSN_OPCODE && i < insn->length)
 517		bytes[i++] = INT3_INSN_OPCODE;
 518
 519	for (; i < insn->length;)
 520		bytes[i++] = BYTES_NOP1;
 521
 522	return i;
 523}
 524
 525/*
 526 * Generated by 'objtool --retpoline'.
 527 */
 528void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
 529{
 530	s32 *s;
 531
 532	for (s = start; s < end; s++) {
 533		void *addr = (void *)s + *s;
 534		struct insn insn;
 535		int len, ret;
 536		u8 bytes[16];
 537		u8 op1, op2;
 538
 539		ret = insn_decode_kernel(&insn, addr);
 540		if (WARN_ON_ONCE(ret < 0))
 541			continue;
 542
 543		op1 = insn.opcode.bytes[0];
 544		op2 = insn.opcode.bytes[1];
 545
 546		switch (op1) {
 547		case CALL_INSN_OPCODE:
 548		case JMP32_INSN_OPCODE:
 549			break;
 550
 551		case 0x0f: /* escape */
 552			if (op2 >= 0x80 && op2 <= 0x8f)
 553				break;
 554			fallthrough;
 555		default:
 556			WARN_ON_ONCE(1);
 557			continue;
 558		}
 559
 560		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
 561			addr, addr, insn.length,
 562			addr + insn.length + insn.immediate.value);
 563
 564		len = patch_retpoline(addr, &insn, bytes);
 565		if (len == insn.length) {
 566			optimize_nops(bytes, len);
 567			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
 568			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
 569			text_poke_early(addr, bytes, len);
 570		}
 571	}
 572}
 573
 574#ifdef CONFIG_RETHUNK
 
 575
 576#ifdef CONFIG_CALL_THUNKS
 577void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
 578#endif
 579
 580/*
 581 * Rewrite the compiler generated return thunk tail-calls.
 582 *
 583 * For example, convert:
 584 *
 585 *   JMP __x86_return_thunk
 586 *
 587 * into:
 588 *
 589 *   RET
 590 */
 591static int patch_return(void *addr, struct insn *insn, u8 *bytes)
 592{
 593	int i = 0;
 594
 595	if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
 596		if (x86_return_thunk == __x86_return_thunk)
 597			return -1;
 598
 599		i = JMP32_INSN_SIZE;
 600		__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
 601	} else {
 602		bytes[i++] = RET_INSN_OPCODE;
 603	}
 604
 605	for (; i < insn->length;)
 606		bytes[i++] = INT3_INSN_OPCODE;
 607	return i;
 608}
 609
 610void __init_or_module noinline apply_returns(s32 *start, s32 *end)
 611{
 612	s32 *s;
 613
 614	for (s = start; s < end; s++) {
 615		void *dest = NULL, *addr = (void *)s + *s;
 616		struct insn insn;
 617		int len, ret;
 618		u8 bytes[16];
 619		u8 op;
 620
 621		ret = insn_decode_kernel(&insn, addr);
 622		if (WARN_ON_ONCE(ret < 0))
 623			continue;
 624
 625		op = insn.opcode.bytes[0];
 626		if (op == JMP32_INSN_OPCODE)
 627			dest = addr + insn.length + insn.immediate.value;
 628
 629		if (__static_call_fixup(addr, op, dest) ||
 630		    WARN_ONCE(dest != &__x86_return_thunk,
 631			      "missing return thunk: %pS-%pS: %*ph",
 632			      addr, dest, 5, addr))
 633			continue;
 634
 635		DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
 636			addr, addr, insn.length,
 637			addr + insn.length + insn.immediate.value);
 638
 639		len = patch_return(addr, &insn, bytes);
 640		if (len == insn.length) {
 641			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
 642			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
 643			text_poke_early(addr, bytes, len);
 644		}
 645	}
 646}
 647#else
 648void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
 649#endif /* CONFIG_RETHUNK */
 650
 651#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
 652
 653void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
 654void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
 655
 656#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
 657
 658#ifdef CONFIG_X86_KERNEL_IBT
 659
 660static void poison_endbr(void *addr, bool warn)
 661{
 662	u32 endbr, poison = gen_endbr_poison();
 663
 664	if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
 665		return;
 666
 667	if (!is_endbr(endbr)) {
 668		WARN_ON_ONCE(warn);
 669		return;
 670	}
 671
 672	DPRINTK("ENDBR at: %pS (%px)", addr, addr);
 673
 674	/*
 675	 * When we have IBT, the lack of ENDBR will trigger #CP
 676	 */
 677	DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
 678	DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
 679	text_poke_early(addr, &poison, 4);
 680}
 681
 682/*
 683 * Generated by: objtool --ibt
 684 */
 685void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
 686{
 687	s32 *s;
 688
 689	for (s = start; s < end; s++) {
 690		void *addr = (void *)s + *s;
 691
 692		poison_endbr(addr, true);
 693		if (IS_ENABLED(CONFIG_FINEIBT))
 694			poison_endbr(addr - 16, false);
 695	}
 696}
 697
 698#else
 699
 700void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
 701
 702#endif /* CONFIG_X86_KERNEL_IBT */
 703
 704#ifdef CONFIG_FINEIBT
 705
 706enum cfi_mode {
 707	CFI_DEFAULT,
 708	CFI_OFF,
 709	CFI_KCFI,
 710	CFI_FINEIBT,
 711};
 712
 713static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
 714static bool cfi_rand __ro_after_init = true;
 715static u32  cfi_seed __ro_after_init;
 716
 717/*
 718 * Re-hash the CFI hash with a boot-time seed while making sure the result is
 719 * not a valid ENDBR instruction.
 720 */
 721static u32 cfi_rehash(u32 hash)
 722{
 723	hash ^= cfi_seed;
 724	while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
 725		bool lsb = hash & 1;
 726		hash >>= 1;
 727		if (lsb)
 728			hash ^= 0x80200003;
 729	}
 730	return hash;
 731}
 732
 733static __init int cfi_parse_cmdline(char *str)
 734{
 735	if (!str)
 736		return -EINVAL;
 737
 738	while (str) {
 739		char *next = strchr(str, ',');
 740		if (next) {
 741			*next = 0;
 742			next++;
 743		}
 744
 745		if (!strcmp(str, "auto")) {
 746			cfi_mode = CFI_DEFAULT;
 747		} else if (!strcmp(str, "off")) {
 748			cfi_mode = CFI_OFF;
 749			cfi_rand = false;
 750		} else if (!strcmp(str, "kcfi")) {
 751			cfi_mode = CFI_KCFI;
 752		} else if (!strcmp(str, "fineibt")) {
 753			cfi_mode = CFI_FINEIBT;
 754		} else if (!strcmp(str, "norand")) {
 755			cfi_rand = false;
 756		} else {
 757			pr_err("Ignoring unknown cfi option (%s).", str);
 758		}
 759
 760		str = next;
 761	}
 762
 763	return 0;
 764}
 765early_param("cfi", cfi_parse_cmdline);
 766
 767/*
 768 * kCFI						FineIBT
 769 *
 770 * __cfi_\func:					__cfi_\func:
 771 *	movl   $0x12345678,%eax		// 5	     endbr64			// 4
 772 *	nop					     subl   $0x12345678,%r10d   // 7
 773 *	nop					     jz     1f			// 2
 774 *	nop					     ud2			// 2
 775 *	nop					1:   nop			// 1
 776 *	nop
 777 *	nop
 778 *	nop
 779 *	nop
 780 *	nop
 781 *	nop
 782 *	nop
 783 *
 784 *
 785 * caller:					caller:
 786 *	movl	$(-0x12345678),%r10d	 // 6	     movl   $0x12345678,%r10d	// 6
 787 *	addl	$-15(%r11),%r10d	 // 4	     sub    $16,%r11		// 4
 788 *	je	1f			 // 2	     nop4			// 4
 789 *	ud2				 // 2
 790 * 1:	call	__x86_indirect_thunk_r11 // 5	     call   *%r11; nop2;	// 5
 791 *
 792 */
 793
 794asm(	".pushsection .rodata			\n"
 795	"fineibt_preamble_start:		\n"
 796	"	endbr64				\n"
 797	"	subl	$0x12345678, %r10d	\n"
 798	"	je	fineibt_preamble_end	\n"
 799	"	ud2				\n"
 800	"	nop				\n"
 801	"fineibt_preamble_end:			\n"
 802	".popsection\n"
 803);
 804
 805extern u8 fineibt_preamble_start[];
 806extern u8 fineibt_preamble_end[];
 807
 808#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
 809#define fineibt_preamble_hash 7
 810
 811asm(	".pushsection .rodata			\n"
 812	"fineibt_caller_start:			\n"
 813	"	movl	$0x12345678, %r10d	\n"
 814	"	sub	$16, %r11		\n"
 815	ASM_NOP4
 816	"fineibt_caller_end:			\n"
 817	".popsection				\n"
 818);
 819
 820extern u8 fineibt_caller_start[];
 821extern u8 fineibt_caller_end[];
 822
 823#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
 824#define fineibt_caller_hash 2
 825
 826#define fineibt_caller_jmp (fineibt_caller_size - 2)
 827
 828static u32 decode_preamble_hash(void *addr)
 829{
 830	u8 *p = addr;
 831
 832	/* b8 78 56 34 12          mov    $0x12345678,%eax */
 833	if (p[0] == 0xb8)
 834		return *(u32 *)(addr + 1);
 835
 836	return 0; /* invalid hash value */
 837}
 838
 839static u32 decode_caller_hash(void *addr)
 840{
 841	u8 *p = addr;
 842
 843	/* 41 ba 78 56 34 12       mov    $0x12345678,%r10d */
 844	if (p[0] == 0x41 && p[1] == 0xba)
 845		return -*(u32 *)(addr + 2);
 846
 847	/* e8 0c 78 56 34 12	   jmp.d8  +12 */
 848	if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
 849		return -*(u32 *)(addr + 2);
 850
 851	return 0; /* invalid hash value */
 852}
 853
 854/* .retpoline_sites */
 855static int cfi_disable_callers(s32 *start, s32 *end)
 856{
 857	/*
 858	 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
 859	 * in tact for later usage. Also see decode_caller_hash() and
 860	 * cfi_rewrite_callers().
 861	 */
 862	const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
 863	s32 *s;
 864
 865	for (s = start; s < end; s++) {
 866		void *addr = (void *)s + *s;
 867		u32 hash;
 868
 869		addr -= fineibt_caller_size;
 870		hash = decode_caller_hash(addr);
 871		if (!hash) /* nocfi callers */
 872			continue;
 873
 874		text_poke_early(addr, jmp, 2);
 875	}
 876
 877	return 0;
 878}
 879
 880static int cfi_enable_callers(s32 *start, s32 *end)
 881{
 882	/*
 883	 * Re-enable kCFI, undo what cfi_disable_callers() did.
 884	 */
 885	const u8 mov[] = { 0x41, 0xba };
 886	s32 *s;
 887
 888	for (s = start; s < end; s++) {
 889		void *addr = (void *)s + *s;
 890		u32 hash;
 891
 892		addr -= fineibt_caller_size;
 893		hash = decode_caller_hash(addr);
 894		if (!hash) /* nocfi callers */
 895			continue;
 896
 897		text_poke_early(addr, mov, 2);
 898	}
 899
 900	return 0;
 901}
 902
 903/* .cfi_sites */
 904static int cfi_rand_preamble(s32 *start, s32 *end)
 905{
 906	s32 *s;
 907
 908	for (s = start; s < end; s++) {
 909		void *addr = (void *)s + *s;
 910		u32 hash;
 911
 912		hash = decode_preamble_hash(addr);
 913		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
 914			 addr, addr, 5, addr))
 915			return -EINVAL;
 916
 917		hash = cfi_rehash(hash);
 918		text_poke_early(addr + 1, &hash, 4);
 919	}
 920
 921	return 0;
 922}
 923
 924static int cfi_rewrite_preamble(s32 *start, s32 *end)
 925{
 926	s32 *s;
 927
 928	for (s = start; s < end; s++) {
 929		void *addr = (void *)s + *s;
 930		u32 hash;
 931
 932		hash = decode_preamble_hash(addr);
 933		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
 934			 addr, addr, 5, addr))
 935			return -EINVAL;
 936
 937		text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
 938		WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
 939		text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
 940	}
 941
 942	return 0;
 943}
 944
 945/* .retpoline_sites */
 946static int cfi_rand_callers(s32 *start, s32 *end)
 947{
 948	s32 *s;
 949
 950	for (s = start; s < end; s++) {
 951		void *addr = (void *)s + *s;
 952		u32 hash;
 953
 954		addr -= fineibt_caller_size;
 955		hash = decode_caller_hash(addr);
 956		if (hash) {
 957			hash = -cfi_rehash(hash);
 958			text_poke_early(addr + 2, &hash, 4);
 959		}
 960	}
 961
 962	return 0;
 963}
 964
 965static int cfi_rewrite_callers(s32 *start, s32 *end)
 966{
 967	s32 *s;
 968
 969	for (s = start; s < end; s++) {
 970		void *addr = (void *)s + *s;
 971		u32 hash;
 972
 973		addr -= fineibt_caller_size;
 974		hash = decode_caller_hash(addr);
 975		if (hash) {
 976			text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
 977			WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
 978			text_poke_early(addr + fineibt_caller_hash, &hash, 4);
 979		}
 980		/* rely on apply_retpolines() */
 981	}
 982
 983	return 0;
 984}
 985
 986static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
 987			    s32 *start_cfi, s32 *end_cfi, bool builtin)
 988{
 989	int ret;
 990
 991	if (WARN_ONCE(fineibt_preamble_size != 16,
 992		      "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
 993		return;
 994
 995	if (cfi_mode == CFI_DEFAULT) {
 996		cfi_mode = CFI_KCFI;
 997		if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
 998			cfi_mode = CFI_FINEIBT;
 999	}
1000
1001	/*
1002	 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1003	 * rewrite them. This disables all CFI. If this succeeds but any of the
1004	 * later stages fails, we're without CFI.
1005	 */
1006	ret = cfi_disable_callers(start_retpoline, end_retpoline);
1007	if (ret)
1008		goto err;
1009
1010	if (cfi_rand) {
1011		if (builtin)
1012			cfi_seed = get_random_u32();
1013
1014		ret = cfi_rand_preamble(start_cfi, end_cfi);
1015		if (ret)
1016			goto err;
1017
1018		ret = cfi_rand_callers(start_retpoline, end_retpoline);
1019		if (ret)
1020			goto err;
1021	}
1022
1023	switch (cfi_mode) {
1024	case CFI_OFF:
1025		if (builtin)
1026			pr_info("Disabling CFI\n");
1027		return;
1028
1029	case CFI_KCFI:
1030		ret = cfi_enable_callers(start_retpoline, end_retpoline);
1031		if (ret)
1032			goto err;
1033
1034		if (builtin)
1035			pr_info("Using kCFI\n");
1036		return;
1037
1038	case CFI_FINEIBT:
1039		ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1040		if (ret)
1041			goto err;
1042
1043		ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1044		if (ret)
1045			goto err;
1046
1047		if (builtin)
1048			pr_info("Using FineIBT CFI\n");
1049		return;
1050
1051	default:
1052		break;
1053	}
1054
1055err:
1056	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1057}
1058
1059#else
1060
1061static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1062			    s32 *start_cfi, s32 *end_cfi, bool builtin)
1063{
1064}
1065
1066#endif
1067
1068void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1069		   s32 *start_cfi, s32 *end_cfi)
1070{
1071	return __apply_fineibt(start_retpoline, end_retpoline,
1072			       start_cfi, end_cfi,
1073			       /* .builtin = */ false);
1074}
1075
1076#ifdef CONFIG_SMP
1077static void alternatives_smp_lock(const s32 *start, const s32 *end,
1078				  u8 *text, u8 *text_end)
1079{
1080	const s32 *poff;
1081
 
1082	for (poff = start; poff < end; poff++) {
1083		u8 *ptr = (u8 *)poff + *poff;
1084
1085		if (!*poff || ptr < text || ptr >= text_end)
1086			continue;
1087		/* turn DS segment override prefix into lock prefix */
1088		if (*ptr == 0x3e)
1089			text_poke(ptr, ((unsigned char []){0xf0}), 1);
1090	}
 
1091}
1092
1093static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1094				    u8 *text, u8 *text_end)
1095{
1096	const s32 *poff;
1097
 
 
 
 
1098	for (poff = start; poff < end; poff++) {
1099		u8 *ptr = (u8 *)poff + *poff;
1100
1101		if (!*poff || ptr < text || ptr >= text_end)
1102			continue;
1103		/* turn lock prefix into DS segment override prefix */
1104		if (*ptr == 0xf0)
1105			text_poke(ptr, ((unsigned char []){0x3E}), 1);
1106	}
 
1107}
1108
1109struct smp_alt_module {
1110	/* what is this ??? */
1111	struct module	*mod;
1112	char		*name;
1113
1114	/* ptrs to lock prefixes */
1115	const s32	*locks;
1116	const s32	*locks_end;
1117
1118	/* .text segment, needed to avoid patching init code ;) */
1119	u8		*text;
1120	u8		*text_end;
1121
1122	struct list_head next;
1123};
1124static LIST_HEAD(smp_alt_modules);
1125static bool uniproc_patched = false;	/* protected by text_mutex */
 
1126
1127void __init_or_module alternatives_smp_module_add(struct module *mod,
1128						  char *name,
1129						  void *locks, void *locks_end,
1130						  void *text,  void *text_end)
1131{
1132	struct smp_alt_module *smp;
1133
1134	mutex_lock(&text_mutex);
1135	if (!uniproc_patched)
1136		goto unlock;
1137
1138	if (num_possible_cpus() == 1)
1139		/* Don't bother remembering, we'll never have to undo it. */
1140		goto smp_unlock;
 
 
 
1141
1142	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1143	if (NULL == smp)
1144		/* we'll run the (safe but slow) SMP code then ... */
1145		goto unlock;
1146
1147	smp->mod	= mod;
1148	smp->name	= name;
1149	smp->locks	= locks;
1150	smp->locks_end	= locks_end;
1151	smp->text	= text;
1152	smp->text_end	= text_end;
1153	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
1154		smp->locks, smp->locks_end,
1155		smp->text, smp->text_end, smp->name);
1156
 
1157	list_add_tail(&smp->next, &smp_alt_modules);
1158smp_unlock:
1159	alternatives_smp_unlock(locks, locks_end, text, text_end);
1160unlock:
1161	mutex_unlock(&text_mutex);
1162}
1163
1164void __init_or_module alternatives_smp_module_del(struct module *mod)
1165{
1166	struct smp_alt_module *item;
1167
1168	mutex_lock(&text_mutex);
 
 
 
1169	list_for_each_entry(item, &smp_alt_modules, next) {
1170		if (mod != item->mod)
1171			continue;
1172		list_del(&item->next);
 
 
1173		kfree(item);
1174		break;
1175	}
1176	mutex_unlock(&text_mutex);
1177}
1178
1179void alternatives_enable_smp(void)
 
1180{
1181	struct smp_alt_module *mod;
1182
1183	/* Why bother if there are no other CPUs? */
1184	BUG_ON(num_possible_cpus() == 1);
 
 
 
 
 
 
 
 
 
 
 
 
1185
1186	mutex_lock(&text_mutex);
1187
1188	if (uniproc_patched) {
1189		pr_info("switching to SMP code\n");
1190		BUG_ON(num_online_cpus() != 1);
 
 
 
 
 
1191		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1192		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1193		list_for_each_entry(mod, &smp_alt_modules, next)
1194			alternatives_smp_lock(mod->locks, mod->locks_end,
1195					      mod->text, mod->text_end);
1196		uniproc_patched = false;
 
 
 
 
 
 
1197	}
1198	mutex_unlock(&text_mutex);
 
1199}
1200
1201/*
1202 * Return 1 if the address range is reserved for SMP-alternatives.
1203 * Must hold text_mutex.
1204 */
1205int alternatives_text_reserved(void *start, void *end)
1206{
1207	struct smp_alt_module *mod;
1208	const s32 *poff;
1209	u8 *text_start = start;
1210	u8 *text_end = end;
1211
1212	lockdep_assert_held(&text_mutex);
1213
1214	list_for_each_entry(mod, &smp_alt_modules, next) {
1215		if (mod->text > text_end || mod->text_end < text_start)
1216			continue;
1217		for (poff = mod->locks; poff < mod->locks_end; poff++) {
1218			const u8 *ptr = (const u8 *)poff + *poff;
1219
1220			if (text_start <= ptr && text_end > ptr)
1221				return 1;
1222		}
1223	}
1224
1225	return 0;
1226}
1227#endif /* CONFIG_SMP */
1228
1229#ifdef CONFIG_PARAVIRT
1230void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1231				     struct paravirt_patch_site *end)
1232{
1233	struct paravirt_patch_site *p;
1234	char insn_buff[MAX_PATCH_LEN];
 
 
 
1235
1236	for (p = start; p < end; p++) {
1237		unsigned int used;
1238
1239		BUG_ON(p->len > MAX_PATCH_LEN);
1240		/* prep the buffer with the original instructions */
1241		memcpy(insn_buff, p->instr, p->len);
1242		used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
 
1243
1244		BUG_ON(used > p->len);
1245
1246		/* Pad the rest with nops */
1247		add_nops(insn_buff + used, p->len - used);
1248		text_poke_early(p->instr, insn_buff, p->len);
1249	}
1250}
1251extern struct paravirt_patch_site __start_parainstructions[],
1252	__stop_parainstructions[];
1253#endif	/* CONFIG_PARAVIRT */
1254
1255/*
1256 * Self-test for the INT3 based CALL emulation code.
1257 *
1258 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1259 * properly and that there is a stack gap between the INT3 frame and the
1260 * previous context. Without this gap doing a virtual PUSH on the interrupted
1261 * stack would corrupt the INT3 IRET frame.
1262 *
1263 * See entry_{32,64}.S for more details.
1264 */
1265
1266/*
1267 * We define the int3_magic() function in assembly to control the calling
1268 * convention such that we can 'call' it from assembly.
1269 */
1270
1271extern void int3_magic(unsigned int *ptr); /* defined in asm */
1272
1273asm (
1274"	.pushsection	.init.text, \"ax\", @progbits\n"
1275"	.type		int3_magic, @function\n"
1276"int3_magic:\n"
1277	ANNOTATE_NOENDBR
1278"	movl	$1, (%" _ASM_ARG1 ")\n"
1279	ASM_RET
1280"	.size		int3_magic, .-int3_magic\n"
1281"	.popsection\n"
1282);
1283
1284extern void int3_selftest_ip(void); /* defined in asm below */
1285
1286static int __init
1287int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1288{
1289	unsigned long selftest = (unsigned long)&int3_selftest_ip;
1290	struct die_args *args = data;
1291	struct pt_regs *regs = args->regs;
1292
1293	OPTIMIZER_HIDE_VAR(selftest);
1294
1295	if (!regs || user_mode(regs))
1296		return NOTIFY_DONE;
1297
1298	if (val != DIE_INT3)
1299		return NOTIFY_DONE;
1300
1301	if (regs->ip - INT3_INSN_SIZE != selftest)
1302		return NOTIFY_DONE;
1303
1304	int3_emulate_call(regs, (unsigned long)&int3_magic);
1305	return NOTIFY_STOP;
1306}
1307
1308/* Must be noinline to ensure uniqueness of int3_selftest_ip. */
1309static noinline void __init int3_selftest(void)
1310{
1311	static __initdata struct notifier_block int3_exception_nb = {
1312		.notifier_call	= int3_exception_notify,
1313		.priority	= INT_MAX-1, /* last */
1314	};
1315	unsigned int val = 0;
1316
1317	BUG_ON(register_die_notifier(&int3_exception_nb));
1318
1319	/*
1320	 * Basically: int3_magic(&val); but really complicated :-)
1321	 *
1322	 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1323	 * notifier above will emulate CALL for us.
1324	 */
1325	asm volatile ("int3_selftest_ip:\n\t"
1326		      ANNOTATE_NOENDBR
1327		      "    int3; nop; nop; nop; nop\n\t"
1328		      : ASM_CALL_CONSTRAINT
1329		      : __ASM_SEL_RAW(a, D) (&val)
1330		      : "memory");
1331
1332	BUG_ON(val != 1);
1333
1334	unregister_die_notifier(&int3_exception_nb);
1335}
1336
1337void __init alternative_instructions(void)
1338{
1339	int3_selftest();
1340
1341	/*
1342	 * The patching is not fully atomic, so try to avoid local
1343	 * interruptions that might execute the to be patched code.
1344	 * Other CPUs are not running.
1345	 */
1346	stop_nmi();
1347
1348	/*
1349	 * Don't stop machine check exceptions while patching.
1350	 * MCEs only happen when something got corrupted and in this
1351	 * case we must do something about the corruption.
1352	 * Ignoring it is worse than an unlikely patching race.
1353	 * Also machine checks tend to be broadcast and if one CPU
1354	 * goes into machine check the others follow quickly, so we don't
1355	 * expect a machine check to cause undue problems during to code
1356	 * patching.
1357	 */
1358
1359	/*
1360	 * Paravirt patching and alternative patching can be combined to
1361	 * replace a function call with a short direct code sequence (e.g.
1362	 * by setting a constant return value instead of doing that in an
1363	 * external function).
1364	 * In order to make this work the following sequence is required:
1365	 * 1. set (artificial) features depending on used paravirt
1366	 *    functions which can later influence alternative patching
1367	 * 2. apply paravirt patching (generally replacing an indirect
1368	 *    function call with a direct one)
1369	 * 3. apply alternative patching (e.g. replacing a direct function
1370	 *    call with a custom code sequence)
1371	 * Doing paravirt patching after alternative patching would clobber
1372	 * the optimization of the custom code with a function call again.
1373	 */
1374	paravirt_set_cap();
1375
1376	/*
1377	 * First patch paravirt functions, such that we overwrite the indirect
1378	 * call with the direct call.
1379	 */
1380	apply_paravirt(__parainstructions, __parainstructions_end);
1381
1382	__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1383			__cfi_sites, __cfi_sites_end, true);
1384
1385	/*
1386	 * Rewrite the retpolines, must be done before alternatives since
1387	 * those can rewrite the retpoline thunks.
1388	 */
1389	apply_retpolines(__retpoline_sites, __retpoline_sites_end);
1390	apply_returns(__return_sites, __return_sites_end);
1391
1392	/*
1393	 * Then patch alternatives, such that those paravirt calls that are in
1394	 * alternatives can be overwritten by their immediate fragments.
1395	 */
1396	apply_alternatives(__alt_instructions, __alt_instructions_end);
1397
1398	/*
1399	 * Now all calls are established. Apply the call thunks if
1400	 * required.
1401	 */
1402	callthunks_patch_builtin_calls();
 
 
1403
1404	apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
 
 
 
 
 
1405
1406#ifdef CONFIG_SMP
1407	/* Patch to UP if other cpus not imminent. */
1408	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1409		uniproc_patched = true;
1410		alternatives_smp_module_add(NULL, "core kernel",
1411					    __smp_locks, __smp_locks_end,
1412					    _text, _etext);
 
 
 
 
1413	}
 
 
1414
1415	if (!uniproc_patched || num_possible_cpus() == 1) {
1416		free_init_pages("SMP alternatives",
1417				(unsigned long)__smp_locks,
1418				(unsigned long)__smp_locks_end);
1419	}
1420#endif
1421
1422	restart_nmi();
1423	alternatives_patched = 1;
1424}
1425
1426/**
1427 * text_poke_early - Update instructions on a live kernel at boot time
1428 * @addr: address to modify
1429 * @opcode: source of the copy
1430 * @len: length to copy
1431 *
1432 * When you use this code to patch more than one byte of an instruction
1433 * you need to make sure that other CPUs cannot execute this code in parallel.
1434 * Also no thread must be currently preempted in the middle of these
1435 * instructions. And on the local CPU you need to be protected against NMI or
1436 * MCE handlers seeing an inconsistent instruction while you patch.
1437 */
1438void __init_or_module text_poke_early(void *addr, const void *opcode,
1439				      size_t len)
1440{
1441	unsigned long flags;
1442
1443	if (boot_cpu_has(X86_FEATURE_NX) &&
1444	    is_module_text_address((unsigned long)addr)) {
1445		/*
1446		 * Modules text is marked initially as non-executable, so the
1447		 * code cannot be running and speculative code-fetches are
1448		 * prevented. Just change the code.
1449		 */
1450		memcpy(addr, opcode, len);
1451	} else {
1452		local_irq_save(flags);
1453		memcpy(addr, opcode, len);
1454		local_irq_restore(flags);
1455		sync_core();
1456
1457		/*
1458		 * Could also do a CLFLUSH here to speed up CPU recovery; but
1459		 * that causes hangs on some VIA CPUs.
1460		 */
1461	}
1462}
1463
1464typedef struct {
1465	struct mm_struct *mm;
1466} temp_mm_state_t;
1467
1468/*
1469 * Using a temporary mm allows to set temporary mappings that are not accessible
1470 * by other CPUs. Such mappings are needed to perform sensitive memory writes
1471 * that override the kernel memory protections (e.g., W^X), without exposing the
1472 * temporary page-table mappings that are required for these write operations to
1473 * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1474 * mapping is torn down.
1475 *
1476 * Context: The temporary mm needs to be used exclusively by a single core. To
1477 *          harden security IRQs must be disabled while the temporary mm is
1478 *          loaded, thereby preventing interrupt handler bugs from overriding
1479 *          the kernel memory protection.
1480 */
1481static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1482{
1483	temp_mm_state_t temp_state;
1484
1485	lockdep_assert_irqs_disabled();
1486
1487	/*
1488	 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1489	 * with a stale address space WITHOUT being in lazy mode after
1490	 * restoring the previous mm.
1491	 */
1492	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1493		leave_mm(smp_processor_id());
1494
1495	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1496	switch_mm_irqs_off(NULL, mm, current);
1497
1498	/*
1499	 * If breakpoints are enabled, disable them while the temporary mm is
1500	 * used. Userspace might set up watchpoints on addresses that are used
1501	 * in the temporary mm, which would lead to wrong signals being sent or
1502	 * crashes.
1503	 *
1504	 * Note that breakpoints are not disabled selectively, which also causes
1505	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1506	 * undesirable, but still seems reasonable as the code that runs in the
1507	 * temporary mm should be short.
1508	 */
1509	if (hw_breakpoint_active())
1510		hw_breakpoint_disable();
1511
1512	return temp_state;
1513}
1514
1515static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1516{
1517	lockdep_assert_irqs_disabled();
1518	switch_mm_irqs_off(NULL, prev_state.mm, current);
1519
1520	/*
1521	 * Restore the breakpoints if they were disabled before the temporary mm
1522	 * was loaded.
1523	 */
1524	if (hw_breakpoint_active())
1525		hw_breakpoint_restore();
1526}
1527
1528__ro_after_init struct mm_struct *poking_mm;
1529__ro_after_init unsigned long poking_addr;
1530
1531static void text_poke_memcpy(void *dst, const void *src, size_t len)
1532{
1533	memcpy(dst, src, len);
1534}
1535
1536static void text_poke_memset(void *dst, const void *src, size_t len)
1537{
1538	int c = *(const int *)src;
1539
1540	memset(dst, c, len);
1541}
1542
1543typedef void text_poke_f(void *dst, const void *src, size_t len);
1544
1545static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1546{
1547	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1548	struct page *pages[2] = {NULL};
1549	temp_mm_state_t prev;
1550	unsigned long flags;
1551	pte_t pte, *ptep;
1552	spinlock_t *ptl;
1553	pgprot_t pgprot;
1554
1555	/*
1556	 * While boot memory allocator is running we cannot use struct pages as
1557	 * they are not yet initialized. There is no way to recover.
1558	 */
1559	BUG_ON(!after_bootmem);
1560
1561	if (!core_kernel_text((unsigned long)addr)) {
1562		pages[0] = vmalloc_to_page(addr);
1563		if (cross_page_boundary)
1564			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1565	} else {
1566		pages[0] = virt_to_page(addr);
1567		WARN_ON(!PageReserved(pages[0]));
1568		if (cross_page_boundary)
1569			pages[1] = virt_to_page(addr + PAGE_SIZE);
1570	}
1571	/*
1572	 * If something went wrong, crash and burn since recovery paths are not
1573	 * implemented.
1574	 */
1575	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1576
1577	/*
1578	 * Map the page without the global bit, as TLB flushing is done with
1579	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1580	 */
1581	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1582
1583	/*
1584	 * The lock is not really needed, but this allows to avoid open-coding.
1585	 */
1586	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1587
1588	/*
1589	 * This must not fail; preallocated in poking_init().
1590	 */
1591	VM_BUG_ON(!ptep);
1592
1593	local_irq_save(flags);
1594
1595	pte = mk_pte(pages[0], pgprot);
1596	set_pte_at(poking_mm, poking_addr, ptep, pte);
1597
1598	if (cross_page_boundary) {
1599		pte = mk_pte(pages[1], pgprot);
1600		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1601	}
1602
1603	/*
1604	 * Loading the temporary mm behaves as a compiler barrier, which
1605	 * guarantees that the PTE will be set at the time memcpy() is done.
1606	 */
1607	prev = use_temporary_mm(poking_mm);
1608
1609	kasan_disable_current();
1610	func((u8 *)poking_addr + offset_in_page(addr), src, len);
1611	kasan_enable_current();
1612
1613	/*
1614	 * Ensure that the PTE is only cleared after the instructions of memcpy
1615	 * were issued by using a compiler barrier.
1616	 */
1617	barrier();
1618
1619	pte_clear(poking_mm, poking_addr, ptep);
1620	if (cross_page_boundary)
1621		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1622
1623	/*
1624	 * Loading the previous page-table hierarchy requires a serializing
1625	 * instruction that already allows the core to see the updated version.
1626	 * Xen-PV is assumed to serialize execution in a similar manner.
1627	 */
1628	unuse_temporary_mm(prev);
1629
1630	/*
1631	 * Flushing the TLB might involve IPIs, which would require enabled
1632	 * IRQs, but not if the mm is not used, as it is in this point.
1633	 */
1634	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1635			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1636			   PAGE_SHIFT, false);
1637
1638	if (func == text_poke_memcpy) {
1639		/*
1640		 * If the text does not match what we just wrote then something is
1641		 * fundamentally screwy; there's nothing we can really do about that.
1642		 */
1643		BUG_ON(memcmp(addr, src, len));
1644	}
1645
1646	local_irq_restore(flags);
1647	pte_unmap_unlock(ptep, ptl);
 
1648	return addr;
1649}
1650
1651/**
1652 * text_poke - Update instructions on a live kernel
1653 * @addr: address to modify
1654 * @opcode: source of the copy
1655 * @len: length to copy
1656 *
1657 * Only atomic text poke/set should be allowed when not doing early patching.
1658 * It means the size must be writable atomically and the address must be aligned
1659 * in a way that permits an atomic write. It also makes sure we fit on a single
1660 * page.
1661 *
1662 * Note that the caller must ensure that if the modified code is part of a
1663 * module, the module would not be removed during poking. This can be achieved
1664 * by registering a module notifier, and ordering module removal and patching
1665 * trough a mutex.
1666 */
1667void *text_poke(void *addr, const void *opcode, size_t len)
1668{
1669	lockdep_assert_held(&text_mutex);
 
 
 
1670
1671	return __text_poke(text_poke_memcpy, addr, opcode, len);
1672}
1673
1674/**
1675 * text_poke_kgdb - Update instructions on a live kernel by kgdb
1676 * @addr: address to modify
1677 * @opcode: source of the copy
1678 * @len: length to copy
1679 *
1680 * Only atomic text poke/set should be allowed when not doing early patching.
1681 * It means the size must be writable atomically and the address must be aligned
1682 * in a way that permits an atomic write. It also makes sure we fit on a single
1683 * page.
1684 *
1685 * Context: should only be used by kgdb, which ensures no other core is running,
1686 *	    despite the fact it does not hold the text_mutex.
1687 */
1688void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1689{
1690	return __text_poke(text_poke_memcpy, addr, opcode, len);
1691}
1692
1693void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
1694			    bool core_ok)
1695{
1696	unsigned long start = (unsigned long)addr;
1697	size_t patched = 0;
1698
1699	if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
1700		return NULL;
1701
1702	while (patched < len) {
1703		unsigned long ptr = start + patched;
1704		size_t s;
1705
1706		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1707
1708		__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
1709		patched += s;
1710	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1711	return addr;
1712}
1713
1714/**
1715 * text_poke_copy - Copy instructions into (an unused part of) RX memory
1716 * @addr: address to modify
1717 * @opcode: source of the copy
1718 * @len: length to copy, could be more than 2x PAGE_SIZE
1719 *
1720 * Not safe against concurrent execution; useful for JITs to dump
1721 * new code blocks into unused regions of RX memory. Can be used in
1722 * conjunction with synchronize_rcu_tasks() to wait for existing
1723 * execution to quiesce after having made sure no existing functions
1724 * pointers are live.
1725 */
1726void *text_poke_copy(void *addr, const void *opcode, size_t len)
1727{
1728	mutex_lock(&text_mutex);
1729	addr = text_poke_copy_locked(addr, opcode, len, false);
1730	mutex_unlock(&text_mutex);
1731	return addr;
1732}
1733
1734/**
1735 * text_poke_set - memset into (an unused part of) RX memory
1736 * @addr: address to modify
1737 * @c: the byte to fill the area with
1738 * @len: length to copy, could be more than 2x PAGE_SIZE
1739 *
1740 * This is useful to overwrite unused regions of RX memory with illegal
1741 * instructions.
1742 */
1743void *text_poke_set(void *addr, int c, size_t len)
1744{
1745	unsigned long start = (unsigned long)addr;
1746	size_t patched = 0;
1747
1748	if (WARN_ON_ONCE(core_kernel_text(start)))
1749		return NULL;
1750
1751	mutex_lock(&text_mutex);
1752	while (patched < len) {
1753		unsigned long ptr = start + patched;
1754		size_t s;
1755
1756		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1757
1758		__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
1759		patched += s;
1760	}
1761	mutex_unlock(&text_mutex);
1762	return addr;
1763}
1764
1765static void do_sync_core(void *info)
1766{
1767	sync_core();
1768}
1769
1770void text_poke_sync(void)
1771{
1772	on_each_cpu(do_sync_core, NULL, 1);
1773}
1774
1775struct text_poke_loc {
1776	/* addr := _stext + rel_addr */
1777	s32 rel_addr;
1778	s32 disp;
1779	u8 len;
1780	u8 opcode;
1781	const u8 text[POKE_MAX_OPCODE_SIZE];
1782	/* see text_poke_bp_batch() */
1783	u8 old;
1784};
1785
1786struct bp_patching_desc {
1787	struct text_poke_loc *vec;
1788	int nr_entries;
1789	atomic_t refs;
1790};
1791
1792static struct bp_patching_desc bp_desc;
1793
1794static __always_inline
1795struct bp_patching_desc *try_get_desc(void)
1796{
1797	struct bp_patching_desc *desc = &bp_desc;
1798
1799	if (!arch_atomic_inc_not_zero(&desc->refs))
1800		return NULL;
1801
1802	return desc;
1803}
1804
1805static __always_inline void put_desc(void)
1806{
1807	struct bp_patching_desc *desc = &bp_desc;
1808
1809	smp_mb__before_atomic();
1810	arch_atomic_dec(&desc->refs);
1811}
1812
1813static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1814{
1815	return _stext + tp->rel_addr;
1816}
1817
1818static __always_inline int patch_cmp(const void *key, const void *elt)
1819{
1820	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1821
1822	if (key < text_poke_addr(tp))
1823		return -1;
1824	if (key > text_poke_addr(tp))
1825		return 1;
1826	return 0;
1827}
1828
1829noinstr int poke_int3_handler(struct pt_regs *regs)
1830{
1831	struct bp_patching_desc *desc;
1832	struct text_poke_loc *tp;
1833	int ret = 0;
1834	void *ip;
1835
1836	if (user_mode(regs))
1837		return 0;
1838
1839	/*
1840	 * Having observed our INT3 instruction, we now must observe
1841	 * bp_desc with non-zero refcount:
1842	 *
1843	 *	bp_desc.refs = 1		INT3
1844	 *	WMB				RMB
1845	 *	write INT3			if (bp_desc.refs != 0)
1846	 */
1847	smp_rmb();
1848
1849	desc = try_get_desc();
1850	if (!desc)
1851		return 0;
1852
1853	/*
1854	 * Discount the INT3. See text_poke_bp_batch().
1855	 */
1856	ip = (void *) regs->ip - INT3_INSN_SIZE;
1857
1858	/*
1859	 * Skip the binary search if there is a single member in the vector.
1860	 */
1861	if (unlikely(desc->nr_entries > 1)) {
1862		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
1863				      sizeof(struct text_poke_loc),
1864				      patch_cmp);
1865		if (!tp)
1866			goto out_put;
1867	} else {
1868		tp = desc->vec;
1869		if (text_poke_addr(tp) != ip)
1870			goto out_put;
1871	}
1872
1873	ip += tp->len;
1874
1875	switch (tp->opcode) {
1876	case INT3_INSN_OPCODE:
1877		/*
1878		 * Someone poked an explicit INT3, they'll want to handle it,
1879		 * do not consume.
1880		 */
1881		goto out_put;
1882
1883	case RET_INSN_OPCODE:
1884		int3_emulate_ret(regs);
1885		break;
1886
1887	case CALL_INSN_OPCODE:
1888		int3_emulate_call(regs, (long)ip + tp->disp);
1889		break;
1890
1891	case JMP32_INSN_OPCODE:
1892	case JMP8_INSN_OPCODE:
1893		int3_emulate_jmp(regs, (long)ip + tp->disp);
1894		break;
1895
1896	default:
1897		BUG();
1898	}
1899
1900	ret = 1;
1901
1902out_put:
1903	put_desc();
1904	return ret;
 
1905}
1906
1907#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1908static struct text_poke_loc tp_vec[TP_VEC_MAX];
1909static int tp_vec_nr;
1910
1911/**
1912 * text_poke_bp_batch() -- update instructions on live kernel on SMP
1913 * @tp:			vector of instructions to patch
1914 * @nr_entries:		number of entries in the vector
 
1915 *
1916 * Modify multi-byte instruction by using int3 breakpoint on SMP.
1917 * We completely avoid stop_machine() here, and achieve the
1918 * synchronization using int3 breakpoint.
1919 *
1920 * The way it is done:
1921 *	- For each entry in the vector:
1922 *		- add a int3 trap to the address that will be patched
1923 *	- sync cores
1924 *	- For each entry in the vector:
1925 *		- update all but the first byte of the patched range
1926 *	- sync cores
1927 *	- For each entry in the vector:
1928 *		- replace the first byte (int3) by the first byte of
1929 *		  replacing opcode
1930 *	- sync cores
1931 */
1932static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1933{
1934	unsigned char int3 = INT3_INSN_OPCODE;
1935	unsigned int i;
1936	int do_sync;
1937
1938	lockdep_assert_held(&text_mutex);
1939
1940	bp_desc.vec = tp;
1941	bp_desc.nr_entries = nr_entries;
1942
1943	/*
1944	 * Corresponds to the implicit memory barrier in try_get_desc() to
1945	 * ensure reading a non-zero refcount provides up to date bp_desc data.
1946	 */
1947	atomic_set_release(&bp_desc.refs, 1);
1948
1949	/*
1950	 * Corresponding read barrier in int3 notifier for making sure the
1951	 * nr_entries and handler are correctly ordered wrt. patching.
1952	 */
1953	smp_wmb();
1954
1955	/*
1956	 * First step: add a int3 trap to the address that will be patched.
1957	 */
1958	for (i = 0; i < nr_entries; i++) {
1959		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
1960		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1961	}
1962
1963	text_poke_sync();
1964
1965	/*
1966	 * Second step: update all but the first byte of the patched range.
1967	 */
1968	for (do_sync = 0, i = 0; i < nr_entries; i++) {
1969		u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
1970		int len = tp[i].len;
1971
1972		if (len - INT3_INSN_SIZE > 0) {
1973			memcpy(old + INT3_INSN_SIZE,
1974			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1975			       len - INT3_INSN_SIZE);
1976			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1977				  (const char *)tp[i].text + INT3_INSN_SIZE,
1978				  len - INT3_INSN_SIZE);
1979			do_sync++;
1980		}
1981
1982		/*
1983		 * Emit a perf event to record the text poke, primarily to
1984		 * support Intel PT decoding which must walk the executable code
1985		 * to reconstruct the trace. The flow up to here is:
1986		 *   - write INT3 byte
1987		 *   - IPI-SYNC
1988		 *   - write instruction tail
1989		 * At this point the actual control flow will be through the
1990		 * INT3 and handler and not hit the old or new instruction.
1991		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
1992		 * can still be decoded. Subsequently:
1993		 *   - emit RECORD_TEXT_POKE with the new instruction
1994		 *   - IPI-SYNC
1995		 *   - write first byte
1996		 *   - IPI-SYNC
1997		 * So before the text poke event timestamp, the decoder will see
1998		 * either the old instruction flow or FUP/TIP of INT3. After the
1999		 * text poke event timestamp, the decoder will see either the
2000		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2001		 * use the timestamp as the point at which to modify the
2002		 * executable code.
2003		 * The old instruction is recorded so that the event can be
2004		 * processed forwards or backwards.
2005		 */
2006		perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
2007				     tp[i].text, len);
2008	}
2009
2010	if (do_sync) {
2011		/*
2012		 * According to Intel, this core syncing is very likely
2013		 * not necessary and we'd be safe even without it. But
2014		 * better safe than sorry (plus there's not only Intel).
2015		 */
2016		text_poke_sync();
2017	}
2018
2019	/*
2020	 * Third step: replace the first byte (int3) by the first byte of
2021	 * replacing opcode.
2022	 */
2023	for (do_sync = 0, i = 0; i < nr_entries; i++) {
2024		if (tp[i].text[0] == INT3_INSN_OPCODE)
2025			continue;
2026
2027		text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
2028		do_sync++;
2029	}
2030
2031	if (do_sync)
2032		text_poke_sync();
2033
2034	/*
2035	 * Remove and wait for refs to be zero.
2036	 */
2037	if (!atomic_dec_and_test(&bp_desc.refs))
2038		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2039}
2040
2041static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2042			       const void *opcode, size_t len, const void *emulate)
2043{
2044	struct insn insn;
2045	int ret, i;
2046
2047	memcpy((void *)tp->text, opcode, len);
2048	if (!emulate)
2049		emulate = opcode;
2050
2051	ret = insn_decode_kernel(&insn, emulate);
2052	BUG_ON(ret < 0);
2053
2054	tp->rel_addr = addr - (void *)_stext;
2055	tp->len = len;
2056	tp->opcode = insn.opcode.bytes[0];
2057
2058	switch (tp->opcode) {
2059	case RET_INSN_OPCODE:
2060	case JMP32_INSN_OPCODE:
2061	case JMP8_INSN_OPCODE:
2062		/*
2063		 * Control flow instructions without implied execution of the
2064		 * next instruction can be padded with INT3.
2065		 */
2066		for (i = insn.length; i < len; i++)
2067			BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2068		break;
2069
2070	default:
2071		BUG_ON(len != insn.length);
2072	}
2073
2074
2075	switch (tp->opcode) {
2076	case INT3_INSN_OPCODE:
2077	case RET_INSN_OPCODE:
2078		break;
2079
2080	case CALL_INSN_OPCODE:
2081	case JMP32_INSN_OPCODE:
2082	case JMP8_INSN_OPCODE:
2083		tp->disp = insn.immediate.value;
2084		break;
2085
2086	default: /* assume NOP */
2087		switch (len) {
2088		case 2: /* NOP2 -- emulate as JMP8+0 */
2089			BUG_ON(memcmp(emulate, x86_nops[len], len));
2090			tp->opcode = JMP8_INSN_OPCODE;
2091			tp->disp = 0;
2092			break;
2093
2094		case 5: /* NOP5 -- emulate as JMP32+0 */
2095			BUG_ON(memcmp(emulate, x86_nops[len], len));
2096			tp->opcode = JMP32_INSN_OPCODE;
2097			tp->disp = 0;
2098			break;
2099
2100		default: /* unknown instruction */
2101			BUG();
2102		}
2103		break;
2104	}
2105}
2106
2107/*
2108 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2109 * early if needed.
2110 */
2111static bool tp_order_fail(void *addr)
2112{
2113	struct text_poke_loc *tp;
2114
2115	if (!tp_vec_nr)
2116		return false;
2117
2118	if (!addr) /* force */
2119		return true;
2120
2121	tp = &tp_vec[tp_vec_nr - 1];
2122	if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2123		return true;
2124
2125	return false;
2126}
2127
2128static void text_poke_flush(void *addr)
2129{
2130	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2131		text_poke_bp_batch(tp_vec, tp_vec_nr);
2132		tp_vec_nr = 0;
2133	}
2134}
2135
2136void text_poke_finish(void)
2137{
2138	text_poke_flush(NULL);
2139}
2140
2141void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2142{
2143	struct text_poke_loc *tp;
2144
2145	text_poke_flush(addr);
2146
2147	tp = &tp_vec[tp_vec_nr++];
2148	text_poke_loc_init(tp, addr, opcode, len, emulate);
2149}
2150
2151/**
2152 * text_poke_bp() -- update instructions on live kernel on SMP
2153 * @addr:	address to patch
2154 * @opcode:	opcode of new instruction
2155 * @len:	length to copy
2156 * @emulate:	instruction to be emulated
 
 
2157 *
2158 * Update a single instruction with the vector in the stack, avoiding
2159 * dynamically allocated memory. This function should be used when it is
2160 * not possible to allocate memory.
2161 */
2162void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2163{
2164	struct text_poke_loc tp;
2165
2166	text_poke_loc_init(&tp, addr, opcode, len, emulate);
2167	text_poke_bp_batch(&tp, 1);
 
2168}