Linux Audio

Check our new training course

Loading...
v3.15
  1#define pr_fmt(fmt) "SMP alternatives: " fmt
  2
  3#include <linux/module.h>
  4#include <linux/sched.h>
  5#include <linux/mutex.h>
  6#include <linux/list.h>
  7#include <linux/stringify.h>
  8#include <linux/kprobes.h>
  9#include <linux/mm.h>
 10#include <linux/vmalloc.h>
 11#include <linux/memory.h>
 12#include <linux/stop_machine.h>
 13#include <linux/slab.h>
 14#include <linux/kdebug.h>
 
 15#include <asm/alternative.h>
 16#include <asm/sections.h>
 17#include <asm/pgtable.h>
 18#include <asm/mce.h>
 19#include <asm/nmi.h>
 20#include <asm/cacheflush.h>
 21#include <asm/tlbflush.h>
 22#include <asm/io.h>
 23#include <asm/fixmap.h>
 24
 
 
 
 
 25#define MAX_PATCH_LEN (255-1)
 26
 27static int __initdata_or_module debug_alternative;
 28
 29static int __init debug_alt(char *str)
 30{
 31	debug_alternative = 1;
 32	return 1;
 33}
 34__setup("debug-alternative", debug_alt);
 35
 36static int noreplace_smp;
 37
 38static int __init setup_noreplace_smp(char *str)
 39{
 40	noreplace_smp = 1;
 41	return 1;
 42}
 43__setup("noreplace-smp", setup_noreplace_smp);
 44
 45#ifdef CONFIG_PARAVIRT
 46static int __initdata_or_module noreplace_paravirt = 0;
 47
 48static int __init setup_noreplace_paravirt(char *str)
 49{
 50	noreplace_paravirt = 1;
 51	return 1;
 52}
 53__setup("noreplace-paravirt", setup_noreplace_paravirt);
 54#endif
 55
 56#define DPRINTK(fmt, ...)				\
 57do {							\
 58	if (debug_alternative)				\
 59		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 60} while (0)
 61
 62/*
 63 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
 64 * that correspond to that nop. Getting from one nop to the next, we
 65 * add to the array the offset that is equal to the sum of all sizes of
 66 * nops preceding the one we are after.
 67 *
 68 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
 69 * nice symmetry of sizes of the previous nops.
 70 */
 71#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
 72static const unsigned char intelnops[] =
 73{
 74	GENERIC_NOP1,
 75	GENERIC_NOP2,
 76	GENERIC_NOP3,
 77	GENERIC_NOP4,
 78	GENERIC_NOP5,
 79	GENERIC_NOP6,
 80	GENERIC_NOP7,
 81	GENERIC_NOP8,
 82	GENERIC_NOP5_ATOMIC
 83};
 84static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
 85{
 86	NULL,
 87	intelnops,
 88	intelnops + 1,
 89	intelnops + 1 + 2,
 90	intelnops + 1 + 2 + 3,
 91	intelnops + 1 + 2 + 3 + 4,
 92	intelnops + 1 + 2 + 3 + 4 + 5,
 93	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
 94	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 95	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 96};
 97#endif
 98
 99#ifdef K8_NOP1
100static const unsigned char k8nops[] =
101{
102	K8_NOP1,
103	K8_NOP2,
104	K8_NOP3,
105	K8_NOP4,
106	K8_NOP5,
107	K8_NOP6,
108	K8_NOP7,
109	K8_NOP8,
110	K8_NOP5_ATOMIC
111};
112static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
113{
114	NULL,
115	k8nops,
116	k8nops + 1,
117	k8nops + 1 + 2,
118	k8nops + 1 + 2 + 3,
119	k8nops + 1 + 2 + 3 + 4,
120	k8nops + 1 + 2 + 3 + 4 + 5,
121	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
122	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
123	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
124};
125#endif
126
127#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
128static const unsigned char k7nops[] =
129{
130	K7_NOP1,
131	K7_NOP2,
132	K7_NOP3,
133	K7_NOP4,
134	K7_NOP5,
135	K7_NOP6,
136	K7_NOP7,
137	K7_NOP8,
138	K7_NOP5_ATOMIC
139};
140static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
141{
142	NULL,
143	k7nops,
144	k7nops + 1,
145	k7nops + 1 + 2,
146	k7nops + 1 + 2 + 3,
147	k7nops + 1 + 2 + 3 + 4,
148	k7nops + 1 + 2 + 3 + 4 + 5,
149	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
150	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
151	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
152};
153#endif
154
155#ifdef P6_NOP1
156static const unsigned char p6nops[] =
157{
158	P6_NOP1,
159	P6_NOP2,
160	P6_NOP3,
161	P6_NOP4,
162	P6_NOP5,
163	P6_NOP6,
164	P6_NOP7,
165	P6_NOP8,
166	P6_NOP5_ATOMIC
167};
168static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
169{
170	NULL,
171	p6nops,
172	p6nops + 1,
173	p6nops + 1 + 2,
174	p6nops + 1 + 2 + 3,
175	p6nops + 1 + 2 + 3 + 4,
176	p6nops + 1 + 2 + 3 + 4 + 5,
177	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
178	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
179	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
180};
181#endif
182
183/* Initialize these to a safe default */
184#ifdef CONFIG_X86_64
185const unsigned char * const *ideal_nops = p6_nops;
186#else
187const unsigned char * const *ideal_nops = intel_nops;
188#endif
189
190void __init arch_init_ideal_nops(void)
191{
192	switch (boot_cpu_data.x86_vendor) {
193	case X86_VENDOR_INTEL:
194		/*
195		 * Due to a decoder implementation quirk, some
196		 * specific Intel CPUs actually perform better with
197		 * the "k8_nops" than with the SDM-recommended NOPs.
198		 */
199		if (boot_cpu_data.x86 == 6 &&
200		    boot_cpu_data.x86_model >= 0x0f &&
201		    boot_cpu_data.x86_model != 0x1c &&
202		    boot_cpu_data.x86_model != 0x26 &&
203		    boot_cpu_data.x86_model != 0x27 &&
204		    boot_cpu_data.x86_model < 0x30) {
205			ideal_nops = k8_nops;
206		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
207			   ideal_nops = p6_nops;
208		} else {
209#ifdef CONFIG_X86_64
210			ideal_nops = k8_nops;
211#else
212			ideal_nops = intel_nops;
213#endif
214		}
215		break;
 
 
 
 
 
 
 
 
 
216	default:
217#ifdef CONFIG_X86_64
218		ideal_nops = k8_nops;
219#else
220		if (boot_cpu_has(X86_FEATURE_K8))
221			ideal_nops = k8_nops;
222		else if (boot_cpu_has(X86_FEATURE_K7))
223			ideal_nops = k7_nops;
224		else
225			ideal_nops = intel_nops;
226#endif
227	}
228}
229
230/* Use this to add nops to a buffer, then text_poke the whole buffer. */
231static void __init_or_module add_nops(void *insns, unsigned int len)
232{
233	while (len > 0) {
234		unsigned int noplen = len;
235		if (noplen > ASM_NOP_MAX)
236			noplen = ASM_NOP_MAX;
237		memcpy(insns, ideal_nops[noplen], noplen);
238		insns += noplen;
239		len -= noplen;
240	}
241}
242
243extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
244extern s32 __smp_locks[], __smp_locks_end[];
245void *text_poke_early(void *addr, const void *opcode, size_t len);
246
247/* Replace instructions with better alternatives for this CPU type.
248   This runs before SMP is initialized to avoid SMP problems with
249   self modifying code. This implies that asymmetric systems where
250   APs have less capabilities than the boot processor are not handled.
251   Tough. Make sure you disable such features by hand. */
 
 
252
253void __init_or_module apply_alternatives(struct alt_instr *start,
254					 struct alt_instr *end)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255{
256	struct alt_instr *a;
257	u8 *instr, *replacement;
258	u8 insnbuf[MAX_PATCH_LEN];
259
260	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
261	/*
262	 * The scan order should be from start to end. A later scanned
263	 * alternative code can overwrite a previous scanned alternative code.
264	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
265	 * patch code.
266	 *
267	 * So be careful if you want to change the scan order to any other
268	 * order.
269	 */
270	for (a = start; a < end; a++) {
 
 
271		instr = (u8 *)&a->instr_offset + a->instr_offset;
272		replacement = (u8 *)&a->repl_offset + a->repl_offset;
273		BUG_ON(a->replacementlen > a->instrlen);
274		BUG_ON(a->instrlen > sizeof(insnbuf));
275		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
276		if (!boot_cpu_has(a->cpuid))
 
 
 
277			continue;
 
 
 
 
 
 
 
 
 
 
278
279		memcpy(insnbuf, replacement, a->replacementlen);
 
280
281		/* 0xe8 is a relative jump; fix the offset. */
282		if (*insnbuf == 0xe8 && a->replacementlen == 5)
283		    *(s32 *)(insnbuf + 1) += replacement - instr;
 
 
 
 
284
285		add_nops(insnbuf + a->replacementlen,
286			 a->instrlen - a->replacementlen);
 
 
 
 
 
 
 
287
288		text_poke_early(instr, insnbuf, a->instrlen);
289	}
290}
291
292#ifdef CONFIG_SMP
293
294static void alternatives_smp_lock(const s32 *start, const s32 *end,
295				  u8 *text, u8 *text_end)
296{
297	const s32 *poff;
298
299	mutex_lock(&text_mutex);
300	for (poff = start; poff < end; poff++) {
301		u8 *ptr = (u8 *)poff + *poff;
302
303		if (!*poff || ptr < text || ptr >= text_end)
304			continue;
305		/* turn DS segment override prefix into lock prefix */
306		if (*ptr == 0x3e)
307			text_poke(ptr, ((unsigned char []){0xf0}), 1);
308	}
309	mutex_unlock(&text_mutex);
310}
311
312static void alternatives_smp_unlock(const s32 *start, const s32 *end,
313				    u8 *text, u8 *text_end)
314{
315	const s32 *poff;
316
317	mutex_lock(&text_mutex);
318	for (poff = start; poff < end; poff++) {
319		u8 *ptr = (u8 *)poff + *poff;
320
321		if (!*poff || ptr < text || ptr >= text_end)
322			continue;
323		/* turn lock prefix into DS segment override prefix */
324		if (*ptr == 0xf0)
325			text_poke(ptr, ((unsigned char []){0x3E}), 1);
326	}
327	mutex_unlock(&text_mutex);
328}
329
330struct smp_alt_module {
331	/* what is this ??? */
332	struct module	*mod;
333	char		*name;
334
335	/* ptrs to lock prefixes */
336	const s32	*locks;
337	const s32	*locks_end;
338
339	/* .text segment, needed to avoid patching init code ;) */
340	u8		*text;
341	u8		*text_end;
342
343	struct list_head next;
344};
345static LIST_HEAD(smp_alt_modules);
346static DEFINE_MUTEX(smp_alt);
347static bool uniproc_patched = false;	/* protected by smp_alt */
348
349void __init_or_module alternatives_smp_module_add(struct module *mod,
350						  char *name,
351						  void *locks, void *locks_end,
352						  void *text,  void *text_end)
353{
354	struct smp_alt_module *smp;
355
356	mutex_lock(&smp_alt);
357	if (!uniproc_patched)
358		goto unlock;
359
360	if (num_possible_cpus() == 1)
361		/* Don't bother remembering, we'll never have to undo it. */
362		goto smp_unlock;
363
364	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
365	if (NULL == smp)
366		/* we'll run the (safe but slow) SMP code then ... */
367		goto unlock;
368
369	smp->mod	= mod;
370	smp->name	= name;
371	smp->locks	= locks;
372	smp->locks_end	= locks_end;
373	smp->text	= text;
374	smp->text_end	= text_end;
375	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
376		__func__, smp->locks, smp->locks_end,
377		smp->text, smp->text_end, smp->name);
378
379	list_add_tail(&smp->next, &smp_alt_modules);
380smp_unlock:
381	alternatives_smp_unlock(locks, locks_end, text, text_end);
382unlock:
383	mutex_unlock(&smp_alt);
384}
385
386void __init_or_module alternatives_smp_module_del(struct module *mod)
387{
388	struct smp_alt_module *item;
389
390	mutex_lock(&smp_alt);
391	list_for_each_entry(item, &smp_alt_modules, next) {
392		if (mod != item->mod)
393			continue;
394		list_del(&item->next);
395		kfree(item);
396		break;
397	}
398	mutex_unlock(&smp_alt);
399}
400
401void alternatives_enable_smp(void)
402{
403	struct smp_alt_module *mod;
404
405	/* Why bother if there are no other CPUs? */
406	BUG_ON(num_possible_cpus() == 1);
407
408	mutex_lock(&smp_alt);
409
410	if (uniproc_patched) {
411		pr_info("switching to SMP code\n");
412		BUG_ON(num_online_cpus() != 1);
413		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
414		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
415		list_for_each_entry(mod, &smp_alt_modules, next)
416			alternatives_smp_lock(mod->locks, mod->locks_end,
417					      mod->text, mod->text_end);
418		uniproc_patched = false;
419	}
420	mutex_unlock(&smp_alt);
421}
422
423/* Return 1 if the address range is reserved for smp-alternatives */
424int alternatives_text_reserved(void *start, void *end)
425{
426	struct smp_alt_module *mod;
427	const s32 *poff;
428	u8 *text_start = start;
429	u8 *text_end = end;
430
431	list_for_each_entry(mod, &smp_alt_modules, next) {
432		if (mod->text > text_end || mod->text_end < text_start)
433			continue;
434		for (poff = mod->locks; poff < mod->locks_end; poff++) {
435			const u8 *ptr = (const u8 *)poff + *poff;
436
437			if (text_start <= ptr && text_end > ptr)
438				return 1;
439		}
440	}
441
442	return 0;
443}
444#endif
445
446#ifdef CONFIG_PARAVIRT
447void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
448				     struct paravirt_patch_site *end)
449{
450	struct paravirt_patch_site *p;
451	char insnbuf[MAX_PATCH_LEN];
452
453	if (noreplace_paravirt)
454		return;
455
456	for (p = start; p < end; p++) {
457		unsigned int used;
458
459		BUG_ON(p->len > MAX_PATCH_LEN);
460		/* prep the buffer with the original instructions */
461		memcpy(insnbuf, p->instr, p->len);
462		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
463					 (unsigned long)p->instr, p->len);
464
465		BUG_ON(used > p->len);
466
467		/* Pad the rest with nops */
468		add_nops(insnbuf + used, p->len - used);
469		text_poke_early(p->instr, insnbuf, p->len);
470	}
471}
472extern struct paravirt_patch_site __start_parainstructions[],
473	__stop_parainstructions[];
474#endif	/* CONFIG_PARAVIRT */
475
476void __init alternative_instructions(void)
477{
478	/* The patching is not fully atomic, so try to avoid local interruptions
479	   that might execute the to be patched code.
480	   Other CPUs are not running. */
481	stop_nmi();
482
483	/*
484	 * Don't stop machine check exceptions while patching.
485	 * MCEs only happen when something got corrupted and in this
486	 * case we must do something about the corruption.
487	 * Ignoring it is worse than a unlikely patching race.
488	 * Also machine checks tend to be broadcast and if one CPU
489	 * goes into machine check the others follow quickly, so we don't
490	 * expect a machine check to cause undue problems during to code
491	 * patching.
492	 */
493
494	apply_alternatives(__alt_instructions, __alt_instructions_end);
495
496#ifdef CONFIG_SMP
497	/* Patch to UP if other cpus not imminent. */
498	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
499		uniproc_patched = true;
500		alternatives_smp_module_add(NULL, "core kernel",
501					    __smp_locks, __smp_locks_end,
502					    _text, _etext);
503	}
504
505	if (!uniproc_patched || num_possible_cpus() == 1)
506		free_init_pages("SMP alternatives",
507				(unsigned long)__smp_locks,
508				(unsigned long)__smp_locks_end);
509#endif
510
511	apply_paravirt(__parainstructions, __parainstructions_end);
512
513	restart_nmi();
 
514}
515
516/**
517 * text_poke_early - Update instructions on a live kernel at boot time
518 * @addr: address to modify
519 * @opcode: source of the copy
520 * @len: length to copy
521 *
522 * When you use this code to patch more than one byte of an instruction
523 * you need to make sure that other CPUs cannot execute this code in parallel.
524 * Also no thread must be currently preempted in the middle of these
525 * instructions. And on the local CPU you need to be protected again NMI or MCE
526 * handlers seeing an inconsistent instruction while you patch.
527 */
528void *__init_or_module text_poke_early(void *addr, const void *opcode,
529					      size_t len)
530{
531	unsigned long flags;
532	local_irq_save(flags);
533	memcpy(addr, opcode, len);
534	sync_core();
535	local_irq_restore(flags);
536	/* Could also do a CLFLUSH here to speed up CPU recovery; but
537	   that causes hangs on some VIA CPUs. */
538	return addr;
539}
540
541/**
542 * text_poke - Update instructions on a live kernel
543 * @addr: address to modify
544 * @opcode: source of the copy
545 * @len: length to copy
546 *
547 * Only atomic text poke/set should be allowed when not doing early patching.
548 * It means the size must be writable atomically and the address must be aligned
549 * in a way that permits an atomic write. It also makes sure we fit on a single
550 * page.
551 *
552 * Note: Must be called under text_mutex.
553 */
554void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
555{
556	unsigned long flags;
557	char *vaddr;
558	struct page *pages[2];
559	int i;
560
561	if (!core_kernel_text((unsigned long)addr)) {
562		pages[0] = vmalloc_to_page(addr);
563		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
564	} else {
565		pages[0] = virt_to_page(addr);
566		WARN_ON(!PageReserved(pages[0]));
567		pages[1] = virt_to_page(addr + PAGE_SIZE);
568	}
569	BUG_ON(!pages[0]);
570	local_irq_save(flags);
571	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
572	if (pages[1])
573		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
574	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
575	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
576	clear_fixmap(FIX_TEXT_POKE0);
577	if (pages[1])
578		clear_fixmap(FIX_TEXT_POKE1);
579	local_flush_tlb();
580	sync_core();
581	/* Could also do a CLFLUSH here to speed up CPU recovery; but
582	   that causes hangs on some VIA CPUs. */
583	for (i = 0; i < len; i++)
584		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
585	local_irq_restore(flags);
586	return addr;
587}
588
589static void do_sync_core(void *info)
590{
591	sync_core();
592}
593
594static bool bp_patching_in_progress;
595static void *bp_int3_handler, *bp_int3_addr;
596
597int poke_int3_handler(struct pt_regs *regs)
598{
599	/* bp_patching_in_progress */
600	smp_rmb();
601
602	if (likely(!bp_patching_in_progress))
603		return 0;
604
605	if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
606		return 0;
607
608	/* set up the specified breakpoint handler */
609	regs->ip = (unsigned long) bp_int3_handler;
610
611	return 1;
612
613}
614
615/**
616 * text_poke_bp() -- update instructions on live kernel on SMP
617 * @addr:	address to patch
618 * @opcode:	opcode of new instruction
619 * @len:	length to copy
620 * @handler:	address to jump to when the temporary breakpoint is hit
621 *
622 * Modify multi-byte instruction by using int3 breakpoint on SMP.
623 * We completely avoid stop_machine() here, and achieve the
624 * synchronization using int3 breakpoint.
625 *
626 * The way it is done:
627 *	- add a int3 trap to the address that will be patched
628 *	- sync cores
629 *	- update all but the first byte of the patched range
630 *	- sync cores
631 *	- replace the first byte (int3) by the first byte of
632 *	  replacing opcode
633 *	- sync cores
634 *
635 * Note: must be called under text_mutex.
636 */
637void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
638{
639	unsigned char int3 = 0xcc;
640
641	bp_int3_handler = handler;
642	bp_int3_addr = (u8 *)addr + sizeof(int3);
643	bp_patching_in_progress = true;
644	/*
645	 * Corresponding read barrier in int3 notifier for
646	 * making sure the in_progress flags is correctly ordered wrt.
647	 * patching
648	 */
649	smp_wmb();
650
651	text_poke(addr, &int3, sizeof(int3));
652
653	on_each_cpu(do_sync_core, NULL, 1);
654
655	if (len - sizeof(int3) > 0) {
656		/* patch all but the first byte */
657		text_poke((char *)addr + sizeof(int3),
658			  (const char *) opcode + sizeof(int3),
659			  len - sizeof(int3));
660		/*
661		 * According to Intel, this core syncing is very likely
662		 * not necessary and we'd be safe even without it. But
663		 * better safe than sorry (plus there's not only Intel).
664		 */
665		on_each_cpu(do_sync_core, NULL, 1);
666	}
667
668	/* patch the first byte */
669	text_poke(addr, opcode, sizeof(int3));
670
671	on_each_cpu(do_sync_core, NULL, 1);
672
673	bp_patching_in_progress = false;
674	smp_wmb();
675
676	return addr;
677}
678
v4.10.11
  1#define pr_fmt(fmt) "SMP alternatives: " fmt
  2
  3#include <linux/module.h>
  4#include <linux/sched.h>
  5#include <linux/mutex.h>
  6#include <linux/list.h>
  7#include <linux/stringify.h>
 
  8#include <linux/mm.h>
  9#include <linux/vmalloc.h>
 10#include <linux/memory.h>
 11#include <linux/stop_machine.h>
 12#include <linux/slab.h>
 13#include <linux/kdebug.h>
 14#include <asm/text-patching.h>
 15#include <asm/alternative.h>
 16#include <asm/sections.h>
 17#include <asm/pgtable.h>
 18#include <asm/mce.h>
 19#include <asm/nmi.h>
 20#include <asm/cacheflush.h>
 21#include <asm/tlbflush.h>
 22#include <asm/io.h>
 23#include <asm/fixmap.h>
 24
 25int __read_mostly alternatives_patched;
 26
 27EXPORT_SYMBOL_GPL(alternatives_patched);
 28
 29#define MAX_PATCH_LEN (255-1)
 30
 31static int __initdata_or_module debug_alternative;
 32
 33static int __init debug_alt(char *str)
 34{
 35	debug_alternative = 1;
 36	return 1;
 37}
 38__setup("debug-alternative", debug_alt);
 39
 40static int noreplace_smp;
 41
 42static int __init setup_noreplace_smp(char *str)
 43{
 44	noreplace_smp = 1;
 45	return 1;
 46}
 47__setup("noreplace-smp", setup_noreplace_smp);
 48
 49#ifdef CONFIG_PARAVIRT
 50static int __initdata_or_module noreplace_paravirt = 0;
 51
 52static int __init setup_noreplace_paravirt(char *str)
 53{
 54	noreplace_paravirt = 1;
 55	return 1;
 56}
 57__setup("noreplace-paravirt", setup_noreplace_paravirt);
 58#endif
 59
 60#define DPRINTK(fmt, args...)						\
 61do {									\
 62	if (debug_alternative)						\
 63		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
 64} while (0)
 65
 66#define DUMP_BYTES(buf, len, fmt, args...)				\
 67do {									\
 68	if (unlikely(debug_alternative)) {				\
 69		int j;							\
 70									\
 71		if (!(len))						\
 72			break;						\
 73									\
 74		printk(KERN_DEBUG fmt, ##args);				\
 75		for (j = 0; j < (len) - 1; j++)				\
 76			printk(KERN_CONT "%02hhx ", buf[j]);		\
 77		printk(KERN_CONT "%02hhx\n", buf[j]);			\
 78	}								\
 79} while (0)
 80
 81/*
 82 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
 83 * that correspond to that nop. Getting from one nop to the next, we
 84 * add to the array the offset that is equal to the sum of all sizes of
 85 * nops preceding the one we are after.
 86 *
 87 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
 88 * nice symmetry of sizes of the previous nops.
 89 */
 90#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
 91static const unsigned char intelnops[] =
 92{
 93	GENERIC_NOP1,
 94	GENERIC_NOP2,
 95	GENERIC_NOP3,
 96	GENERIC_NOP4,
 97	GENERIC_NOP5,
 98	GENERIC_NOP6,
 99	GENERIC_NOP7,
100	GENERIC_NOP8,
101	GENERIC_NOP5_ATOMIC
102};
103static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
104{
105	NULL,
106	intelnops,
107	intelnops + 1,
108	intelnops + 1 + 2,
109	intelnops + 1 + 2 + 3,
110	intelnops + 1 + 2 + 3 + 4,
111	intelnops + 1 + 2 + 3 + 4 + 5,
112	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
113	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
114	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
115};
116#endif
117
118#ifdef K8_NOP1
119static const unsigned char k8nops[] =
120{
121	K8_NOP1,
122	K8_NOP2,
123	K8_NOP3,
124	K8_NOP4,
125	K8_NOP5,
126	K8_NOP6,
127	K8_NOP7,
128	K8_NOP8,
129	K8_NOP5_ATOMIC
130};
131static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
132{
133	NULL,
134	k8nops,
135	k8nops + 1,
136	k8nops + 1 + 2,
137	k8nops + 1 + 2 + 3,
138	k8nops + 1 + 2 + 3 + 4,
139	k8nops + 1 + 2 + 3 + 4 + 5,
140	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
141	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
142	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
143};
144#endif
145
146#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
147static const unsigned char k7nops[] =
148{
149	K7_NOP1,
150	K7_NOP2,
151	K7_NOP3,
152	K7_NOP4,
153	K7_NOP5,
154	K7_NOP6,
155	K7_NOP7,
156	K7_NOP8,
157	K7_NOP5_ATOMIC
158};
159static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
160{
161	NULL,
162	k7nops,
163	k7nops + 1,
164	k7nops + 1 + 2,
165	k7nops + 1 + 2 + 3,
166	k7nops + 1 + 2 + 3 + 4,
167	k7nops + 1 + 2 + 3 + 4 + 5,
168	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
169	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
170	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
171};
172#endif
173
174#ifdef P6_NOP1
175static const unsigned char p6nops[] =
176{
177	P6_NOP1,
178	P6_NOP2,
179	P6_NOP3,
180	P6_NOP4,
181	P6_NOP5,
182	P6_NOP6,
183	P6_NOP7,
184	P6_NOP8,
185	P6_NOP5_ATOMIC
186};
187static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
188{
189	NULL,
190	p6nops,
191	p6nops + 1,
192	p6nops + 1 + 2,
193	p6nops + 1 + 2 + 3,
194	p6nops + 1 + 2 + 3 + 4,
195	p6nops + 1 + 2 + 3 + 4 + 5,
196	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
197	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
198	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
199};
200#endif
201
202/* Initialize these to a safe default */
203#ifdef CONFIG_X86_64
204const unsigned char * const *ideal_nops = p6_nops;
205#else
206const unsigned char * const *ideal_nops = intel_nops;
207#endif
208
209void __init arch_init_ideal_nops(void)
210{
211	switch (boot_cpu_data.x86_vendor) {
212	case X86_VENDOR_INTEL:
213		/*
214		 * Due to a decoder implementation quirk, some
215		 * specific Intel CPUs actually perform better with
216		 * the "k8_nops" than with the SDM-recommended NOPs.
217		 */
218		if (boot_cpu_data.x86 == 6 &&
219		    boot_cpu_data.x86_model >= 0x0f &&
220		    boot_cpu_data.x86_model != 0x1c &&
221		    boot_cpu_data.x86_model != 0x26 &&
222		    boot_cpu_data.x86_model != 0x27 &&
223		    boot_cpu_data.x86_model < 0x30) {
224			ideal_nops = k8_nops;
225		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
226			   ideal_nops = p6_nops;
227		} else {
228#ifdef CONFIG_X86_64
229			ideal_nops = k8_nops;
230#else
231			ideal_nops = intel_nops;
232#endif
233		}
234		break;
235
236	case X86_VENDOR_AMD:
237		if (boot_cpu_data.x86 > 0xf) {
238			ideal_nops = p6_nops;
239			return;
240		}
241
242		/* fall through */
243
244	default:
245#ifdef CONFIG_X86_64
246		ideal_nops = k8_nops;
247#else
248		if (boot_cpu_has(X86_FEATURE_K8))
249			ideal_nops = k8_nops;
250		else if (boot_cpu_has(X86_FEATURE_K7))
251			ideal_nops = k7_nops;
252		else
253			ideal_nops = intel_nops;
254#endif
255	}
256}
257
258/* Use this to add nops to a buffer, then text_poke the whole buffer. */
259static void __init_or_module add_nops(void *insns, unsigned int len)
260{
261	while (len > 0) {
262		unsigned int noplen = len;
263		if (noplen > ASM_NOP_MAX)
264			noplen = ASM_NOP_MAX;
265		memcpy(insns, ideal_nops[noplen], noplen);
266		insns += noplen;
267		len -= noplen;
268	}
269}
270
271extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
272extern s32 __smp_locks[], __smp_locks_end[];
273void *text_poke_early(void *addr, const void *opcode, size_t len);
274
275/*
276 * Are we looking at a near JMP with a 1 or 4-byte displacement.
277 */
278static inline bool is_jmp(const u8 opcode)
279{
280	return opcode == 0xeb || opcode == 0xe9;
281}
282
283static void __init_or_module
284recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
285{
286	u8 *next_rip, *tgt_rip;
287	s32 n_dspl, o_dspl;
288	int repl_len;
289
290	if (a->replacementlen != 5)
291		return;
292
293	o_dspl = *(s32 *)(insnbuf + 1);
294
295	/* next_rip of the replacement JMP */
296	next_rip = repl_insn + a->replacementlen;
297	/* target rip of the replacement JMP */
298	tgt_rip  = next_rip + o_dspl;
299	n_dspl = tgt_rip - orig_insn;
300
301	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
302
303	if (tgt_rip - orig_insn >= 0) {
304		if (n_dspl - 2 <= 127)
305			goto two_byte_jmp;
306		else
307			goto five_byte_jmp;
308	/* negative offset */
309	} else {
310		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
311			goto two_byte_jmp;
312		else
313			goto five_byte_jmp;
314	}
315
316two_byte_jmp:
317	n_dspl -= 2;
318
319	insnbuf[0] = 0xeb;
320	insnbuf[1] = (s8)n_dspl;
321	add_nops(insnbuf + 2, 3);
322
323	repl_len = 2;
324	goto done;
325
326five_byte_jmp:
327	n_dspl -= 5;
328
329	insnbuf[0] = 0xe9;
330	*(s32 *)&insnbuf[1] = n_dspl;
331
332	repl_len = 5;
333
334done:
335
336	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
337		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
338}
339
340/*
341 * "noinline" to cause control flow change and thus invalidate I$ and
342 * cause refetch after modification.
343 */
344static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
345{
346	unsigned long flags;
347
348	if (instr[0] != 0x90)
349		return;
350
351	local_irq_save(flags);
352	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
353	local_irq_restore(flags);
354
355	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
356		   instr, a->instrlen - a->padlen, a->padlen);
357}
358
359/*
360 * Replace instructions with better alternatives for this CPU type. This runs
361 * before SMP is initialized to avoid SMP problems with self modifying code.
362 * This implies that asymmetric systems where APs have less capabilities than
363 * the boot processor are not handled. Tough. Make sure you disable such
364 * features by hand.
365 *
366 * Marked "noinline" to cause control flow change and thus insn cache
367 * to refetch changed I$ lines.
368 */
369void __init_or_module noinline apply_alternatives(struct alt_instr *start,
370						  struct alt_instr *end)
371{
372	struct alt_instr *a;
373	u8 *instr, *replacement;
374	u8 insnbuf[MAX_PATCH_LEN];
375
376	DPRINTK("alt table %p -> %p", start, end);
377	/*
378	 * The scan order should be from start to end. A later scanned
379	 * alternative code can overwrite previously scanned alternative code.
380	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
381	 * patch code.
382	 *
383	 * So be careful if you want to change the scan order to any other
384	 * order.
385	 */
386	for (a = start; a < end; a++) {
387		int insnbuf_sz = 0;
388
389		instr = (u8 *)&a->instr_offset + a->instr_offset;
390		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 
391		BUG_ON(a->instrlen > sizeof(insnbuf));
392		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
393		if (!boot_cpu_has(a->cpuid)) {
394			if (a->padlen > 1)
395				optimize_nops(a, instr);
396
397			continue;
398		}
399
400		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
401			a->cpuid >> 5,
402			a->cpuid & 0x1f,
403			instr, a->instrlen,
404			replacement, a->replacementlen, a->padlen);
405
406		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
407		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
408
409		memcpy(insnbuf, replacement, a->replacementlen);
410		insnbuf_sz = a->replacementlen;
411
412		/* 0xe8 is a relative jump; fix the offset. */
413		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
414			*(s32 *)(insnbuf + 1) += replacement - instr;
415			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
416				*(s32 *)(insnbuf + 1),
417				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
418		}
419
420		if (a->replacementlen && is_jmp(replacement[0]))
421			recompute_jump(a, instr, replacement, insnbuf);
422
423		if (a->instrlen > a->replacementlen) {
424			add_nops(insnbuf + a->replacementlen,
425				 a->instrlen - a->replacementlen);
426			insnbuf_sz += a->instrlen - a->replacementlen;
427		}
428		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
429
430		text_poke_early(instr, insnbuf, insnbuf_sz);
431	}
432}
433
434#ifdef CONFIG_SMP
 
435static void alternatives_smp_lock(const s32 *start, const s32 *end,
436				  u8 *text, u8 *text_end)
437{
438	const s32 *poff;
439
440	mutex_lock(&text_mutex);
441	for (poff = start; poff < end; poff++) {
442		u8 *ptr = (u8 *)poff + *poff;
443
444		if (!*poff || ptr < text || ptr >= text_end)
445			continue;
446		/* turn DS segment override prefix into lock prefix */
447		if (*ptr == 0x3e)
448			text_poke(ptr, ((unsigned char []){0xf0}), 1);
449	}
450	mutex_unlock(&text_mutex);
451}
452
453static void alternatives_smp_unlock(const s32 *start, const s32 *end,
454				    u8 *text, u8 *text_end)
455{
456	const s32 *poff;
457
458	mutex_lock(&text_mutex);
459	for (poff = start; poff < end; poff++) {
460		u8 *ptr = (u8 *)poff + *poff;
461
462		if (!*poff || ptr < text || ptr >= text_end)
463			continue;
464		/* turn lock prefix into DS segment override prefix */
465		if (*ptr == 0xf0)
466			text_poke(ptr, ((unsigned char []){0x3E}), 1);
467	}
468	mutex_unlock(&text_mutex);
469}
470
471struct smp_alt_module {
472	/* what is this ??? */
473	struct module	*mod;
474	char		*name;
475
476	/* ptrs to lock prefixes */
477	const s32	*locks;
478	const s32	*locks_end;
479
480	/* .text segment, needed to avoid patching init code ;) */
481	u8		*text;
482	u8		*text_end;
483
484	struct list_head next;
485};
486static LIST_HEAD(smp_alt_modules);
487static DEFINE_MUTEX(smp_alt);
488static bool uniproc_patched = false;	/* protected by smp_alt */
489
490void __init_or_module alternatives_smp_module_add(struct module *mod,
491						  char *name,
492						  void *locks, void *locks_end,
493						  void *text,  void *text_end)
494{
495	struct smp_alt_module *smp;
496
497	mutex_lock(&smp_alt);
498	if (!uniproc_patched)
499		goto unlock;
500
501	if (num_possible_cpus() == 1)
502		/* Don't bother remembering, we'll never have to undo it. */
503		goto smp_unlock;
504
505	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
506	if (NULL == smp)
507		/* we'll run the (safe but slow) SMP code then ... */
508		goto unlock;
509
510	smp->mod	= mod;
511	smp->name	= name;
512	smp->locks	= locks;
513	smp->locks_end	= locks_end;
514	smp->text	= text;
515	smp->text_end	= text_end;
516	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
517		smp->locks, smp->locks_end,
518		smp->text, smp->text_end, smp->name);
519
520	list_add_tail(&smp->next, &smp_alt_modules);
521smp_unlock:
522	alternatives_smp_unlock(locks, locks_end, text, text_end);
523unlock:
524	mutex_unlock(&smp_alt);
525}
526
527void __init_or_module alternatives_smp_module_del(struct module *mod)
528{
529	struct smp_alt_module *item;
530
531	mutex_lock(&smp_alt);
532	list_for_each_entry(item, &smp_alt_modules, next) {
533		if (mod != item->mod)
534			continue;
535		list_del(&item->next);
536		kfree(item);
537		break;
538	}
539	mutex_unlock(&smp_alt);
540}
541
542void alternatives_enable_smp(void)
543{
544	struct smp_alt_module *mod;
545
546	/* Why bother if there are no other CPUs? */
547	BUG_ON(num_possible_cpus() == 1);
548
549	mutex_lock(&smp_alt);
550
551	if (uniproc_patched) {
552		pr_info("switching to SMP code\n");
553		BUG_ON(num_online_cpus() != 1);
554		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
555		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
556		list_for_each_entry(mod, &smp_alt_modules, next)
557			alternatives_smp_lock(mod->locks, mod->locks_end,
558					      mod->text, mod->text_end);
559		uniproc_patched = false;
560	}
561	mutex_unlock(&smp_alt);
562}
563
564/* Return 1 if the address range is reserved for smp-alternatives */
565int alternatives_text_reserved(void *start, void *end)
566{
567	struct smp_alt_module *mod;
568	const s32 *poff;
569	u8 *text_start = start;
570	u8 *text_end = end;
571
572	list_for_each_entry(mod, &smp_alt_modules, next) {
573		if (mod->text > text_end || mod->text_end < text_start)
574			continue;
575		for (poff = mod->locks; poff < mod->locks_end; poff++) {
576			const u8 *ptr = (const u8 *)poff + *poff;
577
578			if (text_start <= ptr && text_end > ptr)
579				return 1;
580		}
581	}
582
583	return 0;
584}
585#endif /* CONFIG_SMP */
586
587#ifdef CONFIG_PARAVIRT
588void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
589				     struct paravirt_patch_site *end)
590{
591	struct paravirt_patch_site *p;
592	char insnbuf[MAX_PATCH_LEN];
593
594	if (noreplace_paravirt)
595		return;
596
597	for (p = start; p < end; p++) {
598		unsigned int used;
599
600		BUG_ON(p->len > MAX_PATCH_LEN);
601		/* prep the buffer with the original instructions */
602		memcpy(insnbuf, p->instr, p->len);
603		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
604					 (unsigned long)p->instr, p->len);
605
606		BUG_ON(used > p->len);
607
608		/* Pad the rest with nops */
609		add_nops(insnbuf + used, p->len - used);
610		text_poke_early(p->instr, insnbuf, p->len);
611	}
612}
613extern struct paravirt_patch_site __start_parainstructions[],
614	__stop_parainstructions[];
615#endif	/* CONFIG_PARAVIRT */
616
617void __init alternative_instructions(void)
618{
619	/* The patching is not fully atomic, so try to avoid local interruptions
620	   that might execute the to be patched code.
621	   Other CPUs are not running. */
622	stop_nmi();
623
624	/*
625	 * Don't stop machine check exceptions while patching.
626	 * MCEs only happen when something got corrupted and in this
627	 * case we must do something about the corruption.
628	 * Ignoring it is worse than a unlikely patching race.
629	 * Also machine checks tend to be broadcast and if one CPU
630	 * goes into machine check the others follow quickly, so we don't
631	 * expect a machine check to cause undue problems during to code
632	 * patching.
633	 */
634
635	apply_alternatives(__alt_instructions, __alt_instructions_end);
636
637#ifdef CONFIG_SMP
638	/* Patch to UP if other cpus not imminent. */
639	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
640		uniproc_patched = true;
641		alternatives_smp_module_add(NULL, "core kernel",
642					    __smp_locks, __smp_locks_end,
643					    _text, _etext);
644	}
645
646	if (!uniproc_patched || num_possible_cpus() == 1)
647		free_init_pages("SMP alternatives",
648				(unsigned long)__smp_locks,
649				(unsigned long)__smp_locks_end);
650#endif
651
652	apply_paravirt(__parainstructions, __parainstructions_end);
653
654	restart_nmi();
655	alternatives_patched = 1;
656}
657
658/**
659 * text_poke_early - Update instructions on a live kernel at boot time
660 * @addr: address to modify
661 * @opcode: source of the copy
662 * @len: length to copy
663 *
664 * When you use this code to patch more than one byte of an instruction
665 * you need to make sure that other CPUs cannot execute this code in parallel.
666 * Also no thread must be currently preempted in the middle of these
667 * instructions. And on the local CPU you need to be protected again NMI or MCE
668 * handlers seeing an inconsistent instruction while you patch.
669 */
670void *__init_or_module text_poke_early(void *addr, const void *opcode,
671					      size_t len)
672{
673	unsigned long flags;
674	local_irq_save(flags);
675	memcpy(addr, opcode, len);
 
676	local_irq_restore(flags);
677	/* Could also do a CLFLUSH here to speed up CPU recovery; but
678	   that causes hangs on some VIA CPUs. */
679	return addr;
680}
681
682/**
683 * text_poke - Update instructions on a live kernel
684 * @addr: address to modify
685 * @opcode: source of the copy
686 * @len: length to copy
687 *
688 * Only atomic text poke/set should be allowed when not doing early patching.
689 * It means the size must be writable atomically and the address must be aligned
690 * in a way that permits an atomic write. It also makes sure we fit on a single
691 * page.
692 *
693 * Note: Must be called under text_mutex.
694 */
695void *text_poke(void *addr, const void *opcode, size_t len)
696{
697	unsigned long flags;
698	char *vaddr;
699	struct page *pages[2];
700	int i;
701
702	if (!core_kernel_text((unsigned long)addr)) {
703		pages[0] = vmalloc_to_page(addr);
704		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
705	} else {
706		pages[0] = virt_to_page(addr);
707		WARN_ON(!PageReserved(pages[0]));
708		pages[1] = virt_to_page(addr + PAGE_SIZE);
709	}
710	BUG_ON(!pages[0]);
711	local_irq_save(flags);
712	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
713	if (pages[1])
714		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
715	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
716	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
717	clear_fixmap(FIX_TEXT_POKE0);
718	if (pages[1])
719		clear_fixmap(FIX_TEXT_POKE1);
720	local_flush_tlb();
721	sync_core();
722	/* Could also do a CLFLUSH here to speed up CPU recovery; but
723	   that causes hangs on some VIA CPUs. */
724	for (i = 0; i < len; i++)
725		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
726	local_irq_restore(flags);
727	return addr;
728}
729
730static void do_sync_core(void *info)
731{
732	sync_core();
733}
734
735static bool bp_patching_in_progress;
736static void *bp_int3_handler, *bp_int3_addr;
737
738int poke_int3_handler(struct pt_regs *regs)
739{
740	/* bp_patching_in_progress */
741	smp_rmb();
742
743	if (likely(!bp_patching_in_progress))
744		return 0;
745
746	if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
747		return 0;
748
749	/* set up the specified breakpoint handler */
750	regs->ip = (unsigned long) bp_int3_handler;
751
752	return 1;
753
754}
755
756/**
757 * text_poke_bp() -- update instructions on live kernel on SMP
758 * @addr:	address to patch
759 * @opcode:	opcode of new instruction
760 * @len:	length to copy
761 * @handler:	address to jump to when the temporary breakpoint is hit
762 *
763 * Modify multi-byte instruction by using int3 breakpoint on SMP.
764 * We completely avoid stop_machine() here, and achieve the
765 * synchronization using int3 breakpoint.
766 *
767 * The way it is done:
768 *	- add a int3 trap to the address that will be patched
769 *	- sync cores
770 *	- update all but the first byte of the patched range
771 *	- sync cores
772 *	- replace the first byte (int3) by the first byte of
773 *	  replacing opcode
774 *	- sync cores
775 *
776 * Note: must be called under text_mutex.
777 */
778void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
779{
780	unsigned char int3 = 0xcc;
781
782	bp_int3_handler = handler;
783	bp_int3_addr = (u8 *)addr + sizeof(int3);
784	bp_patching_in_progress = true;
785	/*
786	 * Corresponding read barrier in int3 notifier for
787	 * making sure the in_progress flags is correctly ordered wrt.
788	 * patching
789	 */
790	smp_wmb();
791
792	text_poke(addr, &int3, sizeof(int3));
793
794	on_each_cpu(do_sync_core, NULL, 1);
795
796	if (len - sizeof(int3) > 0) {
797		/* patch all but the first byte */
798		text_poke((char *)addr + sizeof(int3),
799			  (const char *) opcode + sizeof(int3),
800			  len - sizeof(int3));
801		/*
802		 * According to Intel, this core syncing is very likely
803		 * not necessary and we'd be safe even without it. But
804		 * better safe than sorry (plus there's not only Intel).
805		 */
806		on_each_cpu(do_sync_core, NULL, 1);
807	}
808
809	/* patch the first byte */
810	text_poke(addr, opcode, sizeof(int3));
811
812	on_each_cpu(do_sync_core, NULL, 1);
813
814	bp_patching_in_progress = false;
815	smp_wmb();
816
817	return addr;
818}
819