mce.c - arch/x86/kernel/cpu/mcheck/mce.c - Linux diff v4.17

   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/thread_info.h>
  14#include <linux/capability.h>
  15#include <linux/miscdevice.h>
  16#include <linux/ratelimit.h>
 
  17#include <linux/rcupdate.h>
  18#include <linux/kobject.h>
  19#include <linux/uaccess.h>
  20#include <linux/kdebug.h>
  21#include <linux/kernel.h>
  22#include <linux/percpu.h>
  23#include <linux/string.h>
  24#include <linux/device.h>
  25#include <linux/syscore_ops.h>
  26#include <linux/delay.h>
  27#include <linux/ctype.h>
  28#include <linux/sched.h>
  29#include <linux/sysfs.h>
  30#include <linux/types.h>
  31#include <linux/slab.h>
  32#include <linux/init.h>
  33#include <linux/kmod.h>
  34#include <linux/poll.h>
  35#include <linux/nmi.h>
  36#include <linux/cpu.h>
  37#include <linux/ras.h>
  38#include <linux/smp.h>
  39#include <linux/fs.h>
  40#include <linux/mm.h>
  41#include <linux/debugfs.h>
  42#include <linux/irq_work.h>
  43#include <linux/export.h>
  44#include <linux/jump_label.h>
  45
  46#include <asm/intel-family.h>
  47#include <asm/processor.h>
  48#include <asm/traps.h>
  49#include <asm/tlbflush.h>
  50#include <asm/mce.h>
  51#include <asm/msr.h>
  52#include <asm/reboot.h>
  53#include <asm/set_memory.h>
  54
  55#include "mce-internal.h"
  56
  57static DEFINE_MUTEX(mce_log_mutex);
  58
  59/* sysfs synchronization */
  60static DEFINE_MUTEX(mce_sysfs_mutex);
 
 
  61
  62#define CREATE_TRACE_POINTS
  63#include <trace/events/mce.h>
  64
  65#define SPINUNIT		100	/* 100ns */
 
 
 
 
 
 
  66
  67DEFINE_PER_CPU(unsigned, mce_exception_count);
  68
  69struct mce_bank *mce_banks __read_mostly;
  70struct mce_vendor_flags mce_flags __read_mostly;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  71
  72struct mca_config mca_cfg __read_mostly = {
  73	.bootlog  = -1,
  74	/*
  75	 * Tolerant levels:
  76	 * 0: always panic on uncorrected errors, log corrected errors
  77	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  78	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  79	 * 3: never panic or SIGBUS, log all errors (for testing only)
  80	 */
  81	.tolerant = 1,
  82	.monarch_timeout = -1
  83};
  84
  85static DEFINE_PER_CPU(struct mce, mces_seen);
  86static unsigned long mce_need_notify;
  87static int cpu_missing;
  88
  89/*
  90 * MCA banks polled by the period polling timer for corrected events.
  91 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  92 */
  93DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  94	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  95};
  96
  97/*
  98 * MCA banks controlled through firmware first for corrected errors.
  99 * This is a global list of banks for which we won't enable CMCI and we
 100 * won't poll. Firmware controls these banks and is responsible for
 101 * reporting corrected errors through GHES. Uncorrected/recoverable
 102 * errors are still notified through a machine check.
 103 */
 104mce_banks_t mce_banks_ce_disabled;
 105
 106static struct work_struct mce_work;
 107static struct irq_work mce_irq_work;
 108
 109static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 110
 111#ifndef mce_unmap_kpfn
 112static void mce_unmap_kpfn(unsigned long pfn);
 113#endif
 114
 115/*
 116 * CPU/chipset specific EDAC code can register a notifier call here to print
 117 * MCE errors in a human-readable form.
 118 */
 119BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 120
 121/* Do initial initialization of a struct mce */
 122void mce_setup(struct mce *m)
 123{
 124	memset(m, 0, sizeof(struct mce));
 125	m->cpu = m->extcpu = smp_processor_id();
 
 126	/* We hope get_seconds stays lockless */
 127	m->time = get_seconds();
 128	m->cpuvendor = boot_cpu_data.x86_vendor;
 129	m->cpuid = cpuid_eax(1);
 130	m->socketid = cpu_data(m->extcpu).phys_proc_id;
 131	m->apicid = cpu_data(m->extcpu).initial_apicid;
 132	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 133
 134	if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
 135		rdmsrl(MSR_PPIN, m->ppin);
 136
 137	m->microcode = boot_cpu_data.microcode;
 138}
 139
 140DEFINE_PER_CPU(struct mce, injectm);
 141EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 142
 143void mce_log(struct mce *m)
 144{
 145	if (!mce_gen_pool_add(m))
 146		irq_work_queue(&mce_irq_work);
 147}
 148
 149void mce_inject_log(struct mce *m)
 150{
 151	mutex_lock(&mce_log_mutex);
 152	mce_log(m);
 153	mutex_unlock(&mce_log_mutex);
 154}
 155EXPORT_SYMBOL_GPL(mce_inject_log);
 156
 157static struct notifier_block mce_srao_nb;
 158
 159/*
 160 * We run the default notifier if we have only the SRAO, the first and the
 161 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
 162 * notifiers registered on the chain.
 163 */
 164#define NUM_DEFAULT_NOTIFIERS	3
 165static atomic_t num_notifiers;
 166
 167void mce_register_decode_chain(struct notifier_block *nb)
 
 
 
 
 
 
 168{
 169	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
 
 
 
 
 
 
 
 170		return;
 171
 172	atomic_inc(&num_notifiers);
 
 
 
 
 173
 174	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 175}
 176EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 177
 178void mce_unregister_decode_chain(struct notifier_block *nb)
 179{
 180	atomic_dec(&num_notifiers);
 181
 182	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 183}
 184EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 185
 186static inline u32 ctl_reg(int bank)
 187{
 188	return MSR_IA32_MCx_CTL(bank);
 189}
 190
 191static inline u32 status_reg(int bank)
 192{
 193	return MSR_IA32_MCx_STATUS(bank);
 194}
 195
 196static inline u32 addr_reg(int bank)
 197{
 198	return MSR_IA32_MCx_ADDR(bank);
 199}
 200
 201static inline u32 misc_reg(int bank)
 202{
 203	return MSR_IA32_MCx_MISC(bank);
 204}
 205
 206static inline u32 smca_ctl_reg(int bank)
 207{
 208	return MSR_AMD64_SMCA_MCx_CTL(bank);
 209}
 210
 211static inline u32 smca_status_reg(int bank)
 212{
 213	return MSR_AMD64_SMCA_MCx_STATUS(bank);
 
 
 
 
 
 
 
 
 
 
 214}
 215
 216static inline u32 smca_addr_reg(int bank)
 
 217{
 218	return MSR_AMD64_SMCA_MCx_ADDR(bank);
 
 219}
 
 220
 221static inline u32 smca_misc_reg(int bank)
 222{
 223	return MSR_AMD64_SMCA_MCx_MISC(bank);
 224}
 
 225
 226struct mca_msr_regs msr_ops = {
 227	.ctl	= ctl_reg,
 228	.status	= status_reg,
 229	.addr	= addr_reg,
 230	.misc	= misc_reg
 231};
 232
 233static void __print_mce(struct mce *m)
 234{
 235	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 236		 m->extcpu,
 237		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 238		 m->mcgstatus, m->bank, m->status);
 239
 240	if (m->ip) {
 241		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 242			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 243			m->cs, m->ip);
 244
 245		if (m->cs == __KERNEL_CS)
 246			pr_cont("{%pS}", (void *)(unsigned long)m->ip);
 247		pr_cont("\n");
 248	}
 249
 250	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 251	if (m->addr)
 252		pr_cont("ADDR %llx ", m->addr);
 253	if (m->misc)
 254		pr_cont("MISC %llx ", m->misc);
 255
 256	if (mce_flags.smca) {
 257		if (m->synd)
 258			pr_cont("SYND %llx ", m->synd);
 259		if (m->ipid)
 260			pr_cont("IPID %llx ", m->ipid);
 261	}
 262
 263	pr_cont("\n");
 264	/*
 265	 * Note this output is parsed by external tools and old fields
 266	 * should not be changed.
 267	 */
 268	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 269		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 270		m->microcode);
 271}
 272
 273static void print_mce(struct mce *m)
 274{
 275	__print_mce(m);
 
 
 
 
 276
 277	if (m->cpuvendor != X86_VENDOR_AMD)
 278		pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 279}
 280
 281#define PANIC_TIMEOUT 5 /* 5 seconds */
 282
 283static atomic_t mce_panicked;
 284
 285static int fake_panic;
 286static atomic_t mce_fake_panicked;
 287
 288/* Panic in progress. Enable interrupts and wait for final IPI */
 289static void wait_for_panic(void)
 290{
 291	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 292
 293	preempt_disable();
 294	local_irq_enable();
 295	while (timeout-- > 0)
 296		udelay(1);
 297	if (panic_timeout == 0)
 298		panic_timeout = mca_cfg.panic_timeout;
 299	panic("Panicing machine check CPU died");
 300}
 301
 302static void mce_panic(const char *msg, struct mce *final, char *exp)
 303{
 304	int apei_err = 0;
 305	struct llist_node *pending;
 306	struct mce_evt_llist *l;
 307
 308	if (!fake_panic) {
 309		/*
 310		 * Make sure only one CPU runs in machine check panic
 311		 */
 312		if (atomic_inc_return(&mce_panicked) > 1)
 313			wait_for_panic();
 314		barrier();
 315
 316		bust_spinlocks(1);
 317		console_verbose();
 318	} else {
 319		/* Don't log too much for fake panic */
 320		if (atomic_inc_return(&mce_fake_panicked) > 1)
 321			return;
 322	}
 323	pending = mce_gen_pool_prepare_records();
 324	/* First print corrected ones that are still unlogged */
 325	llist_for_each_entry(l, pending, llnode) {
 326		struct mce *m = &l->mce;
 
 
 327		if (!(m->status & MCI_STATUS_UC)) {
 328			print_mce(m);
 329			if (!apei_err)
 330				apei_err = apei_write_mce(m);
 331		}
 332	}
 333	/* Now print uncorrected but with the final one last */
 334	llist_for_each_entry(l, pending, llnode) {
 335		struct mce *m = &l->mce;
 
 
 336		if (!(m->status & MCI_STATUS_UC))
 337			continue;
 338		if (!final || mce_cmp(m, final)) {
 339			print_mce(m);
 340			if (!apei_err)
 341				apei_err = apei_write_mce(m);
 342		}
 343	}
 344	if (final) {
 345		print_mce(final);
 346		if (!apei_err)
 347			apei_err = apei_write_mce(final);
 348	}
 349	if (cpu_missing)
 350		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 351	if (exp)
 352		pr_emerg(HW_ERR "Machine check: %s\n", exp);
 353	if (!fake_panic) {
 354		if (panic_timeout == 0)
 355			panic_timeout = mca_cfg.panic_timeout;
 356		panic(msg);
 357	} else
 358		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 359}
 360
 361/* Support code for software error injection */
 362
 363static int msr_to_offset(u32 msr)
 364{
 365	unsigned bank = __this_cpu_read(injectm.bank);
 366
 367	if (msr == mca_cfg.rip_msr)
 368		return offsetof(struct mce, ip);
 369	if (msr == msr_ops.status(bank))
 370		return offsetof(struct mce, status);
 371	if (msr == msr_ops.addr(bank))
 372		return offsetof(struct mce, addr);
 373	if (msr == msr_ops.misc(bank))
 374		return offsetof(struct mce, misc);
 375	if (msr == MSR_IA32_MCG_STATUS)
 376		return offsetof(struct mce, mcgstatus);
 377	return -1;
 378}
 379
 380/* MSR access wrappers used for error injection */
 381static u64 mce_rdmsrl(u32 msr)
 382{
 383	u64 v;
 384
 385	if (__this_cpu_read(injectm.finished)) {
 386		int offset = msr_to_offset(msr);
 387
 388		if (offset < 0)
 389			return 0;
 390		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 391	}
 392
 393	if (rdmsrl_safe(msr, &v)) {
 394		WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
 395		/*
 396		 * Return zero in case the access faulted. This should
 397		 * not happen normally but can happen if the CPU does
 398		 * something weird, or if the code is buggy.
 399		 */
 400		v = 0;
 401	}
 402
 403	return v;
 404}
 405
 406static void mce_wrmsrl(u32 msr, u64 v)
 407{
 408	if (__this_cpu_read(injectm.finished)) {
 409		int offset = msr_to_offset(msr);
 410
 411		if (offset >= 0)
 412			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 413		return;
 414	}
 415	wrmsrl(msr, v);
 416}
 417
 418/*
 419 * Collect all global (w.r.t. this processor) status about this machine
 420 * check into our "mce" struct so that we can use it later to assess
 421 * the severity of the problem as we read per-bank specific details.
 422 */
 423static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 424{
 425	mce_setup(m);
 426
 427	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 428	if (regs) {
 429		/*
 430		 * Get the address of the instruction at the time of
 431		 * the machine check error.
 432		 */
 433		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 434			m->ip = regs->ip;
 435			m->cs = regs->cs;
 436
 437			/*
 438			 * When in VM86 mode make the cs look like ring 3
 439			 * always. This is a lie, but it's better than passing
 440			 * the additional vm86 bit around everywhere.
 441			 */
 442			if (v8086_mode(regs))
 443				m->cs |= 3;
 444		}
 445		/* Use accurate RIP reporting if available. */
 446		if (mca_cfg.rip_msr)
 447			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 448	}
 449}
 450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 451int mce_available(struct cpuinfo_x86 *c)
 452{
 453	if (mca_cfg.disabled)
 454		return 0;
 455	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 456}
 457
 458static void mce_schedule_work(void)
 459{
 460	if (!mce_gen_pool_empty())
 461		schedule_work(&mce_work);
 
 
 
 462}
 463
 
 
 464static void mce_irq_work_cb(struct irq_work *entry)
 465{
 
 466	mce_schedule_work();
 467}
 468
 469static void mce_report_event(struct pt_regs *regs)
 470{
 471	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 472		mce_notify_irq();
 473		/*
 474		 * Triggering the work queue here is just an insurance
 475		 * policy in case the syscall exit notify handler
 476		 * doesn't run soon enough or ends up running on the
 477		 * wrong CPU (can happen when audit sleeps)
 478		 */
 479		mce_schedule_work();
 480		return;
 481	}
 482
 483	irq_work_queue(&mce_irq_work);
 484}
 485
 486/*
 487 * Check if the address reported by the CPU is in a format we can parse.
 488 * It would be possible to add code for most other cases, but all would
 489 * be somewhat complicated (e.g. segment offset would require an instruction
 490 * parser). So only support physical addresses up to page granuality for now.
 491 */
 492static int mce_usable_address(struct mce *m)
 493{
 494	if (!(m->status & MCI_STATUS_ADDRV))
 495		return 0;
 496
 497	/* Checks after this one are Intel-specific: */
 498	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 499		return 1;
 500
 501	if (!(m->status & MCI_STATUS_MISCV))
 502		return 0;
 503
 504	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 505		return 0;
 506
 507	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 508		return 0;
 509
 510	return 1;
 511}
 512
 513bool mce_is_memory_error(struct mce *m)
 514{
 515	if (m->cpuvendor == X86_VENDOR_AMD) {
 516		return amd_mce_is_memory_error(m);
 517
 518	} else if (m->cpuvendor == X86_VENDOR_INTEL) {
 519		/*
 520		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 521		 *
 522		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 523		 * indicating a memory error. Bit 8 is used for indicating a
 524		 * cache hierarchy error. The combination of bit 2 and bit 3
 525		 * is used for indicating a `generic' cache hierarchy error
 526		 * But we can't just blindly check the above bits, because if
 527		 * bit 11 is set, then it is a bus/interconnect error - and
 528		 * either way the above bits just gives more detail on what
 529		 * bus/interconnect error happened. Note that bit 12 can be
 530		 * ignored, as it's the "filter" bit.
 531		 */
 532		return (m->status & 0xef80) == BIT(7) ||
 533		       (m->status & 0xef00) == BIT(8) ||
 534		       (m->status & 0xeffc) == 0xc;
 535	}
 536
 537	return false;
 538}
 539EXPORT_SYMBOL_GPL(mce_is_memory_error);
 540
 541static bool mce_is_correctable(struct mce *m)
 542{
 543	if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
 544		return false;
 545
 546	if (m->status & MCI_STATUS_UC)
 547		return false;
 548
 549	return true;
 550}
 551
 552static bool cec_add_mce(struct mce *m)
 553{
 554	if (!m)
 555		return false;
 556
 557	/* We eat only correctable DRAM errors with usable addresses. */
 558	if (mce_is_memory_error(m) &&
 559	    mce_is_correctable(m)  &&
 560	    mce_usable_address(m))
 561		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
 562			return true;
 563
 564	return false;
 565}
 566
 567static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
 568			      void *data)
 569{
 570	struct mce *m = (struct mce *)data;
 571
 572	if (!m)
 573		return NOTIFY_DONE;
 574
 575	if (cec_add_mce(m))
 576		return NOTIFY_STOP;
 577
 578	/* Emit the trace record: */
 579	trace_mce_record(m);
 580
 581	set_bit(0, &mce_need_notify);
 582
 583	mce_notify_irq();
 584
 585	return NOTIFY_DONE;
 586}
 587
 588static struct notifier_block first_nb = {
 589	.notifier_call	= mce_first_notifier,
 590	.priority	= MCE_PRIO_FIRST,
 591};
 592
 593static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 594				void *data)
 595{
 596	struct mce *mce = (struct mce *)data;
 597	unsigned long pfn;
 598
 599	if (!mce)
 600		return NOTIFY_DONE;
 601
 602	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 603		pfn = mce->addr >> PAGE_SHIFT;
 604		if (!memory_failure(pfn, 0))
 605			mce_unmap_kpfn(pfn);
 606	}
 607
 608	return NOTIFY_OK;
 609}
 610static struct notifier_block mce_srao_nb = {
 611	.notifier_call	= srao_decode_notifier,
 612	.priority	= MCE_PRIO_SRAO,
 613};
 614
 615static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 616				void *data)
 617{
 618	struct mce *m = (struct mce *)data;
 619
 620	if (!m)
 621		return NOTIFY_DONE;
 622
 623	if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
 624		return NOTIFY_DONE;
 625
 626	__print_mce(m);
 627
 628	return NOTIFY_DONE;
 629}
 630
 631static struct notifier_block mce_default_nb = {
 632	.notifier_call	= mce_default_notifier,
 633	/* lowest prio, we want it to run last. */
 634	.priority	= MCE_PRIO_LOWEST,
 635};
 636
 637/*
 638 * Read ADDR and MISC registers.
 639 */
 640static void mce_read_aux(struct mce *m, int i)
 641{
 642	if (m->status & MCI_STATUS_MISCV)
 643		m->misc = mce_rdmsrl(msr_ops.misc(i));
 644
 645	if (m->status & MCI_STATUS_ADDRV) {
 646		m->addr = mce_rdmsrl(msr_ops.addr(i));
 647
 648		/*
 649		 * Mask the reported address by the reported granularity.
 650		 */
 651		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 652			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 653			m->addr >>= shift;
 654			m->addr <<= shift;
 655		}
 656
 657		/*
 658		 * Extract [55:<lsb>] where lsb is the least significant
 659		 * *valid* bit of the address bits.
 660		 */
 661		if (mce_flags.smca) {
 662			u8 lsb = (m->addr >> 56) & 0x3f;
 663
 664			m->addr &= GENMASK_ULL(55, lsb);
 665		}
 666	}
 667
 668	if (mce_flags.smca) {
 669		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 670
 671		if (m->status & MCI_STATUS_SYNDV)
 672			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 673	}
 674}
 675
 676DEFINE_PER_CPU(unsigned, mce_poll_count);
 677
 678/*
 679 * Poll for corrected events or events that happened before reset.
 680 * Those are just logged through /dev/mcelog.
 681 *
 682 * This is executed in standard interrupt context.
 683 *
 684 * Note: spec recommends to panic for fatal unsignalled
 685 * errors here. However this would be quite problematic --
 686 * we would need to reimplement the Monarch handling and
 687 * it would mess up the exclusion between exception handler
 688 * and poll hander -- * so we skip this for now.
 689 * These cases should not happen anyways, or only when the CPU
 690 * is already totally * confused. In this case it's likely it will
 691 * not fully execute the machine check handler either.
 692 */
 693bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 694{
 695	bool error_seen = false;
 696	struct mce m;
 697	int i;
 698
 699	this_cpu_inc(mce_poll_count);
 700
 701	mce_gather_info(&m, NULL);
 702
 703	if (flags & MCP_TIMESTAMP)
 704		m.tsc = rdtsc();
 705
 706	for (i = 0; i < mca_cfg.banks; i++) {
 707		if (!mce_banks[i].ctl || !test_bit(i, *b))
 708			continue;
 709
 710		m.misc = 0;
 711		m.addr = 0;
 712		m.bank = i;
 
 713
 714		barrier();
 715		m.status = mce_rdmsrl(msr_ops.status(i));
 716		if (!(m.status & MCI_STATUS_VAL))
 717			continue;
 718
 719		/*
 720		 * Uncorrected or signalled events are handled by the exception
 721		 * handler when it is enabled, so don't process those here.
 722		 *
 723		 * TBD do the same check for MCI_STATUS_EN here?
 724		 */
 725		if (!(flags & MCP_UC) &&
 726		    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 727			continue;
 728
 729		error_seen = true;
 730
 731		mce_read_aux(&m, i);
 732
 733		m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 734
 735		/*
 736		 * Don't get the IP here because it's unlikely to
 737		 * have anything to do with the actual error location.
 738		 */
 739		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 740			mce_log(&m);
 741		else if (mce_usable_address(&m)) {
 742			/*
 743			 * Although we skipped logging this, we still want
 744			 * to take action. Add to the pool so the registered
 745			 * notifiers will see it.
 746			 */
 747			if (!mce_gen_pool_add(&m))
 748				mce_schedule_work();
 749		}
 750
 751		/*
 752		 * Clear state for this bank.
 753		 */
 754		mce_wrmsrl(msr_ops.status(i), 0);
 755	}
 756
 757	/*
 758	 * Don't clear MCG_STATUS here because it's only defined for
 759	 * exceptions.
 760	 */
 761
 762	sync_core();
 763
 764	return error_seen;
 765}
 766EXPORT_SYMBOL_GPL(machine_check_poll);
 767
 768/*
 769 * Do a quick check if any of the events requires a panic.
 770 * This decides if we keep the events around or clear them.
 771 */
 772static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 773			  struct pt_regs *regs)
 774{
 775	int i, ret = 0;
 776	char *tmp;
 777
 778	for (i = 0; i < mca_cfg.banks; i++) {
 779		m->status = mce_rdmsrl(msr_ops.status(i));
 780		if (m->status & MCI_STATUS_VAL) {
 781			__set_bit(i, validp);
 782			if (quirk_no_way_out)
 783				quirk_no_way_out(i, m, regs);
 784		}
 785
 786		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 787			*msg = tmp;
 788			ret = 1;
 789		}
 790	}
 791	return ret;
 792}
 793
 794/*
 795 * Variable to establish order between CPUs while scanning.
 796 * Each CPU spins initially until executing is equal its number.
 797 */
 798static atomic_t mce_executing;
 799
 800/*
 801 * Defines order of CPUs on entry. First CPU becomes Monarch.
 802 */
 803static atomic_t mce_callin;
 804
 805/*
 806 * Check if a timeout waiting for other CPUs happened.
 807 */
 808static int mce_timed_out(u64 *t, const char *msg)
 809{
 810	/*
 811	 * The others already did panic for some reason.
 812	 * Bail out like in a timeout.
 813	 * rmb() to tell the compiler that system_state
 814	 * might have been modified by someone else.
 815	 */
 816	rmb();
 817	if (atomic_read(&mce_panicked))
 818		wait_for_panic();
 819	if (!mca_cfg.monarch_timeout)
 820		goto out;
 821	if ((s64)*t < SPINUNIT) {
 822		if (mca_cfg.tolerant <= 1)
 823			mce_panic(msg, NULL, NULL);
 
 
 824		cpu_missing = 1;
 825		return 1;
 826	}
 827	*t -= SPINUNIT;
 828out:
 829	touch_nmi_watchdog();
 830	return 0;
 831}
 832
 833/*
 834 * The Monarch's reign.  The Monarch is the CPU who entered
 835 * the machine check handler first. It waits for the others to
 836 * raise the exception too and then grades them. When any
 837 * error is fatal panic. Only then let the others continue.
 838 *
 839 * The other CPUs entering the MCE handler will be controlled by the
 840 * Monarch. They are called Subjects.
 841 *
 842 * This way we prevent any potential data corruption in a unrecoverable case
 843 * and also makes sure always all CPU's errors are examined.
 844 *
 845 * Also this detects the case of a machine check event coming from outer
 846 * space (not detected by any CPUs) In this case some external agent wants
 847 * us to shut down, so panic too.
 848 *
 849 * The other CPUs might still decide to panic if the handler happens
 850 * in a unrecoverable place, but in this case the system is in a semi-stable
 851 * state and won't corrupt anything by itself. It's ok to let the others
 852 * continue for a bit first.
 853 *
 854 * All the spin loops have timeouts; when a timeout happens a CPU
 855 * typically elects itself to be Monarch.
 856 */
 857static void mce_reign(void)
 858{
 859	int cpu;
 860	struct mce *m = NULL;
 861	int global_worst = 0;
 862	char *msg = NULL;
 863	char *nmsg = NULL;
 864
 865	/*
 866	 * This CPU is the Monarch and the other CPUs have run
 867	 * through their handlers.
 868	 * Grade the severity of the errors of all the CPUs.
 869	 */
 870	for_each_possible_cpu(cpu) {
 871		int severity = mce_severity(&per_cpu(mces_seen, cpu),
 872					    mca_cfg.tolerant,
 873					    &nmsg, true);
 874		if (severity > global_worst) {
 875			msg = nmsg;
 876			global_worst = severity;
 877			m = &per_cpu(mces_seen, cpu);
 878		}
 879	}
 880
 881	/*
 882	 * Cannot recover? Panic here then.
 883	 * This dumps all the mces in the log buffer and stops the
 884	 * other CPUs.
 885	 */
 886	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 887		mce_panic("Fatal machine check", m, msg);
 888
 889	/*
 890	 * For UC somewhere we let the CPU who detects it handle it.
 891	 * Also must let continue the others, otherwise the handling
 892	 * CPU could deadlock on a lock.
 893	 */
 894
 895	/*
 896	 * No machine check event found. Must be some external
 897	 * source or one CPU is hung. Panic.
 898	 */
 899	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 900		mce_panic("Fatal machine check from unknown source", NULL, NULL);
 901
 902	/*
 903	 * Now clear all the mces_seen so that they don't reappear on
 904	 * the next mce.
 905	 */
 906	for_each_possible_cpu(cpu)
 907		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 908}
 909
 910static atomic_t global_nwo;
 911
 912/*
 913 * Start of Monarch synchronization. This waits until all CPUs have
 914 * entered the exception handler and then determines if any of them
 915 * saw a fatal event that requires panic. Then it executes them
 916 * in the entry order.
 917 * TBD double check parallel CPU hotunplug
 918 */
 919static int mce_start(int *no_way_out)
 920{
 921	int order;
 922	int cpus = num_online_cpus();
 923	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 924
 925	if (!timeout)
 926		return -1;
 927
 928	atomic_add(*no_way_out, &global_nwo);
 929	/*
 930	 * Rely on the implied barrier below, such that global_nwo
 931	 * is updated before mce_callin.
 932	 */
 
 933	order = atomic_inc_return(&mce_callin);
 934
 935	/*
 936	 * Wait for everyone.
 937	 */
 938	while (atomic_read(&mce_callin) != cpus) {
 939		if (mce_timed_out(&timeout,
 940				  "Timeout: Not all CPUs entered broadcast exception handler")) {
 941			atomic_set(&global_nwo, 0);
 942			return -1;
 943		}
 944		ndelay(SPINUNIT);
 945	}
 946
 947	/*
 948	 * mce_callin should be read before global_nwo
 949	 */
 950	smp_rmb();
 951
 952	if (order == 1) {
 953		/*
 954		 * Monarch: Starts executing now, the others wait.
 955		 */
 956		atomic_set(&mce_executing, 1);
 957	} else {
 958		/*
 959		 * Subject: Now start the scanning loop one by one in
 960		 * the original callin order.
 961		 * This way when there are any shared banks it will be
 962		 * only seen by one CPU before cleared, avoiding duplicates.
 963		 */
 964		while (atomic_read(&mce_executing) < order) {
 965			if (mce_timed_out(&timeout,
 966					  "Timeout: Subject CPUs unable to finish machine check processing")) {
 967				atomic_set(&global_nwo, 0);
 968				return -1;
 969			}
 970			ndelay(SPINUNIT);
 971		}
 972	}
 973
 974	/*
 975	 * Cache the global no_way_out state.
 976	 */
 977	*no_way_out = atomic_read(&global_nwo);
 978
 979	return order;
 980}
 981
 982/*
 983 * Synchronize between CPUs after main scanning loop.
 984 * This invokes the bulk of the Monarch processing.
 985 */
 986static int mce_end(int order)
 987{
 988	int ret = -1;
 989	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 990
 991	if (!timeout)
 992		goto reset;
 993	if (order < 0)
 994		goto reset;
 995
 996	/*
 997	 * Allow others to run.
 998	 */
 999	atomic_inc(&mce_executing);
1000
1001	if (order == 1) {
1002		/* CHECKME: Can this race with a parallel hotplug? */
1003		int cpus = num_online_cpus();
1004
1005		/*
1006		 * Monarch: Wait for everyone to go through their scanning
1007		 * loops.
1008		 */
1009		while (atomic_read(&mce_executing) <= cpus) {
1010			if (mce_timed_out(&timeout,
1011					  "Timeout: Monarch CPU unable to finish machine check processing"))
1012				goto reset;
1013			ndelay(SPINUNIT);
1014		}
1015
1016		mce_reign();
1017		barrier();
1018		ret = 0;
1019	} else {
1020		/*
1021		 * Subject: Wait for Monarch to finish.
1022		 */
1023		while (atomic_read(&mce_executing) != 0) {
1024			if (mce_timed_out(&timeout,
1025					  "Timeout: Monarch CPU did not finish machine check processing"))
1026				goto reset;
1027			ndelay(SPINUNIT);
1028		}
1029
1030		/*
1031		 * Don't reset anything. That's done by the Monarch.
1032		 */
1033		return 0;
1034	}
1035
1036	/*
1037	 * Reset all global state.
1038	 */
1039reset:
1040	atomic_set(&global_nwo, 0);
1041	atomic_set(&mce_callin, 0);
1042	barrier();
1043
1044	/*
1045	 * Let others run again.
1046	 */
1047	atomic_set(&mce_executing, 0);
1048	return ret;
1049}
1050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1051static void mce_clear_state(unsigned long *toclear)
1052{
1053	int i;
1054
1055	for (i = 0; i < mca_cfg.banks; i++) {
1056		if (test_bit(i, toclear))
1057			mce_wrmsrl(msr_ops.status(i), 0);
1058	}
1059}
1060
1061static int do_memory_failure(struct mce *m)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1062{
1063	int flags = MF_ACTION_REQUIRED;
1064	int ret;
1065
1066	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1067	if (!(m->mcgstatus & MCG_STATUS_RIPV))
1068		flags |= MF_MUST_KILL;
1069	ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
1070	if (ret)
1071		pr_err("Memory error not recovered");
1072	else
1073		mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
1074	return ret;
 
1075}
1076
1077#ifndef mce_unmap_kpfn
1078static void mce_unmap_kpfn(unsigned long pfn)
1079{
1080	unsigned long decoy_addr;
1081
1082	/*
1083	 * Unmap this page from the kernel 1:1 mappings to make sure
1084	 * we don't log more errors because of speculative access to
1085	 * the page.
1086	 * We would like to just call:
1087	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1088	 * but doing that would radically increase the odds of a
1089	 * speculative access to the poison page because we'd have
1090	 * the virtual address of the kernel 1:1 mapping sitting
1091	 * around in registers.
1092	 * Instead we get tricky.  We create a non-canonical address
1093	 * that looks just like the one we want, but has bit 63 flipped.
1094	 * This relies on set_memory_np() not checking whether we passed
1095	 * a legal address.
1096	 */
1097
1098	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1099
1100	if (set_memory_np(decoy_addr, 1))
1101		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
 
1102}
1103#endif
1104
1105/*
1106 * The actual machine check handler. This only handles real
1107 * exceptions when something got corrupted coming in through int 18.
1108 *
1109 * This is executed in NMI context not subject to normal locking rules. This
1110 * implies that most kernel services cannot be safely used. Don't even
1111 * think about putting a printk in there!
1112 *
1113 * On Intel systems this is entered on all CPUs in parallel through
1114 * MCE broadcast. However some CPUs might be broken beyond repair,
1115 * so be always careful when synchronizing with others.
1116 */
1117void do_machine_check(struct pt_regs *regs, long error_code)
1118{
1119	struct mca_config *cfg = &mca_cfg;
1120	struct mce m, *final;
1121	int i;
1122	int worst = 0;
1123	int severity;
1124
1125	/*
1126	 * Establish sequential order between the CPUs entering the machine
1127	 * check handler.
1128	 */
1129	int order = -1;
1130	/*
1131	 * If no_way_out gets set, there is no safe way to recover from this
1132	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1133	 */
1134	int no_way_out = 0;
1135	/*
1136	 * If kill_it gets set, there might be a way to recover from this
1137	 * error.
1138	 */
1139	int kill_it = 0;
1140	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1141	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1142	char *msg = "Unknown";
1143
1144	/*
1145	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1146	 * on Intel.
1147	 */
1148	int lmce = 1;
1149	int cpu = smp_processor_id();
1150
1151	/*
1152	 * Cases where we avoid rendezvous handler timeout:
1153	 * 1) If this CPU is offline.
1154	 *
1155	 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1156	 *  skip those CPUs which remain looping in the 1st kernel - see
1157	 *  crash_nmi_callback().
1158	 *
1159	 * Note: there still is a small window between kexec-ing and the new,
1160	 * kdump kernel establishing a new #MC handler where a broadcasted MCE
1161	 * might not get handled properly.
1162	 */
1163	if (cpu_is_offline(cpu) ||
1164	    (crashing_cpu != -1 && crashing_cpu != cpu)) {
1165		u64 mcgstatus;
1166
1167		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1168		if (mcgstatus & MCG_STATUS_RIPV) {
1169			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1170			return;
1171		}
1172	}
1173
1174	ist_enter(regs);
1175
1176	this_cpu_inc(mce_exception_count);
1177
1178	if (!cfg->banks)
1179		goto out;
1180
1181	mce_gather_info(&m, regs);
1182	m.tsc = rdtsc();
1183
1184	final = this_cpu_ptr(&mces_seen);
1185	*final = m;
1186
1187	memset(valid_banks, 0, sizeof(valid_banks));
1188	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1189
1190	barrier();
1191
1192	/*
1193	 * When no restart IP might need to kill or panic.
1194	 * Assume the worst for now, but if we find the
1195	 * severity is MCE_AR_SEVERITY we have other options.
1196	 */
1197	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1198		kill_it = 1;
1199
1200	/*
1201	 * Check if this MCE is signaled to only this logical processor,
1202	 * on Intel only.
1203	 */
1204	if (m.cpuvendor == X86_VENDOR_INTEL)
1205		lmce = m.mcgstatus & MCG_STATUS_LMCES;
1206
1207	/*
1208	 * Go through all banks in exclusion of the other CPUs. This way we
1209	 * don't report duplicated events on shared banks because the first one
1210	 * to see it will clear it. If this is a Local MCE, then no need to
1211	 * perform rendezvous.
1212	 */
1213	if (!lmce)
1214		order = mce_start(&no_way_out);
1215
1216	for (i = 0; i < cfg->banks; i++) {
1217		__clear_bit(i, toclear);
1218		if (!test_bit(i, valid_banks))
1219			continue;
1220		if (!mce_banks[i].ctl)
1221			continue;
1222
1223		m.misc = 0;
1224		m.addr = 0;
1225		m.bank = i;
1226
1227		m.status = mce_rdmsrl(msr_ops.status(i));
1228		if ((m.status & MCI_STATUS_VAL) == 0)
1229			continue;
1230
1231		/*
1232		 * Non uncorrected or non signaled errors are handled by
1233		 * machine_check_poll. Leave them alone, unless this panics.
1234		 */
1235		if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1236			!no_way_out)
1237			continue;
1238
1239		/*
1240		 * Set taint even when machine check was not enabled.
1241		 */
1242		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1243
1244		severity = mce_severity(&m, cfg->tolerant, NULL, true);
1245
1246		/*
1247		 * When machine check was for corrected/deferred handler don't
1248		 * touch, unless we're panicing.
1249		 */
1250		if ((severity == MCE_KEEP_SEVERITY ||
1251		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1252			continue;
1253		__set_bit(i, toclear);
1254		if (severity == MCE_NO_SEVERITY) {
1255			/*
1256			 * Machine check event was not enabled. Clear, but
1257			 * ignore.
1258			 */
1259			continue;
1260		}
1261
1262		mce_read_aux(&m, i);
1263
1264		/* assuming valid severity level != 0 */
1265		m.severity = severity;
 
 
 
 
 
 
 
1266
1267		mce_log(&m);
1268
1269		if (severity > worst) {
1270			*final = m;
1271			worst = severity;
1272		}
1273	}
1274
1275	/* mce_clear_state will clear *final, save locally for use later */
1276	m = *final;
1277
1278	if (!no_way_out)
1279		mce_clear_state(toclear);
1280
1281	/*
1282	 * Do most of the synchronization with other CPUs.
1283	 * When there's any problem use only local no_way_out state.
1284	 */
1285	if (!lmce) {
1286		if (mce_end(order) < 0)
1287			no_way_out = worst >= MCE_PANIC_SEVERITY;
1288	} else {
1289		/*
1290		 * Local MCE skipped calling mce_reign()
1291		 * If we found a fatal error, we need to panic here.
1292		 */
1293		 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1294			mce_panic("Machine check from unknown source",
1295				NULL, NULL);
1296	}
1297
1298	/*
1299	 * If tolerant is at an insane level we drop requests to kill
1300	 * processes and continue even when there is no way out.
1301	 */
1302	if (cfg->tolerant == 3)
1303		kill_it = 0;
1304	else if (no_way_out)
1305		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 
 
 
 
 
 
 
 
1306
1307	if (worst > 0)
1308		mce_report_event(regs);
1309	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1310out:
 
1311	sync_core();
1312
1313	if (worst != MCE_AR_SEVERITY && !kill_it)
1314		goto out_ist;
1315
1316	/* Fault was in user mode and we need to take some action */
1317	if ((m.cs & 3) == 3) {
1318		ist_begin_non_atomic(regs);
1319		local_irq_enable();
1320
1321		if (kill_it || do_memory_failure(&m))
1322			force_sig(SIGBUS, current);
1323		local_irq_disable();
1324		ist_end_non_atomic();
1325	} else {
1326		if (!fixup_exception(regs, X86_TRAP_MC))
1327			mce_panic("Failed kernel mode recovery", &m, NULL);
1328	}
1329
1330out_ist:
1331	ist_exit(regs);
1332}
1333EXPORT_SYMBOL_GPL(do_machine_check);
1334
1335#ifndef CONFIG_MEMORY_FAILURE
1336int memory_failure(unsigned long pfn, int flags)
1337{
1338	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1339	BUG_ON(flags & MF_ACTION_REQUIRED);
1340	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1341	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1342	       pfn);
1343
1344	return 0;
1345}
1346#endif
1347
1348/*
1349 * Periodic polling timer for "silent" machine check errors.  If the
1350 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1351 * errors, poll 2x slower (up to check_interval seconds).
 
 
 
1352 */
1353static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
 
 
 
 
1354
1355static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1356static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
1357
1358static unsigned long mce_adjust_timer_default(unsigned long interval)
1359{
1360	return interval;
1361}
1362
1363static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1364
1365static void __start_timer(struct timer_list *t, unsigned long interval)
 
 
 
 
 
1366{
1367	unsigned long when = jiffies + interval;
1368	unsigned long flags;
1369
1370	local_irq_save(flags);
 
 
1371
1372	if (!timer_pending(t) || time_before(when, t->expires))
1373		mod_timer(t, round_jiffies(when));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1374
1375	local_irq_restore(flags);
 
 
 
1376}
 
1377
1378static void mce_timer_fn(struct timer_list *t)
1379{
1380	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1381	unsigned long iv;
 
 
1382
1383	WARN_ON(cpu_t != t);
 
1384
1385	iv = __this_cpu_read(mce_next_interval);
 
 
 
1386
1387	if (mce_available(this_cpu_ptr(&cpu_info))) {
1388		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1389
1390		if (mce_intel_cmci_poll()) {
1391			iv = mce_adjust_timer(iv);
1392			goto done;
1393		}
1394	}
1395
1396	/*
1397	 * Alert userspace if needed. If we logged an MCE, reduce the polling
1398	 * interval, otherwise increase the polling interval.
1399	 */
 
1400	if (mce_notify_irq())
1401		iv = max(iv / 2, (unsigned long) HZ/100);
1402	else
1403		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1404
1405done:
1406	__this_cpu_write(mce_next_interval, iv);
1407	__start_timer(t, iv);
1408}
1409
1410/*
1411 * Ensure that the timer is firing in @interval from now.
1412 */
1413void mce_timer_kick(unsigned long interval)
1414{
1415	struct timer_list *t = this_cpu_ptr(&mce_timer);
1416	unsigned long iv = __this_cpu_read(mce_next_interval);
1417
1418	__start_timer(t, interval);
1419
1420	if (interval < iv)
1421		__this_cpu_write(mce_next_interval, interval);
1422}
1423
1424/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1425static void mce_timer_delete_all(void)
1426{
1427	int cpu;
1428
1429	for_each_online_cpu(cpu)
1430		del_timer_sync(&per_cpu(mce_timer, cpu));
1431}
1432
 
 
 
 
 
 
 
1433/*
1434 * Notify the user(s) about new machine check events.
1435 * Can be called from interrupt context, but not from machine check/NMI
1436 * context.
1437 */
1438int mce_notify_irq(void)
1439{
1440	/* Not more than two messages every minute */
1441	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1442
1443	if (test_and_clear_bit(0, &mce_need_notify)) {
1444		mce_work_trigger();
 
 
 
 
 
 
 
 
 
1445
1446		if (__ratelimit(&ratelimit))
1447			pr_info(HW_ERR "Machine check events logged\n");
1448
1449		return 1;
1450	}
1451	return 0;
1452}
1453EXPORT_SYMBOL_GPL(mce_notify_irq);
1454
1455static int __mcheck_cpu_mce_banks_init(void)
1456{
1457	int i;
1458	u8 num_banks = mca_cfg.banks;
1459
1460	mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1461	if (!mce_banks)
1462		return -ENOMEM;
1463
1464	for (i = 0; i < num_banks; i++) {
1465		struct mce_bank *b = &mce_banks[i];
1466
1467		b->ctl = -1ULL;
1468		b->init = 1;
1469	}
1470	return 0;
1471}
1472
1473/*
1474 * Initialize Machine Checks for a CPU.
1475 */
1476static int __mcheck_cpu_cap_init(void)
1477{
1478	unsigned b;
1479	u64 cap;
1480
1481	rdmsrl(MSR_IA32_MCG_CAP, cap);
1482
1483	b = cap & MCG_BANKCNT_MASK;
1484	if (!mca_cfg.banks)
1485		pr_info("CPU supports %d MCE banks\n", b);
1486
1487	if (b > MAX_NR_BANKS) {
1488		pr_warn("Using only %u machine check banks out of %u\n",
 
1489			MAX_NR_BANKS, b);
1490		b = MAX_NR_BANKS;
1491	}
1492
1493	/* Don't support asymmetric configurations today */
1494	WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1495	mca_cfg.banks = b;
1496
1497	if (!mce_banks) {
1498		int err = __mcheck_cpu_mce_banks_init();
1499
1500		if (err)
1501			return err;
1502	}
1503
1504	/* Use accurate RIP reporting if available. */
1505	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1506		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1507
1508	if (cap & MCG_SER_P)
1509		mca_cfg.ser = 1;
1510
1511	return 0;
1512}
1513
1514static void __mcheck_cpu_init_generic(void)
1515{
1516	enum mcp_flags m_fl = 0;
1517	mce_banks_t all_banks;
1518	u64 cap;
1519
1520	if (!mca_cfg.bootlog)
1521		m_fl = MCP_DONTLOG;
1522
1523	/*
1524	 * Log the machine checks left over from the previous reset.
1525	 */
1526	bitmap_fill(all_banks, MAX_NR_BANKS);
1527	machine_check_poll(MCP_UC | m_fl, &all_banks);
1528
1529	cr4_set_bits(X86_CR4_MCE);
1530
1531	rdmsrl(MSR_IA32_MCG_CAP, cap);
1532	if (cap & MCG_CTL_P)
1533		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1534}
1535
1536static void __mcheck_cpu_init_clear_banks(void)
1537{
1538	int i;
1539
1540	for (i = 0; i < mca_cfg.banks; i++) {
1541		struct mce_bank *b = &mce_banks[i];
1542
1543		if (!b->init)
1544			continue;
1545		wrmsrl(msr_ops.ctl(i), b->ctl);
1546		wrmsrl(msr_ops.status(i), 0);
1547	}
1548}
1549
1550/*
1551 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1552 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1553 * Vol 3B Table 15-20). But this confuses both the code that determines
1554 * whether the machine check occurred in kernel or user mode, and also
1555 * the severity assessment code. Pretend that EIPV was set, and take the
1556 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1557 */
1558static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1559{
1560	if (bank != 0)
1561		return;
1562	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1563		return;
1564	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1565		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1566			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1567			  MCACOD)) !=
1568			 (MCI_STATUS_UC|MCI_STATUS_EN|
1569			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1570			  MCI_STATUS_AR|MCACOD_INSTR))
1571		return;
1572
1573	m->mcgstatus |= MCG_STATUS_EIPV;
1574	m->ip = regs->ip;
1575	m->cs = regs->cs;
1576}
1577
1578/* Add per CPU specific workarounds here */
1579static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1580{
1581	struct mca_config *cfg = &mca_cfg;
1582
1583	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1584		pr_info("unknown CPU type - not enabling MCE support\n");
1585		return -EOPNOTSUPP;
1586	}
1587
1588	/* This should be disabled by the BIOS, but isn't always */
1589	if (c->x86_vendor == X86_VENDOR_AMD) {
1590		if (c->x86 == 15 && cfg->banks > 4) {
1591			/*
1592			 * disable GART TBL walk error reporting, which
1593			 * trips off incorrectly with the IOMMU & 3ware
1594			 * & Cerberus:
1595			 */
1596			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1597		}
1598		if (c->x86 < 0x11 && cfg->bootlog < 0) {
1599			/*
1600			 * Lots of broken BIOS around that don't clear them
1601			 * by default and leave crap in there. Don't log:
1602			 */
1603			cfg->bootlog = 0;
1604		}
1605		/*
1606		 * Various K7s with broken bank 0 around. Always disable
1607		 * by default.
1608		 */
1609		if (c->x86 == 6 && cfg->banks > 0)
1610			mce_banks[0].ctl = 0;
1611
1612		/*
1613		 * overflow_recov is supported for F15h Models 00h-0fh
1614		 * even though we don't have a CPUID bit for it.
1615		 */
1616		if (c->x86 == 0x15 && c->x86_model <= 0xf)
1617			mce_flags.overflow_recov = 1;
1618
1619		/*
1620		 * Turn off MC4_MISC thresholding banks on those models since
1621		 * they're not supported there.
1622		 */
1623		if (c->x86 == 0x15 &&
1624		    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1625			int i;
1626			u64 hwcr;
1627			bool need_toggle;
1628			u32 msrs[] = {
1629				0x00000413, /* MC4_MISC0 */
1630				0xc0000408, /* MC4_MISC1 */
1631			};
1632
1633			rdmsrl(MSR_K7_HWCR, hwcr);
1634
1635			/* McStatusWrEn has to be set */
1636			need_toggle = !(hwcr & BIT(18));
1637
1638			if (need_toggle)
1639				wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1640
1641			/* Clear CntP bit safely */
1642			for (i = 0; i < ARRAY_SIZE(msrs); i++)
1643				msr_clear_bit(msrs[i], 62);
1644
1645			/* restore old settings */
1646			if (need_toggle)
1647				wrmsrl(MSR_K7_HWCR, hwcr);
1648		}
 
 
 
 
 
 
1649	}
1650
1651	if (c->x86_vendor == X86_VENDOR_INTEL) {
1652		/*
1653		 * SDM documents that on family 6 bank 0 should not be written
1654		 * because it aliases to another special BIOS controlled
1655		 * register.
1656		 * But it's not aliased anymore on model 0x1a+
1657		 * Don't ignore bank 0 completely because there could be a
1658		 * valid event later, merely don't write CTL0.
1659		 */
1660
1661		if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1662			mce_banks[0].init = 0;
1663
1664		/*
1665		 * All newer Intel systems support MCE broadcasting. Enable
1666		 * synchronization with a one second timeout.
1667		 */
1668		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1669			cfg->monarch_timeout < 0)
1670			cfg->monarch_timeout = USEC_PER_SEC;
1671
1672		/*
1673		 * There are also broken BIOSes on some Pentium M and
1674		 * earlier systems:
1675		 */
1676		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1677			cfg->bootlog = 0;
1678
1679		if (c->x86 == 6 && c->x86_model == 45)
1680			quirk_no_way_out = quirk_sandybridge_ifu;
1681	}
1682	if (cfg->monarch_timeout < 0)
1683		cfg->monarch_timeout = 0;
1684	if (cfg->bootlog != 0)
1685		cfg->panic_timeout = 30;
1686
1687	return 0;
1688}
1689
1690static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1691{
1692	if (c->x86 != 5)
1693		return 0;
1694
1695	switch (c->x86_vendor) {
1696	case X86_VENDOR_INTEL:
1697		intel_p5_mcheck_init(c);
1698		return 1;
1699		break;
1700	case X86_VENDOR_CENTAUR:
1701		winchip_mcheck_init(c);
1702		return 1;
1703		break;
1704	default:
1705		return 0;
1706	}
1707
1708	return 0;
1709}
1710
1711/*
1712 * Init basic CPU features needed for early decoding of MCEs.
1713 */
1714static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1715{
1716	if (c->x86_vendor == X86_VENDOR_AMD) {
1717		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1718		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
1719		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
1720
1721		if (mce_flags.smca) {
1722			msr_ops.ctl	= smca_ctl_reg;
1723			msr_ops.status	= smca_status_reg;
1724			msr_ops.addr	= smca_addr_reg;
1725			msr_ops.misc	= smca_misc_reg;
1726		}
1727	}
1728}
1729
1730static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1731{
1732	switch (c->x86_vendor) {
1733	case X86_VENDOR_INTEL:
1734		mce_intel_feature_init(c);
1735		mce_adjust_timer = cmci_intel_adjust_timer;
1736		break;
1737
1738	case X86_VENDOR_AMD: {
1739		mce_amd_feature_init(c);
1740		break;
1741		}
1742
1743	default:
1744		break;
1745	}
1746}
1747
1748static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1749{
1750	switch (c->x86_vendor) {
1751	case X86_VENDOR_INTEL:
1752		mce_intel_feature_clear(c);
1753		break;
1754	default:
1755		break;
1756	}
1757}
1758
1759static void mce_start_timer(struct timer_list *t)
1760{
 
1761	unsigned long iv = check_interval * HZ;
1762
1763	if (mca_cfg.ignore_ce || !iv)
1764		return;
1765
1766	this_cpu_write(mce_next_interval, iv);
1767	__start_timer(t, iv);
1768}
1769
1770static void __mcheck_cpu_setup_timer(void)
1771{
1772	struct timer_list *t = this_cpu_ptr(&mce_timer);
1773
1774	timer_setup(t, mce_timer_fn, TIMER_PINNED);
1775}
1776
1777static void __mcheck_cpu_init_timer(void)
1778{
1779	struct timer_list *t = this_cpu_ptr(&mce_timer);
1780
1781	timer_setup(t, mce_timer_fn, TIMER_PINNED);
1782	mce_start_timer(t);
 
 
 
1783}
1784
1785/* Handle unconfigured int18 (should never happen) */
1786static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1787{
1788	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1789	       smp_processor_id());
1790}
1791
1792/* Call the installed machine check handler for this CPU setup. */
1793void (*machine_check_vector)(struct pt_regs *, long error_code) =
1794						unexpected_machine_check;
1795
1796dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1797{
1798	machine_check_vector(regs, error_code);
1799}
1800
1801/*
1802 * Called for each booted CPU to set up machine checks.
1803 * Must be called with preempt off:
1804 */
1805void mcheck_cpu_init(struct cpuinfo_x86 *c)
1806{
1807	if (mca_cfg.disabled)
1808		return;
1809
1810	if (__mcheck_cpu_ancient_init(c))
1811		return;
1812
1813	if (!mce_available(c))
1814		return;
1815
1816	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1817		mca_cfg.disabled = 1;
1818		return;
1819	}
1820
1821	if (mce_gen_pool_init()) {
1822		mca_cfg.disabled = 1;
1823		pr_emerg("Couldn't allocate MCE records pool!\n");
1824		return;
1825	}
1826
1827	machine_check_vector = do_machine_check;
1828
1829	__mcheck_cpu_init_early(c);
1830	__mcheck_cpu_init_generic();
1831	__mcheck_cpu_init_vendor(c);
1832	__mcheck_cpu_init_clear_banks();
1833	__mcheck_cpu_setup_timer();
 
1834}
1835
1836/*
1837 * Called for each booted CPU to clear some machine checks opt-ins
1838 */
1839void mcheck_cpu_clear(struct cpuinfo_x86 *c)
 
 
 
 
 
1840{
1841	if (mca_cfg.disabled)
1842		return;
1843
1844	if (!mce_available(c))
1845		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1846
1847	/*
1848	 * Possibly to clear general settings generic to x86
1849	 * __mcheck_cpu_clear_generic(c);
1850	 */
1851	__mcheck_cpu_clear_vendor(c);
 
 
 
1852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1853}
1854
1855static void __mce_disable_bank(void *arg)
1856{
1857	int bank = *((int *)arg);
1858	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1859	cmci_disable_bank(bank);
 
 
 
1860}
1861
1862void mce_disable_bank(int bank)
 
1863{
1864	if (bank >= mca_cfg.banks) {
1865		pr_warn(FW_BUG
1866			"Ignoring request to disable invalid MCA bank %d.\n",
1867			bank);
1868		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1869	}
1870	set_bit(bank, mce_banks_ce_disabled);
1871	on_each_cpu(__mce_disable_bank, &bank, 1);
1872}
1873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1874/*
1875 * mce=off Disables machine check
1876 * mce=no_cmci Disables CMCI
1877 * mce=no_lmce Disables LMCE
1878 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1879 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1880 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1881 *	monarchtimeout is how long to wait for other CPUs on machine
1882 *	check, or 0 to not wait
1883 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1884	and older.
1885 * mce=nobootlog Don't log MCEs from before booting.
1886 * mce=bios_cmci_threshold Don't program the CMCI threshold
1887 * mce=recovery force enable memcpy_mcsafe()
1888 */
1889static int __init mcheck_enable(char *str)
1890{
1891	struct mca_config *cfg = &mca_cfg;
1892
1893	if (*str == 0) {
1894		enable_p5_mce();
1895		return 1;
1896	}
1897	if (*str == '=')
1898		str++;
1899	if (!strcmp(str, "off"))
1900		cfg->disabled = 1;
1901	else if (!strcmp(str, "no_cmci"))
1902		cfg->cmci_disabled = true;
1903	else if (!strcmp(str, "no_lmce"))
1904		cfg->lmce_disabled = 1;
1905	else if (!strcmp(str, "dont_log_ce"))
1906		cfg->dont_log_ce = true;
1907	else if (!strcmp(str, "ignore_ce"))
1908		cfg->ignore_ce = true;
1909	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1910		cfg->bootlog = (str[0] == 'b');
1911	else if (!strcmp(str, "bios_cmci_threshold"))
1912		cfg->bios_cmci_threshold = 1;
1913	else if (!strcmp(str, "recovery"))
1914		cfg->recovery = 1;
1915	else if (isdigit(str[0])) {
1916		if (get_option(&str, &cfg->tolerant) == 2)
1917			get_option(&str, &(cfg->monarch_timeout));
 
 
 
1918	} else {
1919		pr_info("mce argument %s ignored. Please use /sys\n", str);
 
1920		return 0;
1921	}
1922	return 1;
1923}
1924__setup("mce", mcheck_enable);
1925
1926int __init mcheck_init(void)
1927{
1928	mcheck_intel_therm_init();
1929	mce_register_decode_chain(&first_nb);
1930	mce_register_decode_chain(&mce_srao_nb);
1931	mce_register_decode_chain(&mce_default_nb);
1932	mcheck_vendor_init_severity();
1933
1934	INIT_WORK(&mce_work, mce_gen_pool_process);
1935	init_irq_work(&mce_irq_work, mce_irq_work_cb);
1936
1937	return 0;
1938}
1939
1940/*
1941 * mce_syscore: PM support
1942 */
1943
1944/*
1945 * Disable machine checks on suspend and shutdown. We can't really handle
1946 * them later.
1947 */
1948static void mce_disable_error_reporting(void)
1949{
1950	int i;
1951
1952	for (i = 0; i < mca_cfg.banks; i++) {
1953		struct mce_bank *b = &mce_banks[i];
1954
1955		if (b->init)
1956			wrmsrl(msr_ops.ctl(i), 0);
1957	}
1958	return;
1959}
1960
1961static void vendor_disable_error_reporting(void)
1962{
1963	/*
1964	 * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
1965	 * Disabling them for just a single offlined CPU is bad, since it will
1966	 * inhibit reporting for all shared resources on the socket like the
1967	 * last level cache (LLC), the integrated memory controller (iMC), etc.
1968	 */
1969	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
1970	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1971		return;
1972
1973	mce_disable_error_reporting();
1974}
1975
1976static int mce_syscore_suspend(void)
1977{
1978	vendor_disable_error_reporting();
1979	return 0;
1980}
1981
1982static void mce_syscore_shutdown(void)
1983{
1984	vendor_disable_error_reporting();
1985}
1986
1987/*
1988 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1989 * Only one CPU is active at this time, the others get re-added later using
1990 * CPU hotplug:
1991 */
1992static void mce_syscore_resume(void)
1993{
1994	__mcheck_cpu_init_generic();
1995	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
1996	__mcheck_cpu_init_clear_banks();
1997}
1998
1999static struct syscore_ops mce_syscore_ops = {
2000	.suspend	= mce_syscore_suspend,
2001	.shutdown	= mce_syscore_shutdown,
2002	.resume		= mce_syscore_resume,
2003};
2004
2005/*
2006 * mce_device: Sysfs support
2007 */
2008
2009static void mce_cpu_restart(void *data)
2010{
2011	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2012		return;
2013	__mcheck_cpu_init_generic();
2014	__mcheck_cpu_init_clear_banks();
2015	__mcheck_cpu_init_timer();
2016}
2017
2018/* Reinit MCEs after user configuration changes */
2019static void mce_restart(void)
2020{
2021	mce_timer_delete_all();
2022	on_each_cpu(mce_cpu_restart, NULL, 1);
2023}
2024
2025/* Toggle features for corrected errors */
2026static void mce_disable_cmci(void *data)
2027{
2028	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2029		return;
2030	cmci_clear();
2031}
2032
2033static void mce_enable_ce(void *all)
2034{
2035	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2036		return;
2037	cmci_reenable();
2038	cmci_recheck();
2039	if (all)
2040		__mcheck_cpu_init_timer();
2041}
2042
2043static struct bus_type mce_subsys = {
2044	.name		= "machinecheck",
2045	.dev_name	= "machinecheck",
2046};
2047
2048DEFINE_PER_CPU(struct device *, mce_device);
2049
 
 
 
2050static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2051{
2052	return container_of(attr, struct mce_bank, attr);
2053}
2054
2055static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2056			 char *buf)
2057{
2058	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2059}
2060
2061static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2062			const char *buf, size_t size)
2063{
2064	u64 new;
2065
2066	if (kstrtou64(buf, 0, &new) < 0)
2067		return -EINVAL;
2068
2069	attr_to_bank(attr)->ctl = new;
2070	mce_restart();
2071
2072	return size;
2073}
2074
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2075static ssize_t set_ignore_ce(struct device *s,
2076			     struct device_attribute *attr,
2077			     const char *buf, size_t size)
2078{
2079	u64 new;
2080
2081	if (kstrtou64(buf, 0, &new) < 0)
2082		return -EINVAL;
2083
2084	mutex_lock(&mce_sysfs_mutex);
2085	if (mca_cfg.ignore_ce ^ !!new) {
2086		if (new) {
2087			/* disable ce features */
2088			mce_timer_delete_all();
2089			on_each_cpu(mce_disable_cmci, NULL, 1);
2090			mca_cfg.ignore_ce = true;
2091		} else {
2092			/* enable ce features */
2093			mca_cfg.ignore_ce = false;
2094			on_each_cpu(mce_enable_ce, (void *)1, 1);
2095		}
2096	}
2097	mutex_unlock(&mce_sysfs_mutex);
2098
2099	return size;
2100}
2101
2102static ssize_t set_cmci_disabled(struct device *s,
2103				 struct device_attribute *attr,
2104				 const char *buf, size_t size)
2105{
2106	u64 new;
2107
2108	if (kstrtou64(buf, 0, &new) < 0)
2109		return -EINVAL;
2110
2111	mutex_lock(&mce_sysfs_mutex);
2112	if (mca_cfg.cmci_disabled ^ !!new) {
2113		if (new) {
2114			/* disable cmci */
2115			on_each_cpu(mce_disable_cmci, NULL, 1);
2116			mca_cfg.cmci_disabled = true;
2117		} else {
2118			/* enable cmci */
2119			mca_cfg.cmci_disabled = false;
2120			on_each_cpu(mce_enable_ce, NULL, 1);
2121		}
2122	}
2123	mutex_unlock(&mce_sysfs_mutex);
2124
2125	return size;
2126}
2127
2128static ssize_t store_int_with_restart(struct device *s,
2129				      struct device_attribute *attr,
2130				      const char *buf, size_t size)
2131{
2132	unsigned long old_check_interval = check_interval;
2133	ssize_t ret = device_store_ulong(s, attr, buf, size);
2134
2135	if (check_interval == old_check_interval)
2136		return ret;
2137
2138	if (check_interval < 1)
2139		check_interval = 1;
2140
2141	mutex_lock(&mce_sysfs_mutex);
2142	mce_restart();
2143	mutex_unlock(&mce_sysfs_mutex);
2144
2145	return ret;
2146}
2147
2148static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2149static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2150static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
 
2151
2152static struct dev_ext_attribute dev_attr_check_interval = {
2153	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2154	&check_interval
2155};
2156
2157static struct dev_ext_attribute dev_attr_ignore_ce = {
2158	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2159	&mca_cfg.ignore_ce
2160};
2161
2162static struct dev_ext_attribute dev_attr_cmci_disabled = {
2163	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2164	&mca_cfg.cmci_disabled
2165};
2166
2167static struct device_attribute *mce_device_attrs[] = {
2168	&dev_attr_tolerant.attr,
2169	&dev_attr_check_interval.attr,
2170#ifdef CONFIG_X86_MCELOG_LEGACY
2171	&dev_attr_trigger,
2172#endif
2173	&dev_attr_monarch_timeout.attr,
2174	&dev_attr_dont_log_ce.attr,
2175	&dev_attr_ignore_ce.attr,
2176	&dev_attr_cmci_disabled.attr,
2177	NULL
2178};
2179
2180static cpumask_var_t mce_device_initialized;
2181
2182static void mce_device_release(struct device *dev)
2183{
2184	kfree(dev);
2185}
2186
2187/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2188static int mce_device_create(unsigned int cpu)
2189{
2190	struct device *dev;
2191	int err;
2192	int i, j;
2193
2194	if (!mce_available(&boot_cpu_data))
2195		return -EIO;
2196
2197	dev = per_cpu(mce_device, cpu);
2198	if (dev)
2199		return 0;
2200
2201	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2202	if (!dev)
2203		return -ENOMEM;
2204	dev->id  = cpu;
2205	dev->bus = &mce_subsys;
2206	dev->release = &mce_device_release;
2207
2208	err = device_register(dev);
2209	if (err) {
2210		put_device(dev);
2211		return err;
2212	}
2213
2214	for (i = 0; mce_device_attrs[i]; i++) {
2215		err = device_create_file(dev, mce_device_attrs[i]);
2216		if (err)
2217			goto error;
2218	}
2219	for (j = 0; j < mca_cfg.banks; j++) {
2220		err = device_create_file(dev, &mce_banks[j].attr);
2221		if (err)
2222			goto error2;
2223	}
2224	cpumask_set_cpu(cpu, mce_device_initialized);
2225	per_cpu(mce_device, cpu) = dev;
2226
2227	return 0;
2228error2:
2229	while (--j >= 0)
2230		device_remove_file(dev, &mce_banks[j].attr);
2231error:
2232	while (--i >= 0)
2233		device_remove_file(dev, mce_device_attrs[i]);
2234
2235	device_unregister(dev);
2236
2237	return err;
2238}
2239
2240static void mce_device_remove(unsigned int cpu)
2241{
2242	struct device *dev = per_cpu(mce_device, cpu);
2243	int i;
2244
2245	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2246		return;
2247
2248	for (i = 0; mce_device_attrs[i]; i++)
2249		device_remove_file(dev, mce_device_attrs[i]);
2250
2251	for (i = 0; i < mca_cfg.banks; i++)
2252		device_remove_file(dev, &mce_banks[i].attr);
2253
2254	device_unregister(dev);
2255	cpumask_clear_cpu(cpu, mce_device_initialized);
2256	per_cpu(mce_device, cpu) = NULL;
2257}
2258
2259/* Make sure there are no machine checks on offlined CPUs. */
2260static void mce_disable_cpu(void)
2261{
2262	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 
 
 
2263		return;
2264
2265	if (!cpuhp_tasks_frozen)
2266		cmci_clear();
 
 
2267
2268	vendor_disable_error_reporting();
 
 
2269}
2270
2271static void mce_reenable_cpu(void)
2272{
 
2273	int i;
2274
2275	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2276		return;
2277
2278	if (!cpuhp_tasks_frozen)
2279		cmci_reenable();
2280	for (i = 0; i < mca_cfg.banks; i++) {
2281		struct mce_bank *b = &mce_banks[i];
2282
2283		if (b->init)
2284			wrmsrl(msr_ops.ctl(i), b->ctl);
2285	}
2286}
2287
2288static int mce_cpu_dead(unsigned int cpu)
2289{
2290	mce_intel_hcpu_update(cpu);
2291
2292	/* intentionally ignoring frozen here */
2293	if (!cpuhp_tasks_frozen)
2294		cmci_rediscover();
2295	return 0;
2296}
2297
2298static int mce_cpu_online(unsigned int cpu)
2299{
2300	struct timer_list *t = this_cpu_ptr(&mce_timer);
2301	int ret;
2302
2303	mce_device_create(cpu);
2304
2305	ret = mce_threshold_create_device(cpu);
2306	if (ret) {
 
 
 
 
 
 
 
2307		mce_device_remove(cpu);
2308		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2309	}
2310	mce_reenable_cpu();
2311	mce_start_timer(t);
2312	return 0;
2313}
2314
2315static int mce_cpu_pre_down(unsigned int cpu)
2316{
2317	struct timer_list *t = this_cpu_ptr(&mce_timer);
2318
2319	mce_disable_cpu();
2320	del_timer_sync(t);
2321	mce_threshold_remove_device(cpu);
2322	mce_device_remove(cpu);
2323	return 0;
2324}
2325
2326static __init void mce_init_banks(void)
2327{
2328	int i;
2329
2330	for (i = 0; i < mca_cfg.banks; i++) {
2331		struct mce_bank *b = &mce_banks[i];
2332		struct device_attribute *a = &b->attr;
2333
2334		sysfs_attr_init(&a->attr);
2335		a->attr.name	= b->attrname;
2336		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2337
2338		a->attr.mode	= 0644;
2339		a->show		= show_bank;
2340		a->store	= set_bank;
2341	}
2342}
2343
2344static __init int mcheck_init_device(void)
2345{
2346	int err;
 
2347
2348	/*
2349	 * Check if we have a spare virtual bit. This will only become
2350	 * a problem if/when we move beyond 5-level page tables.
2351	 */
2352	MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2353
2354	if (!mce_available(&boot_cpu_data)) {
2355		err = -EIO;
2356		goto err_out;
2357	}
2358
2359	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2360		err = -ENOMEM;
2361		goto err_out;
2362	}
2363
2364	mce_init_banks();
2365
2366	err = subsys_system_register(&mce_subsys, NULL);
2367	if (err)
2368		goto err_out_mem;
2369
2370	err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2371				mce_cpu_dead);
2372	if (err)
2373		goto err_out_mem;
2374
2375	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2376				mce_cpu_online, mce_cpu_pre_down);
2377	if (err < 0)
2378		goto err_out_online;
 
2379
2380	register_syscore_ops(&mce_syscore_ops);
 
2381
2382	return 0;
2383
2384err_out_online:
2385	cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2386
2387err_out_mem:
2388	free_cpumask_var(mce_device_initialized);
2389
2390err_out:
2391	pr_err("Unable to init MCE device (rc: %d)\n", err);
2392
2393	return err;
2394}
2395device_initcall_sync(mcheck_init_device);
2396
2397/*
2398 * Old style boot options parsing. Only for compatibility.
2399 */
2400static int __init mcheck_disable(char *str)
2401{
2402	mca_cfg.disabled = 1;
2403	return 1;
2404}
2405__setup("nomce", mcheck_disable);
2406
2407#ifdef CONFIG_DEBUG_FS
2408struct dentry *mce_get_debugfs_dir(void)
2409{
2410	static struct dentry *dmce;
2411
2412	if (!dmce)
2413		dmce = debugfs_create_dir("mce", NULL);
2414
2415	return dmce;
2416}
2417
2418static void mce_reset(void)
2419{
2420	cpu_missing = 0;
2421	atomic_set(&mce_fake_panicked, 0);
2422	atomic_set(&mce_executing, 0);
2423	atomic_set(&mce_callin, 0);
2424	atomic_set(&global_nwo, 0);
2425}
2426
2427static int fake_panic_get(void *data, u64 *val)
2428{
2429	*val = fake_panic;
2430	return 0;
2431}
2432
2433static int fake_panic_set(void *data, u64 val)
2434{
2435	mce_reset();
2436	fake_panic = val;
2437	return 0;
2438}
2439
2440DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2441			fake_panic_set, "%llu\n");
2442
2443static int __init mcheck_debugfs_init(void)
2444{
2445	struct dentry *dmce, *ffake_panic;
2446
2447	dmce = mce_get_debugfs_dir();
2448	if (!dmce)
2449		return -ENOMEM;
2450	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2451					  &fake_panic_fops);
2452	if (!ffake_panic)
2453		return -ENOMEM;
2454
2455	return 0;
2456}
2457#else
2458static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2459#endif
2460
2461DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2462EXPORT_SYMBOL_GPL(mcsafe_key);
2463
2464static int __init mcheck_late_init(void)
2465{
2466	if (mca_cfg.recovery)
2467		static_branch_inc(&mcsafe_key);
2468
2469	mcheck_debugfs_init();
2470	cec_init();
2471
2472	/*
2473	 * Flush out everything that has been logged during early boot, now that
2474	 * everything has been initialized (workqueues, decoders, ...).
2475	 */
2476	mce_schedule_work();
2477
2478	return 0;
2479}
2480late_initcall(mcheck_late_init);

   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
 
 
 
  10#include <linux/thread_info.h>
  11#include <linux/capability.h>
  12#include <linux/miscdevice.h>
  13#include <linux/ratelimit.h>
  14#include <linux/kallsyms.h>
  15#include <linux/rcupdate.h>
  16#include <linux/kobject.h>
  17#include <linux/uaccess.h>
  18#include <linux/kdebug.h>
  19#include <linux/kernel.h>
  20#include <linux/percpu.h>
  21#include <linux/string.h>
  22#include <linux/device.h>
  23#include <linux/syscore_ops.h>
  24#include <linux/delay.h>
  25#include <linux/ctype.h>
  26#include <linux/sched.h>
  27#include <linux/sysfs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/init.h>
  31#include <linux/kmod.h>
  32#include <linux/poll.h>
  33#include <linux/nmi.h>
  34#include <linux/cpu.h>
 
  35#include <linux/smp.h>
  36#include <linux/fs.h>
  37#include <linux/mm.h>
  38#include <linux/debugfs.h>
  39#include <linux/irq_work.h>
  40#include <linux/export.h>
 
  41
 
  42#include <asm/processor.h>
 
 
  43#include <asm/mce.h>
  44#include <asm/msr.h>
 
 
  45
  46#include "mce-internal.h"
  47
  48static DEFINE_MUTEX(mce_chrdev_read_mutex);
  49
  50#define rcu_dereference_check_mce(p) \
  51	rcu_dereference_index_check((p), \
  52			      rcu_read_lock_sched_held() || \
  53			      lockdep_is_held(&mce_chrdev_read_mutex))
  54
  55#define CREATE_TRACE_POINTS
  56#include <trace/events/mce.h>
  57
  58int mce_disabled __read_mostly;
  59
  60#define MISC_MCELOG_MINOR	227
  61
  62#define SPINUNIT 100	/* 100ns */
  63
  64atomic_t mce_entry;
  65
  66DEFINE_PER_CPU(unsigned, mce_exception_count);
  67
  68/*
  69 * Tolerant levels:
  70 *   0: always panic on uncorrected errors, log corrected errors
  71 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  72 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  73 *   3: never panic or SIGBUS, log all errors (for testing only)
  74 */
  75static int			tolerant		__read_mostly = 1;
  76static int			banks			__read_mostly;
  77static int			rip_msr			__read_mostly;
  78static int			mce_bootlog		__read_mostly = -1;
  79static int			monarch_timeout		__read_mostly = -1;
  80static int			mce_panic_timeout	__read_mostly;
  81static int			mce_dont_log_ce		__read_mostly;
  82int				mce_cmci_disabled	__read_mostly;
  83int				mce_ignore_ce		__read_mostly;
  84int				mce_ser			__read_mostly;
  85
  86struct mce_bank                *mce_banks		__read_mostly;
  87
  88/* User mode helper program triggered by machine check event */
  89static unsigned long		mce_need_notify;
  90static char			mce_helper[128];
  91static char			*mce_helper_argv[2] = { mce_helper, NULL };
  92
  93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 
 
 
 
 
 
 
 
 
 
 
  94
  95static DEFINE_PER_CPU(struct mce, mces_seen);
  96static int			cpu_missing;
 
  97
  98/* MCA banks polled by the period polling timer for corrected events */
 
 
 
  99DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 100	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 101};
 102
 103static DEFINE_PER_CPU(struct work_struct, mce_work);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 104
 105/*
 106 * CPU/chipset specific EDAC code can register a notifier call here to print
 107 * MCE errors in a human-readable form.
 108 */
 109ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 110
 111/* Do initial initialization of a struct mce */
 112void mce_setup(struct mce *m)
 113{
 114	memset(m, 0, sizeof(struct mce));
 115	m->cpu = m->extcpu = smp_processor_id();
 116	rdtscll(m->tsc);
 117	/* We hope get_seconds stays lockless */
 118	m->time = get_seconds();
 119	m->cpuvendor = boot_cpu_data.x86_vendor;
 120	m->cpuid = cpuid_eax(1);
 121	m->socketid = cpu_data(m->extcpu).phys_proc_id;
 122	m->apicid = cpu_data(m->extcpu).initial_apicid;
 123	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 
 
 
 
 
 124}
 125
 126DEFINE_PER_CPU(struct mce, injectm);
 127EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 129/*
 130 * Lockless MCE logging infrastructure.
 131 * This avoids deadlocks on printk locks without having to break locks. Also
 132 * separate MCEs from kernel messages to avoid bogus bug reports.
 133 */
 
 
 134
 135static struct mce_log mcelog = {
 136	.signature	= MCE_LOG_SIGNATURE,
 137	.len		= MCE_LOG_LEN,
 138	.recordlen	= sizeof(struct mce),
 139};
 140
 141void mce_log(struct mce *mce)
 142{
 143	unsigned next, entry;
 144	int ret = 0;
 145
 146	/* Emit the trace record: */
 147	trace_mce_record(mce);
 148
 149	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 150	if (ret == NOTIFY_STOP)
 151		return;
 152
 153	mce->finished = 0;
 154	wmb();
 155	for (;;) {
 156		entry = rcu_dereference_check_mce(mcelog.next);
 157		for (;;) {
 158
 159			/*
 160			 * When the buffer fills up discard new entries.
 161			 * Assume that the earlier errors are the more
 162			 * interesting ones:
 163			 */
 164			if (entry >= MCE_LOG_LEN) {
 165				set_bit(MCE_OVERFLOW,
 166					(unsigned long *)&mcelog.flags);
 167				return;
 168			}
 169			/* Old left over entry. Skip: */
 170			if (mcelog.entry[entry].finished) {
 171				entry++;
 172				continue;
 173			}
 174			break;
 175		}
 176		smp_rmb();
 177		next = entry + 1;
 178		if (cmpxchg(&mcelog.next, entry, next) == entry)
 179			break;
 180	}
 181	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 182	wmb();
 183	mcelog.entry[entry].finished = 1;
 184	wmb();
 185
 186	mce->finished = 1;
 187	set_bit(0, &mce_need_notify);
 188}
 
 189
 190static void drain_mcelog_buffer(void)
 191{
 192	unsigned int next, i, prev = 0;
 193
 194	next = ACCESS_ONCE(mcelog.next);
 
 
 195
 196	do {
 197		struct mce *m;
 
 
 198
 199		/* drain what was logged during boot */
 200		for (i = prev; i < next; i++) {
 201			unsigned long start = jiffies;
 202			unsigned retries = 1;
 203
 204			m = &mcelog.entry[i];
 
 
 
 205
 206			while (!m->finished) {
 207				if (time_after_eq(jiffies, start + 2*retries))
 208					retries++;
 
 209
 210				cpu_relax();
 
 
 
 211
 212				if (!m->finished && retries >= 4) {
 213					pr_err("MCE: skipping error being logged currently!\n");
 214					break;
 215				}
 216			}
 217			smp_rmb();
 218			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 219		}
 220
 221		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
 222		prev = next;
 223		next = cmpxchg(&mcelog.next, prev, 0);
 224	} while (next != prev);
 225}
 226
 227
 228void mce_register_decode_chain(struct notifier_block *nb)
 229{
 230	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 231	drain_mcelog_buffer();
 232}
 233EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 234
 235void mce_unregister_decode_chain(struct notifier_block *nb)
 236{
 237	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 238}
 239EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 240
 241static void print_mce(struct mce *m)
 
 
 
 
 
 
 
 242{
 243	int ret = 0;
 244
 245	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 246	       m->extcpu, m->mcgstatus, m->bank, m->status);
 247
 248	if (m->ip) {
 249		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 250			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 251				m->cs, m->ip);
 252
 253		if (m->cs == __KERNEL_CS)
 254			print_symbol("{%s}", m->ip);
 255		pr_cont("\n");
 256	}
 257
 258	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 259	if (m->addr)
 260		pr_cont("ADDR %llx ", m->addr);
 261	if (m->misc)
 262		pr_cont("MISC %llx ", m->misc);
 263
 
 
 
 
 
 
 
 264	pr_cont("\n");
 265	/*
 266	 * Note this output is parsed by external tools and old fields
 267	 * should not be changed.
 268	 */
 269	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 270		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 271		cpu_data(m->extcpu).microcode);
 
 272
 273	/*
 274	 * Print out human-readable details about the MCE error,
 275	 * (if the CPU has an implementation for that)
 276	 */
 277	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 278	if (ret == NOTIFY_STOP)
 279		return;
 280
 281	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 
 282}
 283
 284#define PANIC_TIMEOUT 5 /* 5 seconds */
 285
 286static atomic_t mce_paniced;
 287
 288static int fake_panic;
 289static atomic_t mce_fake_paniced;
 290
 291/* Panic in progress. Enable interrupts and wait for final IPI */
 292static void wait_for_panic(void)
 293{
 294	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 295
 296	preempt_disable();
 297	local_irq_enable();
 298	while (timeout-- > 0)
 299		udelay(1);
 300	if (panic_timeout == 0)
 301		panic_timeout = mce_panic_timeout;
 302	panic("Panicing machine check CPU died");
 303}
 304
 305static void mce_panic(char *msg, struct mce *final, char *exp)
 306{
 307	int i, apei_err = 0;
 
 
 308
 309	if (!fake_panic) {
 310		/*
 311		 * Make sure only one CPU runs in machine check panic
 312		 */
 313		if (atomic_inc_return(&mce_paniced) > 1)
 314			wait_for_panic();
 315		barrier();
 316
 317		bust_spinlocks(1);
 318		console_verbose();
 319	} else {
 320		/* Don't log too much for fake panic */
 321		if (atomic_inc_return(&mce_fake_paniced) > 1)
 322			return;
 323	}
 
 324	/* First print corrected ones that are still unlogged */
 325	for (i = 0; i < MCE_LOG_LEN; i++) {
 326		struct mce *m = &mcelog.entry[i];
 327		if (!(m->status & MCI_STATUS_VAL))
 328			continue;
 329		if (!(m->status & MCI_STATUS_UC)) {
 330			print_mce(m);
 331			if (!apei_err)
 332				apei_err = apei_write_mce(m);
 333		}
 334	}
 335	/* Now print uncorrected but with the final one last */
 336	for (i = 0; i < MCE_LOG_LEN; i++) {
 337		struct mce *m = &mcelog.entry[i];
 338		if (!(m->status & MCI_STATUS_VAL))
 339			continue;
 340		if (!(m->status & MCI_STATUS_UC))
 341			continue;
 342		if (!final || memcmp(m, final, sizeof(struct mce))) {
 343			print_mce(m);
 344			if (!apei_err)
 345				apei_err = apei_write_mce(m);
 346		}
 347	}
 348	if (final) {
 349		print_mce(final);
 350		if (!apei_err)
 351			apei_err = apei_write_mce(final);
 352	}
 353	if (cpu_missing)
 354		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 355	if (exp)
 356		pr_emerg(HW_ERR "Machine check: %s\n", exp);
 357	if (!fake_panic) {
 358		if (panic_timeout == 0)
 359			panic_timeout = mce_panic_timeout;
 360		panic(msg);
 361	} else
 362		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 363}
 364
 365/* Support code for software error injection */
 366
 367static int msr_to_offset(u32 msr)
 368{
 369	unsigned bank = __this_cpu_read(injectm.bank);
 370
 371	if (msr == rip_msr)
 372		return offsetof(struct mce, ip);
 373	if (msr == MSR_IA32_MCx_STATUS(bank))
 374		return offsetof(struct mce, status);
 375	if (msr == MSR_IA32_MCx_ADDR(bank))
 376		return offsetof(struct mce, addr);
 377	if (msr == MSR_IA32_MCx_MISC(bank))
 378		return offsetof(struct mce, misc);
 379	if (msr == MSR_IA32_MCG_STATUS)
 380		return offsetof(struct mce, mcgstatus);
 381	return -1;
 382}
 383
 384/* MSR access wrappers used for error injection */
 385static u64 mce_rdmsrl(u32 msr)
 386{
 387	u64 v;
 388
 389	if (__this_cpu_read(injectm.finished)) {
 390		int offset = msr_to_offset(msr);
 391
 392		if (offset < 0)
 393			return 0;
 394		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 395	}
 396
 397	if (rdmsrl_safe(msr, &v)) {
 398		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 399		/*
 400		 * Return zero in case the access faulted. This should
 401		 * not happen normally but can happen if the CPU does
 402		 * something weird, or if the code is buggy.
 403		 */
 404		v = 0;
 405	}
 406
 407	return v;
 408}
 409
 410static void mce_wrmsrl(u32 msr, u64 v)
 411{
 412	if (__this_cpu_read(injectm.finished)) {
 413		int offset = msr_to_offset(msr);
 414
 415		if (offset >= 0)
 416			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 417		return;
 418	}
 419	wrmsrl(msr, v);
 420}
 421
 422/*
 423 * Collect all global (w.r.t. this processor) status about this machine
 424 * check into our "mce" struct so that we can use it later to assess
 425 * the severity of the problem as we read per-bank specific details.
 426 */
 427static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 428{
 429	mce_setup(m);
 430
 431	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 432	if (regs) {
 433		/*
 434		 * Get the address of the instruction at the time of
 435		 * the machine check error.
 436		 */
 437		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 438			m->ip = regs->ip;
 439			m->cs = regs->cs;
 440
 441			/*
 442			 * When in VM86 mode make the cs look like ring 3
 443			 * always. This is a lie, but it's better than passing
 444			 * the additional vm86 bit around everywhere.
 445			 */
 446			if (v8086_mode(regs))
 447				m->cs |= 3;
 448		}
 449		/* Use accurate RIP reporting if available. */
 450		if (rip_msr)
 451			m->ip = mce_rdmsrl(rip_msr);
 452	}
 453}
 454
 455/*
 456 * Simple lockless ring to communicate PFNs from the exception handler with the
 457 * process context work function. This is vastly simplified because there's
 458 * only a single reader and a single writer.
 459 */
 460#define MCE_RING_SIZE 16	/* we use one entry less */
 461
 462struct mce_ring {
 463	unsigned short start;
 464	unsigned short end;
 465	unsigned long ring[MCE_RING_SIZE];
 466};
 467static DEFINE_PER_CPU(struct mce_ring, mce_ring);
 468
 469/* Runs with CPU affinity in workqueue */
 470static int mce_ring_empty(void)
 471{
 472	struct mce_ring *r = &__get_cpu_var(mce_ring);
 473
 474	return r->start == r->end;
 475}
 476
 477static int mce_ring_get(unsigned long *pfn)
 478{
 479	struct mce_ring *r;
 480	int ret = 0;
 481
 482	*pfn = 0;
 483	get_cpu();
 484	r = &__get_cpu_var(mce_ring);
 485	if (r->start == r->end)
 486		goto out;
 487	*pfn = r->ring[r->start];
 488	r->start = (r->start + 1) % MCE_RING_SIZE;
 489	ret = 1;
 490out:
 491	put_cpu();
 492	return ret;
 493}
 494
 495/* Always runs in MCE context with preempt off */
 496static int mce_ring_add(unsigned long pfn)
 497{
 498	struct mce_ring *r = &__get_cpu_var(mce_ring);
 499	unsigned next;
 500
 501	next = (r->end + 1) % MCE_RING_SIZE;
 502	if (next == r->start)
 503		return -1;
 504	r->ring[r->end] = pfn;
 505	wmb();
 506	r->end = next;
 507	return 0;
 508}
 509
 510int mce_available(struct cpuinfo_x86 *c)
 511{
 512	if (mce_disabled)
 513		return 0;
 514	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 515}
 516
 517static void mce_schedule_work(void)
 518{
 519	if (!mce_ring_empty()) {
 520		struct work_struct *work = &__get_cpu_var(mce_work);
 521		if (!work_pending(work))
 522			schedule_work(work);
 523	}
 524}
 525
 526DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 527
 528static void mce_irq_work_cb(struct irq_work *entry)
 529{
 530	mce_notify_irq();
 531	mce_schedule_work();
 532}
 533
 534static void mce_report_event(struct pt_regs *regs)
 535{
 536	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 537		mce_notify_irq();
 538		/*
 539		 * Triggering the work queue here is just an insurance
 540		 * policy in case the syscall exit notify handler
 541		 * doesn't run soon enough or ends up running on the
 542		 * wrong CPU (can happen when audit sleeps)
 543		 */
 544		mce_schedule_work();
 545		return;
 546	}
 547
 548	irq_work_queue(&__get_cpu_var(mce_irq_work));
 549}
 550
 551/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 552 * Read ADDR and MISC registers.
 553 */
 554static void mce_read_aux(struct mce *m, int i)
 555{
 556	if (m->status & MCI_STATUS_MISCV)
 557		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 
 558	if (m->status & MCI_STATUS_ADDRV) {
 559		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 560
 561		/*
 562		 * Mask the reported address by the reported granularity.
 563		 */
 564		if (mce_ser && (m->status & MCI_STATUS_MISCV)) {
 565			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 566			m->addr >>= shift;
 567			m->addr <<= shift;
 568		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 569	}
 570}
 571
 572DEFINE_PER_CPU(unsigned, mce_poll_count);
 573
 574/*
 575 * Poll for corrected events or events that happened before reset.
 576 * Those are just logged through /dev/mcelog.
 577 *
 578 * This is executed in standard interrupt context.
 579 *
 580 * Note: spec recommends to panic for fatal unsignalled
 581 * errors here. However this would be quite problematic --
 582 * we would need to reimplement the Monarch handling and
 583 * it would mess up the exclusion between exception handler
 584 * and poll hander -- * so we skip this for now.
 585 * These cases should not happen anyways, or only when the CPU
 586 * is already totally * confused. In this case it's likely it will
 587 * not fully execute the machine check handler either.
 588 */
 589void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 590{
 
 591	struct mce m;
 592	int i;
 593
 594	this_cpu_inc(mce_poll_count);
 595
 596	mce_gather_info(&m, NULL);
 597
 598	for (i = 0; i < banks; i++) {
 
 
 
 599		if (!mce_banks[i].ctl || !test_bit(i, *b))
 600			continue;
 601
 602		m.misc = 0;
 603		m.addr = 0;
 604		m.bank = i;
 605		m.tsc = 0;
 606
 607		barrier();
 608		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 609		if (!(m.status & MCI_STATUS_VAL))
 610			continue;
 611
 612		/*
 613		 * Uncorrected or signalled events are handled by the exception
 614		 * handler when it is enabled, so don't process those here.
 615		 *
 616		 * TBD do the same check for MCI_STATUS_EN here?
 617		 */
 618		if (!(flags & MCP_UC) &&
 619		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 620			continue;
 621
 
 
 622		mce_read_aux(&m, i);
 623
 624		if (!(flags & MCP_TIMESTAMP))
 625			m.tsc = 0;
 626		/*
 627		 * Don't get the IP here because it's unlikely to
 628		 * have anything to do with the actual error location.
 629		 */
 630		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
 631			mce_log(&m);
 
 
 
 
 
 
 
 
 
 632
 633		/*
 634		 * Clear state for this bank.
 635		 */
 636		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 637	}
 638
 639	/*
 640	 * Don't clear MCG_STATUS here because it's only defined for
 641	 * exceptions.
 642	 */
 643
 644	sync_core();
 
 
 645}
 646EXPORT_SYMBOL_GPL(machine_check_poll);
 647
 648/*
 649 * Do a quick check if any of the events requires a panic.
 650 * This decides if we keep the events around or clear them.
 651 */
 652static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
 
 653{
 654	int i, ret = 0;
 
 655
 656	for (i = 0; i < banks; i++) {
 657		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 658		if (m->status & MCI_STATUS_VAL)
 659			__set_bit(i, validp);
 660		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
 
 
 
 
 
 661			ret = 1;
 
 662	}
 663	return ret;
 664}
 665
 666/*
 667 * Variable to establish order between CPUs while scanning.
 668 * Each CPU spins initially until executing is equal its number.
 669 */
 670static atomic_t mce_executing;
 671
 672/*
 673 * Defines order of CPUs on entry. First CPU becomes Monarch.
 674 */
 675static atomic_t mce_callin;
 676
 677/*
 678 * Check if a timeout waiting for other CPUs happened.
 679 */
 680static int mce_timed_out(u64 *t)
 681{
 682	/*
 683	 * The others already did panic for some reason.
 684	 * Bail out like in a timeout.
 685	 * rmb() to tell the compiler that system_state
 686	 * might have been modified by someone else.
 687	 */
 688	rmb();
 689	if (atomic_read(&mce_paniced))
 690		wait_for_panic();
 691	if (!monarch_timeout)
 692		goto out;
 693	if ((s64)*t < SPINUNIT) {
 694		/* CHECKME: Make panic default for 1 too? */
 695		if (tolerant < 1)
 696			mce_panic("Timeout synchronizing machine check over CPUs",
 697				  NULL, NULL);
 698		cpu_missing = 1;
 699		return 1;
 700	}
 701	*t -= SPINUNIT;
 702out:
 703	touch_nmi_watchdog();
 704	return 0;
 705}
 706
 707/*
 708 * The Monarch's reign.  The Monarch is the CPU who entered
 709 * the machine check handler first. It waits for the others to
 710 * raise the exception too and then grades them. When any
 711 * error is fatal panic. Only then let the others continue.
 712 *
 713 * The other CPUs entering the MCE handler will be controlled by the
 714 * Monarch. They are called Subjects.
 715 *
 716 * This way we prevent any potential data corruption in a unrecoverable case
 717 * and also makes sure always all CPU's errors are examined.
 718 *
 719 * Also this detects the case of a machine check event coming from outer
 720 * space (not detected by any CPUs) In this case some external agent wants
 721 * us to shut down, so panic too.
 722 *
 723 * The other CPUs might still decide to panic if the handler happens
 724 * in a unrecoverable place, but in this case the system is in a semi-stable
 725 * state and won't corrupt anything by itself. It's ok to let the others
 726 * continue for a bit first.
 727 *
 728 * All the spin loops have timeouts; when a timeout happens a CPU
 729 * typically elects itself to be Monarch.
 730 */
 731static void mce_reign(void)
 732{
 733	int cpu;
 734	struct mce *m = NULL;
 735	int global_worst = 0;
 736	char *msg = NULL;
 737	char *nmsg = NULL;
 738
 739	/*
 740	 * This CPU is the Monarch and the other CPUs have run
 741	 * through their handlers.
 742	 * Grade the severity of the errors of all the CPUs.
 743	 */
 744	for_each_possible_cpu(cpu) {
 745		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
 746					    &nmsg);
 
 747		if (severity > global_worst) {
 748			msg = nmsg;
 749			global_worst = severity;
 750			m = &per_cpu(mces_seen, cpu);
 751		}
 752	}
 753
 754	/*
 755	 * Cannot recover? Panic here then.
 756	 * This dumps all the mces in the log buffer and stops the
 757	 * other CPUs.
 758	 */
 759	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
 760		mce_panic("Fatal Machine check", m, msg);
 761
 762	/*
 763	 * For UC somewhere we let the CPU who detects it handle it.
 764	 * Also must let continue the others, otherwise the handling
 765	 * CPU could deadlock on a lock.
 766	 */
 767
 768	/*
 769	 * No machine check event found. Must be some external
 770	 * source or one CPU is hung. Panic.
 771	 */
 772	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
 773		mce_panic("Machine check from unknown source", NULL, NULL);
 774
 775	/*
 776	 * Now clear all the mces_seen so that they don't reappear on
 777	 * the next mce.
 778	 */
 779	for_each_possible_cpu(cpu)
 780		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 781}
 782
 783static atomic_t global_nwo;
 784
 785/*
 786 * Start of Monarch synchronization. This waits until all CPUs have
 787 * entered the exception handler and then determines if any of them
 788 * saw a fatal event that requires panic. Then it executes them
 789 * in the entry order.
 790 * TBD double check parallel CPU hotunplug
 791 */
 792static int mce_start(int *no_way_out)
 793{
 794	int order;
 795	int cpus = num_online_cpus();
 796	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 797
 798	if (!timeout)
 799		return -1;
 800
 801	atomic_add(*no_way_out, &global_nwo);
 802	/*
 803	 * global_nwo should be updated before mce_callin
 
 804	 */
 805	smp_wmb();
 806	order = atomic_inc_return(&mce_callin);
 807
 808	/*
 809	 * Wait for everyone.
 810	 */
 811	while (atomic_read(&mce_callin) != cpus) {
 812		if (mce_timed_out(&timeout)) {
 
 813			atomic_set(&global_nwo, 0);
 814			return -1;
 815		}
 816		ndelay(SPINUNIT);
 817	}
 818
 819	/*
 820	 * mce_callin should be read before global_nwo
 821	 */
 822	smp_rmb();
 823
 824	if (order == 1) {
 825		/*
 826		 * Monarch: Starts executing now, the others wait.
 827		 */
 828		atomic_set(&mce_executing, 1);
 829	} else {
 830		/*
 831		 * Subject: Now start the scanning loop one by one in
 832		 * the original callin order.
 833		 * This way when there are any shared banks it will be
 834		 * only seen by one CPU before cleared, avoiding duplicates.
 835		 */
 836		while (atomic_read(&mce_executing) < order) {
 837			if (mce_timed_out(&timeout)) {
 
 838				atomic_set(&global_nwo, 0);
 839				return -1;
 840			}
 841			ndelay(SPINUNIT);
 842		}
 843	}
 844
 845	/*
 846	 * Cache the global no_way_out state.
 847	 */
 848	*no_way_out = atomic_read(&global_nwo);
 849
 850	return order;
 851}
 852
 853/*
 854 * Synchronize between CPUs after main scanning loop.
 855 * This invokes the bulk of the Monarch processing.
 856 */
 857static int mce_end(int order)
 858{
 859	int ret = -1;
 860	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 861
 862	if (!timeout)
 863		goto reset;
 864	if (order < 0)
 865		goto reset;
 866
 867	/*
 868	 * Allow others to run.
 869	 */
 870	atomic_inc(&mce_executing);
 871
 872	if (order == 1) {
 873		/* CHECKME: Can this race with a parallel hotplug? */
 874		int cpus = num_online_cpus();
 875
 876		/*
 877		 * Monarch: Wait for everyone to go through their scanning
 878		 * loops.
 879		 */
 880		while (atomic_read(&mce_executing) <= cpus) {
 881			if (mce_timed_out(&timeout))
 
 882				goto reset;
 883			ndelay(SPINUNIT);
 884		}
 885
 886		mce_reign();
 887		barrier();
 888		ret = 0;
 889	} else {
 890		/*
 891		 * Subject: Wait for Monarch to finish.
 892		 */
 893		while (atomic_read(&mce_executing) != 0) {
 894			if (mce_timed_out(&timeout))
 
 895				goto reset;
 896			ndelay(SPINUNIT);
 897		}
 898
 899		/*
 900		 * Don't reset anything. That's done by the Monarch.
 901		 */
 902		return 0;
 903	}
 904
 905	/*
 906	 * Reset all global state.
 907	 */
 908reset:
 909	atomic_set(&global_nwo, 0);
 910	atomic_set(&mce_callin, 0);
 911	barrier();
 912
 913	/*
 914	 * Let others run again.
 915	 */
 916	atomic_set(&mce_executing, 0);
 917	return ret;
 918}
 919
 920/*
 921 * Check if the address reported by the CPU is in a format we can parse.
 922 * It would be possible to add code for most other cases, but all would
 923 * be somewhat complicated (e.g. segment offset would require an instruction
 924 * parser). So only support physical addresses up to page granuality for now.
 925 */
 926static int mce_usable_address(struct mce *m)
 927{
 928	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 929		return 0;
 930	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 931		return 0;
 932	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 933		return 0;
 934	return 1;
 935}
 936
 937static void mce_clear_state(unsigned long *toclear)
 938{
 939	int i;
 940
 941	for (i = 0; i < banks; i++) {
 942		if (test_bit(i, toclear))
 943			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 944	}
 945}
 946
 947/*
 948 * Need to save faulting physical address associated with a process
 949 * in the machine check handler some place where we can grab it back
 950 * later in mce_notify_process()
 951 */
 952#define	MCE_INFO_MAX	16
 953
 954struct mce_info {
 955	atomic_t		inuse;
 956	struct task_struct	*t;
 957	__u64			paddr;
 958	int			restartable;
 959} mce_info[MCE_INFO_MAX];
 960
 961static void mce_save_info(__u64 addr, int c)
 962{
 963	struct mce_info *mi;
 
 964
 965	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
 966		if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
 967			mi->t = current;
 968			mi->paddr = addr;
 969			mi->restartable = c;
 970			return;
 971		}
 972	}
 973
 974	mce_panic("Too many concurrent recoverable errors", NULL, NULL);
 975}
 976
 977static struct mce_info *mce_find_info(void)
 
 978{
 979	struct mce_info *mi;
 980
 981	for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
 982		if (atomic_read(&mi->inuse) && mi->t == current)
 983			return mi;
 984	return NULL;
 985}
 
 
 
 
 
 
 
 
 
 
 
 
 986
 987static void mce_clear_info(struct mce_info *mi)
 988{
 989	atomic_set(&mi->inuse, 0);
 990}
 
 991
 992/*
 993 * The actual machine check handler. This only handles real
 994 * exceptions when something got corrupted coming in through int 18.
 995 *
 996 * This is executed in NMI context not subject to normal locking rules. This
 997 * implies that most kernel services cannot be safely used. Don't even
 998 * think about putting a printk in there!
 999 *
1000 * On Intel systems this is entered on all CPUs in parallel through
1001 * MCE broadcast. However some CPUs might be broken beyond repair,
1002 * so be always careful when synchronizing with others.
1003 */
1004void do_machine_check(struct pt_regs *regs, long error_code)
1005{
 
1006	struct mce m, *final;
1007	int i;
1008	int worst = 0;
1009	int severity;
 
1010	/*
1011	 * Establish sequential order between the CPUs entering the machine
1012	 * check handler.
1013	 */
1014	int order;
1015	/*
1016	 * If no_way_out gets set, there is no safe way to recover from this
1017	 * MCE.  If tolerant is cranked up, we'll try anyway.
1018	 */
1019	int no_way_out = 0;
1020	/*
1021	 * If kill_it gets set, there might be a way to recover from this
1022	 * error.
1023	 */
1024	int kill_it = 0;
1025	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1026	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1027	char *msg = "Unknown";
1028
1029	atomic_inc(&mce_entry);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1030
1031	this_cpu_inc(mce_exception_count);
1032
1033	if (!banks)
1034		goto out;
1035
1036	mce_gather_info(&m, regs);
 
1037
1038	final = &__get_cpu_var(mces_seen);
1039	*final = m;
1040
1041	memset(valid_banks, 0, sizeof(valid_banks));
1042	no_way_out = mce_no_way_out(&m, &msg, valid_banks);
1043
1044	barrier();
1045
1046	/*
1047	 * When no restart IP might need to kill or panic.
1048	 * Assume the worst for now, but if we find the
1049	 * severity is MCE_AR_SEVERITY we have other options.
1050	 */
1051	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1052		kill_it = 1;
1053
1054	/*
1055	 * Go through all the banks in exclusion of the other CPUs.
1056	 * This way we don't report duplicated events on shared banks
1057	 * because the first one to see it will clear it.
 
 
 
 
 
 
 
 
1058	 */
1059	order = mce_start(&no_way_out);
1060	for (i = 0; i < banks; i++) {
 
 
1061		__clear_bit(i, toclear);
1062		if (!test_bit(i, valid_banks))
1063			continue;
1064		if (!mce_banks[i].ctl)
1065			continue;
1066
1067		m.misc = 0;
1068		m.addr = 0;
1069		m.bank = i;
1070
1071		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1072		if ((m.status & MCI_STATUS_VAL) == 0)
1073			continue;
1074
1075		/*
1076		 * Non uncorrected or non signaled errors are handled by
1077		 * machine_check_poll. Leave them alone, unless this panics.
1078		 */
1079		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1080			!no_way_out)
1081			continue;
1082
1083		/*
1084		 * Set taint even when machine check was not enabled.
1085		 */
1086		add_taint(TAINT_MACHINE_CHECK);
1087
1088		severity = mce_severity(&m, tolerant, NULL);
1089
1090		/*
1091		 * When machine check was for corrected handler don't touch,
1092		 * unless we're panicing.
1093		 */
1094		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
 
1095			continue;
1096		__set_bit(i, toclear);
1097		if (severity == MCE_NO_SEVERITY) {
1098			/*
1099			 * Machine check event was not enabled. Clear, but
1100			 * ignore.
1101			 */
1102			continue;
1103		}
1104
1105		mce_read_aux(&m, i);
1106
1107		/*
1108		 * Action optional error. Queue address for later processing.
1109		 * When the ring overflows we just ignore the AO error.
1110		 * RED-PEN add some logging mechanism when
1111		 * usable_address or mce_add_ring fails.
1112		 * RED-PEN don't ignore overflow for tolerant == 0
1113		 */
1114		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1115			mce_ring_add(m.addr >> PAGE_SHIFT);
1116
1117		mce_log(&m);
1118
1119		if (severity > worst) {
1120			*final = m;
1121			worst = severity;
1122		}
1123	}
1124
1125	/* mce_clear_state will clear *final, save locally for use later */
1126	m = *final;
1127
1128	if (!no_way_out)
1129		mce_clear_state(toclear);
1130
1131	/*
1132	 * Do most of the synchronization with other CPUs.
1133	 * When there's any problem use only local no_way_out state.
1134	 */
1135	if (mce_end(order) < 0)
1136		no_way_out = worst >= MCE_PANIC_SEVERITY;
 
 
 
 
 
 
 
 
 
 
1137
1138	/*
1139	 * At insane "tolerant" levels we take no action. Otherwise
1140	 * we only die if we have no other choice. For less serious
1141	 * issues we try to recover, or limit damage to the current
1142	 * process.
1143	 */
1144	if (tolerant < 3) {
1145		if (no_way_out)
1146			mce_panic("Fatal machine check on current CPU", &m, msg);
1147		if (worst == MCE_AR_SEVERITY) {
1148			/* schedule action before return to userland */
1149			mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1150			set_thread_flag(TIF_MCE_NOTIFY);
1151		} else if (kill_it) {
1152			force_sig(SIGBUS, current);
1153		}
1154	}
1155
1156	if (worst > 0)
1157		mce_report_event(regs);
1158	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1159out:
1160	atomic_dec(&mce_entry);
1161	sync_core();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1162}
1163EXPORT_SYMBOL_GPL(do_machine_check);
1164
1165#ifndef CONFIG_MEMORY_FAILURE
1166int memory_failure(unsigned long pfn, int vector, int flags)
1167{
1168	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1169	BUG_ON(flags & MF_ACTION_REQUIRED);
1170	printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
1171		"Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
 
1172
1173	return 0;
1174}
1175#endif
1176
1177/*
1178 * Called in process context that interrupted by MCE and marked with
1179 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1180 * This code is allowed to sleep.
1181 * Attempt possible recovery such as calling the high level VM handler to
1182 * process any corrupted pages, and kill/signal current process if required.
1183 * Action required errors are handled here.
1184 */
1185void mce_notify_process(void)
1186{
1187	unsigned long pfn;
1188	struct mce_info *mi = mce_find_info();
1189	int flags = MF_ACTION_REQUIRED;
1190
1191	if (!mi)
1192		mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1193	pfn = mi->paddr >> PAGE_SHIFT;
1194
1195	clear_thread_flag(TIF_MCE_NOTIFY);
 
 
 
1196
1197	pr_err("Uncorrected hardware memory error in user-access at %llx",
1198		 mi->paddr);
1199	/*
1200	 * We must call memory_failure() here even if the current process is
1201	 * doomed. We still need to mark the page as poisoned and alert any
1202	 * other users of the page.
1203	 */
1204	if (!mi->restartable)
1205		flags |= MF_MUST_KILL;
1206	if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1207		pr_err("Memory error not recovered");
1208		force_sig(SIGBUS, current);
1209	}
1210	mce_clear_info(mi);
1211}
1212
1213/*
1214 * Action optional processing happens here (picking up
1215 * from the list of faulting pages that do_machine_check()
1216 * placed into the "ring").
1217 */
1218static void mce_process_work(struct work_struct *dummy)
1219{
1220	unsigned long pfn;
 
1221
1222	while (mce_ring_get(&pfn))
1223		memory_failure(pfn, MCE_VECTOR, 0);
1224}
1225
1226#ifdef CONFIG_X86_MCE_INTEL
1227/***
1228 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1229 * @cpu: The CPU on which the event occurred.
1230 * @status: Event status information
1231 *
1232 * This function should be called by the thermal interrupt after the
1233 * event has been processed and the decision was made to log the event
1234 * further.
1235 *
1236 * The status parameter will be saved to the 'status' field of 'struct mce'
1237 * and historically has been the register value of the
1238 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1239 */
1240void mce_log_therm_throt_event(__u64 status)
1241{
1242	struct mce m;
1243
1244	mce_setup(&m);
1245	m.bank = MCE_THERMAL_BANK;
1246	m.status = status;
1247	mce_log(&m);
1248}
1249#endif /* CONFIG_X86_MCE_INTEL */
1250
1251/*
1252 * Periodic polling timer for "silent" machine check errors.  If the
1253 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1254 * errors, poll 2x slower (up to check_interval seconds).
1255 */
1256static unsigned long check_interval = 5 * 60; /* 5 minutes */
1257
1258static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1259static DEFINE_PER_CPU(struct timer_list, mce_timer);
1260
1261static void mce_timer_fn(unsigned long data)
1262{
1263	struct timer_list *t = &__get_cpu_var(mce_timer);
1264	unsigned long iv;
1265
1266	WARN_ON(smp_processor_id() != data);
 
1267
1268	if (mce_available(__this_cpu_ptr(&cpu_info))) {
1269		machine_check_poll(MCP_TIMESTAMP,
1270				&__get_cpu_var(mce_poll_banks));
 
1271	}
1272
1273	/*
1274	 * Alert userspace if needed.  If we logged an MCE, reduce the
1275	 * polling interval, otherwise increase the polling interval.
1276	 */
1277	iv = __this_cpu_read(mce_next_interval);
1278	if (mce_notify_irq())
1279		iv = max(iv / 2, (unsigned long) HZ/100);
1280	else
1281		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
 
 
1282	__this_cpu_write(mce_next_interval, iv);
 
 
1283
1284	t->expires = jiffies + iv;
1285	add_timer_on(t, smp_processor_id());
 
 
 
 
 
 
 
 
 
 
1286}
1287
1288/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1289static void mce_timer_delete_all(void)
1290{
1291	int cpu;
1292
1293	for_each_online_cpu(cpu)
1294		del_timer_sync(&per_cpu(mce_timer, cpu));
1295}
1296
1297static void mce_do_trigger(struct work_struct *work)
1298{
1299	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1300}
1301
1302static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1303
1304/*
1305 * Notify the user(s) about new machine check events.
1306 * Can be called from interrupt context, but not from machine check/NMI
1307 * context.
1308 */
1309int mce_notify_irq(void)
1310{
1311	/* Not more than two messages every minute */
1312	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1313
1314	if (test_and_clear_bit(0, &mce_need_notify)) {
1315		/* wake processes polling /dev/mcelog */
1316		wake_up_interruptible(&mce_chrdev_wait);
1317
1318		/*
1319		 * There is no risk of missing notifications because
1320		 * work_pending is always cleared before the function is
1321		 * executed.
1322		 */
1323		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1324			schedule_work(&mce_trigger_work);
1325
1326		if (__ratelimit(&ratelimit))
1327			pr_info(HW_ERR "Machine check events logged\n");
1328
1329		return 1;
1330	}
1331	return 0;
1332}
1333EXPORT_SYMBOL_GPL(mce_notify_irq);
1334
1335static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1336{
1337	int i;
 
1338
1339	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1340	if (!mce_banks)
1341		return -ENOMEM;
1342	for (i = 0; i < banks; i++) {
 
1343		struct mce_bank *b = &mce_banks[i];
1344
1345		b->ctl = -1ULL;
1346		b->init = 1;
1347	}
1348	return 0;
1349}
1350
1351/*
1352 * Initialize Machine Checks for a CPU.
1353 */
1354static int __cpuinit __mcheck_cpu_cap_init(void)
1355{
1356	unsigned b;
1357	u64 cap;
1358
1359	rdmsrl(MSR_IA32_MCG_CAP, cap);
1360
1361	b = cap & MCG_BANKCNT_MASK;
1362	if (!banks)
1363		printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1364
1365	if (b > MAX_NR_BANKS) {
1366		printk(KERN_WARNING
1367		       "MCE: Using only %u machine check banks out of %u\n",
1368			MAX_NR_BANKS, b);
1369		b = MAX_NR_BANKS;
1370	}
1371
1372	/* Don't support asymmetric configurations today */
1373	WARN_ON(banks != 0 && b != banks);
1374	banks = b;
 
1375	if (!mce_banks) {
1376		int err = __mcheck_cpu_mce_banks_init();
1377
1378		if (err)
1379			return err;
1380	}
1381
1382	/* Use accurate RIP reporting if available. */
1383	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1384		rip_msr = MSR_IA32_MCG_EIP;
1385
1386	if (cap & MCG_SER_P)
1387		mce_ser = 1;
1388
1389	return 0;
1390}
1391
1392static void __mcheck_cpu_init_generic(void)
1393{
 
1394	mce_banks_t all_banks;
1395	u64 cap;
1396	int i;
 
 
1397
1398	/*
1399	 * Log the machine checks left over from the previous reset.
1400	 */
1401	bitmap_fill(all_banks, MAX_NR_BANKS);
1402	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1403
1404	set_in_cr4(X86_CR4_MCE);
1405
1406	rdmsrl(MSR_IA32_MCG_CAP, cap);
1407	if (cap & MCG_CTL_P)
1408		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
1409
1410	for (i = 0; i < banks; i++) {
 
 
 
 
1411		struct mce_bank *b = &mce_banks[i];
1412
1413		if (!b->init)
1414			continue;
1415		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1416		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1417	}
1418}
1419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1420/* Add per CPU specific workarounds here */
1421static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1422{
 
 
1423	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1424		pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1425		return -EOPNOTSUPP;
1426	}
1427
1428	/* This should be disabled by the BIOS, but isn't always */
1429	if (c->x86_vendor == X86_VENDOR_AMD) {
1430		if (c->x86 == 15 && banks > 4) {
1431			/*
1432			 * disable GART TBL walk error reporting, which
1433			 * trips off incorrectly with the IOMMU & 3ware
1434			 * & Cerberus:
1435			 */
1436			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1437		}
1438		if (c->x86 <= 17 && mce_bootlog < 0) {
1439			/*
1440			 * Lots of broken BIOS around that don't clear them
1441			 * by default and leave crap in there. Don't log:
1442			 */
1443			mce_bootlog = 0;
1444		}
1445		/*
1446		 * Various K7s with broken bank 0 around. Always disable
1447		 * by default.
1448		 */
1449		 if (c->x86 == 6 && banks > 0)
1450			mce_banks[0].ctl = 0;
1451
1452		 /*
1453		  * Turn off MC4_MISC thresholding banks on those models since
1454		  * they're not supported there.
1455		  */
1456		 if (c->x86 == 0x15 &&
1457		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1458			 int i;
1459			 u64 val, hwcr;
1460			 bool need_toggle;
1461			 u32 msrs[] = {
 
 
 
 
 
 
 
1462				0x00000413, /* MC4_MISC0 */
1463				0xc0000408, /* MC4_MISC1 */
1464			 };
1465
1466			 rdmsrl(MSR_K7_HWCR, hwcr);
1467
1468			 /* McStatusWrEn has to be set */
1469			 need_toggle = !(hwcr & BIT(18));
1470
1471			 if (need_toggle)
1472				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1473
1474			 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1475				 rdmsrl(msrs[i], val);
1476
1477				 /* CntP bit set? */
1478				 if (val & BIT_64(62)) {
1479					val &= ~BIT_64(62);
1480					wrmsrl(msrs[i], val);
1481				 }
1482			 }
1483
1484			 /* restore old settings */
1485			 if (need_toggle)
1486				 wrmsrl(MSR_K7_HWCR, hwcr);
1487		 }
1488	}
1489
1490	if (c->x86_vendor == X86_VENDOR_INTEL) {
1491		/*
1492		 * SDM documents that on family 6 bank 0 should not be written
1493		 * because it aliases to another special BIOS controlled
1494		 * register.
1495		 * But it's not aliased anymore on model 0x1a+
1496		 * Don't ignore bank 0 completely because there could be a
1497		 * valid event later, merely don't write CTL0.
1498		 */
1499
1500		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1501			mce_banks[0].init = 0;
1502
1503		/*
1504		 * All newer Intel systems support MCE broadcasting. Enable
1505		 * synchronization with a one second timeout.
1506		 */
1507		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1508			monarch_timeout < 0)
1509			monarch_timeout = USEC_PER_SEC;
1510
1511		/*
1512		 * There are also broken BIOSes on some Pentium M and
1513		 * earlier systems:
1514		 */
1515		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1516			mce_bootlog = 0;
 
 
 
1517	}
1518	if (monarch_timeout < 0)
1519		monarch_timeout = 0;
1520	if (mce_bootlog != 0)
1521		mce_panic_timeout = 30;
1522
1523	return 0;
1524}
1525
1526static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1527{
1528	if (c->x86 != 5)
1529		return 0;
1530
1531	switch (c->x86_vendor) {
1532	case X86_VENDOR_INTEL:
1533		intel_p5_mcheck_init(c);
1534		return 1;
1535		break;
1536	case X86_VENDOR_CENTAUR:
1537		winchip_mcheck_init(c);
1538		return 1;
1539		break;
 
 
1540	}
1541
1542	return 0;
1543}
1544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1545static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1546{
1547	switch (c->x86_vendor) {
1548	case X86_VENDOR_INTEL:
1549		mce_intel_feature_init(c);
 
1550		break;
1551	case X86_VENDOR_AMD:
 
1552		mce_amd_feature_init(c);
1553		break;
 
 
 
 
 
 
 
 
 
 
 
 
 
1554	default:
1555		break;
1556	}
1557}
1558
1559static void __mcheck_cpu_init_timer(void)
1560{
1561	struct timer_list *t = &__get_cpu_var(mce_timer);
1562	unsigned long iv = check_interval * HZ;
1563
1564	setup_timer(t, mce_timer_fn, smp_processor_id());
 
 
 
 
 
 
 
 
 
 
 
 
1565
1566	if (mce_ignore_ce)
1567		return;
 
1568
1569	__this_cpu_write(mce_next_interval, iv);
1570	if (!iv)
1571		return;
1572	t->expires = round_jiffies(jiffies + iv);
1573	add_timer_on(t, smp_processor_id());
1574}
1575
1576/* Handle unconfigured int18 (should never happen) */
1577static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1578{
1579	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1580	       smp_processor_id());
1581}
1582
1583/* Call the installed machine check handler for this CPU setup. */
1584void (*machine_check_vector)(struct pt_regs *, long error_code) =
1585						unexpected_machine_check;
1586
 
 
 
 
 
1587/*
1588 * Called for each booted CPU to set up machine checks.
1589 * Must be called with preempt off:
1590 */
1591void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1592{
1593	if (mce_disabled)
1594		return;
1595
1596	if (__mcheck_cpu_ancient_init(c))
1597		return;
1598
1599	if (!mce_available(c))
1600		return;
1601
1602	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1603		mce_disabled = 1;
 
 
 
 
 
 
1604		return;
1605	}
1606
1607	machine_check_vector = do_machine_check;
1608
 
1609	__mcheck_cpu_init_generic();
1610	__mcheck_cpu_init_vendor(c);
1611	__mcheck_cpu_init_timer();
1612	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1613	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1614}
1615
1616/*
1617 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1618 */
1619
1620static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1621static int mce_chrdev_open_count;	/* #times opened */
1622static int mce_chrdev_open_exclu;	/* already open exclusive? */
1623
1624static int mce_chrdev_open(struct inode *inode, struct file *file)
1625{
1626	spin_lock(&mce_chrdev_state_lock);
 
1627
1628	if (mce_chrdev_open_exclu ||
1629	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1630		spin_unlock(&mce_chrdev_state_lock);
1631
1632		return -EBUSY;
1633	}
1634
1635	if (file->f_flags & O_EXCL)
1636		mce_chrdev_open_exclu = 1;
1637	mce_chrdev_open_count++;
1638
1639	spin_unlock(&mce_chrdev_state_lock);
1640
1641	return nonseekable_open(inode, file);
1642}
1643
1644static int mce_chrdev_release(struct inode *inode, struct file *file)
1645{
1646	spin_lock(&mce_chrdev_state_lock);
1647
1648	mce_chrdev_open_count--;
1649	mce_chrdev_open_exclu = 0;
1650
1651	spin_unlock(&mce_chrdev_state_lock);
1652
1653	return 0;
1654}
1655
1656static void collect_tscs(void *data)
1657{
1658	unsigned long *cpu_tsc = (unsigned long *)data;
1659
1660	rdtscll(cpu_tsc[smp_processor_id()]);
1661}
1662
1663static int mce_apei_read_done;
1664
1665/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1666static int __mce_read_apei(char __user **ubuf, size_t usize)
1667{
1668	int rc;
1669	u64 record_id;
1670	struct mce m;
1671
1672	if (usize < sizeof(struct mce))
1673		return -EINVAL;
1674
1675	rc = apei_read_mce(&m, &record_id);
1676	/* Error or no more MCE record */
1677	if (rc <= 0) {
1678		mce_apei_read_done = 1;
1679		/*
1680		 * When ERST is disabled, mce_chrdev_read() should return
1681		 * "no record" instead of "no device."
1682		 */
1683		if (rc == -ENODEV)
1684			return 0;
1685		return rc;
1686	}
1687	rc = -EFAULT;
1688	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1689		return rc;
1690	/*
1691	 * In fact, we should have cleared the record after that has
1692	 * been flushed to the disk or sent to network in
1693	 * /sbin/mcelog, but we have no interface to support that now,
1694	 * so just clear it to avoid duplication.
1695	 */
1696	rc = apei_clear_mce(record_id);
1697	if (rc) {
1698		mce_apei_read_done = 1;
1699		return rc;
1700	}
1701	*ubuf += sizeof(struct mce);
1702
1703	return 0;
1704}
1705
1706static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1707				size_t usize, loff_t *off)
1708{
1709	char __user *buf = ubuf;
1710	unsigned long *cpu_tsc;
1711	unsigned prev, next;
1712	int i, err;
1713
1714	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1715	if (!cpu_tsc)
1716		return -ENOMEM;
1717
1718	mutex_lock(&mce_chrdev_read_mutex);
1719
1720	if (!mce_apei_read_done) {
1721		err = __mce_read_apei(&buf, usize);
1722		if (err || buf != ubuf)
1723			goto out;
1724	}
1725
1726	next = rcu_dereference_check_mce(mcelog.next);
1727
1728	/* Only supports full reads right now */
1729	err = -EINVAL;
1730	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1731		goto out;
1732
1733	err = 0;
1734	prev = 0;
1735	do {
1736		for (i = prev; i < next; i++) {
1737			unsigned long start = jiffies;
1738			struct mce *m = &mcelog.entry[i];
1739
1740			while (!m->finished) {
1741				if (time_after_eq(jiffies, start + 2)) {
1742					memset(m, 0, sizeof(*m));
1743					goto timeout;
1744				}
1745				cpu_relax();
1746			}
1747			smp_rmb();
1748			err |= copy_to_user(buf, m, sizeof(*m));
1749			buf += sizeof(*m);
1750timeout:
1751			;
1752		}
1753
1754		memset(mcelog.entry + prev, 0,
1755		       (next - prev) * sizeof(struct mce));
1756		prev = next;
1757		next = cmpxchg(&mcelog.next, prev, 0);
1758	} while (next != prev);
1759
1760	synchronize_sched();
1761
1762	/*
1763	 * Collect entries that were still getting written before the
1764	 * synchronize.
1765	 */
1766	on_each_cpu(collect_tscs, cpu_tsc, 1);
1767
1768	for (i = next; i < MCE_LOG_LEN; i++) {
1769		struct mce *m = &mcelog.entry[i];
1770
1771		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1772			err |= copy_to_user(buf, m, sizeof(*m));
1773			smp_rmb();
1774			buf += sizeof(*m);
1775			memset(m, 0, sizeof(*m));
1776		}
1777	}
1778
1779	if (err)
1780		err = -EFAULT;
1781
1782out:
1783	mutex_unlock(&mce_chrdev_read_mutex);
1784	kfree(cpu_tsc);
1785
1786	return err ? err : buf - ubuf;
1787}
1788
1789static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1790{
1791	poll_wait(file, &mce_chrdev_wait, wait);
1792	if (rcu_access_index(mcelog.next))
1793		return POLLIN | POLLRDNORM;
1794	if (!mce_apei_read_done && apei_check_mce())
1795		return POLLIN | POLLRDNORM;
1796	return 0;
1797}
1798
1799static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1800				unsigned long arg)
1801{
1802	int __user *p = (int __user *)arg;
1803
1804	if (!capable(CAP_SYS_ADMIN))
1805		return -EPERM;
1806
1807	switch (cmd) {
1808	case MCE_GET_RECORD_LEN:
1809		return put_user(sizeof(struct mce), p);
1810	case MCE_GET_LOG_LEN:
1811		return put_user(MCE_LOG_LEN, p);
1812	case MCE_GETCLEAR_FLAGS: {
1813		unsigned flags;
1814
1815		do {
1816			flags = mcelog.flags;
1817		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1818
1819		return put_user(flags, p);
1820	}
1821	default:
1822		return -ENOTTY;
1823	}
 
 
1824}
1825
1826static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1827			    size_t usize, loff_t *off);
1828
1829void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1830			     const char __user *ubuf,
1831			     size_t usize, loff_t *off))
1832{
1833	mce_write = fn;
1834}
1835EXPORT_SYMBOL_GPL(register_mce_write_callback);
1836
1837ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1838			 size_t usize, loff_t *off)
1839{
1840	if (mce_write)
1841		return mce_write(filp, ubuf, usize, off);
1842	else
1843		return -EINVAL;
1844}
1845
1846static const struct file_operations mce_chrdev_ops = {
1847	.open			= mce_chrdev_open,
1848	.release		= mce_chrdev_release,
1849	.read			= mce_chrdev_read,
1850	.write			= mce_chrdev_write,
1851	.poll			= mce_chrdev_poll,
1852	.unlocked_ioctl		= mce_chrdev_ioctl,
1853	.llseek			= no_llseek,
1854};
1855
1856static struct miscdevice mce_chrdev_device = {
1857	MISC_MCELOG_MINOR,
1858	"mcelog",
1859	&mce_chrdev_ops,
1860};
1861
1862/*
1863 * mce=off Disables machine check
1864 * mce=no_cmci Disables CMCI
 
1865 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1866 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1867 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1868 *	monarchtimeout is how long to wait for other CPUs on machine
1869 *	check, or 0 to not wait
1870 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 
1871 * mce=nobootlog Don't log MCEs from before booting.
 
 
1872 */
1873static int __init mcheck_enable(char *str)
1874{
 
 
1875	if (*str == 0) {
1876		enable_p5_mce();
1877		return 1;
1878	}
1879	if (*str == '=')
1880		str++;
1881	if (!strcmp(str, "off"))
1882		mce_disabled = 1;
1883	else if (!strcmp(str, "no_cmci"))
1884		mce_cmci_disabled = 1;
 
 
1885	else if (!strcmp(str, "dont_log_ce"))
1886		mce_dont_log_ce = 1;
1887	else if (!strcmp(str, "ignore_ce"))
1888		mce_ignore_ce = 1;
1889	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1890		mce_bootlog = (str[0] == 'b');
 
 
 
 
1891	else if (isdigit(str[0])) {
1892		get_option(&str, &tolerant);
1893		if (*str == ',') {
1894			++str;
1895			get_option(&str, &monarch_timeout);
1896		}
1897	} else {
1898		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1899		       str);
1900		return 0;
1901	}
1902	return 1;
1903}
1904__setup("mce", mcheck_enable);
1905
1906int __init mcheck_init(void)
1907{
1908	mcheck_intel_therm_init();
 
 
 
 
 
 
 
1909
1910	return 0;
1911}
1912
1913/*
1914 * mce_syscore: PM support
1915 */
1916
1917/*
1918 * Disable machine checks on suspend and shutdown. We can't really handle
1919 * them later.
1920 */
1921static int mce_disable_error_reporting(void)
1922{
1923	int i;
1924
1925	for (i = 0; i < banks; i++) {
1926		struct mce_bank *b = &mce_banks[i];
1927
1928		if (b->init)
1929			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1930	}
1931	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1932}
1933
1934static int mce_syscore_suspend(void)
1935{
1936	return mce_disable_error_reporting();
 
1937}
1938
1939static void mce_syscore_shutdown(void)
1940{
1941	mce_disable_error_reporting();
1942}
1943
1944/*
1945 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1946 * Only one CPU is active at this time, the others get re-added later using
1947 * CPU hotplug:
1948 */
1949static void mce_syscore_resume(void)
1950{
1951	__mcheck_cpu_init_generic();
1952	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
 
1953}
1954
1955static struct syscore_ops mce_syscore_ops = {
1956	.suspend	= mce_syscore_suspend,
1957	.shutdown	= mce_syscore_shutdown,
1958	.resume		= mce_syscore_resume,
1959};
1960
1961/*
1962 * mce_device: Sysfs support
1963 */
1964
1965static void mce_cpu_restart(void *data)
1966{
1967	if (!mce_available(__this_cpu_ptr(&cpu_info)))
1968		return;
1969	__mcheck_cpu_init_generic();
 
1970	__mcheck_cpu_init_timer();
1971}
1972
1973/* Reinit MCEs after user configuration changes */
1974static void mce_restart(void)
1975{
1976	mce_timer_delete_all();
1977	on_each_cpu(mce_cpu_restart, NULL, 1);
1978}
1979
1980/* Toggle features for corrected errors */
1981static void mce_disable_cmci(void *data)
1982{
1983	if (!mce_available(__this_cpu_ptr(&cpu_info)))
1984		return;
1985	cmci_clear();
1986}
1987
1988static void mce_enable_ce(void *all)
1989{
1990	if (!mce_available(__this_cpu_ptr(&cpu_info)))
1991		return;
1992	cmci_reenable();
1993	cmci_recheck();
1994	if (all)
1995		__mcheck_cpu_init_timer();
1996}
1997
1998static struct bus_type mce_subsys = {
1999	.name		= "machinecheck",
2000	.dev_name	= "machinecheck",
2001};
2002
2003DEFINE_PER_CPU(struct device *, mce_device);
2004
2005__cpuinitdata
2006void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2007
2008static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2009{
2010	return container_of(attr, struct mce_bank, attr);
2011}
2012
2013static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2014			 char *buf)
2015{
2016	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2017}
2018
2019static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2020			const char *buf, size_t size)
2021{
2022	u64 new;
2023
2024	if (strict_strtoull(buf, 0, &new) < 0)
2025		return -EINVAL;
2026
2027	attr_to_bank(attr)->ctl = new;
2028	mce_restart();
2029
2030	return size;
2031}
2032
2033static ssize_t
2034show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2035{
2036	strcpy(buf, mce_helper);
2037	strcat(buf, "\n");
2038	return strlen(mce_helper) + 1;
2039}
2040
2041static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2042				const char *buf, size_t siz)
2043{
2044	char *p;
2045
2046	strncpy(mce_helper, buf, sizeof(mce_helper));
2047	mce_helper[sizeof(mce_helper)-1] = 0;
2048	p = strchr(mce_helper, '\n');
2049
2050	if (p)
2051		*p = 0;
2052
2053	return strlen(mce_helper) + !!p;
2054}
2055
2056static ssize_t set_ignore_ce(struct device *s,
2057			     struct device_attribute *attr,
2058			     const char *buf, size_t size)
2059{
2060	u64 new;
2061
2062	if (strict_strtoull(buf, 0, &new) < 0)
2063		return -EINVAL;
2064
2065	if (mce_ignore_ce ^ !!new) {
 
2066		if (new) {
2067			/* disable ce features */
2068			mce_timer_delete_all();
2069			on_each_cpu(mce_disable_cmci, NULL, 1);
2070			mce_ignore_ce = 1;
2071		} else {
2072			/* enable ce features */
2073			mce_ignore_ce = 0;
2074			on_each_cpu(mce_enable_ce, (void *)1, 1);
2075		}
2076	}
 
 
2077	return size;
2078}
2079
2080static ssize_t set_cmci_disabled(struct device *s,
2081				 struct device_attribute *attr,
2082				 const char *buf, size_t size)
2083{
2084	u64 new;
2085
2086	if (strict_strtoull(buf, 0, &new) < 0)
2087		return -EINVAL;
2088
2089	if (mce_cmci_disabled ^ !!new) {
 
2090		if (new) {
2091			/* disable cmci */
2092			on_each_cpu(mce_disable_cmci, NULL, 1);
2093			mce_cmci_disabled = 1;
2094		} else {
2095			/* enable cmci */
2096			mce_cmci_disabled = 0;
2097			on_each_cpu(mce_enable_ce, NULL, 1);
2098		}
2099	}
 
 
2100	return size;
2101}
2102
2103static ssize_t store_int_with_restart(struct device *s,
2104				      struct device_attribute *attr,
2105				      const char *buf, size_t size)
2106{
2107	ssize_t ret = device_store_int(s, attr, buf, size);
 
 
 
 
 
 
 
 
 
2108	mce_restart();
 
 
2109	return ret;
2110}
2111
2112static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2113static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
2114static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
2115static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
2116
2117static struct dev_ext_attribute dev_attr_check_interval = {
2118	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2119	&check_interval
2120};
2121
2122static struct dev_ext_attribute dev_attr_ignore_ce = {
2123	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
2124	&mce_ignore_ce
2125};
2126
2127static struct dev_ext_attribute dev_attr_cmci_disabled = {
2128	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
2129	&mce_cmci_disabled
2130};
2131
2132static struct device_attribute *mce_device_attrs[] = {
2133	&dev_attr_tolerant.attr,
2134	&dev_attr_check_interval.attr,
 
2135	&dev_attr_trigger,
 
2136	&dev_attr_monarch_timeout.attr,
2137	&dev_attr_dont_log_ce.attr,
2138	&dev_attr_ignore_ce.attr,
2139	&dev_attr_cmci_disabled.attr,
2140	NULL
2141};
2142
2143static cpumask_var_t mce_device_initialized;
2144
2145static void mce_device_release(struct device *dev)
2146{
2147	kfree(dev);
2148}
2149
2150/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2151static __cpuinit int mce_device_create(unsigned int cpu)
2152{
2153	struct device *dev;
2154	int err;
2155	int i, j;
2156
2157	if (!mce_available(&boot_cpu_data))
2158		return -EIO;
2159
 
 
 
 
2160	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2161	if (!dev)
2162		return -ENOMEM;
2163	dev->id  = cpu;
2164	dev->bus = &mce_subsys;
2165	dev->release = &mce_device_release;
2166
2167	err = device_register(dev);
2168	if (err)
 
2169		return err;
 
2170
2171	for (i = 0; mce_device_attrs[i]; i++) {
2172		err = device_create_file(dev, mce_device_attrs[i]);
2173		if (err)
2174			goto error;
2175	}
2176	for (j = 0; j < banks; j++) {
2177		err = device_create_file(dev, &mce_banks[j].attr);
2178		if (err)
2179			goto error2;
2180	}
2181	cpumask_set_cpu(cpu, mce_device_initialized);
2182	per_cpu(mce_device, cpu) = dev;
2183
2184	return 0;
2185error2:
2186	while (--j >= 0)
2187		device_remove_file(dev, &mce_banks[j].attr);
2188error:
2189	while (--i >= 0)
2190		device_remove_file(dev, mce_device_attrs[i]);
2191
2192	device_unregister(dev);
2193
2194	return err;
2195}
2196
2197static __cpuinit void mce_device_remove(unsigned int cpu)
2198{
2199	struct device *dev = per_cpu(mce_device, cpu);
2200	int i;
2201
2202	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2203		return;
2204
2205	for (i = 0; mce_device_attrs[i]; i++)
2206		device_remove_file(dev, mce_device_attrs[i]);
2207
2208	for (i = 0; i < banks; i++)
2209		device_remove_file(dev, &mce_banks[i].attr);
2210
2211	device_unregister(dev);
2212	cpumask_clear_cpu(cpu, mce_device_initialized);
2213	per_cpu(mce_device, cpu) = NULL;
2214}
2215
2216/* Make sure there are no machine checks on offlined CPUs. */
2217static void __cpuinit mce_disable_cpu(void *h)
2218{
2219	unsigned long action = *(unsigned long *)h;
2220	int i;
2221
2222	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2223		return;
2224
2225	if (!(action & CPU_TASKS_FROZEN))
2226		cmci_clear();
2227	for (i = 0; i < banks; i++) {
2228		struct mce_bank *b = &mce_banks[i];
2229
2230		if (b->init)
2231			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2232	}
2233}
2234
2235static void __cpuinit mce_reenable_cpu(void *h)
2236{
2237	unsigned long action = *(unsigned long *)h;
2238	int i;
2239
2240	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2241		return;
2242
2243	if (!(action & CPU_TASKS_FROZEN))
2244		cmci_reenable();
2245	for (i = 0; i < banks; i++) {
2246		struct mce_bank *b = &mce_banks[i];
2247
2248		if (b->init)
2249			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2250	}
2251}
2252
2253/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2254static int __cpuinit
2255mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 
 
 
 
 
 
 
2256{
2257	unsigned int cpu = (unsigned long)hcpu;
2258	struct timer_list *t = &per_cpu(mce_timer, cpu);
2259
2260	switch (action) {
2261	case CPU_ONLINE:
2262	case CPU_ONLINE_FROZEN:
2263		mce_device_create(cpu);
2264		if (threshold_cpu_callback)
2265			threshold_cpu_callback(action, cpu);
2266		break;
2267	case CPU_DEAD:
2268	case CPU_DEAD_FROZEN:
2269		if (threshold_cpu_callback)
2270			threshold_cpu_callback(action, cpu);
2271		mce_device_remove(cpu);
2272		break;
2273	case CPU_DOWN_PREPARE:
2274	case CPU_DOWN_PREPARE_FROZEN:
2275		del_timer_sync(t);
2276		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2277		break;
2278	case CPU_DOWN_FAILED:
2279	case CPU_DOWN_FAILED_FROZEN:
2280		if (!mce_ignore_ce && check_interval) {
2281			t->expires = round_jiffies(jiffies +
2282					per_cpu(mce_next_interval, cpu));
2283			add_timer_on(t, cpu);
2284		}
2285		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2286		break;
2287	case CPU_POST_DEAD:
2288		/* intentionally ignoring frozen here */
2289		cmci_rediscover(cpu);
2290		break;
2291	}
2292	return NOTIFY_OK;
 
 
2293}
2294
2295static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2296	.notifier_call = mce_cpu_callback,
2297};
 
 
 
 
 
 
 
2298
2299static __init void mce_init_banks(void)
2300{
2301	int i;
2302
2303	for (i = 0; i < banks; i++) {
2304		struct mce_bank *b = &mce_banks[i];
2305		struct device_attribute *a = &b->attr;
2306
2307		sysfs_attr_init(&a->attr);
2308		a->attr.name	= b->attrname;
2309		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2310
2311		a->attr.mode	= 0644;
2312		a->show		= show_bank;
2313		a->store	= set_bank;
2314	}
2315}
2316
2317static __init int mcheck_init_device(void)
2318{
2319	int err;
2320	int i = 0;
2321
2322	if (!mce_available(&boot_cpu_data))
2323		return -EIO;
 
 
 
 
 
 
 
 
2324
2325	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
 
 
 
2326
2327	mce_init_banks();
2328
2329	err = subsys_system_register(&mce_subsys, NULL);
2330	if (err)
2331		return err;
 
 
 
 
 
2332
2333	for_each_online_cpu(i) {
2334		err = mce_device_create(i);
2335		if (err)
2336			return err;
2337	}
2338
2339	register_syscore_ops(&mce_syscore_ops);
2340	register_hotcpu_notifier(&mce_cpu_notifier);
2341
2342	/* register character device /dev/mcelog */
2343	misc_register(&mce_chrdev_device);
 
 
 
 
 
 
 
 
2344
2345	return err;
2346}
2347device_initcall(mcheck_init_device);
2348
2349/*
2350 * Old style boot options parsing. Only for compatibility.
2351 */
2352static int __init mcheck_disable(char *str)
2353{
2354	mce_disabled = 1;
2355	return 1;
2356}
2357__setup("nomce", mcheck_disable);
2358
2359#ifdef CONFIG_DEBUG_FS
2360struct dentry *mce_get_debugfs_dir(void)
2361{
2362	static struct dentry *dmce;
2363
2364	if (!dmce)
2365		dmce = debugfs_create_dir("mce", NULL);
2366
2367	return dmce;
2368}
2369
2370static void mce_reset(void)
2371{
2372	cpu_missing = 0;
2373	atomic_set(&mce_fake_paniced, 0);
2374	atomic_set(&mce_executing, 0);
2375	atomic_set(&mce_callin, 0);
2376	atomic_set(&global_nwo, 0);
2377}
2378
2379static int fake_panic_get(void *data, u64 *val)
2380{
2381	*val = fake_panic;
2382	return 0;
2383}
2384
2385static int fake_panic_set(void *data, u64 val)
2386{
2387	mce_reset();
2388	fake_panic = val;
2389	return 0;
2390}
2391
2392DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2393			fake_panic_set, "%llu\n");
2394
2395static int __init mcheck_debugfs_init(void)
2396{
2397	struct dentry *dmce, *ffake_panic;
2398
2399	dmce = mce_get_debugfs_dir();
2400	if (!dmce)
2401		return -ENOMEM;
2402	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2403					  &fake_panic_fops);
2404	if (!ffake_panic)
2405		return -ENOMEM;
2406
2407	return 0;
2408}
2409late_initcall(mcheck_debugfs_init);
 
2410#endif