Linux Audio

Check our new training course

Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2#include <linux/module.h>
   3#include <linux/slab.h>
   4
   5#include <asm/cpu.h>
   6
   7#include "mce_amd.h"
   8
   9static struct amd_decoder_ops fam_ops;
  10
  11static u8 xec_mask	 = 0xf;
 
 
 
 
  12
  13static void (*decode_dram_ecc)(int node_id, struct mce *m);
 
 
 
 
  14
  15void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  16{
  17	decode_dram_ecc = f;
  18}
  19EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  20
  21void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  22{
  23	if (decode_dram_ecc) {
  24		WARN_ON(decode_dram_ecc != f);
  25
  26		decode_dram_ecc = NULL;
  27	}
  28}
  29EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  30
  31/*
  32 * string representation for the different MCA reported error types, see F3x48
  33 * or MSR0000_0411.
  34 */
  35
  36/* transaction type */
  37static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
 
  38
  39/* cache level */
  40static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
 
  41
  42/* memory transaction type */
  43static const char * const rrrr_msgs[] = {
  44       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  45};
 
  46
  47/* participating processor */
  48const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  49EXPORT_SYMBOL_GPL(pp_msgs);
  50
  51/* request timeout */
  52static const char * const to_msgs[] = { "no timeout", "timed out" };
 
  53
  54/* memory or i/o */
  55static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
 
  56
  57/* internal error type */
  58static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
 
 
 
 
 
 
 
 
  59
  60static const char * const f15h_mc1_mce_desc[] = {
  61	"UC during a demand linefill from L2",
  62	"Parity error during data load from IC",
  63	"Parity error for IC valid bit",
  64	"Main tag parity error",
  65	"Parity error in prediction queue",
  66	"PFB data/address parity error",
  67	"Parity error in the branch status reg",
  68	"PFB promotion address error",
  69	"Tag error during probe/victimization",
  70	"Parity error for IC probe tag valid bit",
  71	"PFB non-cacheable bit parity error",
  72	"PFB valid bit parity error",			/* xec = 0xd */
  73	"Microcode Patch Buffer",			/* xec = 010 */
  74	"uop queue",
  75	"insn buffer",
  76	"predecode buffer",
  77	"fetch address FIFO",
  78	"dispatch uop queue"
  79};
  80
  81static const char * const f15h_mc2_mce_desc[] = {
  82	"Fill ECC error on data fills",			/* xec = 0x4 */
  83	"Fill parity error on insn fills",
  84	"Prefetcher request FIFO parity error",
  85	"PRQ address parity error",
  86	"PRQ data parity error",
  87	"WCC Tag ECC error",
  88	"WCC Data ECC error",
  89	"WCB Data parity error",
  90	"VB Data ECC or parity error",
  91	"L2 Tag ECC error",				/* xec = 0x10 */
  92	"Hard L2 Tag ECC error",
  93	"Multiple hits on L2 tag",
  94	"XAB parity error",
  95	"PRB address parity error"
  96};
  97
  98static const char * const mc4_mce_desc[] = {
  99	"DRAM ECC error detected on the NB",
 100	"CRC error detected on HT link",
 101	"Link-defined sync error packets detected on HT link",
 102	"HT Master abort",
 103	"HT Target abort",
 104	"Invalid GART PTE entry during GART table walk",
 105	"Unsupported atomic RMW received from an IO link",
 106	"Watchdog timeout due to lack of progress",
 107	"DRAM ECC error detected on the NB",
 108	"SVM DMA Exclusion Vector error",
 109	"HT data error detected on link",
 110	"Protocol error (link, L3, probe filter)",
 111	"NB internal arrays parity error",
 112	"DRAM addr/ctl signals parity error",
 113	"IO link transmission error",
 114	"L3 data cache ECC error",			/* xec = 0x1c */
 115	"L3 cache tag error",
 116	"L3 LRU parity bits error",
 117	"ECC Error in the Probe Filter directory"
 118};
 119
 120static const char * const mc5_mce_desc[] = {
 121	"CPU Watchdog timer expire",
 122	"Wakeup array dest tag",
 123	"AG payload array",
 124	"EX payload array",
 125	"IDRF array",
 126	"Retire dispatch queue",
 127	"Mapper checkpoint array",
 128	"Physical register file EX0 port",
 129	"Physical register file EX1 port",
 130	"Physical register file AG0 port",
 131	"Physical register file AG1 port",
 132	"Flag register file",
 133	"DE error occurred",
 134	"Retire status queue"
 135};
 136
 137static const char * const mc6_mce_desc[] = {
 138	"Hardware Assertion",
 139	"Free List",
 140	"Physical Register File",
 141	"Retire Queue",
 142	"Scheduler table",
 143	"Status Register File",
 144};
 145
 146static bool f12h_mc0_mce(u16 ec, u8 xec)
 147{
 148	bool ret = false;
 149
 150	if (MEM_ERROR(ec)) {
 151		u8 ll = LL(ec);
 152		ret = true;
 153
 154		if (ll == LL_L2)
 155			pr_cont("during L1 linefill from L2.\n");
 156		else if (ll == LL_L1)
 157			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 158		else
 159			ret = false;
 160	}
 161	return ret;
 162}
 163
 164static bool f10h_mc0_mce(u16 ec, u8 xec)
 165{
 166	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 167		pr_cont("during data scrub.\n");
 168		return true;
 169	}
 170	return f12h_mc0_mce(ec, xec);
 171}
 172
 173static bool k8_mc0_mce(u16 ec, u8 xec)
 174{
 175	if (BUS_ERROR(ec)) {
 176		pr_cont("during system linefill.\n");
 177		return true;
 178	}
 179
 180	return f10h_mc0_mce(ec, xec);
 181}
 182
 183static bool cat_mc0_mce(u16 ec, u8 xec)
 184{
 185	u8 r4	 = R4(ec);
 186	bool ret = true;
 187
 188	if (MEM_ERROR(ec)) {
 189
 190		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 191			return false;
 192
 193		switch (r4) {
 194		case R4_DRD:
 195		case R4_DWR:
 196			pr_cont("Data/Tag parity error due to %s.\n",
 197				(r4 == R4_DRD ? "load/hw prf" : "store"));
 198			break;
 199		case R4_EVICT:
 200			pr_cont("Copyback parity error on a tag miss.\n");
 201			break;
 202		case R4_SNOOP:
 203			pr_cont("Tag parity error during snoop.\n");
 204			break;
 205		default:
 206			ret = false;
 207		}
 208	} else if (BUS_ERROR(ec)) {
 209
 210		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 211			return false;
 212
 213		pr_cont("System read data error on a ");
 214
 215		switch (r4) {
 216		case R4_RD:
 217			pr_cont("TLB reload.\n");
 218			break;
 219		case R4_DWR:
 220			pr_cont("store.\n");
 221			break;
 222		case R4_DRD:
 223			pr_cont("load.\n");
 224			break;
 225		default:
 226			ret = false;
 227		}
 228	} else {
 229		ret = false;
 230	}
 231
 232	return ret;
 233}
 234
 235static bool f15h_mc0_mce(u16 ec, u8 xec)
 236{
 237	bool ret = true;
 238
 239	if (MEM_ERROR(ec)) {
 240
 241		switch (xec) {
 242		case 0x0:
 243			pr_cont("Data Array access error.\n");
 244			break;
 245
 246		case 0x1:
 247			pr_cont("UC error during a linefill from L2/NB.\n");
 248			break;
 249
 250		case 0x2:
 251		case 0x11:
 252			pr_cont("STQ access error.\n");
 253			break;
 254
 255		case 0x3:
 256			pr_cont("SCB access error.\n");
 257			break;
 258
 259		case 0x10:
 260			pr_cont("Tag error.\n");
 261			break;
 262
 263		case 0x12:
 264			pr_cont("LDQ access error.\n");
 265			break;
 266
 267		default:
 268			ret = false;
 269		}
 270	} else if (BUS_ERROR(ec)) {
 271
 272		if (!xec)
 273			pr_cont("System Read Data Error.\n");
 274		else
 275			pr_cont(" Internal error condition type %d.\n", xec);
 276	} else if (INT_ERROR(ec)) {
 277		if (xec <= 0x1f)
 278			pr_cont("Hardware Assert.\n");
 279		else
 280			ret = false;
 281
 282	} else
 283		ret = false;
 284
 285	return ret;
 286}
 287
 288static void decode_mc0_mce(struct mce *m)
 289{
 290	u16 ec = EC(m->status);
 291	u8 xec = XEC(m->status, xec_mask);
 292
 293	pr_emerg(HW_ERR "MC0 Error: ");
 294
 295	/* TLB error signatures are the same across families */
 296	if (TLB_ERROR(ec)) {
 297		if (TT(ec) == TT_DATA) {
 298			pr_cont("%s TLB %s.\n", LL_MSG(ec),
 299				((xec == 2) ? "locked miss"
 300					    : (xec ? "multimatch" : "parity")));
 301			return;
 302		}
 303	} else if (fam_ops.mc0_mce(ec, xec))
 304		;
 305	else
 306		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 307}
 308
 309static bool k8_mc1_mce(u16 ec, u8 xec)
 310{
 311	u8 ll	 = LL(ec);
 312	bool ret = true;
 313
 314	if (!MEM_ERROR(ec))
 315		return false;
 316
 317	if (ll == 0x2)
 318		pr_cont("during a linefill from L2.\n");
 319	else if (ll == 0x1) {
 320		switch (R4(ec)) {
 321		case R4_IRD:
 322			pr_cont("Parity error during data load.\n");
 323			break;
 324
 325		case R4_EVICT:
 326			pr_cont("Copyback Parity/Victim error.\n");
 327			break;
 328
 329		case R4_SNOOP:
 330			pr_cont("Tag Snoop error.\n");
 331			break;
 332
 333		default:
 334			ret = false;
 335			break;
 336		}
 337	} else
 338		ret = false;
 339
 340	return ret;
 341}
 342
 343static bool cat_mc1_mce(u16 ec, u8 xec)
 344{
 345	u8 r4    = R4(ec);
 346	bool ret = true;
 347
 348	if (!MEM_ERROR(ec))
 349		return false;
 350
 351	if (TT(ec) != TT_INSTR)
 352		return false;
 353
 354	if (r4 == R4_IRD)
 355		pr_cont("Data/tag array parity error for a tag hit.\n");
 356	else if (r4 == R4_SNOOP)
 357		pr_cont("Tag error during snoop/victimization.\n");
 358	else if (xec == 0x0)
 359		pr_cont("Tag parity error from victim castout.\n");
 360	else if (xec == 0x2)
 361		pr_cont("Microcode patch RAM parity error.\n");
 362	else
 363		ret = false;
 364
 
 
 
 
 
 
 
 365	return ret;
 366}
 367
 368static bool f15h_mc1_mce(u16 ec, u8 xec)
 369{
 370	bool ret = true;
 371
 372	if (!MEM_ERROR(ec))
 373		return false;
 374
 375	switch (xec) {
 376	case 0x0 ... 0xa:
 377		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 378		break;
 379
 380	case 0xd:
 381		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 382		break;
 383
 384	case 0x10:
 385		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 386		break;
 387
 388	case 0x11 ... 0x15:
 389		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 390		break;
 391
 392	default:
 393		ret = false;
 394	}
 395	return ret;
 396}
 397
 398static void decode_mc1_mce(struct mce *m)
 399{
 400	u16 ec = EC(m->status);
 401	u8 xec = XEC(m->status, xec_mask);
 402
 403	pr_emerg(HW_ERR "MC1 Error: ");
 404
 405	if (TLB_ERROR(ec))
 406		pr_cont("%s TLB %s.\n", LL_MSG(ec),
 407			(xec ? "multimatch" : "parity error"));
 408	else if (BUS_ERROR(ec)) {
 409		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 410
 411		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 412	} else if (INT_ERROR(ec)) {
 413		if (xec <= 0x3f)
 414			pr_cont("Hardware Assert.\n");
 415		else
 416			goto wrong_mc1_mce;
 417	} else if (fam_ops.mc1_mce(ec, xec))
 418		;
 419	else
 420		goto wrong_mc1_mce;
 421
 422	return;
 423
 424wrong_mc1_mce:
 425	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 426}
 427
 428static bool k8_mc2_mce(u16 ec, u8 xec)
 429{
 430	bool ret = true;
 
 
 
 431
 432	if (xec == 0x1)
 433		pr_cont(" in the write data buffers.\n");
 434	else if (xec == 0x3)
 435		pr_cont(" in the victim data buffers.\n");
 436	else if (xec == 0x2 && MEM_ERROR(ec))
 437		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 438	else if (xec == 0x0) {
 439		if (TLB_ERROR(ec))
 440			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 441				TT_MSG(ec));
 442		else if (BUS_ERROR(ec))
 443			pr_cont(": %s/ECC error in data read from NB: %s.\n",
 444				R4_MSG(ec), PP_MSG(ec));
 445		else if (MEM_ERROR(ec)) {
 446			u8 r4 = R4(ec);
 447
 448			if (r4 >= 0x7)
 449				pr_cont(": %s error during data copyback.\n",
 450					R4_MSG(ec));
 451			else if (r4 <= 0x1)
 452				pr_cont(": %s parity/ECC error during data "
 453					"access from L2.\n", R4_MSG(ec));
 454			else
 455				ret = false;
 456		} else
 457			ret = false;
 458	} else
 459		ret = false;
 460
 461	return ret;
 
 
 
 462}
 463
 464static bool f15h_mc2_mce(u16 ec, u8 xec)
 465{
 466	bool ret = true;
 
 
 
 467
 468	if (TLB_ERROR(ec)) {
 469		if (xec == 0x0)
 470			pr_cont("Data parity TLB read error.\n");
 471		else if (xec == 0x1)
 472			pr_cont("Poison data provided for TLB fill.\n");
 473		else
 474			ret = false;
 475	} else if (BUS_ERROR(ec)) {
 476		if (xec > 2)
 477			ret = false;
 478
 479		pr_cont("Error during attempted NB data read.\n");
 480	} else if (MEM_ERROR(ec)) {
 481		switch (xec) {
 482		case 0x4 ... 0xc:
 483			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 484			break;
 485
 486		case 0x10 ... 0x14:
 487			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 488			break;
 489
 490		default:
 491			ret = false;
 492		}
 493	} else if (INT_ERROR(ec)) {
 494		if (xec <= 0x3f)
 495			pr_cont("Hardware Assert.\n");
 496		else
 497			ret = false;
 498	}
 499
 500	return ret;
 501}
 502
 503static bool f16h_mc2_mce(u16 ec, u8 xec)
 504{
 505	u8 r4 = R4(ec);
 506
 507	if (!MEM_ERROR(ec))
 508		return false;
 509
 510	switch (xec) {
 511	case 0x04 ... 0x05:
 512		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 513		break;
 514
 515	case 0x09 ... 0x0b:
 516	case 0x0d ... 0x0f:
 517		pr_cont("ECC error in L2 tag (%s).\n",
 518			((r4 == R4_GEN)   ? "BankReq" :
 519			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 520		break;
 521
 522	case 0x10 ... 0x19:
 523	case 0x1b:
 524		pr_cont("ECC error in L2 data array (%s).\n",
 525			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 526			((r4 == R4_GEN)   ? "Attr" :
 527			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 528		break;
 529
 530	case 0x1c ... 0x1d:
 531	case 0x1f:
 532		pr_cont("Parity error in L2 attribute bits (%s).\n",
 533			((r4 == R4_RD)  ? "Hit"  :
 534			((r4 == R4_GEN) ? "Attr" : "Fill")));
 535		break;
 536
 537	default:
 538		return false;
 539	}
 540
 541	return true;
 542}
 543
 544static void decode_mc2_mce(struct mce *m)
 545{
 546	u16 ec = EC(m->status);
 547	u8 xec = XEC(m->status, xec_mask);
 548
 549	pr_emerg(HW_ERR "MC2 Error: ");
 550
 551	if (!fam_ops.mc2_mce(ec, xec))
 552		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 553}
 554
 555static void decode_mc3_mce(struct mce *m)
 556{
 557	u16 ec = EC(m->status);
 558	u8 xec = XEC(m->status, xec_mask);
 559
 560	if (boot_cpu_data.x86 >= 0x14) {
 561		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 562			 " please report on LKML.\n");
 563		return;
 564	}
 565
 566	pr_emerg(HW_ERR "MC3 Error");
 567
 568	if (xec == 0x0) {
 569		u8 r4 = R4(ec);
 570
 571		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 572			goto wrong_mc3_mce;
 573
 574		pr_cont(" during %s.\n", R4_MSG(ec));
 575	} else
 576		goto wrong_mc3_mce;
 577
 578	return;
 579
 580 wrong_mc3_mce:
 581	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 582}
 583
 584static void decode_mc4_mce(struct mce *m)
 585{
 586	unsigned int fam = x86_family(m->cpuid);
 587	int node_id = topology_die_id(m->extcpu);
 588	u16 ec = EC(m->status);
 589	u8 xec = XEC(m->status, 0x1f);
 590	u8 offset = 0;
 591
 592	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 593
 594	switch (xec) {
 595	case 0x0 ... 0xe:
 
 
 596
 597		/* special handling for DRAM ECCs */
 598		if (xec == 0x0 || xec == 0x8) {
 599			/* no ECCs on F11h */
 600			if (fam == 0x11)
 601				goto wrong_mc4_mce;
 602
 603			pr_cont("%s.\n", mc4_mce_desc[xec]);
 
 
 604
 605			if (decode_dram_ecc)
 606				decode_dram_ecc(node_id, m);
 607			return;
 608		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 609		break;
 610
 611	case 0xf:
 612		if (TLB_ERROR(ec))
 613			pr_cont("GART Table Walk data error.\n");
 614		else if (BUS_ERROR(ec))
 615			pr_cont("DMA Exclusion Vector Table Walk error.\n");
 616		else
 617			goto wrong_mc4_mce;
 618		return;
 
 
 619
 620	case 0x19:
 621		if (fam == 0x15 || fam == 0x16)
 622			pr_cont("Compute Unit Data Error.\n");
 623		else
 624			goto wrong_mc4_mce;
 625		return;
 
 
 626
 627	case 0x1c ... 0x1f:
 628		offset = 13;
 629		break;
 630
 631	default:
 632		goto wrong_mc4_mce;
 
 
 
 633	}
 634
 635	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 636	return;
 637
 638 wrong_mc4_mce:
 639	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 640}
 641
 642static void decode_mc5_mce(struct mce *m)
 643{
 644	unsigned int fam = x86_family(m->cpuid);
 645	u16 ec = EC(m->status);
 646	u8 xec = XEC(m->status, xec_mask);
 647
 648	if (fam == 0xf || fam == 0x11)
 649		goto wrong_mc5_mce;
 650
 651	pr_emerg(HW_ERR "MC5 Error: ");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 652
 653	if (INT_ERROR(ec)) {
 654		if (xec <= 0x1f) {
 655			pr_cont("Hardware Assert.\n");
 656			return;
 657		} else
 658			goto wrong_mc5_mce;
 659	}
 660
 661	if (xec == 0x0 || xec == 0xc)
 662		pr_cont("%s.\n", mc5_mce_desc[xec]);
 663	else if (xec <= 0xd)
 664		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 665	else
 666		goto wrong_mc5_mce;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 667
 668	return;
 669
 670 wrong_mc5_mce:
 671	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 672}
 
 673
 674static void decode_mc6_mce(struct mce *m)
 675{
 
 676	u8 xec = XEC(m->status, xec_mask);
 677
 678	pr_emerg(HW_ERR "MC6 Error: ");
 679
 680	if (xec > 0x5)
 681		goto wrong_mc6_mce;
 682
 683	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
 684	return;
 685
 686 wrong_mc6_mce:
 687	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 688}
 689
 690static const char * const smca_long_names[] = {
 691	[SMCA_LS ... SMCA_LS_V2]	= "Load Store Unit",
 692	[SMCA_IF]			= "Instruction Fetch Unit",
 693	[SMCA_L2_CACHE]			= "L2 Cache",
 694	[SMCA_DE]			= "Decode Unit",
 695	[SMCA_RESERVED]			= "Reserved",
 696	[SMCA_EX]			= "Execution Unit",
 697	[SMCA_FP]			= "Floating Point Unit",
 698	[SMCA_L3_CACHE]			= "L3 Cache",
 699	[SMCA_CS ... SMCA_CS_V2]	= "Coherent Slave",
 700	[SMCA_PIE]			= "Power, Interrupts, etc.",
 701
 702	/* UMC v2 is separate because both of them can exist in a single system. */
 703	[SMCA_UMC]			= "Unified Memory Controller",
 704	[SMCA_UMC_V2]			= "Unified Memory Controller v2",
 705	[SMCA_PB]			= "Parameter Block",
 706	[SMCA_PSP ... SMCA_PSP_V2]	= "Platform Security Processor",
 707	[SMCA_SMU ... SMCA_SMU_V2]	= "System Management Unit",
 708	[SMCA_MP5]			= "Microprocessor 5 Unit",
 709	[SMCA_MPDMA]			= "MPDMA Unit",
 710	[SMCA_NBIO]			= "Northbridge IO Unit",
 711	[SMCA_PCIE ... SMCA_PCIE_V2]	= "PCI Express Unit",
 712	[SMCA_XGMI_PCS]			= "Ext Global Memory Interconnect PCS Unit",
 713	[SMCA_NBIF]			= "NBIF Unit",
 714	[SMCA_SHUB]			= "System Hub Unit",
 715	[SMCA_SATA]			= "SATA Unit",
 716	[SMCA_USB]			= "USB Unit",
 717	[SMCA_GMI_PCS]			= "Global Memory Interconnect PCS Unit",
 718	[SMCA_XGMI_PHY]			= "Ext Global Memory Interconnect PHY Unit",
 719	[SMCA_WAFL_PHY]			= "WAFL PHY Unit",
 720	[SMCA_GMI_PHY]			= "Global Memory Interconnect PHY Unit",
 721};
 722
 723static const char *smca_get_long_name(enum smca_bank_types t)
 724{
 725	if (t >= N_SMCA_BANK_TYPES)
 726		return NULL;
 727
 728	return smca_long_names[t];
 
 729}
 730
 731/* Decode errors according to Scalable MCA specification */
 732static void decode_smca_error(struct mce *m)
 733{
 734	enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
 735	u8 xec = XEC(m->status, xec_mask);
 736
 737	if (bank_type >= N_SMCA_BANK_TYPES)
 738		return;
 739
 740	if (bank_type == SMCA_RESERVED) {
 741		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
 742		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 743	}
 744
 745	pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
 
 
 746
 747	if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
 748	    xec == 0 && decode_dram_ecc)
 749		decode_dram_ecc(topology_die_id(m->extcpu), m);
 750}
 751
 752static inline void amd_decode_err_code(u16 ec)
 753{
 754	if (INT_ERROR(ec)) {
 755		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
 756		return;
 757	}
 758
 759	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 760
 761	if (BUS_ERROR(ec))
 762		pr_cont(", mem/io: %s", II_MSG(ec));
 763	else
 764		pr_cont(", tx: %s", TT_MSG(ec));
 765
 766	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
 767		pr_cont(", mem-tx: %s", R4_MSG(ec));
 768
 769		if (BUS_ERROR(ec))
 770			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
 771	}
 772
 773	pr_cont("\n");
 774}
 775
 776static const char *decode_error_status(struct mce *m)
 
 
 
 777{
 778	if (m->status & MCI_STATUS_UC) {
 779		if (m->status & MCI_STATUS_PCC)
 780			return "System Fatal error.";
 781		if (m->mcgstatus & MCG_STATUS_RIPV)
 782			return "Uncorrected, software restartable error.";
 783		return "Uncorrected, software containable error.";
 784	}
 785
 786	if (m->status & MCI_STATUS_DEFERRED)
 787		return "Deferred error, no action required.";
 
 
 
 788
 789	return "Corrected error, no action required.";
 790}
 791
 792static int
 793amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 794{
 795	struct mce *m = (struct mce *)data;
 796	unsigned int fam = x86_family(m->cpuid);
 797	int ecc;
 798
 799	if (m->kflags & MCE_HANDLED_CEC)
 800		return NOTIFY_DONE;
 801
 802	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
 
 803
 804	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
 805		m->extcpu,
 806		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
 807		m->bank,
 808		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
 809		((m->status & MCI_STATUS_UC)	? "UE"	  :
 810		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
 811		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
 812		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
 813		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
 814
 815	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 816		u32 low, high;
 817		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 818
 819		if (!rdmsr_safe(addr, &low, &high) &&
 820		    (low & MCI_CONFIG_MCAX))
 821			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 822
 823		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
 824	}
 825
 826	/* do the two bits[14:13] together */
 827	ecc = (m->status >> 45) & 0x3;
 828	if (ecc)
 829		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
 830
 831	if (fam >= 0x15) {
 832		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
 833
 834		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
 835		if (fam != 0x15 || m->bank != 4)
 836			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
 837	}
 838
 839	if (fam >= 0x17)
 840		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
 841
 842	pr_cont("]: 0x%016llx\n", m->status);
 843
 844	if (m->status & MCI_STATUS_ADDRV)
 845		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
 846
 847	if (m->ppin)
 848		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
 849
 850	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 851		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
 852
 853		if (m->status & MCI_STATUS_SYNDV)
 854			pr_cont(", Syndrome: 0x%016llx", m->synd);
 855
 856		pr_cont("\n");
 857
 858		decode_smca_error(m);
 859		goto err_code;
 860	}
 861
 862	if (m->tsc)
 863		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
 864
 865	/* Doesn't matter which member to test. */
 866	if (!fam_ops.mc0_mce)
 867		goto err_code;
 868
 869	switch (m->bank) {
 870	case 0:
 871		decode_mc0_mce(m);
 872		break;
 873
 874	case 1:
 875		decode_mc1_mce(m);
 876		break;
 877
 878	case 2:
 879		decode_mc2_mce(m);
 
 
 
 880		break;
 881
 882	case 3:
 883		decode_mc3_mce(m);
 884		break;
 885
 886	case 4:
 887		decode_mc4_mce(m);
 
 888		break;
 889
 890	case 5:
 891		decode_mc5_mce(m);
 892		break;
 893
 894	case 6:
 895		decode_mc6_mce(m);
 896		break;
 897
 898	default:
 899		break;
 900	}
 901
 902 err_code:
 903	amd_decode_err_code(m->status & 0xffff);
 904
 905	m->kflags |= MCE_HANDLED_EDAC;
 906	return NOTIFY_OK;
 907}
 
 908
 909static struct notifier_block amd_mce_dec_nb = {
 910	.notifier_call	= amd_decode_mce,
 911	.priority	= MCE_PRIO_EDAC,
 912};
 913
 914static int __init mce_amd_init(void)
 915{
 916	struct cpuinfo_x86 *c = &boot_cpu_data;
 917
 918	if (c->x86_vendor != X86_VENDOR_AMD &&
 919	    c->x86_vendor != X86_VENDOR_HYGON)
 920		return -ENODEV;
 921
 922	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
 923		return -ENODEV;
 924
 925	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 926		xec_mask = 0x3f;
 927		goto out;
 928	}
 
 929
 930	switch (c->x86) {
 931	case 0xf:
 932		fam_ops.mc0_mce = k8_mc0_mce;
 933		fam_ops.mc1_mce = k8_mc1_mce;
 934		fam_ops.mc2_mce = k8_mc2_mce;
 935		break;
 936
 937	case 0x10:
 938		fam_ops.mc0_mce = f10h_mc0_mce;
 939		fam_ops.mc1_mce = k8_mc1_mce;
 940		fam_ops.mc2_mce = k8_mc2_mce;
 941		break;
 942
 943	case 0x11:
 944		fam_ops.mc0_mce = k8_mc0_mce;
 945		fam_ops.mc1_mce = k8_mc1_mce;
 946		fam_ops.mc2_mce = k8_mc2_mce;
 947		break;
 948
 949	case 0x12:
 950		fam_ops.mc0_mce = f12h_mc0_mce;
 951		fam_ops.mc1_mce = k8_mc1_mce;
 952		fam_ops.mc2_mce = k8_mc2_mce;
 953		break;
 954
 955	case 0x14:
 956		fam_ops.mc0_mce = cat_mc0_mce;
 957		fam_ops.mc1_mce = cat_mc1_mce;
 958		fam_ops.mc2_mce = k8_mc2_mce;
 
 959		break;
 960
 961	case 0x15:
 962		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
 963
 964		fam_ops.mc0_mce = f15h_mc0_mce;
 965		fam_ops.mc1_mce = f15h_mc1_mce;
 966		fam_ops.mc2_mce = f15h_mc2_mce;
 967		break;
 968
 969	case 0x16:
 970		xec_mask = 0x1f;
 971		fam_ops.mc0_mce = cat_mc0_mce;
 972		fam_ops.mc1_mce = cat_mc1_mce;
 973		fam_ops.mc2_mce = f16h_mc2_mce;
 974		break;
 975
 976	case 0x17:
 977	case 0x18:
 978		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
 979		return -EINVAL;
 980
 981	default:
 982		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
 
 983		return -EINVAL;
 984	}
 985
 986out:
 987	pr_info("MCE: In-kernel MCE decoding enabled.\n");
 988
 989	mce_register_decode_chain(&amd_mce_dec_nb);
 990
 991	return 0;
 992}
 993early_initcall(mce_amd_init);
 994
 995#ifdef MODULE
 996static void __exit mce_amd_exit(void)
 997{
 998	mce_unregister_decode_chain(&amd_mce_dec_nb);
 
 999}
1000
1001MODULE_DESCRIPTION("AMD MCE decoder");
1002MODULE_ALIAS("edac-mce-amd");
1003MODULE_LICENSE("GPL");
1004module_exit(mce_amd_exit);
1005#endif
v3.1
 
  1#include <linux/module.h>
  2#include <linux/slab.h>
  3
 
 
  4#include "mce_amd.h"
  5
  6static struct amd_decoder_ops *fam_ops;
  7
  8static u8 xec_mask	 = 0xf;
  9static u8 nb_err_cpumask = 0xf;
 10
 11static bool report_gart_errors;
 12static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
 13
 14void amd_report_gart_errors(bool v)
 15{
 16	report_gart_errors = v;
 17}
 18EXPORT_SYMBOL_GPL(amd_report_gart_errors);
 19
 20void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
 21{
 22	nb_bus_decoder = f;
 23}
 24EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
 25
 26void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
 27{
 28	if (nb_bus_decoder) {
 29		WARN_ON(nb_bus_decoder != f);
 30
 31		nb_bus_decoder = NULL;
 32	}
 33}
 34EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
 35
 36/*
 37 * string representation for the different MCA reported error types, see F3x48
 38 * or MSR0000_0411.
 39 */
 40
 41/* transaction type */
 42const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
 43EXPORT_SYMBOL_GPL(tt_msgs);
 44
 45/* cache level */
 46const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
 47EXPORT_SYMBOL_GPL(ll_msgs);
 48
 49/* memory transaction type */
 50const char *rrrr_msgs[] = {
 51       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
 52};
 53EXPORT_SYMBOL_GPL(rrrr_msgs);
 54
 55/* participating processor */
 56const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
 57EXPORT_SYMBOL_GPL(pp_msgs);
 58
 59/* request timeout */
 60const char *to_msgs[] = { "no timeout",	"timed out" };
 61EXPORT_SYMBOL_GPL(to_msgs);
 62
 63/* memory or i/o */
 64const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
 65EXPORT_SYMBOL_GPL(ii_msgs);
 66
 67static const char *f10h_nb_mce_desc[] = {
 68	"HT link data error",
 69	"Protocol error (link, L3, probe filter, etc.)",
 70	"Parity error in NB-internal arrays",
 71	"Link Retry due to IO link transmission error",
 72	"L3 ECC data cache error",
 73	"ECC error in L3 cache tag",
 74	"L3 LRU parity bits error",
 75	"ECC Error in the Probe Filter directory"
 76};
 77
 78static const char * const f15h_ic_mce_desc[] = {
 79	"UC during a demand linefill from L2",
 80	"Parity error during data load from IC",
 81	"Parity error for IC valid bit",
 82	"Main tag parity error",
 83	"Parity error in prediction queue",
 84	"PFB data/address parity error",
 85	"Parity error in the branch status reg",
 86	"PFB promotion address error",
 87	"Tag error during probe/victimization",
 88	"Parity error for IC probe tag valid bit",
 89	"PFB non-cacheable bit parity error",
 90	"PFB valid bit parity error",			/* xec = 0xd */
 91	"patch RAM",					/* xec = 010 */
 92	"uop queue",
 93	"insn buffer",
 94	"predecode buffer",
 95	"fetch address FIFO"
 
 96};
 97
 98static const char * const f15h_cu_mce_desc[] = {
 99	"Fill ECC error on data fills",			/* xec = 0x4 */
100	"Fill parity error on insn fills",
101	"Prefetcher request FIFO parity error",
102	"PRQ address parity error",
103	"PRQ data parity error",
104	"WCC Tag ECC error",
105	"WCC Data ECC error",
106	"WCB Data parity error",
107	"VB Data/ECC error",
108	"L2 Tag ECC error",				/* xec = 0x10 */
109	"Hard L2 Tag ECC error",
110	"Multiple hits on L2 tag",
111	"XAB parity error",
112	"PRB address parity error"
113};
114
115static const char * const fr_ex_mce_desc[] = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116	"CPU Watchdog timer expire",
117	"Wakeup array dest tag",
118	"AG payload array",
119	"EX payload array",
120	"IDRF array",
121	"Retire dispatch queue",
122	"Mapper checkpoint array",
123	"Physical register file EX0 port",
124	"Physical register file EX1 port",
125	"Physical register file AG0 port",
126	"Physical register file AG1 port",
127	"Flag register file",
128	"DE correctable error could not be corrected"
 
129};
130
131static bool f12h_dc_mce(u16 ec, u8 xec)
 
 
 
 
 
 
 
 
 
132{
133	bool ret = false;
134
135	if (MEM_ERROR(ec)) {
136		u8 ll = LL(ec);
137		ret = true;
138
139		if (ll == LL_L2)
140			pr_cont("during L1 linefill from L2.\n");
141		else if (ll == LL_L1)
142			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
143		else
144			ret = false;
145	}
146	return ret;
147}
148
149static bool f10h_dc_mce(u16 ec, u8 xec)
150{
151	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
152		pr_cont("during data scrub.\n");
153		return true;
154	}
155	return f12h_dc_mce(ec, xec);
156}
157
158static bool k8_dc_mce(u16 ec, u8 xec)
159{
160	if (BUS_ERROR(ec)) {
161		pr_cont("during system linefill.\n");
162		return true;
163	}
164
165	return f10h_dc_mce(ec, xec);
166}
167
168static bool f14h_dc_mce(u16 ec, u8 xec)
169{
170	u8 r4	 = R4(ec);
171	bool ret = true;
172
173	if (MEM_ERROR(ec)) {
174
175		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
176			return false;
177
178		switch (r4) {
179		case R4_DRD:
180		case R4_DWR:
181			pr_cont("Data/Tag parity error due to %s.\n",
182				(r4 == R4_DRD ? "load/hw prf" : "store"));
183			break;
184		case R4_EVICT:
185			pr_cont("Copyback parity error on a tag miss.\n");
186			break;
187		case R4_SNOOP:
188			pr_cont("Tag parity error during snoop.\n");
189			break;
190		default:
191			ret = false;
192		}
193	} else if (BUS_ERROR(ec)) {
194
195		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
196			return false;
197
198		pr_cont("System read data error on a ");
199
200		switch (r4) {
201		case R4_RD:
202			pr_cont("TLB reload.\n");
203			break;
204		case R4_DWR:
205			pr_cont("store.\n");
206			break;
207		case R4_DRD:
208			pr_cont("load.\n");
209			break;
210		default:
211			ret = false;
212		}
213	} else {
214		ret = false;
215	}
216
217	return ret;
218}
219
220static bool f15h_dc_mce(u16 ec, u8 xec)
221{
222	bool ret = true;
223
224	if (MEM_ERROR(ec)) {
225
226		switch (xec) {
227		case 0x0:
228			pr_cont("Data Array access error.\n");
229			break;
230
231		case 0x1:
232			pr_cont("UC error during a linefill from L2/NB.\n");
233			break;
234
235		case 0x2:
236		case 0x11:
237			pr_cont("STQ access error.\n");
238			break;
239
240		case 0x3:
241			pr_cont("SCB access error.\n");
242			break;
243
244		case 0x10:
245			pr_cont("Tag error.\n");
246			break;
247
248		case 0x12:
249			pr_cont("LDQ access error.\n");
250			break;
251
252		default:
253			ret = false;
254		}
255	} else if (BUS_ERROR(ec)) {
256
257		if (!xec)
258			pr_cont("during system linefill.\n");
 
 
 
 
 
259		else
260			pr_cont(" Internal %s condition.\n",
261				((xec == 1) ? "livelock" : "deadlock"));
262	} else
263		ret = false;
264
265	return ret;
266}
267
268static void amd_decode_dc_mce(struct mce *m)
269{
270	u16 ec = EC(m->status);
271	u8 xec = XEC(m->status, xec_mask);
272
273	pr_emerg(HW_ERR "Data Cache Error: ");
274
275	/* TLB error signatures are the same across families */
276	if (TLB_ERROR(ec)) {
277		if (TT(ec) == TT_DATA) {
278			pr_cont("%s TLB %s.\n", LL_MSG(ec),
279				((xec == 2) ? "locked miss"
280					    : (xec ? "multimatch" : "parity")));
281			return;
282		}
283	} else if (fam_ops->dc_mce(ec, xec))
284		;
285	else
286		pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
287}
288
289static bool k8_ic_mce(u16 ec, u8 xec)
290{
291	u8 ll	 = LL(ec);
292	bool ret = true;
293
294	if (!MEM_ERROR(ec))
295		return false;
296
297	if (ll == 0x2)
298		pr_cont("during a linefill from L2.\n");
299	else if (ll == 0x1) {
300		switch (R4(ec)) {
301		case R4_IRD:
302			pr_cont("Parity error during data load.\n");
303			break;
304
305		case R4_EVICT:
306			pr_cont("Copyback Parity/Victim error.\n");
307			break;
308
309		case R4_SNOOP:
310			pr_cont("Tag Snoop error.\n");
311			break;
312
313		default:
314			ret = false;
315			break;
316		}
317	} else
318		ret = false;
319
320	return ret;
321}
322
323static bool f14h_ic_mce(u16 ec, u8 xec)
324{
325	u8 r4    = R4(ec);
326	bool ret = true;
327
328	if (MEM_ERROR(ec)) {
329		if (TT(ec) != 0 || LL(ec) != 1)
330			ret = false;
 
 
 
 
 
 
 
 
 
 
 
 
 
331
332		if (r4 == R4_IRD)
333			pr_cont("Data/tag array parity error for a tag hit.\n");
334		else if (r4 == R4_SNOOP)
335			pr_cont("Tag error during snoop/victimization.\n");
336		else
337			ret = false;
338	}
339	return ret;
340}
341
342static bool f15h_ic_mce(u16 ec, u8 xec)
343{
344	bool ret = true;
345
346	if (!MEM_ERROR(ec))
347		return false;
348
349	switch (xec) {
350	case 0x0 ... 0xa:
351		pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
352		break;
353
354	case 0xd:
355		pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
 
 
 
 
356		break;
357
358	case 0x10 ... 0x14:
359		pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
360		break;
361
362	default:
363		ret = false;
364	}
365	return ret;
366}
367
368static void amd_decode_ic_mce(struct mce *m)
369{
370	u16 ec = EC(m->status);
371	u8 xec = XEC(m->status, xec_mask);
372
373	pr_emerg(HW_ERR "Instruction Cache Error: ");
374
375	if (TLB_ERROR(ec))
376		pr_cont("%s TLB %s.\n", LL_MSG(ec),
377			(xec ? "multimatch" : "parity error"));
378	else if (BUS_ERROR(ec)) {
379		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
380
381		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
382	} else if (fam_ops->ic_mce(ec, xec))
 
 
 
 
 
383		;
384	else
385		pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
 
 
 
 
 
386}
387
388static void amd_decode_bu_mce(struct mce *m)
389{
390	u16 ec = EC(m->status);
391	u8 xec = XEC(m->status, xec_mask);
392
393	pr_emerg(HW_ERR "Bus Unit Error");
394
395	if (xec == 0x1)
396		pr_cont(" in the write data buffers.\n");
397	else if (xec == 0x3)
398		pr_cont(" in the victim data buffers.\n");
399	else if (xec == 0x2 && MEM_ERROR(ec))
400		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
401	else if (xec == 0x0) {
402		if (TLB_ERROR(ec))
403			pr_cont(": %s error in a Page Descriptor Cache or "
404				"Guest TLB.\n", TT_MSG(ec));
405		else if (BUS_ERROR(ec))
406			pr_cont(": %s/ECC error in data read from NB: %s.\n",
407				R4_MSG(ec), PP_MSG(ec));
408		else if (MEM_ERROR(ec)) {
409			u8 r4 = R4(ec);
410
411			if (r4 >= 0x7)
412				pr_cont(": %s error during data copyback.\n",
413					R4_MSG(ec));
414			else if (r4 <= 0x1)
415				pr_cont(": %s parity/ECC error during data "
416					"access from L2.\n", R4_MSG(ec));
417			else
418				goto wrong_bu_mce;
419		} else
420			goto wrong_bu_mce;
421	} else
422		goto wrong_bu_mce;
423
424	return;
425
426wrong_bu_mce:
427	pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
428}
429
430static void amd_decode_cu_mce(struct mce *m)
431{
432	u16 ec = EC(m->status);
433	u8 xec = XEC(m->status, xec_mask);
434
435	pr_emerg(HW_ERR "Combined Unit Error: ");
436
437	if (TLB_ERROR(ec)) {
438		if (xec == 0x0)
439			pr_cont("Data parity TLB read error.\n");
440		else if (xec == 0x1)
441			pr_cont("Poison data provided for TLB fill.\n");
442		else
443			goto wrong_cu_mce;
444	} else if (BUS_ERROR(ec)) {
445		if (xec > 2)
446			goto wrong_cu_mce;
447
448		pr_cont("Error during attempted NB data read.\n");
449	} else if (MEM_ERROR(ec)) {
450		switch (xec) {
451		case 0x4 ... 0xc:
452			pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
453			break;
454
455		case 0x10 ... 0x14:
456			pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
457			break;
458
459		default:
460			goto wrong_cu_mce;
461		}
 
 
 
 
 
462	}
463
464	return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
466wrong_cu_mce:
467	pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
468}
469
470static void amd_decode_ls_mce(struct mce *m)
471{
472	u16 ec = EC(m->status);
473	u8 xec = XEC(m->status, xec_mask);
474
475	if (boot_cpu_data.x86 >= 0x14) {
476		pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
477			 " please report on LKML.\n");
478		return;
479	}
480
481	pr_emerg(HW_ERR "Load Store Error");
482
483	if (xec == 0x0) {
484		u8 r4 = R4(ec);
485
486		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
487			goto wrong_ls_mce;
488
489		pr_cont(" during %s.\n", R4_MSG(ec));
490	} else
491		goto wrong_ls_mce;
492
493	return;
494
495wrong_ls_mce:
496	pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
497}
498
499static bool k8_nb_mce(u16 ec, u8 xec)
500{
501	bool ret = true;
 
 
 
 
 
 
502
503	switch (xec) {
504	case 0x1:
505		pr_cont("CRC error detected on HT link.\n");
506		break;
507
508	case 0x5:
509		pr_cont("Invalid GART PTE entry during GART table walk.\n");
510		break;
 
 
511
512	case 0x6:
513		pr_cont("Unsupported atomic RMW received from an IO link.\n");
514		break;
515
516	case 0x0:
517	case 0x8:
518		if (boot_cpu_data.x86 == 0x11)
519			return false;
520
521		pr_cont("DRAM ECC error detected on the NB.\n");
522		break;
523
524	case 0xd:
525		pr_cont("Parity error on the DRAM addr/ctl signals.\n");
526		break;
527
528	default:
529		ret = false;
530		break;
531	}
532
533	return ret;
534}
535
536static bool f10h_nb_mce(u16 ec, u8 xec)
537{
538	bool ret = true;
539	u8 offset = 0;
540
541	if (k8_nb_mce(ec, xec))
542		return true;
543
544	switch(xec) {
545	case 0xa ... 0xc:
546		offset = 10;
547		break;
548
549	case 0xe:
550		offset = 11;
551		break;
552
553	case 0xf:
554		if (TLB_ERROR(ec))
555			pr_cont("GART Table Walk data error.\n");
556		else if (BUS_ERROR(ec))
557			pr_cont("DMA Exclusion Vector Table Walk error.\n");
558		else
559			ret = false;
560
561		goto out;
562		break;
563
564	case 0x19:
565		if (boot_cpu_data.x86 == 0x15)
566			pr_cont("Compute Unit Data Error.\n");
567		else
568			ret = false;
569
570		goto out;
571		break;
572
573	case 0x1c ... 0x1f:
574		offset = 24;
575		break;
576
577	default:
578		ret = false;
579
580		goto out;
581		break;
582	}
583
584	pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
 
585
586out:
587	return ret;
588}
589
590static bool nb_noop_mce(u16 ec, u8 xec)
591{
592	return false;
593}
 
 
 
 
594
595void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
596{
597	struct cpuinfo_x86 *c = &boot_cpu_data;
598	u16 ec   = EC(m->status);
599	u8 xec   = XEC(m->status, 0x1f);
600	u32 nbsh = (u32)(m->status >> 32);
601	int core = -1;
602
603	pr_emerg(HW_ERR "Northbridge Error (node %d", node_id);
604
605	/* F10h, revD can disable ErrCpu[3:0] through ErrCpuVal */
606	if (c->x86 == 0x10 && c->x86_model > 7) {
607		if (nbsh & NBSH_ERR_CPU_VAL)
608			core = nbsh & nb_err_cpumask;
609	} else {
610		u8 assoc_cpus = nbsh & nb_err_cpumask;
611
612		if (assoc_cpus > 0)
613			core = fls(assoc_cpus) - 1;
 
 
 
 
614	}
615
616	if (core >= 0)
617		pr_cont(", core %d): ", core);
 
 
618	else
619		pr_cont("): ");
620
621	switch (xec) {
622	case 0x2:
623		pr_cont("Sync error (sync packets on HT link detected).\n");
624		return;
625
626	case 0x3:
627		pr_cont("HT Master abort.\n");
628		return;
629
630	case 0x4:
631		pr_cont("HT Target abort.\n");
632		return;
633
634	case 0x7:
635		pr_cont("NB Watchdog timeout.\n");
636		return;
637
638	case 0x9:
639		pr_cont("SVM DMA Exclusion Vector error.\n");
640		return;
641
642	default:
643		break;
644	}
645
646	if (!fam_ops->nb_mce(ec, xec))
647		goto wrong_nb_mce;
648
649	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x15)
650		if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
651			nb_bus_decoder(node_id, m, nbcfg);
652
653	return;
654
655wrong_nb_mce:
656	pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
657}
658EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
659
660static void amd_decode_fr_mce(struct mce *m)
661{
662	struct cpuinfo_x86 *c = &boot_cpu_data;
663	u8 xec = XEC(m->status, xec_mask);
664
665	if (c->x86 == 0xf || c->x86 == 0x11)
666		goto wrong_fr_mce;
 
 
667
668	if (c->x86 != 0x15 && xec != 0x0)
669		goto wrong_fr_mce;
670
671	pr_emerg(HW_ERR "%s Error: ",
672		 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
 
673
674	if (xec == 0x0 || xec == 0xc)
675		pr_cont("%s.\n", fr_ex_mce_desc[xec]);
676	else if (xec < 0xd)
677		pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
678	else
679		goto wrong_fr_mce;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
681	return;
 
 
 
682
683wrong_fr_mce:
684	pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
685}
686
687static void amd_decode_fp_mce(struct mce *m)
 
688{
 
689	u8 xec = XEC(m->status, xec_mask);
690
691	pr_emerg(HW_ERR "Floating Point Unit Error: ");
 
692
693	switch (xec) {
694	case 0x1:
695		pr_cont("Free List");
696		break;
697
698	case 0x2:
699		pr_cont("Physical Register File");
700		break;
701
702	case 0x3:
703		pr_cont("Retire Queue");
704		break;
705
706	case 0x4:
707		pr_cont("Scheduler table");
708		break;
709
710	case 0x5:
711		pr_cont("Status Register File");
712		break;
713
714	default:
715		goto wrong_fp_mce;
716		break;
717	}
718
719	pr_cont(" parity error.\n");
720
721	return;
722
723wrong_fp_mce:
724	pr_emerg(HW_ERR "Corrupted FP MCE info?\n");
 
725}
726
727static inline void amd_decode_err_code(u16 ec)
728{
 
 
 
 
729
730	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
731
732	if (BUS_ERROR(ec))
733		pr_cont(", mem/io: %s", II_MSG(ec));
734	else
735		pr_cont(", tx: %s", TT_MSG(ec));
736
737	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
738		pr_cont(", mem-tx: %s", R4_MSG(ec));
739
740		if (BUS_ERROR(ec))
741			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
742	}
743
744	pr_cont("\n");
745}
746
747/*
748 * Filter out unwanted MCE signatures here.
749 */
750static bool amd_filter_mce(struct mce *m)
751{
752	u8 xec = (m->status >> 16) & 0x1f;
 
 
 
 
 
 
753
754	/*
755	 * NB GART TLB error reporting is disabled by default.
756	 */
757	if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
758		return true;
759
760	return false;
761}
762
763int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 
764{
765	struct mce *m = (struct mce *)data;
766	struct cpuinfo_x86 *c = &boot_cpu_data;
767	int node, ecc;
 
 
 
768
769	if (amd_filter_mce(m))
770		return NOTIFY_STOP;
771
772	pr_emerg(HW_ERR "MC%d_STATUS[%s|%s|%s|%s|%s",
 
 
773		m->bank,
774		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
775		((m->status & MCI_STATUS_UC)	? "UE"	  : "CE"),
 
776		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
777		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
778		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
779
780	if (c->x86 == 0x15)
781		pr_cont("|%s|%s",
782			((m->status & BIT_64(44)) ? "Deferred" : "-"),
783			((m->status & BIT_64(43)) ? "Poison"   : "-"));
 
 
 
 
 
 
784
785	/* do the two bits[14:13] together */
786	ecc = (m->status >> 45) & 0x3;
787	if (ecc)
788		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
789
 
 
 
 
 
 
 
 
 
 
 
790	pr_cont("]: 0x%016llx\n", m->status);
791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
793	switch (m->bank) {
794	case 0:
795		amd_decode_dc_mce(m);
796		break;
797
798	case 1:
799		amd_decode_ic_mce(m);
800		break;
801
802	case 2:
803		if (c->x86 == 0x15)
804			amd_decode_cu_mce(m);
805		else
806			amd_decode_bu_mce(m);
807		break;
808
809	case 3:
810		amd_decode_ls_mce(m);
811		break;
812
813	case 4:
814		node = amd_get_nb_id(m->extcpu);
815		amd_decode_nb_mce(node, m, 0);
816		break;
817
818	case 5:
819		amd_decode_fr_mce(m);
820		break;
821
822	case 6:
823		amd_decode_fp_mce(m);
824		break;
825
826	default:
827		break;
828	}
829
 
830	amd_decode_err_code(m->status & 0xffff);
831
832	return NOTIFY_STOP;
 
833}
834EXPORT_SYMBOL_GPL(amd_decode_mce);
835
836static struct notifier_block amd_mce_dec_nb = {
837	.notifier_call	= amd_decode_mce,
 
838};
839
840static int __init mce_amd_init(void)
841{
842	struct cpuinfo_x86 *c = &boot_cpu_data;
843
844	if (c->x86_vendor != X86_VENDOR_AMD)
845		return 0;
 
846
847	if ((c->x86 < 0xf || c->x86 > 0x12) &&
848	    (c->x86 != 0x14 || c->x86_model > 0xf) &&
849	    (c->x86 != 0x15 || c->x86_model > 0xf))
850		return 0;
851
852	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
853	if (!fam_ops)
854		return -ENOMEM;
855
856	switch (c->x86) {
857	case 0xf:
858		fam_ops->dc_mce = k8_dc_mce;
859		fam_ops->ic_mce = k8_ic_mce;
860		fam_ops->nb_mce = k8_nb_mce;
861		break;
862
863	case 0x10:
864		fam_ops->dc_mce = f10h_dc_mce;
865		fam_ops->ic_mce = k8_ic_mce;
866		fam_ops->nb_mce = f10h_nb_mce;
867		break;
868
869	case 0x11:
870		fam_ops->dc_mce = k8_dc_mce;
871		fam_ops->ic_mce = k8_ic_mce;
872		fam_ops->nb_mce = f10h_nb_mce;
873		break;
874
875	case 0x12:
876		fam_ops->dc_mce = f12h_dc_mce;
877		fam_ops->ic_mce = k8_ic_mce;
878		fam_ops->nb_mce = nb_noop_mce;
879		break;
880
881	case 0x14:
882		nb_err_cpumask  = 0x3;
883		fam_ops->dc_mce = f14h_dc_mce;
884		fam_ops->ic_mce = f14h_ic_mce;
885		fam_ops->nb_mce = nb_noop_mce;
886		break;
887
888	case 0x15:
 
 
 
 
 
 
 
 
889		xec_mask = 0x1f;
890		fam_ops->dc_mce = f15h_dc_mce;
891		fam_ops->ic_mce = f15h_ic_mce;
892		fam_ops->nb_mce = f10h_nb_mce;
893		break;
894
 
 
 
 
 
895	default:
896		printk(KERN_WARNING "Huh? What family is that: %d?!\n", c->x86);
897		kfree(fam_ops);
898		return -EINVAL;
899	}
900
 
901	pr_info("MCE: In-kernel MCE decoding enabled.\n");
902
903	atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
904
905	return 0;
906}
907early_initcall(mce_amd_init);
908
909#ifdef MODULE
910static void __exit mce_amd_exit(void)
911{
912	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
913	kfree(fam_ops);
914}
915
916MODULE_DESCRIPTION("AMD MCE decoder");
917MODULE_ALIAS("edac-mce-amd");
918MODULE_LICENSE("GPL");
919module_exit(mce_amd_exit);
920#endif