Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2#include <linux/module.h>
3#include <linux/slab.h>
4
5#include <asm/cpu.h>
6
7#include "mce_amd.h"
8
9static struct amd_decoder_ops fam_ops;
10
11static u8 xec_mask = 0xf;
12
13static void (*decode_dram_ecc)(int node_id, struct mce *m);
14
15void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16{
17 decode_dram_ecc = f;
18}
19EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20
21void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22{
23 if (decode_dram_ecc) {
24 WARN_ON(decode_dram_ecc != f);
25
26 decode_dram_ecc = NULL;
27 }
28}
29EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30
31/*
32 * string representation for the different MCA reported error types, see F3x48
33 * or MSR0000_0411.
34 */
35
36/* transaction type */
37static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38
39/* cache level */
40static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41
42/* memory transaction type */
43static const char * const rrrr_msgs[] = {
44 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45};
46
47/* participating processor */
48const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49EXPORT_SYMBOL_GPL(pp_msgs);
50
51/* request timeout */
52static const char * const to_msgs[] = { "no timeout", "timed out" };
53
54/* memory or i/o */
55static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56
57/* internal error type */
58static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59
60static const char * const f15h_mc1_mce_desc[] = {
61 "UC during a demand linefill from L2",
62 "Parity error during data load from IC",
63 "Parity error for IC valid bit",
64 "Main tag parity error",
65 "Parity error in prediction queue",
66 "PFB data/address parity error",
67 "Parity error in the branch status reg",
68 "PFB promotion address error",
69 "Tag error during probe/victimization",
70 "Parity error for IC probe tag valid bit",
71 "PFB non-cacheable bit parity error",
72 "PFB valid bit parity error", /* xec = 0xd */
73 "Microcode Patch Buffer", /* xec = 010 */
74 "uop queue",
75 "insn buffer",
76 "predecode buffer",
77 "fetch address FIFO",
78 "dispatch uop queue"
79};
80
81static const char * const f15h_mc2_mce_desc[] = {
82 "Fill ECC error on data fills", /* xec = 0x4 */
83 "Fill parity error on insn fills",
84 "Prefetcher request FIFO parity error",
85 "PRQ address parity error",
86 "PRQ data parity error",
87 "WCC Tag ECC error",
88 "WCC Data ECC error",
89 "WCB Data parity error",
90 "VB Data ECC or parity error",
91 "L2 Tag ECC error", /* xec = 0x10 */
92 "Hard L2 Tag ECC error",
93 "Multiple hits on L2 tag",
94 "XAB parity error",
95 "PRB address parity error"
96};
97
98static const char * const mc4_mce_desc[] = {
99 "DRAM ECC error detected on the NB",
100 "CRC error detected on HT link",
101 "Link-defined sync error packets detected on HT link",
102 "HT Master abort",
103 "HT Target abort",
104 "Invalid GART PTE entry during GART table walk",
105 "Unsupported atomic RMW received from an IO link",
106 "Watchdog timeout due to lack of progress",
107 "DRAM ECC error detected on the NB",
108 "SVM DMA Exclusion Vector error",
109 "HT data error detected on link",
110 "Protocol error (link, L3, probe filter)",
111 "NB internal arrays parity error",
112 "DRAM addr/ctl signals parity error",
113 "IO link transmission error",
114 "L3 data cache ECC error", /* xec = 0x1c */
115 "L3 cache tag error",
116 "L3 LRU parity bits error",
117 "ECC Error in the Probe Filter directory"
118};
119
120static const char * const mc5_mce_desc[] = {
121 "CPU Watchdog timer expire",
122 "Wakeup array dest tag",
123 "AG payload array",
124 "EX payload array",
125 "IDRF array",
126 "Retire dispatch queue",
127 "Mapper checkpoint array",
128 "Physical register file EX0 port",
129 "Physical register file EX1 port",
130 "Physical register file AG0 port",
131 "Physical register file AG1 port",
132 "Flag register file",
133 "DE error occurred",
134 "Retire status queue"
135};
136
137static const char * const mc6_mce_desc[] = {
138 "Hardware Assertion",
139 "Free List",
140 "Physical Register File",
141 "Retire Queue",
142 "Scheduler table",
143 "Status Register File",
144};
145
146static bool f12h_mc0_mce(u16 ec, u8 xec)
147{
148 bool ret = false;
149
150 if (MEM_ERROR(ec)) {
151 u8 ll = LL(ec);
152 ret = true;
153
154 if (ll == LL_L2)
155 pr_cont("during L1 linefill from L2.\n");
156 else if (ll == LL_L1)
157 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
158 else
159 ret = false;
160 }
161 return ret;
162}
163
164static bool f10h_mc0_mce(u16 ec, u8 xec)
165{
166 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
167 pr_cont("during data scrub.\n");
168 return true;
169 }
170 return f12h_mc0_mce(ec, xec);
171}
172
173static bool k8_mc0_mce(u16 ec, u8 xec)
174{
175 if (BUS_ERROR(ec)) {
176 pr_cont("during system linefill.\n");
177 return true;
178 }
179
180 return f10h_mc0_mce(ec, xec);
181}
182
183static bool cat_mc0_mce(u16 ec, u8 xec)
184{
185 u8 r4 = R4(ec);
186 bool ret = true;
187
188 if (MEM_ERROR(ec)) {
189
190 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
191 return false;
192
193 switch (r4) {
194 case R4_DRD:
195 case R4_DWR:
196 pr_cont("Data/Tag parity error due to %s.\n",
197 (r4 == R4_DRD ? "load/hw prf" : "store"));
198 break;
199 case R4_EVICT:
200 pr_cont("Copyback parity error on a tag miss.\n");
201 break;
202 case R4_SNOOP:
203 pr_cont("Tag parity error during snoop.\n");
204 break;
205 default:
206 ret = false;
207 }
208 } else if (BUS_ERROR(ec)) {
209
210 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
211 return false;
212
213 pr_cont("System read data error on a ");
214
215 switch (r4) {
216 case R4_RD:
217 pr_cont("TLB reload.\n");
218 break;
219 case R4_DWR:
220 pr_cont("store.\n");
221 break;
222 case R4_DRD:
223 pr_cont("load.\n");
224 break;
225 default:
226 ret = false;
227 }
228 } else {
229 ret = false;
230 }
231
232 return ret;
233}
234
235static bool f15h_mc0_mce(u16 ec, u8 xec)
236{
237 bool ret = true;
238
239 if (MEM_ERROR(ec)) {
240
241 switch (xec) {
242 case 0x0:
243 pr_cont("Data Array access error.\n");
244 break;
245
246 case 0x1:
247 pr_cont("UC error during a linefill from L2/NB.\n");
248 break;
249
250 case 0x2:
251 case 0x11:
252 pr_cont("STQ access error.\n");
253 break;
254
255 case 0x3:
256 pr_cont("SCB access error.\n");
257 break;
258
259 case 0x10:
260 pr_cont("Tag error.\n");
261 break;
262
263 case 0x12:
264 pr_cont("LDQ access error.\n");
265 break;
266
267 default:
268 ret = false;
269 }
270 } else if (BUS_ERROR(ec)) {
271
272 if (!xec)
273 pr_cont("System Read Data Error.\n");
274 else
275 pr_cont(" Internal error condition type %d.\n", xec);
276 } else if (INT_ERROR(ec)) {
277 if (xec <= 0x1f)
278 pr_cont("Hardware Assert.\n");
279 else
280 ret = false;
281
282 } else
283 ret = false;
284
285 return ret;
286}
287
288static void decode_mc0_mce(struct mce *m)
289{
290 u16 ec = EC(m->status);
291 u8 xec = XEC(m->status, xec_mask);
292
293 pr_emerg(HW_ERR "MC0 Error: ");
294
295 /* TLB error signatures are the same across families */
296 if (TLB_ERROR(ec)) {
297 if (TT(ec) == TT_DATA) {
298 pr_cont("%s TLB %s.\n", LL_MSG(ec),
299 ((xec == 2) ? "locked miss"
300 : (xec ? "multimatch" : "parity")));
301 return;
302 }
303 } else if (fam_ops.mc0_mce(ec, xec))
304 ;
305 else
306 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
307}
308
309static bool k8_mc1_mce(u16 ec, u8 xec)
310{
311 u8 ll = LL(ec);
312 bool ret = true;
313
314 if (!MEM_ERROR(ec))
315 return false;
316
317 if (ll == 0x2)
318 pr_cont("during a linefill from L2.\n");
319 else if (ll == 0x1) {
320 switch (R4(ec)) {
321 case R4_IRD:
322 pr_cont("Parity error during data load.\n");
323 break;
324
325 case R4_EVICT:
326 pr_cont("Copyback Parity/Victim error.\n");
327 break;
328
329 case R4_SNOOP:
330 pr_cont("Tag Snoop error.\n");
331 break;
332
333 default:
334 ret = false;
335 break;
336 }
337 } else
338 ret = false;
339
340 return ret;
341}
342
343static bool cat_mc1_mce(u16 ec, u8 xec)
344{
345 u8 r4 = R4(ec);
346 bool ret = true;
347
348 if (!MEM_ERROR(ec))
349 return false;
350
351 if (TT(ec) != TT_INSTR)
352 return false;
353
354 if (r4 == R4_IRD)
355 pr_cont("Data/tag array parity error for a tag hit.\n");
356 else if (r4 == R4_SNOOP)
357 pr_cont("Tag error during snoop/victimization.\n");
358 else if (xec == 0x0)
359 pr_cont("Tag parity error from victim castout.\n");
360 else if (xec == 0x2)
361 pr_cont("Microcode patch RAM parity error.\n");
362 else
363 ret = false;
364
365 return ret;
366}
367
368static bool f15h_mc1_mce(u16 ec, u8 xec)
369{
370 bool ret = true;
371
372 if (!MEM_ERROR(ec))
373 return false;
374
375 switch (xec) {
376 case 0x0 ... 0xa:
377 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
378 break;
379
380 case 0xd:
381 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
382 break;
383
384 case 0x10:
385 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
386 break;
387
388 case 0x11 ... 0x15:
389 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
390 break;
391
392 default:
393 ret = false;
394 }
395 return ret;
396}
397
398static void decode_mc1_mce(struct mce *m)
399{
400 u16 ec = EC(m->status);
401 u8 xec = XEC(m->status, xec_mask);
402
403 pr_emerg(HW_ERR "MC1 Error: ");
404
405 if (TLB_ERROR(ec))
406 pr_cont("%s TLB %s.\n", LL_MSG(ec),
407 (xec ? "multimatch" : "parity error"));
408 else if (BUS_ERROR(ec)) {
409 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
410
411 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
412 } else if (INT_ERROR(ec)) {
413 if (xec <= 0x3f)
414 pr_cont("Hardware Assert.\n");
415 else
416 goto wrong_mc1_mce;
417 } else if (fam_ops.mc1_mce(ec, xec))
418 ;
419 else
420 goto wrong_mc1_mce;
421
422 return;
423
424wrong_mc1_mce:
425 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
426}
427
428static bool k8_mc2_mce(u16 ec, u8 xec)
429{
430 bool ret = true;
431
432 if (xec == 0x1)
433 pr_cont(" in the write data buffers.\n");
434 else if (xec == 0x3)
435 pr_cont(" in the victim data buffers.\n");
436 else if (xec == 0x2 && MEM_ERROR(ec))
437 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
438 else if (xec == 0x0) {
439 if (TLB_ERROR(ec))
440 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
441 TT_MSG(ec));
442 else if (BUS_ERROR(ec))
443 pr_cont(": %s/ECC error in data read from NB: %s.\n",
444 R4_MSG(ec), PP_MSG(ec));
445 else if (MEM_ERROR(ec)) {
446 u8 r4 = R4(ec);
447
448 if (r4 >= 0x7)
449 pr_cont(": %s error during data copyback.\n",
450 R4_MSG(ec));
451 else if (r4 <= 0x1)
452 pr_cont(": %s parity/ECC error during data "
453 "access from L2.\n", R4_MSG(ec));
454 else
455 ret = false;
456 } else
457 ret = false;
458 } else
459 ret = false;
460
461 return ret;
462}
463
464static bool f15h_mc2_mce(u16 ec, u8 xec)
465{
466 bool ret = true;
467
468 if (TLB_ERROR(ec)) {
469 if (xec == 0x0)
470 pr_cont("Data parity TLB read error.\n");
471 else if (xec == 0x1)
472 pr_cont("Poison data provided for TLB fill.\n");
473 else
474 ret = false;
475 } else if (BUS_ERROR(ec)) {
476 if (xec > 2)
477 ret = false;
478
479 pr_cont("Error during attempted NB data read.\n");
480 } else if (MEM_ERROR(ec)) {
481 switch (xec) {
482 case 0x4 ... 0xc:
483 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
484 break;
485
486 case 0x10 ... 0x14:
487 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
488 break;
489
490 default:
491 ret = false;
492 }
493 } else if (INT_ERROR(ec)) {
494 if (xec <= 0x3f)
495 pr_cont("Hardware Assert.\n");
496 else
497 ret = false;
498 }
499
500 return ret;
501}
502
503static bool f16h_mc2_mce(u16 ec, u8 xec)
504{
505 u8 r4 = R4(ec);
506
507 if (!MEM_ERROR(ec))
508 return false;
509
510 switch (xec) {
511 case 0x04 ... 0x05:
512 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
513 break;
514
515 case 0x09 ... 0x0b:
516 case 0x0d ... 0x0f:
517 pr_cont("ECC error in L2 tag (%s).\n",
518 ((r4 == R4_GEN) ? "BankReq" :
519 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
520 break;
521
522 case 0x10 ... 0x19:
523 case 0x1b:
524 pr_cont("ECC error in L2 data array (%s).\n",
525 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
526 ((r4 == R4_GEN) ? "Attr" :
527 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
528 break;
529
530 case 0x1c ... 0x1d:
531 case 0x1f:
532 pr_cont("Parity error in L2 attribute bits (%s).\n",
533 ((r4 == R4_RD) ? "Hit" :
534 ((r4 == R4_GEN) ? "Attr" : "Fill")));
535 break;
536
537 default:
538 return false;
539 }
540
541 return true;
542}
543
544static void decode_mc2_mce(struct mce *m)
545{
546 u16 ec = EC(m->status);
547 u8 xec = XEC(m->status, xec_mask);
548
549 pr_emerg(HW_ERR "MC2 Error: ");
550
551 if (!fam_ops.mc2_mce(ec, xec))
552 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
553}
554
555static void decode_mc3_mce(struct mce *m)
556{
557 u16 ec = EC(m->status);
558 u8 xec = XEC(m->status, xec_mask);
559
560 if (boot_cpu_data.x86 >= 0x14) {
561 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
562 " please report on LKML.\n");
563 return;
564 }
565
566 pr_emerg(HW_ERR "MC3 Error");
567
568 if (xec == 0x0) {
569 u8 r4 = R4(ec);
570
571 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
572 goto wrong_mc3_mce;
573
574 pr_cont(" during %s.\n", R4_MSG(ec));
575 } else
576 goto wrong_mc3_mce;
577
578 return;
579
580 wrong_mc3_mce:
581 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
582}
583
584static void decode_mc4_mce(struct mce *m)
585{
586 unsigned int fam = x86_family(m->cpuid);
587 int node_id = topology_die_id(m->extcpu);
588 u16 ec = EC(m->status);
589 u8 xec = XEC(m->status, 0x1f);
590 u8 offset = 0;
591
592 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
593
594 switch (xec) {
595 case 0x0 ... 0xe:
596
597 /* special handling for DRAM ECCs */
598 if (xec == 0x0 || xec == 0x8) {
599 /* no ECCs on F11h */
600 if (fam == 0x11)
601 goto wrong_mc4_mce;
602
603 pr_cont("%s.\n", mc4_mce_desc[xec]);
604
605 if (decode_dram_ecc)
606 decode_dram_ecc(node_id, m);
607 return;
608 }
609 break;
610
611 case 0xf:
612 if (TLB_ERROR(ec))
613 pr_cont("GART Table Walk data error.\n");
614 else if (BUS_ERROR(ec))
615 pr_cont("DMA Exclusion Vector Table Walk error.\n");
616 else
617 goto wrong_mc4_mce;
618 return;
619
620 case 0x19:
621 if (fam == 0x15 || fam == 0x16)
622 pr_cont("Compute Unit Data Error.\n");
623 else
624 goto wrong_mc4_mce;
625 return;
626
627 case 0x1c ... 0x1f:
628 offset = 13;
629 break;
630
631 default:
632 goto wrong_mc4_mce;
633 }
634
635 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
636 return;
637
638 wrong_mc4_mce:
639 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
640}
641
642static void decode_mc5_mce(struct mce *m)
643{
644 unsigned int fam = x86_family(m->cpuid);
645 u16 ec = EC(m->status);
646 u8 xec = XEC(m->status, xec_mask);
647
648 if (fam == 0xf || fam == 0x11)
649 goto wrong_mc5_mce;
650
651 pr_emerg(HW_ERR "MC5 Error: ");
652
653 if (INT_ERROR(ec)) {
654 if (xec <= 0x1f) {
655 pr_cont("Hardware Assert.\n");
656 return;
657 } else
658 goto wrong_mc5_mce;
659 }
660
661 if (xec == 0x0 || xec == 0xc)
662 pr_cont("%s.\n", mc5_mce_desc[xec]);
663 else if (xec <= 0xd)
664 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
665 else
666 goto wrong_mc5_mce;
667
668 return;
669
670 wrong_mc5_mce:
671 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
672}
673
674static void decode_mc6_mce(struct mce *m)
675{
676 u8 xec = XEC(m->status, xec_mask);
677
678 pr_emerg(HW_ERR "MC6 Error: ");
679
680 if (xec > 0x5)
681 goto wrong_mc6_mce;
682
683 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
684 return;
685
686 wrong_mc6_mce:
687 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
688}
689
690static const char * const smca_long_names[] = {
691 [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit",
692 [SMCA_IF] = "Instruction Fetch Unit",
693 [SMCA_L2_CACHE] = "L2 Cache",
694 [SMCA_DE] = "Decode Unit",
695 [SMCA_RESERVED] = "Reserved",
696 [SMCA_EX] = "Execution Unit",
697 [SMCA_FP] = "Floating Point Unit",
698 [SMCA_L3_CACHE] = "L3 Cache",
699 [SMCA_CS ... SMCA_CS_V2] = "Coherent Slave",
700 [SMCA_PIE] = "Power, Interrupts, etc.",
701
702 /* UMC v2 is separate because both of them can exist in a single system. */
703 [SMCA_UMC] = "Unified Memory Controller",
704 [SMCA_UMC_V2] = "Unified Memory Controller v2",
705 [SMCA_PB] = "Parameter Block",
706 [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor",
707 [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit",
708 [SMCA_MP5] = "Microprocessor 5 Unit",
709 [SMCA_MPDMA] = "MPDMA Unit",
710 [SMCA_NBIO] = "Northbridge IO Unit",
711 [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit",
712 [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit",
713 [SMCA_NBIF] = "NBIF Unit",
714 [SMCA_SHUB] = "System Hub Unit",
715 [SMCA_SATA] = "SATA Unit",
716 [SMCA_USB] = "USB Unit",
717 [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit",
718 [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit",
719 [SMCA_WAFL_PHY] = "WAFL PHY Unit",
720 [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit",
721};
722
723static const char *smca_get_long_name(enum smca_bank_types t)
724{
725 if (t >= N_SMCA_BANK_TYPES)
726 return NULL;
727
728 return smca_long_names[t];
729}
730
731/* Decode errors according to Scalable MCA specification */
732static void decode_smca_error(struct mce *m)
733{
734 enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
735 u8 xec = XEC(m->status, xec_mask);
736
737 if (bank_type >= N_SMCA_BANK_TYPES)
738 return;
739
740 if (bank_type == SMCA_RESERVED) {
741 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
742 return;
743 }
744
745 pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec);
746
747 if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
748 xec == 0 && decode_dram_ecc)
749 decode_dram_ecc(topology_die_id(m->extcpu), m);
750}
751
752static inline void amd_decode_err_code(u16 ec)
753{
754 if (INT_ERROR(ec)) {
755 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
756 return;
757 }
758
759 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
760
761 if (BUS_ERROR(ec))
762 pr_cont(", mem/io: %s", II_MSG(ec));
763 else
764 pr_cont(", tx: %s", TT_MSG(ec));
765
766 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
767 pr_cont(", mem-tx: %s", R4_MSG(ec));
768
769 if (BUS_ERROR(ec))
770 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
771 }
772
773 pr_cont("\n");
774}
775
776static const char *decode_error_status(struct mce *m)
777{
778 if (m->status & MCI_STATUS_UC) {
779 if (m->status & MCI_STATUS_PCC)
780 return "System Fatal error.";
781 if (m->mcgstatus & MCG_STATUS_RIPV)
782 return "Uncorrected, software restartable error.";
783 return "Uncorrected, software containable error.";
784 }
785
786 if (m->status & MCI_STATUS_DEFERRED)
787 return "Deferred error, no action required.";
788
789 return "Corrected error, no action required.";
790}
791
792static int
793amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
794{
795 struct mce *m = (struct mce *)data;
796 unsigned int fam = x86_family(m->cpuid);
797 int ecc;
798
799 if (m->kflags & MCE_HANDLED_CEC)
800 return NOTIFY_DONE;
801
802 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
803
804 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
805 m->extcpu,
806 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
807 m->bank,
808 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
809 ((m->status & MCI_STATUS_UC) ? "UE" :
810 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
811 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
812 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
813 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
814
815 if (boot_cpu_has(X86_FEATURE_SMCA)) {
816 u32 low, high;
817 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
818
819 if (!rdmsr_safe(addr, &low, &high) &&
820 (low & MCI_CONFIG_MCAX))
821 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
822
823 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
824 }
825
826 /* do the two bits[14:13] together */
827 ecc = (m->status >> 45) & 0x3;
828 if (ecc)
829 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
830
831 if (fam >= 0x15) {
832 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
833
834 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
835 if (fam != 0x15 || m->bank != 4)
836 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
837 }
838
839 if (fam >= 0x17)
840 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
841
842 pr_cont("]: 0x%016llx\n", m->status);
843
844 if (m->status & MCI_STATUS_ADDRV)
845 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
846
847 if (m->ppin)
848 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
849
850 if (boot_cpu_has(X86_FEATURE_SMCA)) {
851 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
852
853 if (m->status & MCI_STATUS_SYNDV)
854 pr_cont(", Syndrome: 0x%016llx", m->synd);
855
856 pr_cont("\n");
857
858 decode_smca_error(m);
859 goto err_code;
860 }
861
862 if (m->tsc)
863 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
864
865 /* Doesn't matter which member to test. */
866 if (!fam_ops.mc0_mce)
867 goto err_code;
868
869 switch (m->bank) {
870 case 0:
871 decode_mc0_mce(m);
872 break;
873
874 case 1:
875 decode_mc1_mce(m);
876 break;
877
878 case 2:
879 decode_mc2_mce(m);
880 break;
881
882 case 3:
883 decode_mc3_mce(m);
884 break;
885
886 case 4:
887 decode_mc4_mce(m);
888 break;
889
890 case 5:
891 decode_mc5_mce(m);
892 break;
893
894 case 6:
895 decode_mc6_mce(m);
896 break;
897
898 default:
899 break;
900 }
901
902 err_code:
903 amd_decode_err_code(m->status & 0xffff);
904
905 m->kflags |= MCE_HANDLED_EDAC;
906 return NOTIFY_OK;
907}
908
909static struct notifier_block amd_mce_dec_nb = {
910 .notifier_call = amd_decode_mce,
911 .priority = MCE_PRIO_EDAC,
912};
913
914static int __init mce_amd_init(void)
915{
916 struct cpuinfo_x86 *c = &boot_cpu_data;
917
918 if (c->x86_vendor != X86_VENDOR_AMD &&
919 c->x86_vendor != X86_VENDOR_HYGON)
920 return -ENODEV;
921
922 if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
923 return -ENODEV;
924
925 if (boot_cpu_has(X86_FEATURE_SMCA)) {
926 xec_mask = 0x3f;
927 goto out;
928 }
929
930 switch (c->x86) {
931 case 0xf:
932 fam_ops.mc0_mce = k8_mc0_mce;
933 fam_ops.mc1_mce = k8_mc1_mce;
934 fam_ops.mc2_mce = k8_mc2_mce;
935 break;
936
937 case 0x10:
938 fam_ops.mc0_mce = f10h_mc0_mce;
939 fam_ops.mc1_mce = k8_mc1_mce;
940 fam_ops.mc2_mce = k8_mc2_mce;
941 break;
942
943 case 0x11:
944 fam_ops.mc0_mce = k8_mc0_mce;
945 fam_ops.mc1_mce = k8_mc1_mce;
946 fam_ops.mc2_mce = k8_mc2_mce;
947 break;
948
949 case 0x12:
950 fam_ops.mc0_mce = f12h_mc0_mce;
951 fam_ops.mc1_mce = k8_mc1_mce;
952 fam_ops.mc2_mce = k8_mc2_mce;
953 break;
954
955 case 0x14:
956 fam_ops.mc0_mce = cat_mc0_mce;
957 fam_ops.mc1_mce = cat_mc1_mce;
958 fam_ops.mc2_mce = k8_mc2_mce;
959 break;
960
961 case 0x15:
962 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
963
964 fam_ops.mc0_mce = f15h_mc0_mce;
965 fam_ops.mc1_mce = f15h_mc1_mce;
966 fam_ops.mc2_mce = f15h_mc2_mce;
967 break;
968
969 case 0x16:
970 xec_mask = 0x1f;
971 fam_ops.mc0_mce = cat_mc0_mce;
972 fam_ops.mc1_mce = cat_mc1_mce;
973 fam_ops.mc2_mce = f16h_mc2_mce;
974 break;
975
976 case 0x17:
977 case 0x18:
978 pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
979 return -EINVAL;
980
981 default:
982 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
983 return -EINVAL;
984 }
985
986out:
987 pr_info("MCE: In-kernel MCE decoding enabled.\n");
988
989 mce_register_decode_chain(&amd_mce_dec_nb);
990
991 return 0;
992}
993early_initcall(mce_amd_init);
994
995#ifdef MODULE
996static void __exit mce_amd_exit(void)
997{
998 mce_unregister_decode_chain(&amd_mce_dec_nb);
999}
1000
1001MODULE_DESCRIPTION("AMD MCE decoder");
1002MODULE_ALIAS("edac-mce-amd");
1003MODULE_LICENSE("GPL");
1004module_exit(mce_amd_exit);
1005#endif
1#include <linux/module.h>
2#include <linux/slab.h>
3
4#include "mce_amd.h"
5
6static struct amd_decoder_ops *fam_ops;
7
8static u8 xec_mask = 0xf;
9static u8 nb_err_cpumask = 0xf;
10
11static bool report_gart_errors;
12static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
13
14void amd_report_gart_errors(bool v)
15{
16 report_gart_errors = v;
17}
18EXPORT_SYMBOL_GPL(amd_report_gart_errors);
19
20void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
21{
22 nb_bus_decoder = f;
23}
24EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
25
26void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
27{
28 if (nb_bus_decoder) {
29 WARN_ON(nb_bus_decoder != f);
30
31 nb_bus_decoder = NULL;
32 }
33}
34EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
35
36/*
37 * string representation for the different MCA reported error types, see F3x48
38 * or MSR0000_0411.
39 */
40
41/* transaction type */
42const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43EXPORT_SYMBOL_GPL(tt_msgs);
44
45/* cache level */
46const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47EXPORT_SYMBOL_GPL(ll_msgs);
48
49/* memory transaction type */
50const char *rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52};
53EXPORT_SYMBOL_GPL(rrrr_msgs);
54
55/* participating processor */
56const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57EXPORT_SYMBOL_GPL(pp_msgs);
58
59/* request timeout */
60const char *to_msgs[] = { "no timeout", "timed out" };
61EXPORT_SYMBOL_GPL(to_msgs);
62
63/* memory or i/o */
64const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65EXPORT_SYMBOL_GPL(ii_msgs);
66
67static const char *f10h_nb_mce_desc[] = {
68 "HT link data error",
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
76};
77
78static const char * const f15h_ic_mce_desc[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
92 "uop queue",
93 "insn buffer",
94 "predecode buffer",
95 "fetch address FIFO"
96};
97
98static const char * const f15h_cu_mce_desc[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
104 "WCC Tag ECC error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
107 "VB Data/ECC error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
111 "XAB parity error",
112 "PRB address parity error"
113};
114
115static const char * const fr_ex_mce_desc[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
118 "AG payload array",
119 "EX payload array",
120 "IDRF array",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
129};
130
131static bool f12h_dc_mce(u16 ec, u8 xec)
132{
133 bool ret = false;
134
135 if (MEM_ERROR(ec)) {
136 u8 ll = LL(ec);
137 ret = true;
138
139 if (ll == LL_L2)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll == LL_L1)
142 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
143 else
144 ret = false;
145 }
146 return ret;
147}
148
149static bool f10h_dc_mce(u16 ec, u8 xec)
150{
151 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
152 pr_cont("during data scrub.\n");
153 return true;
154 }
155 return f12h_dc_mce(ec, xec);
156}
157
158static bool k8_dc_mce(u16 ec, u8 xec)
159{
160 if (BUS_ERROR(ec)) {
161 pr_cont("during system linefill.\n");
162 return true;
163 }
164
165 return f10h_dc_mce(ec, xec);
166}
167
168static bool f14h_dc_mce(u16 ec, u8 xec)
169{
170 u8 r4 = R4(ec);
171 bool ret = true;
172
173 if (MEM_ERROR(ec)) {
174
175 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
176 return false;
177
178 switch (r4) {
179 case R4_DRD:
180 case R4_DWR:
181 pr_cont("Data/Tag parity error due to %s.\n",
182 (r4 == R4_DRD ? "load/hw prf" : "store"));
183 break;
184 case R4_EVICT:
185 pr_cont("Copyback parity error on a tag miss.\n");
186 break;
187 case R4_SNOOP:
188 pr_cont("Tag parity error during snoop.\n");
189 break;
190 default:
191 ret = false;
192 }
193 } else if (BUS_ERROR(ec)) {
194
195 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
196 return false;
197
198 pr_cont("System read data error on a ");
199
200 switch (r4) {
201 case R4_RD:
202 pr_cont("TLB reload.\n");
203 break;
204 case R4_DWR:
205 pr_cont("store.\n");
206 break;
207 case R4_DRD:
208 pr_cont("load.\n");
209 break;
210 default:
211 ret = false;
212 }
213 } else {
214 ret = false;
215 }
216
217 return ret;
218}
219
220static bool f15h_dc_mce(u16 ec, u8 xec)
221{
222 bool ret = true;
223
224 if (MEM_ERROR(ec)) {
225
226 switch (xec) {
227 case 0x0:
228 pr_cont("Data Array access error.\n");
229 break;
230
231 case 0x1:
232 pr_cont("UC error during a linefill from L2/NB.\n");
233 break;
234
235 case 0x2:
236 case 0x11:
237 pr_cont("STQ access error.\n");
238 break;
239
240 case 0x3:
241 pr_cont("SCB access error.\n");
242 break;
243
244 case 0x10:
245 pr_cont("Tag error.\n");
246 break;
247
248 case 0x12:
249 pr_cont("LDQ access error.\n");
250 break;
251
252 default:
253 ret = false;
254 }
255 } else if (BUS_ERROR(ec)) {
256
257 if (!xec)
258 pr_cont("during system linefill.\n");
259 else
260 pr_cont(" Internal %s condition.\n",
261 ((xec == 1) ? "livelock" : "deadlock"));
262 } else
263 ret = false;
264
265 return ret;
266}
267
268static void amd_decode_dc_mce(struct mce *m)
269{
270 u16 ec = EC(m->status);
271 u8 xec = XEC(m->status, xec_mask);
272
273 pr_emerg(HW_ERR "Data Cache Error: ");
274
275 /* TLB error signatures are the same across families */
276 if (TLB_ERROR(ec)) {
277 if (TT(ec) == TT_DATA) {
278 pr_cont("%s TLB %s.\n", LL_MSG(ec),
279 ((xec == 2) ? "locked miss"
280 : (xec ? "multimatch" : "parity")));
281 return;
282 }
283 } else if (fam_ops->dc_mce(ec, xec))
284 ;
285 else
286 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
287}
288
289static bool k8_ic_mce(u16 ec, u8 xec)
290{
291 u8 ll = LL(ec);
292 bool ret = true;
293
294 if (!MEM_ERROR(ec))
295 return false;
296
297 if (ll == 0x2)
298 pr_cont("during a linefill from L2.\n");
299 else if (ll == 0x1) {
300 switch (R4(ec)) {
301 case R4_IRD:
302 pr_cont("Parity error during data load.\n");
303 break;
304
305 case R4_EVICT:
306 pr_cont("Copyback Parity/Victim error.\n");
307 break;
308
309 case R4_SNOOP:
310 pr_cont("Tag Snoop error.\n");
311 break;
312
313 default:
314 ret = false;
315 break;
316 }
317 } else
318 ret = false;
319
320 return ret;
321}
322
323static bool f14h_ic_mce(u16 ec, u8 xec)
324{
325 u8 r4 = R4(ec);
326 bool ret = true;
327
328 if (MEM_ERROR(ec)) {
329 if (TT(ec) != 0 || LL(ec) != 1)
330 ret = false;
331
332 if (r4 == R4_IRD)
333 pr_cont("Data/tag array parity error for a tag hit.\n");
334 else if (r4 == R4_SNOOP)
335 pr_cont("Tag error during snoop/victimization.\n");
336 else
337 ret = false;
338 }
339 return ret;
340}
341
342static bool f15h_ic_mce(u16 ec, u8 xec)
343{
344 bool ret = true;
345
346 if (!MEM_ERROR(ec))
347 return false;
348
349 switch (xec) {
350 case 0x0 ... 0xa:
351 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
352 break;
353
354 case 0xd:
355 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
356 break;
357
358 case 0x10 ... 0x14:
359 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
360 break;
361
362 default:
363 ret = false;
364 }
365 return ret;
366}
367
368static void amd_decode_ic_mce(struct mce *m)
369{
370 u16 ec = EC(m->status);
371 u8 xec = XEC(m->status, xec_mask);
372
373 pr_emerg(HW_ERR "Instruction Cache Error: ");
374
375 if (TLB_ERROR(ec))
376 pr_cont("%s TLB %s.\n", LL_MSG(ec),
377 (xec ? "multimatch" : "parity error"));
378 else if (BUS_ERROR(ec)) {
379 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
380
381 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
382 } else if (fam_ops->ic_mce(ec, xec))
383 ;
384 else
385 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
386}
387
388static void amd_decode_bu_mce(struct mce *m)
389{
390 u16 ec = EC(m->status);
391 u8 xec = XEC(m->status, xec_mask);
392
393 pr_emerg(HW_ERR "Bus Unit Error");
394
395 if (xec == 0x1)
396 pr_cont(" in the write data buffers.\n");
397 else if (xec == 0x3)
398 pr_cont(" in the victim data buffers.\n");
399 else if (xec == 0x2 && MEM_ERROR(ec))
400 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
401 else if (xec == 0x0) {
402 if (TLB_ERROR(ec))
403 pr_cont(": %s error in a Page Descriptor Cache or "
404 "Guest TLB.\n", TT_MSG(ec));
405 else if (BUS_ERROR(ec))
406 pr_cont(": %s/ECC error in data read from NB: %s.\n",
407 R4_MSG(ec), PP_MSG(ec));
408 else if (MEM_ERROR(ec)) {
409 u8 r4 = R4(ec);
410
411 if (r4 >= 0x7)
412 pr_cont(": %s error during data copyback.\n",
413 R4_MSG(ec));
414 else if (r4 <= 0x1)
415 pr_cont(": %s parity/ECC error during data "
416 "access from L2.\n", R4_MSG(ec));
417 else
418 goto wrong_bu_mce;
419 } else
420 goto wrong_bu_mce;
421 } else
422 goto wrong_bu_mce;
423
424 return;
425
426wrong_bu_mce:
427 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
428}
429
430static void amd_decode_cu_mce(struct mce *m)
431{
432 u16 ec = EC(m->status);
433 u8 xec = XEC(m->status, xec_mask);
434
435 pr_emerg(HW_ERR "Combined Unit Error: ");
436
437 if (TLB_ERROR(ec)) {
438 if (xec == 0x0)
439 pr_cont("Data parity TLB read error.\n");
440 else if (xec == 0x1)
441 pr_cont("Poison data provided for TLB fill.\n");
442 else
443 goto wrong_cu_mce;
444 } else if (BUS_ERROR(ec)) {
445 if (xec > 2)
446 goto wrong_cu_mce;
447
448 pr_cont("Error during attempted NB data read.\n");
449 } else if (MEM_ERROR(ec)) {
450 switch (xec) {
451 case 0x4 ... 0xc:
452 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
453 break;
454
455 case 0x10 ... 0x14:
456 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
457 break;
458
459 default:
460 goto wrong_cu_mce;
461 }
462 }
463
464 return;
465
466wrong_cu_mce:
467 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
468}
469
470static void amd_decode_ls_mce(struct mce *m)
471{
472 u16 ec = EC(m->status);
473 u8 xec = XEC(m->status, xec_mask);
474
475 if (boot_cpu_data.x86 >= 0x14) {
476 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
477 " please report on LKML.\n");
478 return;
479 }
480
481 pr_emerg(HW_ERR "Load Store Error");
482
483 if (xec == 0x0) {
484 u8 r4 = R4(ec);
485
486 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
487 goto wrong_ls_mce;
488
489 pr_cont(" during %s.\n", R4_MSG(ec));
490 } else
491 goto wrong_ls_mce;
492
493 return;
494
495wrong_ls_mce:
496 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
497}
498
499static bool k8_nb_mce(u16 ec, u8 xec)
500{
501 bool ret = true;
502
503 switch (xec) {
504 case 0x1:
505 pr_cont("CRC error detected on HT link.\n");
506 break;
507
508 case 0x5:
509 pr_cont("Invalid GART PTE entry during GART table walk.\n");
510 break;
511
512 case 0x6:
513 pr_cont("Unsupported atomic RMW received from an IO link.\n");
514 break;
515
516 case 0x0:
517 case 0x8:
518 if (boot_cpu_data.x86 == 0x11)
519 return false;
520
521 pr_cont("DRAM ECC error detected on the NB.\n");
522 break;
523
524 case 0xd:
525 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
526 break;
527
528 default:
529 ret = false;
530 break;
531 }
532
533 return ret;
534}
535
536static bool f10h_nb_mce(u16 ec, u8 xec)
537{
538 bool ret = true;
539 u8 offset = 0;
540
541 if (k8_nb_mce(ec, xec))
542 return true;
543
544 switch(xec) {
545 case 0xa ... 0xc:
546 offset = 10;
547 break;
548
549 case 0xe:
550 offset = 11;
551 break;
552
553 case 0xf:
554 if (TLB_ERROR(ec))
555 pr_cont("GART Table Walk data error.\n");
556 else if (BUS_ERROR(ec))
557 pr_cont("DMA Exclusion Vector Table Walk error.\n");
558 else
559 ret = false;
560
561 goto out;
562 break;
563
564 case 0x19:
565 if (boot_cpu_data.x86 == 0x15)
566 pr_cont("Compute Unit Data Error.\n");
567 else
568 ret = false;
569
570 goto out;
571 break;
572
573 case 0x1c ... 0x1f:
574 offset = 24;
575 break;
576
577 default:
578 ret = false;
579
580 goto out;
581 break;
582 }
583
584 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
585
586out:
587 return ret;
588}
589
590static bool nb_noop_mce(u16 ec, u8 xec)
591{
592 return false;
593}
594
595void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
596{
597 struct cpuinfo_x86 *c = &boot_cpu_data;
598 u16 ec = EC(m->status);
599 u8 xec = XEC(m->status, 0x1f);
600 u32 nbsh = (u32)(m->status >> 32);
601 int core = -1;
602
603 pr_emerg(HW_ERR "Northbridge Error (node %d", node_id);
604
605 /* F10h, revD can disable ErrCpu[3:0] through ErrCpuVal */
606 if (c->x86 == 0x10 && c->x86_model > 7) {
607 if (nbsh & NBSH_ERR_CPU_VAL)
608 core = nbsh & nb_err_cpumask;
609 } else {
610 u8 assoc_cpus = nbsh & nb_err_cpumask;
611
612 if (assoc_cpus > 0)
613 core = fls(assoc_cpus) - 1;
614 }
615
616 if (core >= 0)
617 pr_cont(", core %d): ", core);
618 else
619 pr_cont("): ");
620
621 switch (xec) {
622 case 0x2:
623 pr_cont("Sync error (sync packets on HT link detected).\n");
624 return;
625
626 case 0x3:
627 pr_cont("HT Master abort.\n");
628 return;
629
630 case 0x4:
631 pr_cont("HT Target abort.\n");
632 return;
633
634 case 0x7:
635 pr_cont("NB Watchdog timeout.\n");
636 return;
637
638 case 0x9:
639 pr_cont("SVM DMA Exclusion Vector error.\n");
640 return;
641
642 default:
643 break;
644 }
645
646 if (!fam_ops->nb_mce(ec, xec))
647 goto wrong_nb_mce;
648
649 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x15)
650 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
651 nb_bus_decoder(node_id, m, nbcfg);
652
653 return;
654
655wrong_nb_mce:
656 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
657}
658EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
659
660static void amd_decode_fr_mce(struct mce *m)
661{
662 struct cpuinfo_x86 *c = &boot_cpu_data;
663 u8 xec = XEC(m->status, xec_mask);
664
665 if (c->x86 == 0xf || c->x86 == 0x11)
666 goto wrong_fr_mce;
667
668 if (c->x86 != 0x15 && xec != 0x0)
669 goto wrong_fr_mce;
670
671 pr_emerg(HW_ERR "%s Error: ",
672 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
673
674 if (xec == 0x0 || xec == 0xc)
675 pr_cont("%s.\n", fr_ex_mce_desc[xec]);
676 else if (xec < 0xd)
677 pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
678 else
679 goto wrong_fr_mce;
680
681 return;
682
683wrong_fr_mce:
684 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
685}
686
687static void amd_decode_fp_mce(struct mce *m)
688{
689 u8 xec = XEC(m->status, xec_mask);
690
691 pr_emerg(HW_ERR "Floating Point Unit Error: ");
692
693 switch (xec) {
694 case 0x1:
695 pr_cont("Free List");
696 break;
697
698 case 0x2:
699 pr_cont("Physical Register File");
700 break;
701
702 case 0x3:
703 pr_cont("Retire Queue");
704 break;
705
706 case 0x4:
707 pr_cont("Scheduler table");
708 break;
709
710 case 0x5:
711 pr_cont("Status Register File");
712 break;
713
714 default:
715 goto wrong_fp_mce;
716 break;
717 }
718
719 pr_cont(" parity error.\n");
720
721 return;
722
723wrong_fp_mce:
724 pr_emerg(HW_ERR "Corrupted FP MCE info?\n");
725}
726
727static inline void amd_decode_err_code(u16 ec)
728{
729
730 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
731
732 if (BUS_ERROR(ec))
733 pr_cont(", mem/io: %s", II_MSG(ec));
734 else
735 pr_cont(", tx: %s", TT_MSG(ec));
736
737 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
738 pr_cont(", mem-tx: %s", R4_MSG(ec));
739
740 if (BUS_ERROR(ec))
741 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
742 }
743
744 pr_cont("\n");
745}
746
747/*
748 * Filter out unwanted MCE signatures here.
749 */
750static bool amd_filter_mce(struct mce *m)
751{
752 u8 xec = (m->status >> 16) & 0x1f;
753
754 /*
755 * NB GART TLB error reporting is disabled by default.
756 */
757 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
758 return true;
759
760 return false;
761}
762
763int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
764{
765 struct mce *m = (struct mce *)data;
766 struct cpuinfo_x86 *c = &boot_cpu_data;
767 int node, ecc;
768
769 if (amd_filter_mce(m))
770 return NOTIFY_STOP;
771
772 pr_emerg(HW_ERR "MC%d_STATUS[%s|%s|%s|%s|%s",
773 m->bank,
774 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
775 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
776 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
777 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
778 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
779
780 if (c->x86 == 0x15)
781 pr_cont("|%s|%s",
782 ((m->status & BIT_64(44)) ? "Deferred" : "-"),
783 ((m->status & BIT_64(43)) ? "Poison" : "-"));
784
785 /* do the two bits[14:13] together */
786 ecc = (m->status >> 45) & 0x3;
787 if (ecc)
788 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
789
790 pr_cont("]: 0x%016llx\n", m->status);
791
792
793 switch (m->bank) {
794 case 0:
795 amd_decode_dc_mce(m);
796 break;
797
798 case 1:
799 amd_decode_ic_mce(m);
800 break;
801
802 case 2:
803 if (c->x86 == 0x15)
804 amd_decode_cu_mce(m);
805 else
806 amd_decode_bu_mce(m);
807 break;
808
809 case 3:
810 amd_decode_ls_mce(m);
811 break;
812
813 case 4:
814 node = amd_get_nb_id(m->extcpu);
815 amd_decode_nb_mce(node, m, 0);
816 break;
817
818 case 5:
819 amd_decode_fr_mce(m);
820 break;
821
822 case 6:
823 amd_decode_fp_mce(m);
824 break;
825
826 default:
827 break;
828 }
829
830 amd_decode_err_code(m->status & 0xffff);
831
832 return NOTIFY_STOP;
833}
834EXPORT_SYMBOL_GPL(amd_decode_mce);
835
836static struct notifier_block amd_mce_dec_nb = {
837 .notifier_call = amd_decode_mce,
838};
839
840static int __init mce_amd_init(void)
841{
842 struct cpuinfo_x86 *c = &boot_cpu_data;
843
844 if (c->x86_vendor != X86_VENDOR_AMD)
845 return 0;
846
847 if ((c->x86 < 0xf || c->x86 > 0x12) &&
848 (c->x86 != 0x14 || c->x86_model > 0xf) &&
849 (c->x86 != 0x15 || c->x86_model > 0xf))
850 return 0;
851
852 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
853 if (!fam_ops)
854 return -ENOMEM;
855
856 switch (c->x86) {
857 case 0xf:
858 fam_ops->dc_mce = k8_dc_mce;
859 fam_ops->ic_mce = k8_ic_mce;
860 fam_ops->nb_mce = k8_nb_mce;
861 break;
862
863 case 0x10:
864 fam_ops->dc_mce = f10h_dc_mce;
865 fam_ops->ic_mce = k8_ic_mce;
866 fam_ops->nb_mce = f10h_nb_mce;
867 break;
868
869 case 0x11:
870 fam_ops->dc_mce = k8_dc_mce;
871 fam_ops->ic_mce = k8_ic_mce;
872 fam_ops->nb_mce = f10h_nb_mce;
873 break;
874
875 case 0x12:
876 fam_ops->dc_mce = f12h_dc_mce;
877 fam_ops->ic_mce = k8_ic_mce;
878 fam_ops->nb_mce = nb_noop_mce;
879 break;
880
881 case 0x14:
882 nb_err_cpumask = 0x3;
883 fam_ops->dc_mce = f14h_dc_mce;
884 fam_ops->ic_mce = f14h_ic_mce;
885 fam_ops->nb_mce = nb_noop_mce;
886 break;
887
888 case 0x15:
889 xec_mask = 0x1f;
890 fam_ops->dc_mce = f15h_dc_mce;
891 fam_ops->ic_mce = f15h_ic_mce;
892 fam_ops->nb_mce = f10h_nb_mce;
893 break;
894
895 default:
896 printk(KERN_WARNING "Huh? What family is that: %d?!\n", c->x86);
897 kfree(fam_ops);
898 return -EINVAL;
899 }
900
901 pr_info("MCE: In-kernel MCE decoding enabled.\n");
902
903 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
904
905 return 0;
906}
907early_initcall(mce_amd_init);
908
909#ifdef MODULE
910static void __exit mce_amd_exit(void)
911{
912 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
913 kfree(fam_ops);
914}
915
916MODULE_DESCRIPTION("AMD MCE decoder");
917MODULE_ALIAS("edac-mce-amd");
918MODULE_LICENSE("GPL");
919module_exit(mce_amd_exit);
920#endif