mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-23 00:39:29 +07:00
68627a697c
Currently, bank 4 is reserved on Fam17h, so we chose not to initialize bank 4 in the smca_banks array. This means that when we check if a bank is initialized, like during boot or resume, we will see that bank 4 is not initialized and try to initialize it. This will cause a call trace, when resuming from suspend, due to rdmsr_*on_cpu() calls in the init path. The rdmsr_*on_cpu() calls issue an IPI but we're running with interrupts disabled. This triggers: WARNING: CPU: 0 PID: 11523 at kernel/smp.c:291 smp_call_function_single+0xdc/0xe0 ... Reserved banks will be read-as-zero, so their MCA_IPID register will be zero. So, like the smca_banks array, the threshold_banks array will not have an entry for a reserved bank since all its MCA_MISC* registers will be zero. Enumerate a "Reserved" bank type that matches on a HWID_MCATYPE of 0,0. Use the "Reserved" type when checking if a bank is reserved. It's possible that other bank numbers may be reserved on future systems. Don't try to find the block address on reserved banks. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov <bp@suse.de> Cc: <stable@vger.kernel.org> # 4.14.x Cc: Borislav Petkov <bp@alien8.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony Luck <tony.luck@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20180221101900.10326-7-bp@alien8.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
1153 lines
24 KiB
C
1153 lines
24 KiB
C
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include <asm/cpu.h>
|
|
|
|
#include "mce_amd.h"
|
|
|
|
static struct amd_decoder_ops *fam_ops;
|
|
|
|
static u8 xec_mask = 0xf;
|
|
|
|
static bool report_gart_errors;
|
|
static void (*decode_dram_ecc)(int node_id, struct mce *m);
|
|
|
|
void amd_report_gart_errors(bool v)
|
|
{
|
|
report_gart_errors = v;
|
|
}
|
|
EXPORT_SYMBOL_GPL(amd_report_gart_errors);
|
|
|
|
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
|
|
{
|
|
decode_dram_ecc = f;
|
|
}
|
|
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
|
|
|
|
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
|
|
{
|
|
if (decode_dram_ecc) {
|
|
WARN_ON(decode_dram_ecc != f);
|
|
|
|
decode_dram_ecc = NULL;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
|
|
|
|
/*
|
|
* string representation for the different MCA reported error types, see F3x48
|
|
* or MSR0000_0411.
|
|
*/
|
|
|
|
/* transaction type */
|
|
static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
|
|
|
|
/* cache level */
|
|
static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
|
|
|
|
/* memory transaction type */
|
|
static const char * const rrrr_msgs[] = {
|
|
"GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
|
|
};
|
|
|
|
/* participating processor */
|
|
const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
|
|
EXPORT_SYMBOL_GPL(pp_msgs);
|
|
|
|
/* request timeout */
|
|
static const char * const to_msgs[] = { "no timeout", "timed out" };
|
|
|
|
/* memory or i/o */
|
|
static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
|
|
|
|
/* internal error type */
|
|
static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
|
|
|
|
static const char * const f15h_mc1_mce_desc[] = {
|
|
"UC during a demand linefill from L2",
|
|
"Parity error during data load from IC",
|
|
"Parity error for IC valid bit",
|
|
"Main tag parity error",
|
|
"Parity error in prediction queue",
|
|
"PFB data/address parity error",
|
|
"Parity error in the branch status reg",
|
|
"PFB promotion address error",
|
|
"Tag error during probe/victimization",
|
|
"Parity error for IC probe tag valid bit",
|
|
"PFB non-cacheable bit parity error",
|
|
"PFB valid bit parity error", /* xec = 0xd */
|
|
"Microcode Patch Buffer", /* xec = 010 */
|
|
"uop queue",
|
|
"insn buffer",
|
|
"predecode buffer",
|
|
"fetch address FIFO",
|
|
"dispatch uop queue"
|
|
};
|
|
|
|
static const char * const f15h_mc2_mce_desc[] = {
|
|
"Fill ECC error on data fills", /* xec = 0x4 */
|
|
"Fill parity error on insn fills",
|
|
"Prefetcher request FIFO parity error",
|
|
"PRQ address parity error",
|
|
"PRQ data parity error",
|
|
"WCC Tag ECC error",
|
|
"WCC Data ECC error",
|
|
"WCB Data parity error",
|
|
"VB Data ECC or parity error",
|
|
"L2 Tag ECC error", /* xec = 0x10 */
|
|
"Hard L2 Tag ECC error",
|
|
"Multiple hits on L2 tag",
|
|
"XAB parity error",
|
|
"PRB address parity error"
|
|
};
|
|
|
|
static const char * const mc4_mce_desc[] = {
|
|
"DRAM ECC error detected on the NB",
|
|
"CRC error detected on HT link",
|
|
"Link-defined sync error packets detected on HT link",
|
|
"HT Master abort",
|
|
"HT Target abort",
|
|
"Invalid GART PTE entry during GART table walk",
|
|
"Unsupported atomic RMW received from an IO link",
|
|
"Watchdog timeout due to lack of progress",
|
|
"DRAM ECC error detected on the NB",
|
|
"SVM DMA Exclusion Vector error",
|
|
"HT data error detected on link",
|
|
"Protocol error (link, L3, probe filter)",
|
|
"NB internal arrays parity error",
|
|
"DRAM addr/ctl signals parity error",
|
|
"IO link transmission error",
|
|
"L3 data cache ECC error", /* xec = 0x1c */
|
|
"L3 cache tag error",
|
|
"L3 LRU parity bits error",
|
|
"ECC Error in the Probe Filter directory"
|
|
};
|
|
|
|
static const char * const mc5_mce_desc[] = {
|
|
"CPU Watchdog timer expire",
|
|
"Wakeup array dest tag",
|
|
"AG payload array",
|
|
"EX payload array",
|
|
"IDRF array",
|
|
"Retire dispatch queue",
|
|
"Mapper checkpoint array",
|
|
"Physical register file EX0 port",
|
|
"Physical register file EX1 port",
|
|
"Physical register file AG0 port",
|
|
"Physical register file AG1 port",
|
|
"Flag register file",
|
|
"DE error occurred",
|
|
"Retire status queue"
|
|
};
|
|
|
|
static const char * const mc6_mce_desc[] = {
|
|
"Hardware Assertion",
|
|
"Free List",
|
|
"Physical Register File",
|
|
"Retire Queue",
|
|
"Scheduler table",
|
|
"Status Register File",
|
|
};
|
|
|
|
/* Scalable MCA error strings */
|
|
static const char * const smca_ls_mce_desc[] = {
|
|
"Load queue parity",
|
|
"Store queue parity",
|
|
"Miss address buffer payload parity",
|
|
"L1 TLB parity",
|
|
"Reserved",
|
|
"DC tag error type 6",
|
|
"DC tag error type 1",
|
|
"Internal error type 1",
|
|
"Internal error type 2",
|
|
"Sys Read data error thread 0",
|
|
"Sys read data error thread 1",
|
|
"DC tag error type 2",
|
|
"DC data error type 1 (poison consumption)",
|
|
"DC data error type 2",
|
|
"DC data error type 3",
|
|
"DC tag error type 4",
|
|
"L2 TLB parity",
|
|
"PDC parity error",
|
|
"DC tag error type 3",
|
|
"DC tag error type 5",
|
|
"L2 fill data error",
|
|
};
|
|
|
|
static const char * const smca_if_mce_desc[] = {
|
|
"microtag probe port parity error",
|
|
"IC microtag or full tag multi-hit error",
|
|
"IC full tag parity",
|
|
"IC data array parity",
|
|
"Decoupling queue phys addr parity error",
|
|
"L0 ITLB parity error",
|
|
"L1 ITLB parity error",
|
|
"L2 ITLB parity error",
|
|
"BPQ snoop parity on Thread 0",
|
|
"BPQ snoop parity on Thread 1",
|
|
"L1 BTB multi-match error",
|
|
"L2 BTB multi-match error",
|
|
"L2 Cache Response Poison error",
|
|
"System Read Data error",
|
|
};
|
|
|
|
static const char * const smca_l2_mce_desc[] = {
|
|
"L2M tag multi-way-hit error",
|
|
"L2M tag ECC error",
|
|
"L2M data ECC error",
|
|
"HW assert",
|
|
};
|
|
|
|
static const char * const smca_de_mce_desc[] = {
|
|
"uop cache tag parity error",
|
|
"uop cache data parity error",
|
|
"Insn buffer parity error",
|
|
"uop queue parity error",
|
|
"Insn dispatch queue parity error",
|
|
"Fetch address FIFO parity",
|
|
"Patch RAM data parity",
|
|
"Patch RAM sequencer parity",
|
|
"uop buffer parity"
|
|
};
|
|
|
|
static const char * const smca_ex_mce_desc[] = {
|
|
"Watchdog timeout error",
|
|
"Phy register file parity",
|
|
"Flag register file parity",
|
|
"Immediate displacement register file parity",
|
|
"Address generator payload parity",
|
|
"EX payload parity",
|
|
"Checkpoint queue parity",
|
|
"Retire dispatch queue parity",
|
|
"Retire status queue parity error",
|
|
"Scheduling queue parity error",
|
|
"Branch buffer queue parity error",
|
|
};
|
|
|
|
static const char * const smca_fp_mce_desc[] = {
|
|
"Physical register file parity",
|
|
"Freelist parity error",
|
|
"Schedule queue parity",
|
|
"NSQ parity error",
|
|
"Retire queue parity",
|
|
"Status register file parity",
|
|
"Hardware assertion",
|
|
};
|
|
|
|
static const char * const smca_l3_mce_desc[] = {
|
|
"Shadow tag macro ECC error",
|
|
"Shadow tag macro multi-way-hit error",
|
|
"L3M tag ECC error",
|
|
"L3M tag multi-way-hit error",
|
|
"L3M data ECC error",
|
|
"XI parity, L3 fill done channel error",
|
|
"L3 victim queue parity",
|
|
"L3 HW assert",
|
|
};
|
|
|
|
static const char * const smca_cs_mce_desc[] = {
|
|
"Illegal request from transport layer",
|
|
"Address violation",
|
|
"Security violation",
|
|
"Illegal response from transport layer",
|
|
"Unexpected response",
|
|
"Parity error on incoming request or probe response data",
|
|
"Parity error on incoming read response data",
|
|
"Atomic request parity",
|
|
"ECC error on probe filter access",
|
|
};
|
|
|
|
static const char * const smca_pie_mce_desc[] = {
|
|
"HW assert",
|
|
"Internal PIE register security violation",
|
|
"Error on GMI link",
|
|
"Poison data written to internal PIE register",
|
|
};
|
|
|
|
static const char * const smca_umc_mce_desc[] = {
|
|
"DRAM ECC error",
|
|
"Data poison error on DRAM",
|
|
"SDP parity error",
|
|
"Advanced peripheral bus error",
|
|
"Command/address parity error",
|
|
"Write data CRC error",
|
|
};
|
|
|
|
static const char * const smca_pb_mce_desc[] = {
|
|
"Parameter Block RAM ECC error",
|
|
};
|
|
|
|
static const char * const smca_psp_mce_desc[] = {
|
|
"PSP RAM ECC or parity error",
|
|
};
|
|
|
|
static const char * const smca_smu_mce_desc[] = {
|
|
"SMU RAM ECC or parity error",
|
|
};
|
|
|
|
struct smca_mce_desc {
|
|
const char * const *descs;
|
|
unsigned int num_descs;
|
|
};
|
|
|
|
static struct smca_mce_desc smca_mce_descs[] = {
|
|
[SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
|
|
[SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
|
|
[SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
|
|
[SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
|
|
[SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
|
|
[SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
|
|
[SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
|
|
[SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
|
|
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
|
|
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
|
|
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
|
|
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
|
|
[SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
|
|
};
|
|
|
|
static bool f12h_mc0_mce(u16 ec, u8 xec)
|
|
{
|
|
bool ret = false;
|
|
|
|
if (MEM_ERROR(ec)) {
|
|
u8 ll = LL(ec);
|
|
ret = true;
|
|
|
|
if (ll == LL_L2)
|
|
pr_cont("during L1 linefill from L2.\n");
|
|
else if (ll == LL_L1)
|
|
pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
|
|
else
|
|
ret = false;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static bool f10h_mc0_mce(u16 ec, u8 xec)
|
|
{
|
|
if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
|
|
pr_cont("during data scrub.\n");
|
|
return true;
|
|
}
|
|
return f12h_mc0_mce(ec, xec);
|
|
}
|
|
|
|
static bool k8_mc0_mce(u16 ec, u8 xec)
|
|
{
|
|
if (BUS_ERROR(ec)) {
|
|
pr_cont("during system linefill.\n");
|
|
return true;
|
|
}
|
|
|
|
return f10h_mc0_mce(ec, xec);
|
|
}
|
|
|
|
static bool cat_mc0_mce(u16 ec, u8 xec)
|
|
{
|
|
u8 r4 = R4(ec);
|
|
bool ret = true;
|
|
|
|
if (MEM_ERROR(ec)) {
|
|
|
|
if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
|
|
return false;
|
|
|
|
switch (r4) {
|
|
case R4_DRD:
|
|
case R4_DWR:
|
|
pr_cont("Data/Tag parity error due to %s.\n",
|
|
(r4 == R4_DRD ? "load/hw prf" : "store"));
|
|
break;
|
|
case R4_EVICT:
|
|
pr_cont("Copyback parity error on a tag miss.\n");
|
|
break;
|
|
case R4_SNOOP:
|
|
pr_cont("Tag parity error during snoop.\n");
|
|
break;
|
|
default:
|
|
ret = false;
|
|
}
|
|
} else if (BUS_ERROR(ec)) {
|
|
|
|
if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
|
|
return false;
|
|
|
|
pr_cont("System read data error on a ");
|
|
|
|
switch (r4) {
|
|
case R4_RD:
|
|
pr_cont("TLB reload.\n");
|
|
break;
|
|
case R4_DWR:
|
|
pr_cont("store.\n");
|
|
break;
|
|
case R4_DRD:
|
|
pr_cont("load.\n");
|
|
break;
|
|
default:
|
|
ret = false;
|
|
}
|
|
} else {
|
|
ret = false;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool f15h_mc0_mce(u16 ec, u8 xec)
|
|
{
|
|
bool ret = true;
|
|
|
|
if (MEM_ERROR(ec)) {
|
|
|
|
switch (xec) {
|
|
case 0x0:
|
|
pr_cont("Data Array access error.\n");
|
|
break;
|
|
|
|
case 0x1:
|
|
pr_cont("UC error during a linefill from L2/NB.\n");
|
|
break;
|
|
|
|
case 0x2:
|
|
case 0x11:
|
|
pr_cont("STQ access error.\n");
|
|
break;
|
|
|
|
case 0x3:
|
|
pr_cont("SCB access error.\n");
|
|
break;
|
|
|
|
case 0x10:
|
|
pr_cont("Tag error.\n");
|
|
break;
|
|
|
|
case 0x12:
|
|
pr_cont("LDQ access error.\n");
|
|
break;
|
|
|
|
default:
|
|
ret = false;
|
|
}
|
|
} else if (BUS_ERROR(ec)) {
|
|
|
|
if (!xec)
|
|
pr_cont("System Read Data Error.\n");
|
|
else
|
|
pr_cont(" Internal error condition type %d.\n", xec);
|
|
} else if (INT_ERROR(ec)) {
|
|
if (xec <= 0x1f)
|
|
pr_cont("Hardware Assert.\n");
|
|
else
|
|
ret = false;
|
|
|
|
} else
|
|
ret = false;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void decode_mc0_mce(struct mce *m)
|
|
{
|
|
u16 ec = EC(m->status);
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
pr_emerg(HW_ERR "MC0 Error: ");
|
|
|
|
/* TLB error signatures are the same across families */
|
|
if (TLB_ERROR(ec)) {
|
|
if (TT(ec) == TT_DATA) {
|
|
pr_cont("%s TLB %s.\n", LL_MSG(ec),
|
|
((xec == 2) ? "locked miss"
|
|
: (xec ? "multimatch" : "parity")));
|
|
return;
|
|
}
|
|
} else if (fam_ops->mc0_mce(ec, xec))
|
|
;
|
|
else
|
|
pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
|
|
}
|
|
|
|
static bool k8_mc1_mce(u16 ec, u8 xec)
|
|
{
|
|
u8 ll = LL(ec);
|
|
bool ret = true;
|
|
|
|
if (!MEM_ERROR(ec))
|
|
return false;
|
|
|
|
if (ll == 0x2)
|
|
pr_cont("during a linefill from L2.\n");
|
|
else if (ll == 0x1) {
|
|
switch (R4(ec)) {
|
|
case R4_IRD:
|
|
pr_cont("Parity error during data load.\n");
|
|
break;
|
|
|
|
case R4_EVICT:
|
|
pr_cont("Copyback Parity/Victim error.\n");
|
|
break;
|
|
|
|
case R4_SNOOP:
|
|
pr_cont("Tag Snoop error.\n");
|
|
break;
|
|
|
|
default:
|
|
ret = false;
|
|
break;
|
|
}
|
|
} else
|
|
ret = false;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool cat_mc1_mce(u16 ec, u8 xec)
|
|
{
|
|
u8 r4 = R4(ec);
|
|
bool ret = true;
|
|
|
|
if (!MEM_ERROR(ec))
|
|
return false;
|
|
|
|
if (TT(ec) != TT_INSTR)
|
|
return false;
|
|
|
|
if (r4 == R4_IRD)
|
|
pr_cont("Data/tag array parity error for a tag hit.\n");
|
|
else if (r4 == R4_SNOOP)
|
|
pr_cont("Tag error during snoop/victimization.\n");
|
|
else if (xec == 0x0)
|
|
pr_cont("Tag parity error from victim castout.\n");
|
|
else if (xec == 0x2)
|
|
pr_cont("Microcode patch RAM parity error.\n");
|
|
else
|
|
ret = false;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool f15h_mc1_mce(u16 ec, u8 xec)
|
|
{
|
|
bool ret = true;
|
|
|
|
if (!MEM_ERROR(ec))
|
|
return false;
|
|
|
|
switch (xec) {
|
|
case 0x0 ... 0xa:
|
|
pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
|
|
break;
|
|
|
|
case 0xd:
|
|
pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
|
|
break;
|
|
|
|
case 0x10:
|
|
pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
|
|
break;
|
|
|
|
case 0x11 ... 0x15:
|
|
pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
|
|
break;
|
|
|
|
default:
|
|
ret = false;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static void decode_mc1_mce(struct mce *m)
|
|
{
|
|
u16 ec = EC(m->status);
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
pr_emerg(HW_ERR "MC1 Error: ");
|
|
|
|
if (TLB_ERROR(ec))
|
|
pr_cont("%s TLB %s.\n", LL_MSG(ec),
|
|
(xec ? "multimatch" : "parity error"));
|
|
else if (BUS_ERROR(ec)) {
|
|
bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
|
|
|
|
pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
|
|
} else if (INT_ERROR(ec)) {
|
|
if (xec <= 0x3f)
|
|
pr_cont("Hardware Assert.\n");
|
|
else
|
|
goto wrong_mc1_mce;
|
|
} else if (fam_ops->mc1_mce(ec, xec))
|
|
;
|
|
else
|
|
goto wrong_mc1_mce;
|
|
|
|
return;
|
|
|
|
wrong_mc1_mce:
|
|
pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
|
|
}
|
|
|
|
static bool k8_mc2_mce(u16 ec, u8 xec)
|
|
{
|
|
bool ret = true;
|
|
|
|
if (xec == 0x1)
|
|
pr_cont(" in the write data buffers.\n");
|
|
else if (xec == 0x3)
|
|
pr_cont(" in the victim data buffers.\n");
|
|
else if (xec == 0x2 && MEM_ERROR(ec))
|
|
pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
|
|
else if (xec == 0x0) {
|
|
if (TLB_ERROR(ec))
|
|
pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
|
|
TT_MSG(ec));
|
|
else if (BUS_ERROR(ec))
|
|
pr_cont(": %s/ECC error in data read from NB: %s.\n",
|
|
R4_MSG(ec), PP_MSG(ec));
|
|
else if (MEM_ERROR(ec)) {
|
|
u8 r4 = R4(ec);
|
|
|
|
if (r4 >= 0x7)
|
|
pr_cont(": %s error during data copyback.\n",
|
|
R4_MSG(ec));
|
|
else if (r4 <= 0x1)
|
|
pr_cont(": %s parity/ECC error during data "
|
|
"access from L2.\n", R4_MSG(ec));
|
|
else
|
|
ret = false;
|
|
} else
|
|
ret = false;
|
|
} else
|
|
ret = false;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool f15h_mc2_mce(u16 ec, u8 xec)
|
|
{
|
|
bool ret = true;
|
|
|
|
if (TLB_ERROR(ec)) {
|
|
if (xec == 0x0)
|
|
pr_cont("Data parity TLB read error.\n");
|
|
else if (xec == 0x1)
|
|
pr_cont("Poison data provided for TLB fill.\n");
|
|
else
|
|
ret = false;
|
|
} else if (BUS_ERROR(ec)) {
|
|
if (xec > 2)
|
|
ret = false;
|
|
|
|
pr_cont("Error during attempted NB data read.\n");
|
|
} else if (MEM_ERROR(ec)) {
|
|
switch (xec) {
|
|
case 0x4 ... 0xc:
|
|
pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
|
|
break;
|
|
|
|
case 0x10 ... 0x14:
|
|
pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
|
|
break;
|
|
|
|
default:
|
|
ret = false;
|
|
}
|
|
} else if (INT_ERROR(ec)) {
|
|
if (xec <= 0x3f)
|
|
pr_cont("Hardware Assert.\n");
|
|
else
|
|
ret = false;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool f16h_mc2_mce(u16 ec, u8 xec)
|
|
{
|
|
u8 r4 = R4(ec);
|
|
|
|
if (!MEM_ERROR(ec))
|
|
return false;
|
|
|
|
switch (xec) {
|
|
case 0x04 ... 0x05:
|
|
pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
|
|
break;
|
|
|
|
case 0x09 ... 0x0b:
|
|
case 0x0d ... 0x0f:
|
|
pr_cont("ECC error in L2 tag (%s).\n",
|
|
((r4 == R4_GEN) ? "BankReq" :
|
|
((r4 == R4_SNOOP) ? "Prb" : "Fill")));
|
|
break;
|
|
|
|
case 0x10 ... 0x19:
|
|
case 0x1b:
|
|
pr_cont("ECC error in L2 data array (%s).\n",
|
|
(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
|
|
((r4 == R4_GEN) ? "Attr" :
|
|
((r4 == R4_EVICT) ? "Vict" : "Fill"))));
|
|
break;
|
|
|
|
case 0x1c ... 0x1d:
|
|
case 0x1f:
|
|
pr_cont("Parity error in L2 attribute bits (%s).\n",
|
|
((r4 == R4_RD) ? "Hit" :
|
|
((r4 == R4_GEN) ? "Attr" : "Fill")));
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static void decode_mc2_mce(struct mce *m)
|
|
{
|
|
u16 ec = EC(m->status);
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
pr_emerg(HW_ERR "MC2 Error: ");
|
|
|
|
if (!fam_ops->mc2_mce(ec, xec))
|
|
pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
|
|
}
|
|
|
|
static void decode_mc3_mce(struct mce *m)
|
|
{
|
|
u16 ec = EC(m->status);
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
if (boot_cpu_data.x86 >= 0x14) {
|
|
pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
|
|
" please report on LKML.\n");
|
|
return;
|
|
}
|
|
|
|
pr_emerg(HW_ERR "MC3 Error");
|
|
|
|
if (xec == 0x0) {
|
|
u8 r4 = R4(ec);
|
|
|
|
if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
|
|
goto wrong_mc3_mce;
|
|
|
|
pr_cont(" during %s.\n", R4_MSG(ec));
|
|
} else
|
|
goto wrong_mc3_mce;
|
|
|
|
return;
|
|
|
|
wrong_mc3_mce:
|
|
pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
|
|
}
|
|
|
|
static void decode_mc4_mce(struct mce *m)
|
|
{
|
|
unsigned int fam = x86_family(m->cpuid);
|
|
int node_id = amd_get_nb_id(m->extcpu);
|
|
u16 ec = EC(m->status);
|
|
u8 xec = XEC(m->status, 0x1f);
|
|
u8 offset = 0;
|
|
|
|
pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
|
|
|
|
switch (xec) {
|
|
case 0x0 ... 0xe:
|
|
|
|
/* special handling for DRAM ECCs */
|
|
if (xec == 0x0 || xec == 0x8) {
|
|
/* no ECCs on F11h */
|
|
if (fam == 0x11)
|
|
goto wrong_mc4_mce;
|
|
|
|
pr_cont("%s.\n", mc4_mce_desc[xec]);
|
|
|
|
if (decode_dram_ecc)
|
|
decode_dram_ecc(node_id, m);
|
|
return;
|
|
}
|
|
break;
|
|
|
|
case 0xf:
|
|
if (TLB_ERROR(ec))
|
|
pr_cont("GART Table Walk data error.\n");
|
|
else if (BUS_ERROR(ec))
|
|
pr_cont("DMA Exclusion Vector Table Walk error.\n");
|
|
else
|
|
goto wrong_mc4_mce;
|
|
return;
|
|
|
|
case 0x19:
|
|
if (fam == 0x15 || fam == 0x16)
|
|
pr_cont("Compute Unit Data Error.\n");
|
|
else
|
|
goto wrong_mc4_mce;
|
|
return;
|
|
|
|
case 0x1c ... 0x1f:
|
|
offset = 13;
|
|
break;
|
|
|
|
default:
|
|
goto wrong_mc4_mce;
|
|
}
|
|
|
|
pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
|
|
return;
|
|
|
|
wrong_mc4_mce:
|
|
pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
|
|
}
|
|
|
|
static void decode_mc5_mce(struct mce *m)
|
|
{
|
|
unsigned int fam = x86_family(m->cpuid);
|
|
u16 ec = EC(m->status);
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
if (fam == 0xf || fam == 0x11)
|
|
goto wrong_mc5_mce;
|
|
|
|
pr_emerg(HW_ERR "MC5 Error: ");
|
|
|
|
if (INT_ERROR(ec)) {
|
|
if (xec <= 0x1f) {
|
|
pr_cont("Hardware Assert.\n");
|
|
return;
|
|
} else
|
|
goto wrong_mc5_mce;
|
|
}
|
|
|
|
if (xec == 0x0 || xec == 0xc)
|
|
pr_cont("%s.\n", mc5_mce_desc[xec]);
|
|
else if (xec <= 0xd)
|
|
pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
|
|
else
|
|
goto wrong_mc5_mce;
|
|
|
|
return;
|
|
|
|
wrong_mc5_mce:
|
|
pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
|
|
}
|
|
|
|
static void decode_mc6_mce(struct mce *m)
|
|
{
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
pr_emerg(HW_ERR "MC6 Error: ");
|
|
|
|
if (xec > 0x5)
|
|
goto wrong_mc6_mce;
|
|
|
|
pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
|
|
return;
|
|
|
|
wrong_mc6_mce:
|
|
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
|
|
}
|
|
|
|
/* Decode errors according to Scalable MCA specification */
|
|
static void decode_smca_error(struct mce *m)
|
|
{
|
|
struct smca_hwid *hwid;
|
|
enum smca_bank_types bank_type;
|
|
const char *ip_name;
|
|
u8 xec = XEC(m->status, xec_mask);
|
|
|
|
if (m->bank >= ARRAY_SIZE(smca_banks))
|
|
return;
|
|
|
|
hwid = smca_banks[m->bank].hwid;
|
|
if (!hwid)
|
|
return;
|
|
|
|
bank_type = hwid->bank_type;
|
|
|
|
if (bank_type == SMCA_RESERVED) {
|
|
pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
|
|
return;
|
|
}
|
|
|
|
ip_name = smca_get_long_name(bank_type);
|
|
|
|
pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
|
|
|
|
/* Only print the decode of valid error codes */
|
|
if (xec < smca_mce_descs[bank_type].num_descs &&
|
|
(hwid->xec_bitmap & BIT_ULL(xec))) {
|
|
pr_emerg(HW_ERR "%s Error: ", ip_name);
|
|
pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
|
|
}
|
|
|
|
if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
|
|
decode_dram_ecc(cpu_to_node(m->extcpu), m);
|
|
}
|
|
|
|
static inline void amd_decode_err_code(u16 ec)
|
|
{
|
|
if (INT_ERROR(ec)) {
|
|
pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
|
|
return;
|
|
}
|
|
|
|
pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
|
|
|
|
if (BUS_ERROR(ec))
|
|
pr_cont(", mem/io: %s", II_MSG(ec));
|
|
else
|
|
pr_cont(", tx: %s", TT_MSG(ec));
|
|
|
|
if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
|
|
pr_cont(", mem-tx: %s", R4_MSG(ec));
|
|
|
|
if (BUS_ERROR(ec))
|
|
pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
|
|
}
|
|
|
|
pr_cont("\n");
|
|
}
|
|
|
|
/*
|
|
* Filter out unwanted MCE signatures here.
|
|
*/
|
|
static bool amd_filter_mce(struct mce *m)
|
|
{
|
|
/*
|
|
* NB GART TLB error reporting is disabled by default.
|
|
*/
|
|
if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static const char *decode_error_status(struct mce *m)
|
|
{
|
|
if (m->status & MCI_STATUS_UC) {
|
|
if (m->status & MCI_STATUS_PCC)
|
|
return "System Fatal error.";
|
|
if (m->mcgstatus & MCG_STATUS_RIPV)
|
|
return "Uncorrected, software restartable error.";
|
|
return "Uncorrected, software containable error.";
|
|
}
|
|
|
|
if (m->status & MCI_STATUS_DEFERRED)
|
|
return "Deferred error, no action required.";
|
|
|
|
return "Corrected error, no action required.";
|
|
}
|
|
|
|
static int
|
|
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
|
|
{
|
|
struct mce *m = (struct mce *)data;
|
|
unsigned int fam = x86_family(m->cpuid);
|
|
int ecc;
|
|
|
|
if (amd_filter_mce(m))
|
|
return NOTIFY_STOP;
|
|
|
|
pr_emerg(HW_ERR "%s\n", decode_error_status(m));
|
|
|
|
pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
|
|
m->extcpu,
|
|
fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
|
|
m->bank,
|
|
((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
|
|
((m->status & MCI_STATUS_UC) ? "UE" :
|
|
(m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
|
|
((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
|
|
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
|
|
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
|
|
|
|
if (fam >= 0x15) {
|
|
pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
|
|
|
|
/* F15h, bank4, bit 43 is part of McaStatSubCache. */
|
|
if (fam != 0x15 || m->bank != 4)
|
|
pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
|
|
}
|
|
|
|
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
|
u32 low, high;
|
|
u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
|
|
|
|
pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
|
|
|
|
if (!rdmsr_safe(addr, &low, &high) &&
|
|
(low & MCI_CONFIG_MCAX))
|
|
pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
|
|
}
|
|
|
|
/* do the two bits[14:13] together */
|
|
ecc = (m->status >> 45) & 0x3;
|
|
if (ecc)
|
|
pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
|
|
|
|
pr_cont("]: 0x%016llx\n", m->status);
|
|
|
|
if (m->status & MCI_STATUS_ADDRV)
|
|
pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
|
|
|
|
if (boot_cpu_has(X86_FEATURE_SMCA)) {
|
|
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
|
|
|
|
if (m->status & MCI_STATUS_SYNDV)
|
|
pr_cont(", Syndrome: 0x%016llx", m->synd);
|
|
|
|
pr_cont("\n");
|
|
|
|
decode_smca_error(m);
|
|
goto err_code;
|
|
}
|
|
|
|
if (m->tsc)
|
|
pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
|
|
|
|
if (!fam_ops)
|
|
goto err_code;
|
|
|
|
switch (m->bank) {
|
|
case 0:
|
|
decode_mc0_mce(m);
|
|
break;
|
|
|
|
case 1:
|
|
decode_mc1_mce(m);
|
|
break;
|
|
|
|
case 2:
|
|
decode_mc2_mce(m);
|
|
break;
|
|
|
|
case 3:
|
|
decode_mc3_mce(m);
|
|
break;
|
|
|
|
case 4:
|
|
decode_mc4_mce(m);
|
|
break;
|
|
|
|
case 5:
|
|
decode_mc5_mce(m);
|
|
break;
|
|
|
|
case 6:
|
|
decode_mc6_mce(m);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
err_code:
|
|
amd_decode_err_code(m->status & 0xffff);
|
|
|
|
return NOTIFY_STOP;
|
|
}
|
|
|
|
static struct notifier_block amd_mce_dec_nb = {
|
|
.notifier_call = amd_decode_mce,
|
|
.priority = MCE_PRIO_EDAC,
|
|
};
|
|
|
|
static int __init mce_amd_init(void)
|
|
{
|
|
struct cpuinfo_x86 *c = &boot_cpu_data;
|
|
|
|
if (c->x86_vendor != X86_VENDOR_AMD)
|
|
return -ENODEV;
|
|
|
|
fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
|
|
if (!fam_ops)
|
|
return -ENOMEM;
|
|
|
|
switch (c->x86) {
|
|
case 0xf:
|
|
fam_ops->mc0_mce = k8_mc0_mce;
|
|
fam_ops->mc1_mce = k8_mc1_mce;
|
|
fam_ops->mc2_mce = k8_mc2_mce;
|
|
break;
|
|
|
|
case 0x10:
|
|
fam_ops->mc0_mce = f10h_mc0_mce;
|
|
fam_ops->mc1_mce = k8_mc1_mce;
|
|
fam_ops->mc2_mce = k8_mc2_mce;
|
|
break;
|
|
|
|
case 0x11:
|
|
fam_ops->mc0_mce = k8_mc0_mce;
|
|
fam_ops->mc1_mce = k8_mc1_mce;
|
|
fam_ops->mc2_mce = k8_mc2_mce;
|
|
break;
|
|
|
|
case 0x12:
|
|
fam_ops->mc0_mce = f12h_mc0_mce;
|
|
fam_ops->mc1_mce = k8_mc1_mce;
|
|
fam_ops->mc2_mce = k8_mc2_mce;
|
|
break;
|
|
|
|
case 0x14:
|
|
fam_ops->mc0_mce = cat_mc0_mce;
|
|
fam_ops->mc1_mce = cat_mc1_mce;
|
|
fam_ops->mc2_mce = k8_mc2_mce;
|
|
break;
|
|
|
|
case 0x15:
|
|
xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
|
|
|
|
fam_ops->mc0_mce = f15h_mc0_mce;
|
|
fam_ops->mc1_mce = f15h_mc1_mce;
|
|
fam_ops->mc2_mce = f15h_mc2_mce;
|
|
break;
|
|
|
|
case 0x16:
|
|
xec_mask = 0x1f;
|
|
fam_ops->mc0_mce = cat_mc0_mce;
|
|
fam_ops->mc1_mce = cat_mc1_mce;
|
|
fam_ops->mc2_mce = f16h_mc2_mce;
|
|
break;
|
|
|
|
case 0x17:
|
|
xec_mask = 0x3f;
|
|
if (!boot_cpu_has(X86_FEATURE_SMCA)) {
|
|
printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
|
|
goto err_out;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
|
|
goto err_out;
|
|
}
|
|
|
|
pr_info("MCE: In-kernel MCE decoding enabled.\n");
|
|
|
|
mce_register_decode_chain(&amd_mce_dec_nb);
|
|
|
|
return 0;
|
|
|
|
err_out:
|
|
kfree(fam_ops);
|
|
fam_ops = NULL;
|
|
return -EINVAL;
|
|
}
|
|
early_initcall(mce_amd_init);
|
|
|
|
#ifdef MODULE
|
|
static void __exit mce_amd_exit(void)
|
|
{
|
|
mce_unregister_decode_chain(&amd_mce_dec_nb);
|
|
kfree(fam_ops);
|
|
}
|
|
|
|
MODULE_DESCRIPTION("AMD MCE decoder");
|
|
MODULE_ALIAS("edac-mce-amd");
|
|
MODULE_LICENSE("GPL");
|
|
module_exit(mce_amd_exit);
|
|
#endif
|