From b70ef01016850de87b9a28a6af19fed8801df076 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:32:38 +0200 Subject: [PATCH 01/15] EDAC: move MCE error descriptions to EDAC core This is in preparation of adding AMD-specific MCE decoding functionality to the EDAC core. The error decoding macros originate from the AMD64 EDAC driver albeit in a simplified and cleaned up version here. While at it, add macros to generate the error description strings and use them in the error type decoders directly which removes a bunch of code and makes the decoding functions much more readable. Also, fix strings and shorten macro names. Remove superfluous htlink_msgs. Signed-off-by: Borislav Petkov --- drivers/edac/Makefile | 6 +- drivers/edac/amd64_edac.c | 140 ++++++++++-------- drivers/edac/amd64_edac.h | 17 +-- ...{amd64_edac_err_types.c => edac_mce_amd.c} | 78 ++-------- drivers/edac/edac_mce_amd.h | 29 ++++ 5 files changed, 122 insertions(+), 148 deletions(-) rename drivers/edac/{amd64_edac_err_types.c => edac_mce_amd.c} (61%) create mode 100644 drivers/edac/edac_mce_amd.h diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 98aa4a7db412..cfa033ce53a7 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -17,6 +17,10 @@ ifdef CONFIG_PCI edac_core-objs += edac_pci.o edac_pci_sysfs.o endif +ifdef CONFIG_CPU_SUP_AMD +edac_core-objs += edac_mce_amd.o +endif + obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o obj-$(CONFIG_EDAC_CPC925) += cpc925_edac.o obj-$(CONFIG_EDAC_I5000) += i5000_edac.o @@ -32,7 +36,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o obj-$(CONFIG_EDAC_I82860) += i82860_edac.o obj-$(CONFIG_EDAC_R82600) += r82600_edac.o -amd64_edac_mod-y := amd64_edac_err_types.o amd64_edac.o +amd64_edac_mod-y := amd64_edac.o amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e2a10bcba7a1..b9e84bc91766 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -18,6 +18,63 @@ struct amd64_pvt; static struct mem_ctl_info *mci_lookup[MAX_NUMNODES]; static struct amd64_pvt *pvt_lookup[MAX_NUMNODES]; +/* + * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only + * for DDR2 DRAM mapping. + */ +u32 revf_quad_ddr2_shift[] = { + 0, /* 0000b NULL DIMM (128mb) */ + 28, /* 0001b 256mb */ + 29, /* 0010b 512mb */ + 29, /* 0011b 512mb */ + 29, /* 0100b 512mb */ + 30, /* 0101b 1gb */ + 30, /* 0110b 1gb */ + 31, /* 0111b 2gb */ + 31, /* 1000b 2gb */ + 32, /* 1001b 4gb */ + 32, /* 1010b 4gb */ + 33, /* 1011b 8gb */ + 0, /* 1100b future */ + 0, /* 1101b future */ + 0, /* 1110b future */ + 0 /* 1111b future */ +}; + +/* + * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing + * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- + * or higher value'. + * + *FIXME: Produce a better mapping/linearisation. + */ + +struct scrubrate scrubrates[] = { + { 0x01, 1600000000UL}, + { 0x02, 800000000UL}, + { 0x03, 400000000UL}, + { 0x04, 200000000UL}, + { 0x05, 100000000UL}, + { 0x06, 50000000UL}, + { 0x07, 25000000UL}, + { 0x08, 12284069UL}, + { 0x09, 6274509UL}, + { 0x0A, 3121951UL}, + { 0x0B, 1560975UL}, + { 0x0C, 781440UL}, + { 0x0D, 390720UL}, + { 0x0E, 195300UL}, + { 0x0F, 97650UL}, + { 0x10, 48854UL}, + { 0x11, 24427UL}, + { 0x12, 12213UL}, + { 0x13, 6101UL}, + { 0x14, 3051UL}, + { 0x15, 1523UL}, + { 0x16, 761UL}, + { 0x00, 0UL}, /* scrubbing off */ +}; + /* * Memory scrubber control interface. For K8, memory scrubbing is handled by * hardware and can involve L2 cache, dcache as well as the main memory. With @@ -1101,8 +1158,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u32 page, offset; /* Extract the syndrome parts and form a 16-bit syndrome */ - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* CHIPKILL enabled */ if (info->nbcfg & K8_NBCFG_CHIPKILL) { @@ -1701,8 +1758,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, if (csrow >= 0) { error_address_to_page_and_offset(sys_addr, &page, &offset); - syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8; - syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh); + syndrome = HIGH_SYNDROME(info->nbsl) << 8; + syndrome |= LOW_SYNDROME(info->nbsh); /* * Is CHIPKILL on? If so, then we can attempt to use the @@ -2155,36 +2212,22 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "GART TLB event: transaction type(%s), " - "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]); + "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code; - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_tt; /* error code transaction type (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - err_code = EXTRACT_ERROR_CODE(info->nbsl); - ec_ll = EXTRACT_LL_CODE(err_code); - ec_tt = EXTRACT_TT_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "cache hierarchy error: memory transaction type(%s), " "transaction type(%s), cache level(%s)\n", - rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]); + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } @@ -2264,21 +2307,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, static void amd64_decode_bus_error(struct mem_ctl_info *mci, struct amd64_error_info_regs *info) { - u32 err_code, ext_ec; - u32 ec_pp; /* error code participating processor (2p) */ - u32 ec_to; /* error code timed out (1b) */ - u32 ec_rrrr; /* error code memory transaction (4b) */ - u32 ec_ii; /* error code memory or I/O (2b) */ - u32 ec_ll; /* error code cache level (2b) */ - - ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl); - err_code = EXTRACT_ERROR_CODE(info->nbsl); - - ec_ll = EXTRACT_LL_CODE(err_code); - ec_ii = EXTRACT_II_CODE(err_code); - ec_rrrr = EXTRACT_RRRR_CODE(err_code); - ec_to = EXTRACT_TO_CODE(err_code); - ec_pp = EXTRACT_PP_CODE(err_code); + u32 ec = ERROR_CODE(info->nbsl); + u32 xec = EXT_ERROR_CODE(info->nbsl); amd64_mc_printk(mci, KERN_ERR, "BUS ERROR:\n" @@ -2286,20 +2316,17 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, " participating processor(%s)\n" " memory transaction type(%s)\n" " cache level(%s) Error Found by: %s\n", - to_msgs[ec_to], - ii_msgs[ec_ii], - pp_msgs[ec_pp], - rrrr_msgs[ec_rrrr], - ll_msgs[ec_ll], + TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), (info->nbsh & K8_NBSH_ERR_SCRUBER) ? "Scrubber" : "Normal Operation"); - /* If this was an 'observed' error, early out */ - if (ec_pp == K8_NBSL_PP_OBS) - return; /* We aren't the node involved */ + + /* Bail early out if this was an 'observed' error */ + if (PP(ec) == K8_NBSL_PP_OBS) + return; /* Parse out the extended error code for ECC events */ - switch (ext_ec) { + switch (xec) { /* F10 changed to one Extended ECC error code */ case F10_NBSL_EXT_ERR_RES: /* Reserved field */ case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ @@ -2379,7 +2406,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); - err_code = EXTRACT_ERROR_CODE(regs->nbsl); + err_code = ERROR_CODE(regs->nbsl); /* Determine which error type: * 1) GART errors - non-fatal, developmental events @@ -2387,7 +2414,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * 3) BUS errors * 4) Unknown error */ - if (TEST_TLB_ERROR(err_code)) { + if (TLB_ERROR(err_code)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2411,10 +2438,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, debugf1("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, info); - } else if (TEST_MEM_ERROR(err_code)) { + } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, info); - } else if (TEST_BUS_ERROR(err_code)) { + } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, info); } else { @@ -2424,21 +2451,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci, err_code); } - ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl); + ext_ec = EXT_ERROR_CODE(regs->nbsl); amd64_mc_printk(mci, KERN_ERR, "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); - if (((ext_ec >= F10_NBSL_EXT_ERR_CRC && - ext_ec <= F10_NBSL_EXT_ERR_TGT) || - (ext_ec == F10_NBSL_EXT_ERR_RMW)) && - EXTRACT_LDT_LINK(info->nbsh)) { - - amd64_mc_printk(mci, KERN_ERR, - "Error on hypertransport link: %s\n", - htlink_msgs[ - EXTRACT_LDT_LINK(info->nbsh)]); - } - /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ba73015af8e4..1ddef8d15d52 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -72,6 +72,7 @@ #include #include #include "edac_core.h" +#include "edac_mce_amd.h" #define amd64_printk(level, fmt, arg...) \ edac_printk(level, "amd64", fmt, ##arg) @@ -303,9 +304,6 @@ enum { #define K8_NBSL 0x48 -#define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff) -#define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) - /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 #define F10_NBSL_EXT_ERR_CRC 0x1 @@ -348,17 +346,6 @@ enum { #define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8 #define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD -#define EXTRACT_ERROR_CODE(x) ((x) & 0xffff) -#define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -#define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -#define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800) -#define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3) -#define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3) -#define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf) -#define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1) -#define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3) - /* * The following are for BUS type errors AFTER values have been normalized by * shifting right @@ -386,9 +373,7 @@ enum { #define K8_NBSH_CORE1 BIT(1) #define K8_NBSH_CORE0 BIT(0) -#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) -#define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define K8_NBEAL 0x50 diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/edac_mce_amd.c similarity index 61% rename from drivers/edac/amd64_edac_err_types.c rename to drivers/edac/edac_mce_amd.c index f212ff12a9d8..cf8465450b32 100644 --- a/drivers/edac/amd64_edac_err_types.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,61 +1,5 @@ -#include "amd64_edac.h" - -/* - * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only - * for DDR2 DRAM mapping. - */ -u32 revf_quad_ddr2_shift[] = { - 0, /* 0000b NULL DIMM (128mb) */ - 28, /* 0001b 256mb */ - 29, /* 0010b 512mb */ - 29, /* 0011b 512mb */ - 29, /* 0100b 512mb */ - 30, /* 0101b 1gb */ - 30, /* 0110b 1gb */ - 31, /* 0111b 2gb */ - 31, /* 1000b 2gb */ - 32, /* 1001b 4gb */ - 32, /* 1010b 4gb */ - 33, /* 1011b 8gb */ - 0, /* 1100b future */ - 0, /* 1101b future */ - 0, /* 1110b future */ - 0 /* 1111b future */ -}; - -/* - * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing - * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching- - * or higher value'. - * - *FIXME: Produce a better mapping/linearisation. - */ - -struct scrubrate scrubrates[] = { - { 0x01, 1600000000UL}, - { 0x02, 800000000UL}, - { 0x03, 400000000UL}, - { 0x04, 200000000UL}, - { 0x05, 100000000UL}, - { 0x06, 50000000UL}, - { 0x07, 25000000UL}, - { 0x08, 12284069UL}, - { 0x09, 6274509UL}, - { 0x0A, 3121951UL}, - { 0x0B, 1560975UL}, - { 0x0C, 781440UL}, - { 0x0D, 390720UL}, - { 0x0E, 195300UL}, - { 0x0F, 97650UL}, - { 0x10, 48854UL}, - { 0x11, 24427UL}, - { 0x12, 12213UL}, - { 0x13, 6101UL}, - { 0x14, 3051UL}, - { 0x15, 1523UL}, - { 0x16, 761UL}, - { 0x00, 0UL}, /* scrubbing off */ -}; +#include +#include "edac_mce_amd.h" /* * string representation for the different MCA reported error types, see F3x48 @@ -67,6 +11,7 @@ const char *tt_msgs[] = { /* transaction type */ "generic", "reserved" }; +EXPORT_SYMBOL_GPL(tt_msgs); const char *ll_msgs[] = { /* cache level */ "L0", @@ -74,6 +19,7 @@ const char *ll_msgs[] = { /* cache level */ "L2", "L3/generic" }; +EXPORT_SYMBOL_GPL(ll_msgs); const char *rrrr_msgs[] = { "generic", @@ -93,6 +39,7 @@ const char *rrrr_msgs[] = { "reserved RRRR= 14", "reserved RRRR= 15" }; +EXPORT_SYMBOL_GPL(rrrr_msgs); const char *pp_msgs[] = { /* participating processor */ "local node originated (SRC)", @@ -100,11 +47,13 @@ const char *pp_msgs[] = { /* participating processor */ "local node observed as 3rd party (OBS)", "generic" }; +EXPORT_SYMBOL_GPL(pp_msgs); const char *to_msgs[] = { "no timeout", "timed out" }; +EXPORT_SYMBOL_GPL(to_msgs); const char *ii_msgs[] = { /* memory or i/o */ "mem access", @@ -112,6 +61,7 @@ const char *ii_msgs[] = { /* memory or i/o */ "i/o access", "generic" }; +EXPORT_SYMBOL_GPL(ii_msgs); /* Map the 5 bits of Extended Error code to the string table. */ const char *ext_msgs[] = { /* extended error */ @@ -148,14 +98,4 @@ const char *ext_msgs[] = { /* extended error */ "L3 Cache LRU error", /* 1_1110b */ "Res 0x1FF error" /* 1_1111b */ }; - -const char *htlink_msgs[] = { - "none", - "1", - "2", - "1 2", - "3", - "1 3", - "2 3", - "1 2 3" -}; +EXPORT_SYMBOL_GPL(ext_msgs); diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h new file mode 100644 index 000000000000..81f9dcf9990a --- /dev/null +++ b/drivers/edac/edac_mce_amd.h @@ -0,0 +1,29 @@ +#define ERROR_CODE(x) ((x) & 0xffff) +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) (((x) >> 0) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) rrrr_msgs[RRRR(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] + +extern const char *tt_msgs[]; +extern const char *ll_msgs[]; +extern const char *rrrr_msgs[]; +extern const char *pp_msgs[]; +extern const char *to_msgs[]; +extern const char *ii_msgs[]; +extern const char *ext_msgs[]; From 1c43f2e24d059913bce58887f1d6e4267aaed284 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 15:47:51 +0200 Subject: [PATCH 02/15] EDAC: beef up ErrorCodeExt error signatures Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 71 +++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index cf8465450b32..918567e8cfd5 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -63,39 +63,42 @@ const char *ii_msgs[] = { /* memory or i/o */ }; EXPORT_SYMBOL_GPL(ii_msgs); -/* Map the 5 bits of Extended Error code to the string table. */ -const char *ext_msgs[] = { /* extended error */ - "K8 ECC error/F10 reserved", /* 0_0000b */ - "CRC error", /* 0_0001b */ - "sync error", /* 0_0010b */ - "mst abort", /* 0_0011b */ - "tgt abort", /* 0_0100b */ - "GART error", /* 0_0101b */ - "RMW error", /* 0_0110b */ - "Wdog timer error", /* 0_0111b */ - "F10-ECC/K8-Chipkill error", /* 0_1000b */ - "DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link or L3 Protocol error", /* 0_1011b */ - "NB Array error", /* 0_1100b */ - "DRAM Parity error", /* 0_1101b */ - "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */ - "Res 0x0ff error", /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "L3 Cache Data error", /* 1_1100b */ - "L3 CacheTag error", /* 1_1101b */ - "L3 Cache LRU error", /* 1_1110b */ - "Res 0x1FF error" /* 1_1111b */ +/* + * Map the 4 or 5 (family-specific) bits of Extended Error code to the + * string table. + */ +const char *ext_msgs[] = { + "K8 ECC error", /* 0_0000b */ + "CRC error on link", /* 0_0001b */ + "Sync error packets on link", /* 0_0010b */ + "Master Abort during link operation", /* 0_0011b */ + "Target Abort during link operation", /* 0_0100b */ + "Invalid GART PTE entry during table walk", /* 0_0101b */ + "Unsupported atomic RMW command received", /* 0_0110b */ + "WDT error: NB transaction timeout", /* 0_0111b */ + "ECC/ChipKill ECC error", /* 0_1000b */ + "SVM DEV Error", /* 0_1001b */ + "Link Data error", /* 0_1010b */ + "Link/L3/Probe Filter Protocol error", /* 0_1011b */ + "NB Internal Arrays Parity error", /* 0_1100b */ + "DRAM Address/Control Parity error", /* 0_1101b */ + "Link Transmission error", /* 0_1110b */ + "GART/DEV Table Walk Data error" /* 0_1111b */ + "Res 0x100 error", /* 1_0000b */ + "Res 0x101 error", /* 1_0001b */ + "Res 0x102 error", /* 1_0010b */ + "Res 0x103 error", /* 1_0011b */ + "Res 0x104 error", /* 1_0100b */ + "Res 0x105 error", /* 1_0101b */ + "Res 0x106 error", /* 1_0110b */ + "Res 0x107 error", /* 1_0111b */ + "Res 0x108 error", /* 1_1000b */ + "Res 0x109 error", /* 1_1001b */ + "Res 0x10A error", /* 1_1010b */ + "Res 0x10B error", /* 1_1011b */ + "ECC error in L3 Cache Data", /* 1_1100b */ + "L3 Cache Tag error", /* 1_1101b */ + "L3 Cache LRU Parity error", /* 1_1110b */ + "Probe Filter error" /* 1_1111b */ }; EXPORT_SYMBOL_GPL(ext_msgs); From ef44cc4c2245d3c43f3c11c7bff6239852eef498 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 14:45:48 +0200 Subject: [PATCH 03/15] amd64_edac: cleanup amd64_process_error_info * mv amd64_error_info_regs -> err_regs * remove redundant info ptr Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 44 ++++++++++++++++++--------------------- drivers/edac/amd64_edac.h | 10 ++++----- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index b9e84bc91766..c9b88d829701 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -750,7 +750,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow, * specific. */ static u64 extract_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; @@ -1106,7 +1106,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt) /* extract the ERROR ADDRESS for the K8 CPUs */ static u64 k8_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xff)) << 32) + (info->nbeal & ~0x03); @@ -1149,7 +1149,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram) } static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddress) { struct mem_ctl_info *src_mci; @@ -1368,7 +1368,7 @@ static void amd64_teardown(struct amd64_pvt *pvt) } static u64 f10_get_error_address(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { return (((u64) (info->nbeah & 0xffff)) << 32) + (info->nbeal & ~0x01); @@ -1745,7 +1745,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, * The @sys_addr is usually an error address received from the hardware. */ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 sys_addr) { struct amd64_pvt *pvt = mci->pvt_info; @@ -2102,7 +2102,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome) * - 0: if no valid error is indicated */ static int amd64_get_error_info_regs(struct mem_ctl_info *mci, - struct amd64_error_info_regs *regs) + struct err_regs *regs) { struct amd64_pvt *pvt; struct pci_dev *misc_f3_ctl; @@ -2151,10 +2151,10 @@ static int amd64_get_error_info_regs(struct mem_ctl_info *mci, * - 0: if no error is found */ static int amd64_get_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt; - struct amd64_error_info_regs regs; + struct err_regs regs; pvt = mci->pvt_info; @@ -2210,7 +2210,7 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, } static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); @@ -2220,7 +2220,7 @@ static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, } static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); @@ -2236,7 +2236,7 @@ static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, * ADDRESS and process. */ static void amd64_handle_ce(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { struct amd64_pvt *pvt = mci->pvt_info; u64 SystemAddress; @@ -2259,7 +2259,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, /* Handle any Un-correctable Errors (UEs) */ static void amd64_handle_ue(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { int csrow; u64 SystemAddress; @@ -2305,7 +2305,7 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); @@ -2356,22 +2356,18 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, } int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *regs, int handle_errors) { struct amd64_pvt *pvt; - struct amd64_error_info_regs *regs; u32 err_code, ext_ec; int gart_tlb_error = 0; pvt = mci->pvt_info; - /* If caller doesn't want us to process the error, return */ if (!handle_errors) return 1; - regs = info; - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", pvt->mc_node_id, regs->nbeah, regs->nbeal); @@ -2437,13 +2433,13 @@ int amd64_process_error_info(struct mem_ctl_info *mci, gart_tlb_error = 1; debugf1("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, info); + amd64_decode_gart_tlb_error(mci, regs); } else if (MEM_ERROR(err_code)) { debugf1("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, info); + amd64_decode_mem_cache_error(mci, regs); } else if (BUS_ERROR(err_code)) { debugf1("Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, info); + amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, @@ -2480,10 +2476,10 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info); */ static void amd64_check(struct mem_ctl_info *mci) { - struct amd64_error_info_regs info; + struct err_regs regs; - if (amd64_get_error_info(mci, &info)) - amd64_process_error_info(mci, &info, 1); + if (amd64_get_error_info(mci, ®s)) + amd64_process_error_info(mci, ®s, 1); } /* diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 1ddef8d15d52..bde8f78551f9 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -449,7 +449,7 @@ enum amd64_chipset_families { * * Depends on entry into the modules */ -struct amd64_error_info_regs { +struct err_regs { u32 nbcfg; u32 nbsh; u32 nbsl; @@ -527,7 +527,7 @@ struct amd64_pvt { u32 online_spare; /* On-Line spare Reg */ /* temp storage for when input is received from sysfs */ - struct amd64_error_info_regs ctl_error_info; + struct err_regs ctl_error_info; /* place to store error injection parameters prior to issue */ struct error_injection injection; @@ -586,11 +586,11 @@ struct low_ops { int (*early_channel_count)(struct amd64_pvt *pvt); u64 (*get_error_address)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info); + struct err_regs *info); void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram); void (*read_dram_ctl_register)(struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, u64 SystemAddr); int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map); }; @@ -623,7 +623,7 @@ static inline struct low_ops *family_ops(int index) #define F11_MIN_SCRUB_RATE_BITS 0x6 int amd64_process_error_info(struct mem_ctl_info *mci, - struct amd64_error_info_regs *info, + struct err_regs *info, int handle_errors); int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); From 5110dbdeab546268dda2e4c6a83448639b2fc5ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 25 Jun 2009 19:51:04 +0200 Subject: [PATCH 04/15] amd64_edac: cleanup/complete NB MCE decoding * don't dump info which mcheck already does * update to newest BKDG * mv amd64_process_error_info -> amd64_decode_nb_mce * shorten error struct names * remove redundant info ptr in amd64_process_error_info * remove unused ErrorCodeExt[19:16] (MCx_STATUS) defines Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 127 +++++++++++++--------------------- drivers/edac/amd64_edac.h | 26 ++----- drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.h | 2 + 4 files changed, 57 insertions(+), 100 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c9b88d829701..5af87d44c80c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2355,62 +2355,47 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, "Error Overflow set"); } -int amd64_process_error_info(struct mem_ctl_info *mci, - struct err_regs *regs, - int handle_errors) +void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, + int handle_errors) { - struct amd64_pvt *pvt; - u32 err_code, ext_ec; - int gart_tlb_error = 0; - - pvt = mci->pvt_info; + struct amd64_pvt *pvt = mci->pvt_info; + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); if (!handle_errors) - return 1; + return; - debugf1("NorthBridge ERROR: mci(0x%p)\n", mci); - debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n", - pvt->mc_node_id, regs->nbeah, regs->nbeal); - debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n", - regs->nbsh, regs->nbsl); - debugf1(" Valid Error=%s Overflow=%s\n", - (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False", - (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False"); - debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n", - (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_ENABLE) ? - "True" : "False"); - debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n", - (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ? - "True" : "False", - (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ? - "True" : "False", - (regs->nbsh & K8_NBSH_PCC) ? - "True" : "False"); - debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n", - (regs->nbsh & K8_NBSH_CECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_UECC) ? - "True" : "False", - (regs->nbsh & K8_NBSH_ERR_SCRUBER) ? - "True" : "False"); - debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n", - (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False", - (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False"); + pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id); - - err_code = ERROR_CODE(regs->nbsl); - - /* Determine which error type: - * 1) GART errors - non-fatal, developmental events - * 2) MEMORY errors - * 3) BUS errors - * 4) Unknown error + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently */ - if (TLB_ERROR(err_code)) { + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } + + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers * to detect bad GART PTEs. It is recommended by AMD to disable @@ -2423,52 +2408,34 @@ int amd64_process_error_info(struct mem_ctl_info *mci, * [1] section 13.10.1 on BIOS and Kernel Developers Guide for * AMD NPT family 0Fh processors */ - if (report_gart_errors == 0) - return 1; + if (!report_gart_errors) + return; - /* - * Only if GART error reporting is requested should we generate - * any logs. - */ - gart_tlb_error = 1; - - debugf1("GART TLB error\n"); + pr_emerg("GART TLB error\n"); amd64_decode_gart_tlb_error(mci, regs); - } else if (MEM_ERROR(err_code)) { - debugf1("Memory/Cache error\n"); + } else if (MEM_ERROR(ec)) { + pr_emerg("Memory/Cache error\n"); amd64_decode_mem_cache_error(mci, regs); - } else if (BUS_ERROR(err_code)) { - debugf1("Bus (Link/DRAM) error\n"); + } else if (BUS_ERROR(ec)) { + pr_emerg("Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, "%s(): unknown MCE error 0x%x\n", __func__, - err_code); + ec); } - ext_ec = EXT_ERROR_CODE(regs->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]); + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. */ - if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) { - amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n"); - if (!gart_tlb_error) - edac_mc_handle_ue_no_info(mci, "UE bit is set\n"); - } - - if (regs->nbsh & K8_NBSH_PCC) - amd64_mc_printk(mci, KERN_CRIT, - "PCC (processor context corrupt) set\n"); - - return 1; + if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) + edac_mc_handle_ue_no_info(mci, "UE bit is set"); } -EXPORT_SYMBOL_GPL(amd64_process_error_info); /* * The main polling 'check' function, called FROM the edac core to perform the @@ -2479,7 +2446,7 @@ static void amd64_check(struct mem_ctl_info *mci) struct err_regs regs; if (amd64_get_error_info(mci, ®s)) - amd64_process_error_info(mci, ®s, 1); + amd64_decode_nb_mce(mci, ®s, 1); } /* diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index bde8f78551f9..ecab0c9fd14e 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -306,16 +306,7 @@ enum { /* Family F10h: Normalized Extended Error Codes */ #define F10_NBSL_EXT_ERR_RES 0x0 -#define F10_NBSL_EXT_ERR_CRC 0x1 -#define F10_NBSL_EXT_ERR_SYNC 0x2 -#define F10_NBSL_EXT_ERR_MST 0x3 -#define F10_NBSL_EXT_ERR_TGT 0x4 -#define F10_NBSL_EXT_ERR_GART 0x5 -#define F10_NBSL_EXT_ERR_RMW 0x6 -#define F10_NBSL_EXT_ERR_WDT 0x7 #define F10_NBSL_EXT_ERR_ECC 0x8 -#define F10_NBSL_EXT_ERR_DEV 0x9 -#define F10_NBSL_EXT_ERR_LINK_DATA 0xA /* Next two are overloaded values */ #define F10_NBSL_EXT_ERR_LINK_PROTO 0xB @@ -360,18 +351,15 @@ enum { #define K8_NBSH_VALID_BIT BIT(31) #define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UNCORRECTED_ERR BIT(29) -#define K8_NBSH_ERR_ENABLE BIT(28) -#define K8_NBSH_MISC_ERR_VALID BIT(27) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) #define K8_NBSH_VALID_ERROR_ADDR BIT(26) #define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) #define K8_NBSH_CECC BIT(14) #define K8_NBSH_UECC BIT(13) #define K8_NBSH_ERR_SCRUBER BIT(8) -#define K8_NBSH_CORE3 BIT(3) -#define K8_NBSH_CORE2 BIT(2) -#define K8_NBSH_CORE1 BIT(1) -#define K8_NBSH_CORE0 BIT(0) #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) @@ -622,8 +610,8 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -int amd64_process_error_info(struct mem_ctl_info *mci, - struct err_regs *info, - int handle_errors); +void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info, + int handle_errors); + int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 0a41b248a4ad..bcb4e2eba3dc 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_process_error_info(mci, &pvt->ctl_error_info, 1); + amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 81f9dcf9990a..39971cdabb51 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,5 +1,7 @@ #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] + #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) From b7225e4fc19ce27a594cb2b868ef151bf82f8f93 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 16:05:53 +0200 Subject: [PATCH 05/15] amd64_edac: remove memory and GART TLB error decoders Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5af87d44c80c..75842f08db83 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2209,28 +2209,6 @@ static int amd64_get_error_info(struct mem_ctl_info *mci, return 1; } -static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci, - struct err_regs *info) -{ - u32 ec = ERROR_CODE(info->nbsl); - - amd64_mc_printk(mci, KERN_ERR, - "GART TLB event: transaction type(%s), " - "cache level(%s)\n", TT_MSG(ec), LL_MSG(ec)); -} - -static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci, - struct err_regs *info) -{ - u32 ec = ERROR_CODE(info->nbsl); - - amd64_mc_printk(mci, KERN_ERR, - "cache hierarchy error: memory transaction type(%s), " - "transaction type(%s), cache level(%s)\n", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); -} - - /* * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR * ADDRESS and process. @@ -2411,19 +2389,19 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, if (!report_gart_errors) return; - pr_emerg("GART TLB error\n"); - amd64_decode_gart_tlb_error(mci, regs); + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg("Memory/Cache error\n"); - amd64_decode_mem_cache_error(mci, regs); + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg("Bus (Link/DRAM) error\n"); + pr_emerg(" Bus (Link/DRAM) error\n"); amd64_decode_bus_error(mci, regs); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, - ec); + "%s(): unknown MCE error 0x%x\n", __func__, ec); } pr_emerg("%s.\n", EXT_ERR_MSG(xec)); From ecaf5606de65cdd04de5f526185fe28fb0df654e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 23 Jul 2009 16:32:01 +0200 Subject: [PATCH 06/15] amd64_edac: cleanup amd64_decode_bus_error Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 75842f08db83..82f48ee90f11 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2283,42 +2283,26 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info) + struct err_regs *info, int ecc_type) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); - amd64_mc_printk(mci, KERN_ERR, - "BUS ERROR:\n" - " time-out(%s) mem or i/o(%s)\n" - " participating processor(%s)\n" - " memory transaction type(%s)\n" - " cache level(%s) Error Found by: %s\n", - TO_MSG(ec), II_MSG(ec), PP_MSG(ec), RRRR_MSG(ec), LL_MSG(ec), - (info->nbsh & K8_NBSH_ERR_SCRUBER) ? - "Scrubber" : "Normal Operation"); + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); /* Bail early out if this was an 'observed' error */ if (PP(ec) == K8_NBSL_PP_OBS) return; - /* Parse out the extended error code for ECC events */ - switch (xec) { - /* F10 changed to one Extended ECC error code */ - case F10_NBSL_EXT_ERR_RES: /* Reserved field */ - case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */ - break; - - default: - amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error " - "handling for this error\n"); + /* Do only ECC errors */ + if (xec && xec != F10_NBSL_EXT_ERR_ECC) return; - } - if (info->nbsh & K8_NBSH_CECC) + if (ecc_type == 2) amd64_handle_ce(mci, info); - else if (info->nbsh & K8_NBSH_UECC) + else if (ecc_type == 1) amd64_handle_ue(mci, info); /* @@ -2329,8 +2313,7 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, * catastrophic. */ if (info->nbsh & K8_NBSH_OVERFLOW) - edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR - "Error Overflow set"); + edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, @@ -2397,7 +2380,7 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { pr_emerg(" Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, regs); + amd64_decode_bus_error(mci, regs, ecc); } else { /* shouldn't reach here! */ amd64_mc_printk(mci, KERN_WARNING, From 549d042df240dfb4203bab40ad44f9336751b7d6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 24 Jul 2009 13:51:42 +0200 Subject: [PATCH 07/15] x86, mce: pass mce info to EDAC for decoding Move NB decoder along with required defines to EDAC MCE core. Add registration routines for further decoding of the MCE info in the AMD64 EDAC module. CC: Andi Kleen Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 7 ++ drivers/edac/amd64_edac.c | 98 +++++++------------------- drivers/edac/amd64_edac.h | 36 ---------- drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.c | 115 +++++++++++++++++++++++++++++++ drivers/edac/edac_mce_amd.h | 38 ++++++++++ 6 files changed, 185 insertions(+), 111 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62f..b82866f6adf5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -183,6 +183,11 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void __weak decode_mce(struct mce *m) +{ + return; +} + static void print_mce(struct mce *m) { printk(KERN_EMERG @@ -205,6 +210,8 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + decode_mce(m); } static void print_mce_head(void) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 82f48ee90f11..2080b1e2e8a2 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2282,8 +2282,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } } -static void amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info, int ecc_type) +static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, + struct err_regs *info, int ecc_type) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); @@ -2316,86 +2316,23 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci, edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs, - int handle_errors) +void amd64_decode_bus_error(int node_id, struct err_regs *regs, + int ecc_type) { - struct amd64_pvt *pvt = mci->pvt_info; - int ecc; - u32 ec = ERROR_CODE(regs->nbsl); - u32 xec = EXT_ERROR_CODE(regs->nbsl); + struct mem_ctl_info *mci = mci_lookup[node_id]; - if (!handle_errors) - return; - - pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id); - - /* - * F10h, revD can disable ErrCpu[3:0] so check that first and also the - * value encoding has changed so interpret those differently - */ - if ((boot_cpu_data.x86 == 0x10) && - (boot_cpu_data.x86_model > 8)) { - if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); - } else { - pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); - } - - pr_emerg(" Error: %sorrected", - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); - pr_cont(", Report Error: %s", - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); - pr_cont(", MiscV: %svalid, CPU context corrupt: %s", - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = regs->nbsh & (0x3 << 13); - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - - if (TLB_ERROR(ec)) { - /* - * GART errors are intended to help graphics driver developers - * to detect bad GART PTEs. It is recommended by AMD to disable - * GART table walk error reporting by default[1] (currently - * being disabled in mce_cpu_quirks()) and according to the - * comment in mce_cpu_quirks(), such GART errors can be - * incorrectly triggered. We may see these errors anyway and - * unless requested by the user, they won't be reported. - * - * [1] section 13.10.1 on BIOS and Kernel Developers Guide for - * AMD NPT family 0Fh processors - */ - if (!report_gart_errors) - return; - - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", - TT_MSG(ec), LL_MSG(ec)); - } else if (MEM_ERROR(ec)) { - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," - " Cache Level: %s", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); - } else if (BUS_ERROR(ec)) { - pr_emerg(" Bus (Link/DRAM) error\n"); - amd64_decode_bus_error(mci, regs, ecc); - } else { - /* shouldn't reach here! */ - amd64_mc_printk(mci, KERN_WARNING, - "%s(): unknown MCE error 0x%x\n", __func__, ec); - } - - pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + __amd64_decode_bus_error(mci, regs, ecc_type); /* * Check the UE bit of the NB status high register, if set generate some * logs. If NOT a GART error, then process the event as a NO-INFO event. * If it was a GART error, skip that process. + * + * FIXME: this should go somewhere else, if at all. */ if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) edac_mc_handle_ue_no_info(mci, "UE bit is set"); + } /* @@ -2406,8 +2343,10 @@ static void amd64_check(struct mem_ctl_info *mci) { struct err_regs regs; - if (amd64_get_error_info(mci, ®s)) - amd64_decode_nb_mce(mci, ®s, 1); + if (amd64_get_error_info(mci, ®s)) { + struct amd64_pvt *pvt = mci->pvt_info; + amd_decode_nb_mce(pvt->mc_node_id, ®s, 1); + } } /* @@ -3103,6 +3042,13 @@ static int amd64_init_2nd_stage(struct amd64_pvt *pvt) mci_lookup[node_id] = mci; pvt_lookup[node_id] = NULL; + + /* register stuff with EDAC MCE */ + if (report_gart_errors) + amd_report_gart_errors(true); + + amd_register_ecc_decoder(amd64_decode_bus_error); + return 0; err_add_mc: @@ -3169,6 +3115,10 @@ static void __devexit amd64_remove_one_instance(struct pci_dev *pdev) mci_lookup[pvt->mc_node_id] = NULL; + /* unregister from EDAC MCE */ + amd_report_gart_errors(false); + amd_unregister_ecc_decoder(amd64_decode_bus_error); + /* Free the EDAC CORE resources */ edac_mc_free(mci); } diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index ecab0c9fd14e..8ea07e2715dc 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -346,24 +346,8 @@ enum { #define K8_NBSL_PP_OBS 0x2 #define K8_NBSL_PP_GENERIC 0x3 - -#define K8_NBSH 0x4C - -#define K8_NBSH_VALID_BIT BIT(31) -#define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UC_ERR BIT(29) -#define K8_NBSH_ERR_EN BIT(28) -#define K8_NBSH_MISCV BIT(27) -#define K8_NBSH_VALID_ERROR_ADDR BIT(26) -#define K8_NBSH_PCC BIT(25) -#define K8_NBSH_ERR_CPU_VAL BIT(24) -#define K8_NBSH_CECC BIT(14) -#define K8_NBSH_UECC BIT(13) -#define K8_NBSH_ERR_SCRUBER BIT(8) - #define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF) - #define K8_NBEAL 0x50 #define K8_NBEAH 0x54 #define K8_SCRCTRL 0x58 @@ -428,23 +412,6 @@ enum amd64_chipset_families { F11_CPUS, }; -/* - * Structure to hold: - * - * 1) dynamically read status and error address HW registers - * 2) sysfs entered values - * 3) MCE values - * - * Depends on entry into the modules - */ -struct err_regs { - u32 nbcfg; - u32 nbsh; - u32 nbsl; - u32 nbeah; - u32 nbeal; -}; - /* Error injection control structure */ struct error_injection { u32 section; @@ -610,8 +577,5 @@ static inline struct low_ops *family_ops(int index) #define F10_MIN_SCRUB_RATE_BITS 0x5 #define F11_MIN_SCRUB_RATE_BITS 0x6 -void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info, - int handle_errors); - int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index bcb4e2eba3dc..59cf2cf6e11e 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1); + amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1); return count; } diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 918567e8cfd5..444c2cc4472d 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,6 +1,31 @@ #include #include "edac_mce_amd.h" +static bool report_gart_errors; +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); + +void amd_report_gart_errors(bool v) +{ + report_gart_errors = v; +} +EXPORT_SYMBOL_GPL(amd_report_gart_errors); + +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + nb_bus_decoder = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +{ + if (nb_bus_decoder) { + WARN_ON(nb_bus_decoder != f); + + nb_bus_decoder = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + /* * string representation for the different MCA reported error types, see F3x48 * or MSR0000_0411. @@ -102,3 +127,93 @@ const char *ext_msgs[] = { "Probe Filter error" /* 1_1111b */ }; EXPORT_SYMBOL_GPL(ext_msgs); + +void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) +{ + int ecc; + u32 ec = ERROR_CODE(regs->nbsl); + u32 xec = EXT_ERROR_CODE(regs->nbsl); + + if (!handle_errors) + return; + + pr_emerg(" Northbridge Error, node %d", node_id); + + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 8)) { + if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + } else { + pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); + } + + pr_emerg(" Error: %sorrected", + ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); + pr_cont(", Report Error: %s", + ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); + pr_cont(", MiscV: %svalid, CPU context corrupt: %s", + ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), + ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = regs->nbsh & (0x3 << 13); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (TLB_ERROR(ec)) { + /* + * GART errors are intended to help graphics driver developers + * to detect bad GART PTEs. It is recommended by AMD to disable + * GART table walk error reporting by default[1] (currently + * being disabled in mce_cpu_quirks()) and according to the + * comment in mce_cpu_quirks(), such GART errors can be + * incorrectly triggered. We may see these errors anyway and + * unless requested by the user, they won't be reported. + * + * [1] section 13.10.1 on BIOS and Kernel Developers Guide for + * AMD NPT family 0Fh processors + */ + if (!report_gart_errors) + return; + + pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + TT_MSG(ec), LL_MSG(ec)); + } else if (MEM_ERROR(ec)) { + pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," + " Cache Level: %s", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); + } else if (BUS_ERROR(ec)) { + pr_emerg(" Bus (Link/DRAM) error\n"); + if (nb_bus_decoder) + nb_bus_decoder(node_id, regs, ecc); + } else { + /* shouldn't reach here! */ + pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); + } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +void decode_mce(struct mce *m) +{ + struct err_regs regs; + int node; + + if (m->bank != 4) + return; + + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = topology_cpu_node_id(m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); +} diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 39971cdabb51..9114dc62782b 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,3 +1,8 @@ +#ifndef _EDAC_MCE_AMD_H +#define _EDAC_MCE_AMD_H + +#include + #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) #define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] @@ -22,6 +27,20 @@ #define PP(x) (((x) >> 9) & 0x3) #define PP_MSG(x) pp_msgs[PP(x)] +#define K8_NBSH 0x4C + +#define K8_NBSH_VALID_BIT BIT(31) +#define K8_NBSH_OVERFLOW BIT(30) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) +#define K8_NBSH_VALID_ERROR_ADDR BIT(26) +#define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) +#define K8_NBSH_CECC BIT(14) +#define K8_NBSH_UECC BIT(13) +#define K8_NBSH_ERR_SCRUBER BIT(8) + extern const char *tt_msgs[]; extern const char *ll_msgs[]; extern const char *rrrr_msgs[]; @@ -29,3 +48,22 @@ extern const char *pp_msgs[]; extern const char *to_msgs[]; extern const char *ii_msgs[]; extern const char *ext_msgs[]; + +/* + * relevant NB regs + */ +struct err_regs { + u32 nbcfg; + u32 nbsh; + u32 nbsl; + u32 nbeah; + u32 nbeal; +}; + + +void amd_report_gart_errors(bool); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_decode_nb_mce(int, struct err_regs *, int); + +#endif /* _EDAC_MCE_AMD_H */ From b69b29de65fe4078b125acc9dea34be82f7c362c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 27 Jul 2009 16:21:14 +0200 Subject: [PATCH 08/15] EDAC, AMD: carve out MCi_STATUS decoding The MCi_STATUS registers have most field definitions in common so decode them in the general path. Do not pass ecc_type along and compute it in __amd64_decode_bus_error instead. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 8 +++--- drivers/edac/edac_mce_amd.c | 57 ++++++++++++++++++------------------- drivers/edac/edac_mce_amd.h | 4 +-- 3 files changed, 34 insertions(+), 35 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 2080b1e2e8a2..c81ca2cf8dc7 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2283,10 +2283,11 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, } static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, - struct err_regs *info, int ecc_type) + struct err_regs *info) { u32 ec = ERROR_CODE(info->nbsl); u32 xec = EXT_ERROR_CODE(info->nbsl); + int ecc_type = info->nbsh & (0x3 << 13); pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); @@ -2316,12 +2317,11 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow"); } -void amd64_decode_bus_error(int node_id, struct err_regs *regs, - int ecc_type) +void amd64_decode_bus_error(int node_id, struct err_regs *regs) { struct mem_ctl_info *mci = mci_lookup[node_id]; - __amd64_decode_bus_error(mci, regs, ecc_type); + __amd64_decode_bus_error(mci, regs); /* * Check the UE bit of the NB status high register, if set generate some diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 444c2cc4472d..0ba92d65db43 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -2,7 +2,7 @@ #include "edac_mce_amd.h" static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct err_regs *regs, int ecc_type); +static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); void amd_report_gart_errors(bool v) { @@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v) } EXPORT_SYMBOL_GPL(amd_report_gart_errors); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)) +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) { nb_bus_decoder = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)) +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) { if (nb_bus_decoder) { WARN_ON(nb_bus_decoder != f); @@ -130,7 +130,6 @@ EXPORT_SYMBOL_GPL(ext_msgs); void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { - int ecc; u32 ec = ERROR_CODE(regs->nbsl); u32 xec = EXT_ERROR_CODE(regs->nbsl); @@ -151,21 +150,6 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); } - pr_emerg(" Error: %sorrected", - ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C")); - pr_cont(", Report Error: %s", - ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no")); - pr_cont(", MiscV: %svalid, CPU context corrupt: %s", - ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"), - ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = regs->nbsh & (0x3 << 13); - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers @@ -191,7 +175,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) } else if (BUS_ERROR(ec)) { pr_emerg(" Bus (Link/DRAM) error\n"); if (nb_bus_decoder) - nb_bus_decoder(node_id, regs, ecc); + nb_bus_decoder(node_id, regs); } else { /* shouldn't reach here! */ pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); @@ -204,16 +188,31 @@ EXPORT_SYMBOL_GPL(amd_decode_nb_mce); void decode_mce(struct mce *m) { struct err_regs regs; - int node; + int node, ecc; - if (m->bank != 4) - return; + pr_emerg("MC%d_STATUS:\n", m->bank); - regs.nbsl = (u32) m->status; - regs.nbsh = (u32)(m->status >> 32); - regs.nbeal = (u32) m->addr; - regs.nbeah = (u32)(m->addr >> 32); - node = topology_cpu_node_id(m->extcpu); + pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, " + "CPU context corrupt: %s", + ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), + ((m->status & MCI_STATUS_EN) ? "yes" : "no"), + ((m->status & MCI_STATUS_MISCV) ? "" : "in"), + ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); - amd_decode_nb_mce(node, ®s, 1); + /* do the two bits[14:13] together */ + ecc = m->status & (3ULL << 45); + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + if (m->bank == 4) { + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + node = per_cpu(cpu_llc_id, m->extcpu); + + amd_decode_nb_mce(node, ®s, 1); + } } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 9114dc62782b..df23ee065f79 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -62,8 +62,8 @@ struct err_regs { void amd_report_gart_errors(bool); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *, int)); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *, int)); +void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); +void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); void amd_decode_nb_mce(int, struct err_regs *, int); #endif /* _EDAC_MCE_AMD_H */ From d93cc222adf3532ddb442648f8db00c15d1dc4c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 10:56:15 +0200 Subject: [PATCH 09/15] EDAC, AMD: carve out decoding of MCi_STATUS ErrorCode This is the MCE error code from the MCi_STATUS banks, bits [15:0] which describe what type of error was encountered: GART TLB, Memory or Bus error. The semantics of those bits are identical across all MCE banks so decode those separately, irrespectively of MCE type. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 4 ---- drivers/edac/edac_mce_amd.c | 37 ++++++++++++++++++++++--------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index c81ca2cf8dc7..173dc4a84166 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2289,10 +2289,6 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, u32 xec = EXT_ERROR_CODE(info->nbsl); int ecc_type = info->nbsh & (0x3 << 13); - pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n", - RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); - - /* Bail early out if this was an 'observed' error */ if (PP(ec) == K8_NBSL_PP_OBS) return; diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 0ba92d65db43..81f812eb3aea 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -150,6 +150,16 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf))); } + + pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + + if (BUS_ERROR(ec) && nb_bus_decoder) + nb_bus_decoder(node_id, regs); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +static inline void amd_decode_err_code(unsigned int ec) +{ if (TLB_ERROR(ec)) { /* * GART errors are intended to help graphics driver developers @@ -166,33 +176,28 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) if (!report_gart_errors) return; - pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n", + pr_emerg(" Transaction: %s, Cache Level %s\n", TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s," - " Cache Level: %s", + pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg(" Bus (Link/DRAM) error\n"); - if (nb_bus_decoder) - nb_bus_decoder(node_id, regs); - } else { - /* shouldn't reach here! */ - pr_warning("%s: unknown MCE error 0x%x\n", __func__, ec); - } - - pr_emerg("%s.\n", EXT_ERR_MSG(xec)); + pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " + "Participating Processor: %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), + PP_MSG(ec)); + } else + pr_warning("Huh? Unknown MCE error 0x%x\n", ec); } -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); void decode_mce(struct mce *m) { struct err_regs regs; int node, ecc; - pr_emerg("MC%d_STATUS:\n", m->bank); + pr_emerg("MC%d_STATUS: ", m->bank); - pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, " + pr_cont("%sorrected error, report: %s, MiscV: %svalid, " "CPU context corrupt: %s", ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), ((m->status & MCI_STATUS_EN) ? "yes" : "no"), @@ -206,6 +211,8 @@ void decode_mce(struct mce *m) pr_cont("\n"); + amd_decode_err_code(m->status & 0xffff); + if (m->bank == 4) { regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); From 51966241360874e85d1e4d93c9fcdd2ef917b0fb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 13:50:43 +0200 Subject: [PATCH 10/15] EDAC, AMD: decode data cache MCEs Those get reported in MC0_STATUS, see Table 92, F10h BKDG (31116, rev. 3.28) for more details. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 56 +++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 81f812eb3aea..fe8ccebd9672 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -128,6 +128,49 @@ const char *ext_msgs[] = { }; EXPORT_SYMBOL_GPL(ext_msgs); +static void amd_decode_dc_mce(u64 mc0_status) +{ + u32 ec = mc0_status & 0xffff; + u32 xec = (mc0_status >> 16) & 0xf; + + pr_emerg(" Data Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (mc0_status & (1ULL << 40)) + pr_cont(" during Data Scrub.\n"); + else if (TLB_ERROR(ec)) + pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + /* see F10h BKDG (31116), Table 92. */ + if (ll == 0x1) { + if (tt != 0x1) + goto wrong_dc_mce; + + pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); + + } else if (ll == 0x2 && rrrr == 0x3) + pr_cont(" during L1 linefill from L2.\n"); + else + goto wrong_dc_mce; + } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) + pr_cont(" during system linefill.\n"); + else + goto wrong_dc_mce; + } else + goto wrong_dc_mce; + + return; + +wrong_dc_mce: + pr_warning("Corrupted DC MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -211,9 +254,12 @@ void decode_mce(struct mce *m) pr_cont("\n"); - amd_decode_err_code(m->status & 0xffff); + switch (m->bank) { + case 0: + amd_decode_dc_mce(m->status); + break; - if (m->bank == 4) { + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); regs.nbeal = (u32) m->addr; @@ -221,5 +267,11 @@ void decode_mce(struct mce *m) node = per_cpu(cpu_llc_id, m->extcpu); amd_decode_nb_mce(node, ®s, 1); + break; + + default: + break; } + + amd_decode_err_code(m->status & 0xffff); } From ab5535e70fb35b8046b6ace50259fe212e074a4f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:06:26 +0200 Subject: [PATCH 11/15] EDAC, AMD: decode instruction cache MCEs See Fam10h BKDG (31116, rev. 3.28), Table 95 Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 61 +++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index fe8ccebd9672..b30a8306b143 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -171,6 +171,63 @@ static void amd_decode_dc_mce(u64 mc0_status) pr_warning("Corrupted DC MCE info?\n"); } +static void amd_decode_ic_mce(u64 mc1_status) +{ + u32 ec = mc1_status & 0xffff; + u32 xec = (mc1_status >> 16) & 0xf; + + pr_emerg(" Instruction Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (TLB_ERROR(ec)) + pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); + else if (BUS_ERROR(ec)) { + if (boot_cpu_data.x86 == 0xf && + (mc1_status & (1ULL << 58))) + pr_cont(" during system linefill.\n"); + else + pr_cont(" during attempted NB data read.\n"); + } else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + if (ll == 0x2) + pr_cont(" during a linefill from L2.\n"); + else if (ll == 0x1) { + + switch (rrrr) { + case 0x5: + pr_cont(": Parity error during " + "data load.\n"); + break; + + case 0x7: + pr_cont(": Copyback Parity/Victim" + " error.\n"); + break; + + case 0x8: + pr_cont(": Tag Snoop error.\n"); + break; + + default: + goto wrong_ic_mce; + break; + } + } + } else + goto wrong_ic_mce; + } else + goto wrong_ic_mce; + + return; + +wrong_ic_mce: + pr_warning("Corrupted IC MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -259,6 +316,10 @@ void decode_mce(struct mce *m) amd_decode_dc_mce(m->status); break; + case 1: + amd_decode_ic_mce(m->status); + break; + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); From 56cad2d6fb832a876ab8bda4b01e5d0722dc754b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:14:24 +0200 Subject: [PATCH 12/15] EDAC, AMD: decode bus unit MCEs ... according to Table 69, Fam10h BKDG (31116, rev. 3.28). Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index b30a8306b143..e1f32c36248d 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -228,6 +228,48 @@ static void amd_decode_ic_mce(u64 mc1_status) pr_warning("Corrupted IC MCE info?\n"); } +static void amd_decode_bu_mce(u64 mc2_status) +{ + u32 ec = mc2_status & 0xffff; + u32 xec = (mc2_status >> 16) & 0xf; + + pr_emerg(" Bus Unit Error"); + + if (xec == 0x1) + pr_cont(" in the write data buffers.\n"); + else if (xec == 0x3) + pr_cont(" in the victim data buffers.\n"); + else if (xec == 0x2 && MEM_ERROR(ec)) + pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); + else if (xec == 0x0) { + if (TLB_ERROR(ec)) + pr_cont(": %s error in a Page Descriptor Cache or " + "Guest TLB.\n", TT_MSG(ec)); + else if (BUS_ERROR(ec)) + pr_cont(": %s/ECC error in data read from NB: %s.\n", + RRRR_MSG(ec), PP_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 rrrr = (ec >> 4) & 0xf; + + if (rrrr >= 0x7) + pr_cont(": %s error during data copyback.\n", + RRRR_MSG(ec)); + else if (rrrr <= 0x1) + pr_cont(": %s parity/ECC error during data " + "access from L2.\n", RRRR_MSG(ec)); + else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + + return; + +wrong_bu_mce: + pr_warning("Corrupted BU MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -320,6 +362,10 @@ void decode_mce(struct mce *m) amd_decode_ic_mce(m->status); break; + case 2: + amd_decode_bu_mce(m->status); + break; + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); From f9350efd6f37ef60d2334739edb76ef1f8ee0183 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:17:30 +0200 Subject: [PATCH 13/15] EDAC, AMD: decode load store MCEs See Fam10h BKDG (31116, rev. 3.28), Table 100. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index e1f32c36248d..228482855362 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -270,6 +270,27 @@ static void amd_decode_bu_mce(u64 mc2_status) pr_warning("Corrupted BU MCE info?\n"); } +static void amd_decode_ls_mce(u64 mc3_status) +{ + u32 ec = mc3_status & 0xffff; + u32 xec = (mc3_status >> 16) & 0xf; + + pr_emerg(" Load Store Error"); + + if (xec == 0x0) { + u8 rrrr = (ec >> 4) & 0xf; + + if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) + goto wrong_ls_mce; + + pr_cont(" during %s.\n", RRRR_MSG(ec)); + } + return; + +wrong_ls_mce: + pr_warning("Corrupted LS MCE info?\n"); +} + void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) { u32 ec = ERROR_CODE(regs->nbsl); @@ -366,6 +387,10 @@ void decode_mce(struct mce *m) amd_decode_bu_mce(m->status); break; + case 3: + amd_decode_ls_mce(m->status); + break; + case 4: regs.nbsl = (u32) m->status; regs.nbsh = (u32)(m->status >> 32); From 53bd5fedca7d0c28b35b02cab5f4e27bf8d7fabe Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:20:46 +0200 Subject: [PATCH 14/15] EDAC, AMD: decode FR MCEs See Fam10h BKDG (31116, rev. 3.28), Table 101. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 228482855362..c8ca7136dacc 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -321,6 +321,15 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) } EXPORT_SYMBOL_GPL(amd_decode_nb_mce); +static void amd_decode_fr_mce(u64 mc5_status) +{ + /* we have only one error signature so match all fields at once. */ + if ((mc5_status & 0xffff) == 0x0f0f) + pr_emerg(" FR Error: CPU Watchdog timer expire.\n"); + else + pr_warning("Corrupted FR MCE info?\n"); +} + static inline void amd_decode_err_code(unsigned int ec) { if (TLB_ERROR(ec)) { @@ -401,6 +410,10 @@ void decode_mce(struct mce *m) amd_decode_nb_mce(node, ®s, 1); break; + case 5: + amd_decode_fr_mce(m->status); + break; + default: break; } From 22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 28 Jul 2009 14:47:10 +0200 Subject: [PATCH 15/15] x86, mce: do not compile mcelog message on AMD Now that decoding is done in-kernel, suppress mcelog message part. CC: Andi Kleen Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b82866f6adf5..9bfe9d2ea615 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -222,7 +222,10 @@ static void print_mce_head(void) static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) + "Run through mcelog --ascii to decode and contact your hardware vendor\n" +#endif + ); } #define PANIC_TIMEOUT 5 /* 5 seconds */