linux_dsm_epyc7002/drivers/edac/dmc520_edac.c

657 lines
16 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* EDAC driver for DMC-520 memory controller.
*
* The driver supports 10 interrupt lines,
* though only dram_ecc_errc and dram_ecc_errd are currently handled.
*
* Authors: Rui Zhao <ruizhao@microsoft.com>
* Lei Wang <lewan@microsoft.com>
* Shiping Ji <shji@microsoft.com>
*/
#include <linux/bitfield.h>
#include <linux/edac.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include "edac_mc.h"
/* DMC-520 registers */
#define REG_OFFSET_FEATURE_CONFIG 0x130
#define REG_OFFSET_ECC_ERRC_COUNT_31_00 0x158
#define REG_OFFSET_ECC_ERRC_COUNT_63_32 0x15C
#define REG_OFFSET_ECC_ERRD_COUNT_31_00 0x160
#define REG_OFFSET_ECC_ERRD_COUNT_63_32 0x164
#define REG_OFFSET_INTERRUPT_CONTROL 0x500
#define REG_OFFSET_INTERRUPT_CLR 0x508
#define REG_OFFSET_INTERRUPT_STATUS 0x510
#define REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 0x528
#define REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 0x52C
#define REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00 0x530
#define REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32 0x534
#define REG_OFFSET_ADDRESS_CONTROL_NOW 0x1010
#define REG_OFFSET_MEMORY_TYPE_NOW 0x1128
#define REG_OFFSET_SCRUB_CONTROL0_NOW 0x1170
#define REG_OFFSET_FORMAT_CONTROL 0x18
/* DMC-520 types, masks and bitfields */
#define RAM_ECC_INT_CE_BIT BIT(0)
#define RAM_ECC_INT_UE_BIT BIT(1)
#define DRAM_ECC_INT_CE_BIT BIT(2)
#define DRAM_ECC_INT_UE_BIT BIT(3)
#define FAILED_ACCESS_INT_BIT BIT(4)
#define FAILED_PROG_INT_BIT BIT(5)
#define LINK_ERR_INT_BIT BIT(6)
#define TEMPERATURE_EVENT_INT_BIT BIT(7)
#define ARCH_FSM_INT_BIT BIT(8)
#define PHY_REQUEST_INT_BIT BIT(9)
#define MEMORY_WIDTH_MASK GENMASK(1, 0)
#define SCRUB_TRIGGER0_NEXT_MASK GENMASK(1, 0)
#define REG_FIELD_DRAM_ECC_ENABLED GENMASK(1, 0)
#define REG_FIELD_MEMORY_TYPE GENMASK(2, 0)
#define REG_FIELD_DEVICE_WIDTH GENMASK(9, 8)
#define REG_FIELD_ADDRESS_CONTROL_COL GENMASK(2, 0)
#define REG_FIELD_ADDRESS_CONTROL_ROW GENMASK(10, 8)
#define REG_FIELD_ADDRESS_CONTROL_BANK GENMASK(18, 16)
#define REG_FIELD_ADDRESS_CONTROL_RANK GENMASK(25, 24)
#define REG_FIELD_ERR_INFO_LOW_VALID BIT(0)
#define REG_FIELD_ERR_INFO_LOW_COL GENMASK(10, 1)
#define REG_FIELD_ERR_INFO_LOW_ROW GENMASK(28, 11)
#define REG_FIELD_ERR_INFO_LOW_RANK GENMASK(31, 29)
#define REG_FIELD_ERR_INFO_HIGH_BANK GENMASK(3, 0)
#define REG_FIELD_ERR_INFO_HIGH_VALID BIT(31)
#define DRAM_ADDRESS_CONTROL_MIN_COL_BITS 8
#define DRAM_ADDRESS_CONTROL_MIN_ROW_BITS 11
#define DMC520_SCRUB_TRIGGER_ERR_DETECT 2
#define DMC520_SCRUB_TRIGGER_IDLE 3
/* Driver settings */
/*
* The max-length message would be: "rank:7 bank:15 row:262143 col:1023".
* Max length is 34. Using a 40-size buffer is enough.
*/
#define DMC520_MSG_BUF_SIZE 40
#define EDAC_MOD_NAME "dmc520-edac"
#define EDAC_CTL_NAME "dmc520"
/* the data bus width for the attached memory chips. */
enum dmc520_mem_width {
MEM_WIDTH_X32 = 2,
MEM_WIDTH_X64 = 3
};
/* memory type */
enum dmc520_mem_type {
MEM_TYPE_DDR3 = 1,
MEM_TYPE_DDR4 = 2
};
/* memory device width */
enum dmc520_dev_width {
DEV_WIDTH_X4 = 0,
DEV_WIDTH_X8 = 1,
DEV_WIDTH_X16 = 2
};
struct ecc_error_info {
u32 col;
u32 row;
u32 bank;
u32 rank;
};
/* The interrupt config */
struct dmc520_irq_config {
char *name;
int mask;
};
/* The interrupt mappings */
static struct dmc520_irq_config dmc520_irq_configs[] = {
{
.name = "ram_ecc_errc",
.mask = RAM_ECC_INT_CE_BIT
},
{
.name = "ram_ecc_errd",
.mask = RAM_ECC_INT_UE_BIT
},
{
.name = "dram_ecc_errc",
.mask = DRAM_ECC_INT_CE_BIT
},
{
.name = "dram_ecc_errd",
.mask = DRAM_ECC_INT_UE_BIT
},
{
.name = "failed_access",
.mask = FAILED_ACCESS_INT_BIT
},
{
.name = "failed_prog",
.mask = FAILED_PROG_INT_BIT
},
{
.name = "link_err",
.mask = LINK_ERR_INT_BIT
},
{
.name = "temperature_event",
.mask = TEMPERATURE_EVENT_INT_BIT
},
{
.name = "arch_fsm",
.mask = ARCH_FSM_INT_BIT
},
{
.name = "phy_request",
.mask = PHY_REQUEST_INT_BIT
}
};
#define NUMBER_OF_IRQS ARRAY_SIZE(dmc520_irq_configs)
/*
* The EDAC driver private data.
* error_lock is to protect concurrent writes to the mci->error_desc through
* edac_mc_handle_error().
*/
struct dmc520_edac {
void __iomem *reg_base;
spinlock_t error_lock;
u32 mem_width_in_bytes;
int irqs[NUMBER_OF_IRQS];
int masks[NUMBER_OF_IRQS];
};
static int dmc520_mc_idx;
static u32 dmc520_read_reg(struct dmc520_edac *pvt, u32 offset)
{
return readl(pvt->reg_base + offset);
}
static void dmc520_write_reg(struct dmc520_edac *pvt, u32 val, u32 offset)
{
writel(val, pvt->reg_base + offset);
}
static u32 dmc520_calc_dram_ecc_error(u32 value)
{
u32 total = 0;
/* Each rank's error counter takes one byte. */
while (value > 0) {
total += (value & 0xFF);
value >>= 8;
}
return total;
}
static u32 dmc520_get_dram_ecc_error_count(struct dmc520_edac *pvt,
bool is_ce)
{
u32 reg_offset_low, reg_offset_high;
u32 err_low, err_high;
u32 err_count;
reg_offset_low = is_ce ? REG_OFFSET_ECC_ERRC_COUNT_31_00 :
REG_OFFSET_ECC_ERRD_COUNT_31_00;
reg_offset_high = is_ce ? REG_OFFSET_ECC_ERRC_COUNT_63_32 :
REG_OFFSET_ECC_ERRD_COUNT_63_32;
err_low = dmc520_read_reg(pvt, reg_offset_low);
err_high = dmc520_read_reg(pvt, reg_offset_high);
/* Reset error counters */
dmc520_write_reg(pvt, 0, reg_offset_low);
dmc520_write_reg(pvt, 0, reg_offset_high);
err_count = dmc520_calc_dram_ecc_error(err_low) +
dmc520_calc_dram_ecc_error(err_high);
return err_count;
}
static void dmc520_get_dram_ecc_error_info(struct dmc520_edac *pvt,
bool is_ce,
struct ecc_error_info *info)
{
u32 reg_offset_low, reg_offset_high;
u32 reg_val_low, reg_val_high;
bool valid;
reg_offset_low = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_31_00 :
REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_31_00;
reg_offset_high = is_ce ? REG_OFFSET_DRAM_ECC_ERRC_INT_INFO_63_32 :
REG_OFFSET_DRAM_ECC_ERRD_INT_INFO_63_32;
reg_val_low = dmc520_read_reg(pvt, reg_offset_low);
reg_val_high = dmc520_read_reg(pvt, reg_offset_high);
valid = (FIELD_GET(REG_FIELD_ERR_INFO_LOW_VALID, reg_val_low) != 0) &&
(FIELD_GET(REG_FIELD_ERR_INFO_HIGH_VALID, reg_val_high) != 0);
if (valid) {
info->col = FIELD_GET(REG_FIELD_ERR_INFO_LOW_COL, reg_val_low);
info->row = FIELD_GET(REG_FIELD_ERR_INFO_LOW_ROW, reg_val_low);
info->rank = FIELD_GET(REG_FIELD_ERR_INFO_LOW_RANK, reg_val_low);
info->bank = FIELD_GET(REG_FIELD_ERR_INFO_HIGH_BANK, reg_val_high);
} else {
memset(info, 0, sizeof(*info));
}
}
static bool dmc520_is_ecc_enabled(void __iomem *reg_base)
{
u32 reg_val = readl(reg_base + REG_OFFSET_FEATURE_CONFIG);
return FIELD_GET(REG_FIELD_DRAM_ECC_ENABLED, reg_val);
}
static enum scrub_type dmc520_get_scrub_type(struct dmc520_edac *pvt)
{
enum scrub_type type = SCRUB_NONE;
u32 reg_val, scrub_cfg;
reg_val = dmc520_read_reg(pvt, REG_OFFSET_SCRUB_CONTROL0_NOW);
scrub_cfg = FIELD_GET(SCRUB_TRIGGER0_NEXT_MASK, reg_val);
if (scrub_cfg == DMC520_SCRUB_TRIGGER_ERR_DETECT ||
scrub_cfg == DMC520_SCRUB_TRIGGER_IDLE)
type = SCRUB_HW_PROG;
return type;
}
/* Get the memory data bus width, in number of bytes. */
static u32 dmc520_get_memory_width(struct dmc520_edac *pvt)
{
enum dmc520_mem_width mem_width_field;
u32 mem_width_in_bytes = 0;
u32 reg_val;
reg_val = dmc520_read_reg(pvt, REG_OFFSET_FORMAT_CONTROL);
mem_width_field = FIELD_GET(MEMORY_WIDTH_MASK, reg_val);
if (mem_width_field == MEM_WIDTH_X32)
mem_width_in_bytes = 4;
else if (mem_width_field == MEM_WIDTH_X64)
mem_width_in_bytes = 8;
return mem_width_in_bytes;
}
static enum mem_type dmc520_get_mtype(struct dmc520_edac *pvt)
{
enum mem_type mt = MEM_UNKNOWN;
enum dmc520_mem_type type;
u32 reg_val;
reg_val = dmc520_read_reg(pvt, REG_OFFSET_MEMORY_TYPE_NOW);
type = FIELD_GET(REG_FIELD_MEMORY_TYPE, reg_val);
switch (type) {
case MEM_TYPE_DDR3:
mt = MEM_DDR3;
break;
case MEM_TYPE_DDR4:
mt = MEM_DDR4;
break;
}
return mt;
}
static enum dev_type dmc520_get_dtype(struct dmc520_edac *pvt)
{
enum dmc520_dev_width device_width;
enum dev_type dt = DEV_UNKNOWN;
u32 reg_val;
reg_val = dmc520_read_reg(pvt, REG_OFFSET_MEMORY_TYPE_NOW);
device_width = FIELD_GET(REG_FIELD_DEVICE_WIDTH, reg_val);
switch (device_width) {
case DEV_WIDTH_X4:
dt = DEV_X4;
break;
case DEV_WIDTH_X8:
dt = DEV_X8;
break;
case DEV_WIDTH_X16:
dt = DEV_X16;
break;
}
return dt;
}
static u32 dmc520_get_rank_count(void __iomem *reg_base)
{
u32 reg_val, rank_bits;
reg_val = readl(reg_base + REG_OFFSET_ADDRESS_CONTROL_NOW);
rank_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_RANK, reg_val);
return BIT(rank_bits);
}
static u64 dmc520_get_rank_size(struct dmc520_edac *pvt)
{
u32 reg_val, col_bits, row_bits, bank_bits;
reg_val = dmc520_read_reg(pvt, REG_OFFSET_ADDRESS_CONTROL_NOW);
col_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_COL, reg_val) +
DRAM_ADDRESS_CONTROL_MIN_COL_BITS;
row_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_ROW, reg_val) +
DRAM_ADDRESS_CONTROL_MIN_ROW_BITS;
bank_bits = FIELD_GET(REG_FIELD_ADDRESS_CONTROL_BANK, reg_val);
return (u64)pvt->mem_width_in_bytes << (col_bits + row_bits + bank_bits);
}
static void dmc520_handle_dram_ecc_errors(struct mem_ctl_info *mci,
bool is_ce)
{
struct dmc520_edac *pvt = mci->pvt_info;
char message[DMC520_MSG_BUF_SIZE];
struct ecc_error_info info;
u32 cnt;
dmc520_get_dram_ecc_error_info(pvt, is_ce, &info);
cnt = dmc520_get_dram_ecc_error_count(pvt, is_ce);
if (!cnt)
return;
snprintf(message, ARRAY_SIZE(message),
"rank:%d bank:%d row:%d col:%d",
info.rank, info.bank,
info.row, info.col);
spin_lock(&pvt->error_lock);
edac_mc_handle_error((is_ce ? HW_EVENT_ERR_CORRECTED :
HW_EVENT_ERR_UNCORRECTED),
mci, cnt, 0, 0, 0, info.rank, -1, -1,
message, "");
spin_unlock(&pvt->error_lock);
}
static irqreturn_t dmc520_edac_dram_ecc_isr(int irq, struct mem_ctl_info *mci,
bool is_ce)
{
struct dmc520_edac *pvt = mci->pvt_info;
u32 i_mask;
i_mask = is_ce ? DRAM_ECC_INT_CE_BIT : DRAM_ECC_INT_UE_BIT;
dmc520_handle_dram_ecc_errors(mci, is_ce);
dmc520_write_reg(pvt, i_mask, REG_OFFSET_INTERRUPT_CLR);
return IRQ_HANDLED;
}
static irqreturn_t dmc520_edac_dram_all_isr(int irq, struct mem_ctl_info *mci,
u32 irq_mask)
{
struct dmc520_edac *pvt = mci->pvt_info;
irqreturn_t irq_ret = IRQ_NONE;
u32 status;
status = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_STATUS);
if ((irq_mask & DRAM_ECC_INT_CE_BIT) &&
(status & DRAM_ECC_INT_CE_BIT))
irq_ret = dmc520_edac_dram_ecc_isr(irq, mci, true);
if ((irq_mask & DRAM_ECC_INT_UE_BIT) &&
(status & DRAM_ECC_INT_UE_BIT))
irq_ret = dmc520_edac_dram_ecc_isr(irq, mci, false);
return irq_ret;
}
static irqreturn_t dmc520_isr(int irq, void *data)
{
struct mem_ctl_info *mci = data;
struct dmc520_edac *pvt = mci->pvt_info;
u32 mask = 0;
int idx;
for (idx = 0; idx < NUMBER_OF_IRQS; idx++) {
if (pvt->irqs[idx] == irq) {
mask = pvt->masks[idx];
break;
}
}
return dmc520_edac_dram_all_isr(irq, mci, mask);
}
static void dmc520_init_csrow(struct mem_ctl_info *mci)
{
struct dmc520_edac *pvt = mci->pvt_info;
struct csrow_info *csi;
struct dimm_info *dimm;
u32 pages_per_rank;
enum dev_type dt;
enum mem_type mt;
int row, ch;
u64 rs;
dt = dmc520_get_dtype(pvt);
mt = dmc520_get_mtype(pvt);
rs = dmc520_get_rank_size(pvt);
pages_per_rank = rs >> PAGE_SHIFT;
for (row = 0; row < mci->nr_csrows; row++) {
csi = mci->csrows[row];
for (ch = 0; ch < csi->nr_channels; ch++) {
dimm = csi->channels[ch]->dimm;
dimm->grain = pvt->mem_width_in_bytes;
dimm->dtype = dt;
dimm->mtype = mt;
dimm->edac_mode = EDAC_FLAG_SECDED;
dimm->nr_pages = pages_per_rank / csi->nr_channels;
}
}
}
static int dmc520_edac_probe(struct platform_device *pdev)
{
bool registered[NUMBER_OF_IRQS] = { false };
int irqs[NUMBER_OF_IRQS] = { -ENXIO };
int masks[NUMBER_OF_IRQS] = { 0 };
struct edac_mc_layer layers[1];
struct dmc520_edac *pvt = NULL;
struct mem_ctl_info *mci;
void __iomem *reg_base;
u32 irq_mask_all = 0;
struct resource *res;
struct device *dev;
int ret, idx, irq;
u32 reg_val;
/* Parse the device node */
dev = &pdev->dev;
for (idx = 0; idx < NUMBER_OF_IRQS; idx++) {
irq = platform_get_irq_byname(pdev, dmc520_irq_configs[idx].name);
irqs[idx] = irq;
masks[idx] = dmc520_irq_configs[idx].mask;
if (irq >= 0) {
irq_mask_all |= dmc520_irq_configs[idx].mask;
edac_dbg(0, "Discovered %s, irq: %d.\n", dmc520_irq_configs[idx].name, irq);
}
}
if (!irq_mask_all) {
edac_printk(KERN_ERR, EDAC_MOD_NAME,
"At least one valid interrupt line is expected.\n");
return -EINVAL;
}
/* Initialize dmc520 edac */
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
reg_base = devm_ioremap_resource(dev, res);
if (IS_ERR(reg_base))
return PTR_ERR(reg_base);
if (!dmc520_is_ecc_enabled(reg_base))
return -ENXIO;
layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
layers[0].size = dmc520_get_rank_count(reg_base);
layers[0].is_virt_csrow = true;
mci = edac_mc_alloc(dmc520_mc_idx++, ARRAY_SIZE(layers), layers, sizeof(*pvt));
if (!mci) {
edac_printk(KERN_ERR, EDAC_MOD_NAME,
"Failed to allocate memory for mc instance\n");
ret = -ENOMEM;
goto err;
}
pvt = mci->pvt_info;
pvt->reg_base = reg_base;
spin_lock_init(&pvt->error_lock);
memcpy(pvt->irqs, irqs, sizeof(irqs));
memcpy(pvt->masks, masks, sizeof(masks));
platform_set_drvdata(pdev, mci);
mci->pdev = dev;
mci->mtype_cap = MEM_FLAG_DDR3 | MEM_FLAG_DDR4;
mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
mci->edac_cap = EDAC_FLAG_SECDED;
mci->scrub_cap = SCRUB_FLAG_HW_SRC;
mci->scrub_mode = dmc520_get_scrub_type(pvt);
mci->ctl_name = EDAC_CTL_NAME;
mci->dev_name = dev_name(mci->pdev);
mci->mod_name = EDAC_MOD_NAME;
edac_op_state = EDAC_OPSTATE_INT;
pvt->mem_width_in_bytes = dmc520_get_memory_width(pvt);
dmc520_init_csrow(mci);
/* Clear interrupts, not affecting other unrelated interrupts */
reg_val = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_CONTROL);
dmc520_write_reg(pvt, reg_val & (~irq_mask_all),
REG_OFFSET_INTERRUPT_CONTROL);
dmc520_write_reg(pvt, irq_mask_all, REG_OFFSET_INTERRUPT_CLR);
for (idx = 0; idx < NUMBER_OF_IRQS; idx++) {
irq = irqs[idx];
if (irq >= 0) {
ret = devm_request_irq(&pdev->dev, irq,
dmc520_isr, IRQF_SHARED,
dev_name(&pdev->dev), mci);
if (ret < 0) {
edac_printk(KERN_ERR, EDAC_MC,
"Failed to request irq %d\n", irq);
goto err;
}
registered[idx] = true;
}
}
/* Reset DRAM CE/UE counters */
if (irq_mask_all & DRAM_ECC_INT_CE_BIT)
dmc520_get_dram_ecc_error_count(pvt, true);
if (irq_mask_all & DRAM_ECC_INT_UE_BIT)
dmc520_get_dram_ecc_error_count(pvt, false);
ret = edac_mc_add_mc(mci);
if (ret) {
edac_printk(KERN_ERR, EDAC_MOD_NAME,
"Failed to register with EDAC core\n");
goto err;
}
/* Enable interrupts, not affecting other unrelated interrupts */
dmc520_write_reg(pvt, reg_val | irq_mask_all,
REG_OFFSET_INTERRUPT_CONTROL);
return 0;
err:
for (idx = 0; idx < NUMBER_OF_IRQS; idx++) {
if (registered[idx])
devm_free_irq(&pdev->dev, pvt->irqs[idx], mci);
}
if (mci)
edac_mc_free(mci);
return ret;
}
static int dmc520_edac_remove(struct platform_device *pdev)
{
u32 reg_val, idx, irq_mask_all = 0;
struct mem_ctl_info *mci;
struct dmc520_edac *pvt;
mci = platform_get_drvdata(pdev);
pvt = mci->pvt_info;
/* Disable interrupts */
reg_val = dmc520_read_reg(pvt, REG_OFFSET_INTERRUPT_CONTROL);
dmc520_write_reg(pvt, reg_val & (~irq_mask_all),
REG_OFFSET_INTERRUPT_CONTROL);
/* free irq's */
for (idx = 0; idx < NUMBER_OF_IRQS; idx++) {
if (pvt->irqs[idx] >= 0) {
irq_mask_all |= pvt->masks[idx];
devm_free_irq(&pdev->dev, pvt->irqs[idx], mci);
}
}
edac_mc_del_mc(&pdev->dev);
edac_mc_free(mci);
return 0;
}
static const struct of_device_id dmc520_edac_driver_id[] = {
{ .compatible = "arm,dmc-520", },
{ /* end of table */ }
};
MODULE_DEVICE_TABLE(of, dmc520_edac_driver_id);
static struct platform_driver dmc520_edac_driver = {
.driver = {
.name = "dmc520",
.of_match_table = dmc520_edac_driver_id,
},
.probe = dmc520_edac_probe,
.remove = dmc520_edac_remove
};
module_platform_driver(dmc520_edac_driver);
MODULE_AUTHOR("Rui Zhao <ruizhao@microsoft.com>");
MODULE_AUTHOR("Lei Wang <lewan@microsoft.com>");
MODULE_AUTHOR("Shiping Ji <shji@microsoft.com>");
MODULE_DESCRIPTION("DMC-520 ECC driver");
MODULE_LICENSE("GPL v2");