From 4cf781c24c3bc8cc50f8013143aa20b26e9217e8 Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 11 Dec 2019 10:18:55 +0800 Subject: [PATCH] drm/amdgpu: Added RAS UMC error query support for Arcturus Updated UMC 6.1 function set to support UMC 6.1.1 and 6.1.2 devices Reviewed-by: Alex Deucher Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 9 +++- drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 78 ++++++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/umc_v6_1.h | 3 +- 3 files changed, 74 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index a208b2883c03..53dfc82ca171 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -708,11 +708,18 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) adev->umc.funcs = &umc_v6_0_funcs; break; case CHIP_VEGA20: + adev->umc.max_ras_err_cnt_per_query = UMC_V6_1_TOTAL_CHANNEL_NUM; + adev->umc.channel_inst_num = UMC_V6_1_CHANNEL_INSTANCE_NUM; + adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM; + adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET_VG20; + adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0]; + adev->umc.funcs = &umc_v6_1_funcs; + break; case CHIP_ARCTURUS: adev->umc.max_ras_err_cnt_per_query = UMC_V6_1_TOTAL_CHANNEL_NUM; adev->umc.channel_inst_num = UMC_V6_1_CHANNEL_INSTANCE_NUM; adev->umc.umc_inst_num = UMC_V6_1_UMC_INSTANCE_NUM; - adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET; + adev->umc.channel_offs = UMC_V6_1_PER_CHANNEL_OFFSET_ARCT; adev->umc.channel_idx_tbl = &umc_v6_1_channel_idx_tbl[0][0]; adev->umc.funcs = &umc_v6_1_funcs; break; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c index 47c4b96b14d1..515eb50cd0f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c @@ -31,6 +31,14 @@ #define smnMCA_UMC0_MCUMC_ADDRT0 0x50f10 +/* UMC 6_1_2 register offsets */ +#define mmUMCCH0_0_EccErrCntSel_ARCT 0x0360 +#define mmUMCCH0_0_EccErrCntSel_ARCT_BASE_IDX 1 +#define mmUMCCH0_0_EccErrCnt_ARCT 0x0361 +#define mmUMCCH0_0_EccErrCnt_ARCT_BASE_IDX 1 +#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT 0x03c2 +#define mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT_BASE_IDX 1 + /* * (addr / 256) * 8192, the higher 26 bits in ErrorAddr * is the index of 8KB block @@ -95,12 +103,25 @@ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_umc_status; uint32_t mc_umc_status_addr; - ecc_err_cnt_sel_addr = - SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); - ecc_err_cnt_addr = - SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); + if (adev->asic_type == CHIP_ARCTURUS) { + /* UMC 6_1_2 registers */ + + ecc_err_cnt_sel_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); + ecc_err_cnt_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT); + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); + } else { + /* UMC 6_1_1 registers */ + + ecc_err_cnt_sel_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); + ecc_err_cnt_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); + } /* select the lower chip and check the error count */ ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); @@ -141,8 +162,17 @@ static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev uint64_t mc_umc_status; uint32_t mc_umc_status_addr; - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); + if (adev->asic_type == CHIP_ARCTURUS) { + /* UMC 6_1_2 registers */ + + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); + } else { + /* UMC 6_1_1 registers */ + + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); + } /* check the MCUMC_STATUS */ mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); @@ -179,8 +209,17 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, uint64_t mc_umc_status, err_addr, retired_page; struct eeprom_table_record *err_rec; - mc_umc_status_addr = - SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); + if (adev->asic_type == CHIP_ARCTURUS) { + /* UMC 6_1_2 registers */ + + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT); + } else { + /* UMC 6_1_1 registers */ + + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); + } /* skip error address process if -ENOMEM */ if (!err_data->err_addr) { @@ -241,10 +280,21 @@ static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev, uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; uint32_t ecc_err_cnt_addr; - ecc_err_cnt_sel_addr = - SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); - ecc_err_cnt_addr = - SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); + if (adev->asic_type == CHIP_ARCTURUS) { + /* UMC 6_1_2 registers */ + + ecc_err_cnt_sel_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT); + ecc_err_cnt_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT); + } else { + /* UMC 6_1_1 registers */ + + ecc_err_cnt_sel_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel); + ecc_err_cnt_addr = + SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt); + } /* select the lower chip and check the error count */ ecc_err_cnt_sel = RREG32(ecc_err_cnt_sel_addr + umc_reg_offset); diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h index dab9cbd292c5..0ce1d323cfdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.h @@ -35,7 +35,8 @@ /* total channel instances in one umc block */ #define UMC_V6_1_TOTAL_CHANNEL_NUM (UMC_V6_1_CHANNEL_INSTANCE_NUM * UMC_V6_1_UMC_INSTANCE_NUM) /* UMC regiser per channel offset */ -#define UMC_V6_1_PER_CHANNEL_OFFSET 0x800 +#define UMC_V6_1_PER_CHANNEL_OFFSET_VG20 0x800 +#define UMC_V6_1_PER_CHANNEL_OFFSET_ARCT 0x400 /* EccErrCnt max value */ #define UMC_V6_1_CE_CNT_MAX 0xffff