mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-21 20:25:15 +07:00
drm/amdgpu: add helper funcs to detect PCS error
Since from vega20, hardware supports run-time detect and report XGMI/WAFL PCS ras error. Add helper functions to walkthrough every type of ras error and report it if any. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
15a1fbdcfb
commit
18f36157f2
@ -26,7 +26,12 @@
|
||||
#include "amdgpu_xgmi.h"
|
||||
#include "amdgpu_smu.h"
|
||||
#include "amdgpu_ras.h"
|
||||
#include "soc15.h"
|
||||
#include "df/df_3_6_offset.h"
|
||||
#include "xgmi/xgmi_4_0_0_smn.h"
|
||||
#include "xgmi/xgmi_4_0_0_sh_mask.h"
|
||||
#include "wafl/wafl2_4_0_0_smn.h"
|
||||
#include "wafl/wafl2_4_0_0_sh_mask.h"
|
||||
|
||||
static DEFINE_MUTEX(xgmi_mutex);
|
||||
|
||||
@ -36,6 +41,94 @@ static DEFINE_MUTEX(xgmi_mutex);
|
||||
static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
|
||||
static unsigned hive_count = 0;
|
||||
|
||||
static const int xgmi_pcs_err_status_reg_vg20[] = {
|
||||
smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
|
||||
smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
|
||||
};
|
||||
|
||||
static const int wafl_pcs_err_status_reg_vg20[] = {
|
||||
smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
|
||||
smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
|
||||
};
|
||||
|
||||
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
|
||||
{"XGMI PCS DataLossErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
|
||||
{"XGMI PCS TrainingErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
|
||||
{"XGMI PCS CRCErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
|
||||
{"XGMI PCS BERExceededErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
|
||||
{"XGMI PCS TxMetaDataErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
|
||||
{"XGMI PCS ReplayBufParityErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
|
||||
{"XGMI PCS DataParityErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
|
||||
{"XGMI PCS ReplayFifoOverflowErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
|
||||
{"XGMI PCS ReplayFifoUnderflowErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
|
||||
{"XGMI PCS ElasticFifoOverflowErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
|
||||
{"XGMI PCS DeskewErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
|
||||
{"XGMI PCS DataStartupLimitErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
|
||||
{"XGMI PCS FCInitTimeoutErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
|
||||
{"XGMI PCS RecoveryTimeoutErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
|
||||
{"XGMI PCS ReadySerialTimeoutErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
|
||||
{"XGMI PCS ReadySerialAttemptErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
|
||||
{"XGMI PCS RecoveryAttemptErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
|
||||
{"XGMI PCS RecoveryRelockAttemptErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
|
||||
};
|
||||
|
||||
static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
|
||||
{"WAFL PCS DataLossErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
|
||||
{"WAFL PCS TrainingErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
|
||||
{"WAFL PCS CRCErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
|
||||
{"WAFL PCS BERExceededErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
|
||||
{"WAFL PCS TxMetaDataErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
|
||||
{"WAFL PCS ReplayBufParityErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
|
||||
{"WAFL PCS DataParityErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
|
||||
{"WAFL PCS ReplayFifoOverflowErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
|
||||
{"WAFL PCS ReplayFifoUnderflowErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
|
||||
{"WAFL PCS ElasticFifoOverflowErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
|
||||
{"WAFL PCS DeskewErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
|
||||
{"WAFL PCS DataStartupLimitErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
|
||||
{"WAFL PCS FCInitTimeoutErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
|
||||
{"WAFL PCS RecoveryTimeoutErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
|
||||
{"WAFL PCS ReadySerialTimeoutErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
|
||||
{"WAFL PCS ReadySerialAttemptErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
|
||||
{"WAFL PCS RecoveryAttemptErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
|
||||
{"WAFL PCS RecoveryRelockAttemptErr",
|
||||
SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
|
||||
};
|
||||
|
||||
void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
|
||||
{
|
||||
return &hive->device_list;
|
||||
@ -560,3 +653,83 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
|
||||
|
||||
return addr + dram_base_addr;
|
||||
}
|
||||
|
||||
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
|
||||
uint32_t value,
|
||||
uint32_t *ue_count,
|
||||
uint32_t *ce_count,
|
||||
bool is_xgmi_pcs)
|
||||
{
|
||||
int i;
|
||||
int ue_cnt;
|
||||
|
||||
if (is_xgmi_pcs) {
|
||||
/* query xgmi pcs error status,
|
||||
* only ue is supported */
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
|
||||
ue_cnt = (value &
|
||||
xgmi_pcs_ras_fields[i].pcs_err_mask) >>
|
||||
xgmi_pcs_ras_fields[i].pcs_err_shift;
|
||||
if (ue_cnt) {
|
||||
dev_info(adev->dev, "%s detected\n",
|
||||
xgmi_pcs_ras_fields[i].err_name);
|
||||
*ue_count += ue_cnt;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* query wafl pcs error status,
|
||||
* only ue is supported */
|
||||
for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
|
||||
ue_cnt = (value &
|
||||
wafl_pcs_ras_fields[i].pcs_err_mask) >>
|
||||
wafl_pcs_ras_fields[i].pcs_err_shift;
|
||||
if (ue_cnt) {
|
||||
dev_info(adev->dev, "%s detected\n",
|
||||
wafl_pcs_ras_fields[i].err_name);
|
||||
*ue_count += ue_cnt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
int i;
|
||||
uint32_t data;
|
||||
uint32_t ue_cnt = 0, ce_cnt = 0;
|
||||
|
||||
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
|
||||
return -EINVAL;
|
||||
|
||||
err_data->ue_count = 0;
|
||||
err_data->ce_count = 0;
|
||||
|
||||
switch (adev->asic_type) {
|
||||
case CHIP_VEGA20:
|
||||
default:
|
||||
/* check xgmi pcs error */
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
|
||||
data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, true);
|
||||
}
|
||||
/* check wafl pcs error */
|
||||
for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
|
||||
data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
|
||||
if (data)
|
||||
amdgpu_xgmi_query_pcs_error_status(adev,
|
||||
data, &ue_cnt, &ce_cnt, false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
err_data->ue_count += ue_cnt;
|
||||
err_data->ce_count += ce_cnt;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -37,6 +37,12 @@ struct amdgpu_hive_info {
|
||||
struct task_barrier tb;
|
||||
};
|
||||
|
||||
struct amdgpu_pcs_ras_field {
|
||||
const char *err_name;
|
||||
uint32_t pcs_err_mask;
|
||||
uint32_t pcs_err_shift;
|
||||
};
|
||||
|
||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
|
||||
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
|
||||
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
|
||||
@ -48,6 +54,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev);
|
||||
void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev);
|
||||
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
|
||||
uint64_t addr);
|
||||
int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status);
|
||||
|
||||
static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
|
||||
struct amdgpu_device *bo_adev)
|
||||
|
Loading…
Reference in New Issue
Block a user