mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-24 05:29:39 +07:00
drm/amdgpu: schedule ras recovery when reaching bad page threshold(v2)
Once the bad page saved to eeprom reaches the configured threshold, ras recovery will be issued to notify user. v2: Fix spelling typo. Signed-off-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
35cd2cdadb
commit
9c06f91ff2
@ -394,8 +394,10 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||
int i, ret = 0;
|
||||
struct i2c_msg *msgs, *msg;
|
||||
unsigned char *buffs, *buff;
|
||||
bool sched_ras_recovery = false;
|
||||
struct eeprom_table_record *record;
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
|
||||
if (adev->asic_type != CHIP_VEGA20 && adev->asic_type != CHIP_ARCTURUS)
|
||||
return 0;
|
||||
@ -413,11 +415,30 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||
goto free_buff;
|
||||
}
|
||||
|
||||
/*
|
||||
* If saved bad pages number exceeds the bad page threshold for
|
||||
* the whole VRAM, update table header to mark the BAD GPU tag
|
||||
* and schedule one ras recovery after eeprom write is done,
|
||||
* this can avoid the missing for latest records.
|
||||
*
|
||||
* This new header will be picked up and checked in the bootup
|
||||
* by ras recovery, which may break bootup process to notify
|
||||
* user this GPU is in bad state and to retire such GPU for
|
||||
* further check.
|
||||
*/
|
||||
if (write && (amdgpu_bad_page_threshold != 0) &&
|
||||
((control->num_recs + num) >= ras->bad_page_cnt_threshold)) {
|
||||
dev_warn(adev->dev,
|
||||
"Saved bad pages(%d) reaches threshold value(%d).\n",
|
||||
control->num_recs + num, ras->bad_page_cnt_threshold);
|
||||
control->tbl_hdr.header = EEPROM_TABLE_HDR_BAD;
|
||||
sched_ras_recovery = true;
|
||||
}
|
||||
|
||||
/* In case of overflow just start from beginning to not lose newest records */
|
||||
if (write && (control->next_addr + EEPROM_TABLE_RECORD_SIZE * num > EEPROM_SIZE_BYTES))
|
||||
control->next_addr = EEPROM_RECORD_START;
|
||||
|
||||
|
||||
/*
|
||||
* TODO Currently makes EEPROM writes for each record, this creates
|
||||
* internal fragmentation. Optimized the code to do full page write of
|
||||
@ -493,6 +514,20 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||
__update_tbl_checksum(control, records, num, old_hdr_byte_sum);
|
||||
|
||||
__update_table_header(control, buffs);
|
||||
|
||||
if (sched_ras_recovery) {
|
||||
/*
|
||||
* Before scheduling ras recovery, assert the related
|
||||
* flag first, which shall bypass common bad page
|
||||
* reservation execution in amdgpu_ras_reset_gpu.
|
||||
*/
|
||||
amdgpu_ras_get_context(adev)->flags |=
|
||||
AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV;
|
||||
|
||||
dev_warn(adev->dev, "Conduct ras recovery due to bad "
|
||||
"page threshold reached.\n");
|
||||
amdgpu_ras_reset_gpu(adev);
|
||||
}
|
||||
} else if (!__validate_tbl_checksum(control, records, num)) {
|
||||
DRM_WARN("EEPROM Table checksum mismatch!");
|
||||
/* TODO Uncomment when EEPROM read/write is relliable */
|
||||
|
Loading…
Reference in New Issue
Block a user