mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-18 05:36:11 +07:00
drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place
ras recovery_init should be called after ttm init, bad page reserve should be put in front of gpu reset since i2c may be unstable during gpu reset. add cleanup for recovery_init and recovery_fini v2: add more comment and print. remove cancel_work_sync in recovery_init. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Guchun Chen <guchun.chen@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
87d2b92f1e
commit
1a6fc071e1
@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
list_for_each_entry(tmp_adev, device_list_handle,
|
|
||||||
gmc.xgmi.head) {
|
|
||||||
amdgpu_ras_reserve_bad_pages(tmp_adev);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1493,16 +1493,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||||
{
|
{
|
||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_err_handler_data **data = &con->eh_data;
|
struct ras_err_handler_data **data = &con->eh_data;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
*data = kmalloc(sizeof(**data),
|
*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
|
||||||
GFP_KERNEL|__GFP_ZERO);
|
if (!*data) {
|
||||||
if (!*data)
|
ret = -ENOMEM;
|
||||||
return -ENOMEM;
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
mutex_init(&con->recovery_lock);
|
mutex_init(&con->recovery_lock);
|
||||||
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
|
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
|
||||||
@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
|||||||
|
|
||||||
ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
|
ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
goto free;
|
||||||
|
|
||||||
if (adev->psp.ras.ras->eeprom_control.num_recs) {
|
if (adev->psp.ras.ras->eeprom_control.num_recs) {
|
||||||
ret = amdgpu_ras_load_bad_pages(adev);
|
ret = amdgpu_ras_load_bad_pages(adev);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
goto free;
|
||||||
ret = amdgpu_ras_reserve_bad_pages(adev);
|
ret = amdgpu_ras_reserve_bad_pages(adev);
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
goto release;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
release:
|
||||||
|
amdgpu_ras_release_bad_pages(adev);
|
||||||
|
free:
|
||||||
|
con->eh_data = NULL;
|
||||||
|
kfree((*data)->bps);
|
||||||
|
kfree((*data)->bps_bo);
|
||||||
|
kfree(*data);
|
||||||
|
out:
|
||||||
|
DRM_WARN("Failed to initialize ras recovery!\n");
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
||||||
@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
|
|||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_err_handler_data *data = con->eh_data;
|
struct ras_err_handler_data *data = con->eh_data;
|
||||||
|
|
||||||
|
/* recovery_init failed to init it, fini is useless */
|
||||||
|
if (!data)
|
||||||
|
return 0;
|
||||||
|
|
||||||
cancel_work_sync(&con->recovery_work);
|
cancel_work_sync(&con->recovery_work);
|
||||||
amdgpu_ras_release_bad_pages(adev);
|
amdgpu_ras_release_bad_pages(adev);
|
||||||
|
|
||||||
mutex_lock(&con->recovery_lock);
|
mutex_lock(&con->recovery_lock);
|
||||||
con->eh_data = NULL;
|
con->eh_data = NULL;
|
||||||
kfree(data->bps);
|
kfree(data->bps);
|
||||||
|
kfree(data->bps_bo);
|
||||||
kfree(data);
|
kfree(data);
|
||||||
mutex_unlock(&con->recovery_lock);
|
mutex_unlock(&con->recovery_lock);
|
||||||
|
|
||||||
@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (amdgpu_ras_recovery_init(adev))
|
|
||||||
goto recovery_out;
|
|
||||||
|
|
||||||
amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
|
amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
|
||||||
|
|
||||||
if (amdgpu_ras_fs_init(adev))
|
if (amdgpu_ras_fs_init(adev))
|
||||||
@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
|
|||||||
con->hw_supported, con->supported);
|
con->hw_supported, con->supported);
|
||||||
return 0;
|
return 0;
|
||||||
fs_out:
|
fs_out:
|
||||||
amdgpu_ras_recovery_fini(adev);
|
|
||||||
recovery_out:
|
|
||||||
amdgpu_ras_set_context(adev, NULL);
|
amdgpu_ras_set_context(adev, NULL);
|
||||||
kfree(con);
|
kfree(con);
|
||||||
|
|
||||||
|
@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
|
|||||||
return ras && (ras->supported & (1 << block));
|
return ras && (ras->supported & (1 << block));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
|
||||||
int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
|
int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
|
||||||
unsigned int block);
|
unsigned int block);
|
||||||
|
|
||||||
@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
|
|||||||
{
|
{
|
||||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||||
|
|
||||||
|
/* save bad page to eeprom before gpu reset,
|
||||||
|
* i2c may be unstable in gpu reset
|
||||||
|
*/
|
||||||
|
amdgpu_ras_reserve_bad_pages(adev);
|
||||||
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
|
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
|
||||||
schedule_work(&ras->recovery_work);
|
schedule_work(&ras->recovery_work);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -54,6 +54,7 @@
|
|||||||
#include "amdgpu_trace.h"
|
#include "amdgpu_trace.h"
|
||||||
#include "amdgpu_amdkfd.h"
|
#include "amdgpu_amdkfd.h"
|
||||||
#include "amdgpu_sdma.h"
|
#include "amdgpu_sdma.h"
|
||||||
|
#include "amdgpu_ras.h"
|
||||||
#include "bif/bif_4_1_d.h"
|
#include "bif/bif_4_1_d.h"
|
||||||
|
|
||||||
static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
|
static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
|
||||||
@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
|
|||||||
adev->gmc.visible_vram_size);
|
adev->gmc.visible_vram_size);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* retired pages will be loaded from eeprom and reserved here,
|
||||||
|
* it should be called after ttm init since new bo may be created,
|
||||||
|
* recovery_init may fail, but it can free all resources allocated by
|
||||||
|
* itself and its failure should not stop amdgpu init process.
|
||||||
|
*
|
||||||
|
* Note: theoretically, this should be called before all vram allocations
|
||||||
|
* to protect retired page from abusing
|
||||||
|
*/
|
||||||
|
amdgpu_ras_recovery_init(adev);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
*The reserved vram for firmware must be pinned to the specified
|
*The reserved vram for firmware must be pinned to the specified
|
||||||
*place on the VRAM, so reserve it early.
|
*place on the VRAM, so reserve it early.
|
||||||
|
Loading…
Reference in New Issue
Block a user