mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-24 14:19:51 +07:00
drm/amdgpu: Fix repeatly flr issue
Only for no job running test case need to do recover in flr notification. For having job in mirror list, then let guest driver to hit job timeout, and then do recover. Signed-off-by: jqdeng <Emily.Deng@amd.com> Acked-by: Nirmoy Das <nirmoy.das@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
5ce99853a6
commit
9a1cddd637
@ -1133,6 +1133,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
|
||||
#define amdgpu_inc_vram_lost(adev) atomic_inc(&((adev)->vram_lost_counter));
|
||||
|
||||
/* Common functions */
|
||||
bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
|
||||
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
|
||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
struct amdgpu_job* job);
|
||||
|
@ -3922,6 +3922,34 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_has_job_running - check if there is any job in mirror list
|
||||
*
|
||||
* @adev: amdgpu device pointer
|
||||
*
|
||||
* check if there is any job in mirror list
|
||||
*/
|
||||
bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
|
||||
{
|
||||
int i;
|
||||
struct drm_sched_job *job;
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = adev->rings[i];
|
||||
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
|
||||
spin_lock(&ring->sched.job_list_lock);
|
||||
job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
|
||||
struct drm_sched_job, node);
|
||||
spin_unlock(&ring->sched.job_list_lock);
|
||||
if (job)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_should_recover_gpu - check if we should try GPU recovery
|
||||
*
|
||||
|
@ -268,7 +268,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
|
||||
|
||||
/* Trigger recovery for world switch failure if no TDR */
|
||||
if (amdgpu_device_should_recover_gpu(adev)
|
||||
&& adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)
|
||||
&& (amdgpu_device_has_job_running(adev) || adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT))
|
||||
amdgpu_device_gpu_recover(adev, NULL);
|
||||
}
|
||||
|
||||
|
@ -289,7 +289,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
|
||||
|
||||
/* Trigger recovery for world switch failure if no TDR */
|
||||
if (amdgpu_device_should_recover_gpu(adev)
|
||||
&& (adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
&& (amdgpu_device_has_job_running(adev) ||
|
||||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
|
||||
adev->video_timeout == MAX_SCHEDULE_TIMEOUT))
|
||||
|
Loading…
Reference in New Issue
Block a user