mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-19 07:16:20 +07:00
drm/amdgpu:implement SRIOV gpu_reset (v2)
implement SRIOV gpu_reset for future use. it wil be called from: 1) job timeout 2) privl access or instruction error interrupt 3) hypervisor detect VF hang v2: agd: rebase on upstream Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
596c67d076
commit
a90ad3c2af
@ -1607,6 +1607,53 @@ int amdgpu_suspend(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_sriov_resume_early(struct amdgpu_device *adev)
|
||||
{
|
||||
int i, r;
|
||||
|
||||
for (i = 0; i < adev->num_ip_blocks; i++) {
|
||||
if (!adev->ip_blocks[i].status.valid)
|
||||
continue;
|
||||
|
||||
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
|
||||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
|
||||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)
|
||||
r = adev->ip_blocks[i].version->funcs->resume(adev);
|
||||
|
||||
if (r) {
|
||||
DRM_ERROR("resume of IP block <%s> failed %d\n",
|
||||
adev->ip_blocks[i].version->funcs->name, r);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_sriov_resume_late(struct amdgpu_device *adev)
|
||||
{
|
||||
int i, r;
|
||||
|
||||
for (i = 0; i < adev->num_ip_blocks; i++) {
|
||||
if (!adev->ip_blocks[i].status.valid)
|
||||
continue;
|
||||
|
||||
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
|
||||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
|
||||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH )
|
||||
continue;
|
||||
|
||||
r = adev->ip_blocks[i].version->funcs->resume(adev);
|
||||
if (r) {
|
||||
DRM_ERROR("resume of IP block <%s> failed %d\n",
|
||||
adev->ip_blocks[i].version->funcs->name, r);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_resume(struct amdgpu_device *adev)
|
||||
{
|
||||
int i, r;
|
||||
@ -2285,6 +2332,113 @@ static int amdgpu_recover_vram_from_shadow(struct amdgpu_device *adev,
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_sriov_gpu_reset - reset the asic
|
||||
*
|
||||
* @adev: amdgpu device pointer
|
||||
* @voluntary: if this reset is requested by guest.
|
||||
* (true means by guest and false means by HYPERVISOR )
|
||||
*
|
||||
* Attempt the reset the GPU if it has hung (all asics).
|
||||
* for SRIOV case.
|
||||
* Returns 0 for success or an error on failure.
|
||||
*/
|
||||
int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
|
||||
{
|
||||
int i, r = 0;
|
||||
int resched;
|
||||
struct amdgpu_bo *bo, *tmp;
|
||||
struct amdgpu_ring *ring;
|
||||
struct dma_fence *fence = NULL, *next = NULL;
|
||||
|
||||
atomic_inc(&adev->gpu_reset_counter);
|
||||
|
||||
/* block TTM */
|
||||
resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
|
||||
|
||||
/* block scheduler */
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
ring = adev->rings[i];
|
||||
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
|
||||
kthread_park(ring->sched.thread);
|
||||
amd_sched_hw_job_reset(&ring->sched);
|
||||
}
|
||||
|
||||
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
|
||||
amdgpu_fence_driver_force_completion(adev);
|
||||
|
||||
/* request to take full control of GPU before re-initialization */
|
||||
if (voluntary)
|
||||
amdgpu_virt_reset_gpu(adev);
|
||||
else
|
||||
amdgpu_virt_request_full_gpu(adev, true);
|
||||
|
||||
|
||||
/* Resume IP prior to SMC */
|
||||
amdgpu_sriov_resume_early(adev);
|
||||
|
||||
/* we need recover gart prior to run SMC/CP/SDMA resume */
|
||||
amdgpu_ttm_recover_gart(adev);
|
||||
|
||||
/* now we are okay to resume SMC/CP/SDMA */
|
||||
amdgpu_sriov_resume_late(adev);
|
||||
|
||||
amdgpu_irq_gpu_reset_resume_helper(adev);
|
||||
|
||||
if (amdgpu_ib_ring_tests(adev))
|
||||
dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r);
|
||||
|
||||
/* release full control of GPU after ib test */
|
||||
amdgpu_virt_release_full_gpu(adev, true);
|
||||
|
||||
DRM_INFO("recover vram bo from shadow\n");
|
||||
|
||||
ring = adev->mman.buffer_funcs_ring;
|
||||
mutex_lock(&adev->shadow_list_lock);
|
||||
list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
|
||||
amdgpu_recover_vram_from_shadow(adev, ring, bo, &next);
|
||||
if (fence) {
|
||||
r = dma_fence_wait(fence, false);
|
||||
if (r) {
|
||||
WARN(r, "recovery from shadow isn't completed\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
dma_fence_put(fence);
|
||||
fence = next;
|
||||
}
|
||||
mutex_unlock(&adev->shadow_list_lock);
|
||||
|
||||
if (fence) {
|
||||
r = dma_fence_wait(fence, false);
|
||||
if (r)
|
||||
WARN(r, "recovery from shadow isn't completed\n");
|
||||
}
|
||||
dma_fence_put(fence);
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = adev->rings[i];
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
|
||||
amd_sched_job_recovery(&ring->sched);
|
||||
kthread_unpark(ring->sched.thread);
|
||||
}
|
||||
|
||||
drm_helper_resume_force_mode(adev->ddev);
|
||||
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
|
||||
if (r) {
|
||||
/* bad news, how to tell it to userspace ? */
|
||||
dev_info(adev->dev, "GPU reset failed\n");
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_gpu_reset - reset the asic
|
||||
*
|
||||
@ -2300,7 +2454,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
|
||||
bool need_full_reset;
|
||||
|
||||
if (amdgpu_sriov_vf(adev))
|
||||
return 0;
|
||||
return amdgpu_sriov_gpu_reset(adev, true);
|
||||
|
||||
if (!amdgpu_check_soft_reset(adev)) {
|
||||
DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
|
||||
|
@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
|
||||
int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
|
||||
int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
|
||||
int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
|
||||
int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary);
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user