mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-09 16:56:44 +07:00
habanalabs: soft-reset device if context-switch fails
This patch fix a bug in the driver, where if the TPC or MME remains in non-IDLE even after all the command submissions are done (due to user bug or malicious user), then future command submissions will fail in the context-switch stage and the driver will remain in "stuck" mode. The fix is to do a soft-reset of the device in case the context-switch fails, because the device should be IDLE during context-switch. If it is not IDLE, then something is wrong and we should reset the compute engines. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
efaa281219
commit
af5f7eea45
@ -622,13 +622,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
|
||||
"Failed to switch to context %d, rejecting CS! %d\n",
|
||||
ctx->asid, rc);
|
||||
/*
|
||||
* If we timedout, we need to soft-reset because
|
||||
* QMAN is probably stuck. However, we can't
|
||||
* call to reset here directly because of
|
||||
* deadlock, so need to do it at the very end
|
||||
* of this function
|
||||
* If we timedout, or if the device is not IDLE
|
||||
* while we want to do context-switch (-EBUSY),
|
||||
* we need to soft-reset because QMAN is
|
||||
* probably stuck. However, we can't call to
|
||||
* reset here directly because of deadlock, so
|
||||
* need to do it at the very end of this
|
||||
* function
|
||||
*/
|
||||
if (rc == -ETIMEDOUT)
|
||||
if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
|
||||
need_soft_reset = true;
|
||||
mutex_unlock(&hpriv->restore_phase_mutex);
|
||||
goto out;
|
||||
@ -706,7 +708,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
|
||||
args->out.seq = cs_seq;
|
||||
}
|
||||
|
||||
if ((rc == -ETIMEDOUT) && (need_soft_reset))
|
||||
if (((rc == -ETIMEDOUT) || (rc == -EBUSY)) && (need_soft_reset))
|
||||
hl_device_reset(hdev, false, false);
|
||||
|
||||
return rc;
|
||||
|
@ -3138,7 +3138,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
|
||||
if (!hdev->asic_funcs->is_device_idle(hdev)) {
|
||||
dev_err_ratelimited(hdev->dev,
|
||||
"Can't send KMD job on QMAN0 if device is not idle\n");
|
||||
return -EFAULT;
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL,
|
||||
|
Loading…
Reference in New Issue
Block a user