mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-17 11:26:11 +07:00
7c6e68c777
Problem: Under certain conditions, when some IP bocks take a RAS error, we can get into a situation where a GPU reset is not possible due to issues in RAS in SMU/PSP. Temporary fix until proper solution in PSP/SMU is ready: When uncorrectable error happens the DF will unconditionally broadcast error event packets to all its clients/slave upon receiving fatal error event and freeze all its outbound queues, err_event_athub interrupt will be triggered. In such case and we use this interrupt to issue GPU reset. THe GPU reset code is modified for such case to avoid HW reset, only stops schedulers, deatches all in progress and not yet scheduled job's fences, set error code on them and signals. Also reject any new incoming job submissions from user space. All this is done to notify the applications of the problem. v2: Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param from amdgpu_ras_query_error_count v3: Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset for other XGMI hive memebers. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
83 lines
3.0 KiB
C
83 lines
3.0 KiB
C
/*
|
|
* Copyright 2018 Advanced Micro Devices, Inc.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
#ifndef __AMDGPU_JOB_H__
|
|
#define __AMDGPU_JOB_H__
|
|
|
|
/* bit set means command submit involves a preamble IB */
|
|
#define AMDGPU_PREAMBLE_IB_PRESENT (1 << 0)
|
|
/* bit set means preamble IB is first presented in belonging context */
|
|
#define AMDGPU_PREAMBLE_IB_PRESENT_FIRST (1 << 1)
|
|
/* bit set means context switch occured */
|
|
#define AMDGPU_HAVE_CTX_SWITCH (1 << 2)
|
|
/* bit set means IB is preempted */
|
|
#define AMDGPU_IB_PREEMPTED (1 << 3)
|
|
|
|
#define to_amdgpu_job(sched_job) \
|
|
container_of((sched_job), struct amdgpu_job, base)
|
|
|
|
#define AMDGPU_JOB_GET_VMID(job) ((job) ? (job)->vmid : 0)
|
|
|
|
struct amdgpu_fence;
|
|
|
|
struct amdgpu_job {
|
|
struct drm_sched_job base;
|
|
struct amdgpu_vm *vm;
|
|
struct amdgpu_sync sync;
|
|
struct amdgpu_sync sched_sync;
|
|
struct amdgpu_ib *ibs;
|
|
struct dma_fence *fence; /* the hw fence */
|
|
uint32_t preamble_status;
|
|
uint32_t preemption_status;
|
|
uint32_t num_ibs;
|
|
void *owner;
|
|
bool vm_needs_flush;
|
|
uint64_t vm_pd_addr;
|
|
unsigned vmid;
|
|
unsigned pasid;
|
|
uint32_t gds_base, gds_size;
|
|
uint32_t gws_base, gws_size;
|
|
uint32_t oa_base, oa_size;
|
|
uint32_t vram_lost_counter;
|
|
|
|
/* user fence handling */
|
|
uint64_t uf_addr;
|
|
uint64_t uf_sequence;
|
|
|
|
};
|
|
|
|
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
|
|
struct amdgpu_job **job, struct amdgpu_vm *vm);
|
|
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
|
|
struct amdgpu_job **job);
|
|
|
|
void amdgpu_job_free_resources(struct amdgpu_job *job);
|
|
void amdgpu_job_free(struct amdgpu_job *job);
|
|
int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
|
|
void *owner, struct dma_fence **f);
|
|
int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
|
|
struct dma_fence **fence);
|
|
|
|
void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
|
|
|
|
#endif
|