mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-14 00:36:53 +07:00
drm/radeon: handle lockup in delayed work, v5
v5 (chk): complete rework, start when the first fence is emitted, stop when the last fence is signalled, make it work correctly with GPU resets, cleanup radeon_fence_wait_seq Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com> Signed-off-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
9bb39ff43e
commit
0bfa4b4126
@ -350,6 +350,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, unsigned *bankw,
|
|||||||
* Fences.
|
* Fences.
|
||||||
*/
|
*/
|
||||||
struct radeon_fence_driver {
|
struct radeon_fence_driver {
|
||||||
|
struct radeon_device *rdev;
|
||||||
uint32_t scratch_reg;
|
uint32_t scratch_reg;
|
||||||
uint64_t gpu_addr;
|
uint64_t gpu_addr;
|
||||||
volatile uint32_t *cpu_addr;
|
volatile uint32_t *cpu_addr;
|
||||||
@ -357,6 +358,7 @@ struct radeon_fence_driver {
|
|||||||
uint64_t sync_seq[RADEON_NUM_RINGS];
|
uint64_t sync_seq[RADEON_NUM_RINGS];
|
||||||
atomic64_t last_seq;
|
atomic64_t last_seq;
|
||||||
bool initialized;
|
bool initialized;
|
||||||
|
struct delayed_work lockup_work;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct radeon_fence {
|
struct radeon_fence {
|
||||||
|
@ -97,6 +97,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int ring)
|
|||||||
return seq;
|
return seq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* radeon_fence_schedule_check - schedule lockup check
|
||||||
|
*
|
||||||
|
* @rdev: radeon_device pointer
|
||||||
|
* @ring: ring index we should work with
|
||||||
|
*
|
||||||
|
* Queues a delayed work item to check for lockups.
|
||||||
|
*/
|
||||||
|
static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Do not reset the timer here with mod_delayed_work,
|
||||||
|
* this can livelock in an interaction with TTM delayed destroy.
|
||||||
|
*/
|
||||||
|
queue_delayed_work(system_power_efficient_wq,
|
||||||
|
&rdev->fence_drv[ring].lockup_work,
|
||||||
|
RADEON_FENCE_JIFFIES_TIMEOUT);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* radeon_fence_emit - emit a fence on the requested ring
|
* radeon_fence_emit - emit a fence on the requested ring
|
||||||
*
|
*
|
||||||
@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev,
|
|||||||
(*fence)->ring = ring;
|
(*fence)->ring = ring;
|
||||||
radeon_fence_ring_emit(rdev, ring, *fence);
|
radeon_fence_ring_emit(rdev, ring, *fence);
|
||||||
trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
|
trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
|
||||||
|
radeon_fence_schedule_check(rdev, ring);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* radeon_fence_process - process a fence
|
* radeon_fence_activity - check for fence activity
|
||||||
*
|
*
|
||||||
* @rdev: radeon_device pointer
|
* @rdev: radeon_device pointer
|
||||||
* @ring: ring index the fence is associated with
|
* @ring: ring index the fence is associated with
|
||||||
*
|
*
|
||||||
* Checks the current fence value and wakes the fence queue
|
* Checks the current fence value and calculates the last
|
||||||
* if the sequence number has increased (all asics).
|
* signalled fence value. Returns true if activity occured
|
||||||
|
* on the ring, and the fence_queue should be waken up.
|
||||||
*/
|
*/
|
||||||
void radeon_fence_process(struct radeon_device *rdev, int ring)
|
static bool radeon_fence_activity(struct radeon_device *rdev, int ring)
|
||||||
{
|
{
|
||||||
uint64_t seq, last_seq, last_emitted;
|
uint64_t seq, last_seq, last_emitted;
|
||||||
unsigned count_loop = 0;
|
unsigned count_loop = 0;
|
||||||
@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
|
|||||||
}
|
}
|
||||||
} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
|
} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
|
||||||
|
|
||||||
if (wake)
|
if (seq < last_emitted)
|
||||||
|
radeon_fence_schedule_check(rdev, ring);
|
||||||
|
|
||||||
|
return wake;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* radeon_fence_check_lockup - check for hardware lockup
|
||||||
|
*
|
||||||
|
* @work: delayed work item
|
||||||
|
*
|
||||||
|
* Checks for fence activity and if there is none probe
|
||||||
|
* the hardware if a lockup occured.
|
||||||
|
*/
|
||||||
|
static void radeon_fence_check_lockup(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct radeon_fence_driver *fence_drv;
|
||||||
|
struct radeon_device *rdev;
|
||||||
|
int ring;
|
||||||
|
|
||||||
|
fence_drv = container_of(work, struct radeon_fence_driver,
|
||||||
|
lockup_work.work);
|
||||||
|
rdev = fence_drv->rdev;
|
||||||
|
ring = fence_drv - &rdev->fence_drv[0];
|
||||||
|
|
||||||
|
if (!down_read_trylock(&rdev->exclusive_lock)) {
|
||||||
|
/* just reschedule the check if a reset is going on */
|
||||||
|
radeon_fence_schedule_check(rdev, ring);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (radeon_fence_activity(rdev, ring))
|
||||||
|
wake_up_all(&rdev->fence_queue);
|
||||||
|
|
||||||
|
else if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
|
||||||
|
|
||||||
|
/* good news we believe it's a lockup */
|
||||||
|
dev_warn(rdev->dev, "GPU lockup (current fence id "
|
||||||
|
"0x%016llx last fence id 0x%016llx on ring %d)\n",
|
||||||
|
(uint64_t)atomic64_read(&fence_drv->last_seq),
|
||||||
|
fence_drv->sync_seq[ring], ring);
|
||||||
|
|
||||||
|
/* remember that we need an reset */
|
||||||
|
rdev->needs_reset = true;
|
||||||
|
wake_up_all(&rdev->fence_queue);
|
||||||
|
}
|
||||||
|
up_read(&rdev->exclusive_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* radeon_fence_process - process a fence
|
||||||
|
*
|
||||||
|
* @rdev: radeon_device pointer
|
||||||
|
* @ring: ring index the fence is associated with
|
||||||
|
*
|
||||||
|
* Checks the current fence value and wakes the fence queue
|
||||||
|
* if the sequence number has increased (all asics).
|
||||||
|
*/
|
||||||
|
void radeon_fence_process(struct radeon_device *rdev, int ring)
|
||||||
|
{
|
||||||
|
if (radeon_fence_activity(rdev, ring))
|
||||||
wake_up_all(&rdev->fence_queue);
|
wake_up_all(&rdev->fence_queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -300,86 +381,43 @@ static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
|
|||||||
static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
|
static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
|
||||||
bool intr)
|
bool intr)
|
||||||
{
|
{
|
||||||
uint64_t last_seq[RADEON_NUM_RINGS];
|
long r;
|
||||||
bool signaled;
|
int i;
|
||||||
int i, r;
|
|
||||||
|
|
||||||
while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
|
if (radeon_fence_any_seq_signaled(rdev, target_seq))
|
||||||
|
return 0;
|
||||||
|
|
||||||
/* Save current sequence values, used to check for GPU lockups */
|
/* enable IRQs and tracing */
|
||||||
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
|
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
|
||||||
if (!target_seq[i])
|
if (!target_seq[i])
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
|
trace_radeon_fence_wait_begin(rdev->ddev, i, target_seq[i]);
|
||||||
trace_radeon_fence_wait_begin(rdev->ddev, i, target_seq[i]);
|
radeon_irq_kms_sw_irq_get(rdev, i);
|
||||||
radeon_irq_kms_sw_irq_get(rdev, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (intr) {
|
|
||||||
r = wait_event_interruptible_timeout(rdev->fence_queue, (
|
|
||||||
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
|
|
||||||
|| rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
|
|
||||||
} else {
|
|
||||||
r = wait_event_timeout(rdev->fence_queue, (
|
|
||||||
(signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
|
|
||||||
|| rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
|
|
||||||
if (!target_seq[i])
|
|
||||||
continue;
|
|
||||||
|
|
||||||
radeon_irq_kms_sw_irq_put(rdev, i);
|
|
||||||
trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (unlikely(r < 0))
|
|
||||||
return r;
|
|
||||||
|
|
||||||
if (unlikely(!signaled)) {
|
|
||||||
if (rdev->needs_reset)
|
|
||||||
return -EDEADLK;
|
|
||||||
|
|
||||||
/* we were interrupted for some reason and fence
|
|
||||||
* isn't signaled yet, resume waiting */
|
|
||||||
if (r)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
|
|
||||||
if (!target_seq[i])
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i != RADEON_NUM_RINGS)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
|
|
||||||
if (!target_seq[i])
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i < RADEON_NUM_RINGS) {
|
|
||||||
/* good news we believe it's a lockup */
|
|
||||||
dev_warn(rdev->dev, "GPU lockup (waiting for "
|
|
||||||
"0x%016llx last fence id 0x%016llx on"
|
|
||||||
" ring %d)\n",
|
|
||||||
target_seq[i], last_seq[i], i);
|
|
||||||
|
|
||||||
/* remember that we need an reset */
|
|
||||||
rdev->needs_reset = true;
|
|
||||||
wake_up_all(&rdev->fence_queue);
|
|
||||||
return -EDEADLK;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return 0;
|
|
||||||
|
if (intr) {
|
||||||
|
r = wait_event_interruptible_timeout(rdev->fence_queue, (
|
||||||
|
radeon_fence_any_seq_signaled(rdev, target_seq)
|
||||||
|
|| rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
|
||||||
|
} else {
|
||||||
|
r = wait_event_timeout(rdev->fence_queue, (
|
||||||
|
radeon_fence_any_seq_signaled(rdev, target_seq)
|
||||||
|
|| rdev->needs_reset), MAX_SCHEDULE_TIMEOUT);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rdev->needs_reset)
|
||||||
|
r = -EDEADLK;
|
||||||
|
|
||||||
|
for (i = 0; i < RADEON_NUM_RINGS; ++i) {
|
||||||
|
if (!target_seq[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
radeon_irq_kms_sw_irq_put(rdev, i);
|
||||||
|
trace_radeon_fence_wait_end(rdev->ddev, i, target_seq[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return r < 0 ? r : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -711,6 +749,9 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
|
|||||||
rdev->fence_drv[ring].sync_seq[i] = 0;
|
rdev->fence_drv[ring].sync_seq[i] = 0;
|
||||||
atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
|
atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
|
||||||
rdev->fence_drv[ring].initialized = false;
|
rdev->fence_drv[ring].initialized = false;
|
||||||
|
INIT_DELAYED_WORK(&rdev->fence_drv[ring].lockup_work,
|
||||||
|
radeon_fence_check_lockup);
|
||||||
|
rdev->fence_drv[ring].rdev = rdev;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -760,6 +801,7 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
|
|||||||
/* no need to trigger GPU reset as we are unloading */
|
/* no need to trigger GPU reset as we are unloading */
|
||||||
radeon_fence_driver_force_completion(rdev, ring);
|
radeon_fence_driver_force_completion(rdev, ring);
|
||||||
}
|
}
|
||||||
|
cancel_delayed_work_sync(&rdev->fence_drv[ring].lockup_work);
|
||||||
wake_up_all(&rdev->fence_queue);
|
wake_up_all(&rdev->fence_queue);
|
||||||
radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
|
radeon_scratch_free(rdev, rdev->fence_drv[ring].scratch_reg);
|
||||||
rdev->fence_drv[ring].initialized = false;
|
rdev->fence_drv[ring].initialized = false;
|
||||||
@ -778,8 +820,10 @@ void radeon_fence_driver_fini(struct radeon_device *rdev)
|
|||||||
*/
|
*/
|
||||||
void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring)
|
void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring)
|
||||||
{
|
{
|
||||||
if (rdev->fence_drv[ring].initialized)
|
if (rdev->fence_drv[ring].initialized) {
|
||||||
radeon_fence_write(rdev, rdev->fence_drv[ring].sync_seq[ring], ring);
|
radeon_fence_write(rdev, rdev->fence_drv[ring].sync_seq[ring], ring);
|
||||||
|
cancel_delayed_work_sync(&rdev->fence_drv[ring].lockup_work);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user