mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-18 12:56:18 +07:00
drm/radeon: rework gpu lockup detection and processing
Previusly multiple rings could trigger multiple GPU resets at the same time. Signed-off-by: Christian König <deathsimple@vodafone.de> Reviewed-by: Jerome Glisse <jglisse@redhat.com> Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
parent
7bd560e885
commit
36abacaed3
@ -255,8 +255,7 @@ struct radeon_fence_driver {
|
||||
volatile uint32_t *cpu_addr;
|
||||
atomic_t seq;
|
||||
uint32_t last_seq;
|
||||
unsigned long last_jiffies;
|
||||
unsigned long last_timeout;
|
||||
unsigned long last_activity;
|
||||
wait_queue_head_t queue;
|
||||
struct list_head created;
|
||||
struct list_head emitted;
|
||||
|
@ -74,6 +74,10 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence)
|
||||
radeon_fence_ring_emit(rdev, fence->ring, fence);
|
||||
trace_radeon_fence_emit(rdev->ddev, fence->seq);
|
||||
fence->emitted = true;
|
||||
/* are we the first fence on a previusly idle ring? */
|
||||
if (list_empty(&rdev->fence_drv[fence->ring].emitted)) {
|
||||
rdev->fence_drv[fence->ring].last_activity = jiffies;
|
||||
}
|
||||
list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted);
|
||||
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
|
||||
return 0;
|
||||
@ -85,34 +89,14 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring)
|
||||
struct list_head *i, *n;
|
||||
uint32_t seq;
|
||||
bool wake = false;
|
||||
unsigned long cjiffies;
|
||||
|
||||
seq = radeon_fence_read(rdev, ring);
|
||||
if (seq != rdev->fence_drv[ring].last_seq) {
|
||||
rdev->fence_drv[ring].last_seq = seq;
|
||||
rdev->fence_drv[ring].last_jiffies = jiffies;
|
||||
rdev->fence_drv[ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
|
||||
} else {
|
||||
cjiffies = jiffies;
|
||||
if (time_after(cjiffies, rdev->fence_drv[ring].last_jiffies)) {
|
||||
cjiffies -= rdev->fence_drv[ring].last_jiffies;
|
||||
if (time_after(rdev->fence_drv[ring].last_timeout, cjiffies)) {
|
||||
/* update the timeout */
|
||||
rdev->fence_drv[ring].last_timeout -= cjiffies;
|
||||
} else {
|
||||
/* the 500ms timeout is elapsed we should test
|
||||
* for GPU lockup
|
||||
*/
|
||||
rdev->fence_drv[ring].last_timeout = 1;
|
||||
}
|
||||
} else {
|
||||
/* wrap around update last jiffies, we will just wait
|
||||
* a little longer
|
||||
*/
|
||||
rdev->fence_drv[ring].last_jiffies = cjiffies;
|
||||
}
|
||||
if (seq == rdev->fence_drv[ring].last_seq)
|
||||
return false;
|
||||
}
|
||||
|
||||
rdev->fence_drv[ring].last_seq = seq;
|
||||
rdev->fence_drv[ring].last_activity = jiffies;
|
||||
|
||||
n = NULL;
|
||||
list_for_each(i, &rdev->fence_drv[ring].emitted) {
|
||||
fence = list_entry(i, struct radeon_fence, list);
|
||||
@ -207,66 +191,84 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
|
||||
struct radeon_device *rdev;
|
||||
unsigned long irq_flags, timeout;
|
||||
u32 seq;
|
||||
int r;
|
||||
int i, r;
|
||||
bool signaled;
|
||||
|
||||
if (fence == NULL) {
|
||||
WARN(1, "Querying an invalid fence : %p !\n", fence);
|
||||
return 0;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
rdev = fence->rdev;
|
||||
if (radeon_fence_signaled(fence)) {
|
||||
return 0;
|
||||
}
|
||||
timeout = rdev->fence_drv[fence->ring].last_timeout;
|
||||
retry:
|
||||
/* save current sequence used to check for GPU lockup */
|
||||
seq = rdev->fence_drv[fence->ring].last_seq;
|
||||
trace_radeon_fence_wait_begin(rdev->ddev, seq);
|
||||
if (intr) {
|
||||
signaled = radeon_fence_signaled(fence);
|
||||
while (!signaled) {
|
||||
read_lock_irqsave(&rdev->fence_lock, irq_flags);
|
||||
timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
|
||||
if (time_after(rdev->fence_drv[fence->ring].last_activity, timeout)) {
|
||||
/* the normal case, timeout is somewhere before last_activity */
|
||||
timeout = rdev->fence_drv[fence->ring].last_activity - timeout;
|
||||
} else {
|
||||
/* either jiffies wrapped around, or no fence was signaled in the last 500ms
|
||||
* anyway we will just wait for the minimum amount and then check for a lockup */
|
||||
timeout = 1;
|
||||
}
|
||||
/* save current sequence value used to check for GPU lockups */
|
||||
seq = rdev->fence_drv[fence->ring].last_seq;
|
||||
read_unlock_irqrestore(&rdev->fence_lock, irq_flags);
|
||||
|
||||
trace_radeon_fence_wait_begin(rdev->ddev, seq);
|
||||
radeon_irq_kms_sw_irq_get(rdev, fence->ring);
|
||||
r = wait_event_interruptible_timeout(rdev->fence_drv[fence->ring].queue,
|
||||
radeon_fence_signaled(fence), timeout);
|
||||
if (intr) {
|
||||
r = wait_event_interruptible_timeout(
|
||||
rdev->fence_drv[fence->ring].queue,
|
||||
(signaled = radeon_fence_signaled(fence)), timeout);
|
||||
} else {
|
||||
r = wait_event_timeout(
|
||||
rdev->fence_drv[fence->ring].queue,
|
||||
(signaled = radeon_fence_signaled(fence)), timeout);
|
||||
}
|
||||
radeon_irq_kms_sw_irq_put(rdev, fence->ring);
|
||||
if (unlikely(r < 0)) {
|
||||
return r;
|
||||
}
|
||||
} else {
|
||||
radeon_irq_kms_sw_irq_get(rdev, fence->ring);
|
||||
r = wait_event_timeout(rdev->fence_drv[fence->ring].queue,
|
||||
radeon_fence_signaled(fence), timeout);
|
||||
radeon_irq_kms_sw_irq_put(rdev, fence->ring);
|
||||
}
|
||||
trace_radeon_fence_wait_end(rdev->ddev, seq);
|
||||
if (unlikely(!radeon_fence_signaled(fence))) {
|
||||
/* we were interrupted for some reason and fence isn't
|
||||
* isn't signaled yet, resume wait
|
||||
*/
|
||||
if (r) {
|
||||
timeout = r;
|
||||
goto retry;
|
||||
}
|
||||
/* don't protect read access to rdev->fence_drv[t].last_seq
|
||||
* if we experiencing a lockup the value doesn't change
|
||||
*/
|
||||
if (seq == rdev->fence_drv[fence->ring].last_seq &&
|
||||
radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
|
||||
trace_radeon_fence_wait_end(rdev->ddev, seq);
|
||||
|
||||
/* good news we believe it's a lockup */
|
||||
printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
|
||||
fence->seq, seq);
|
||||
if (unlikely(!signaled)) {
|
||||
/* we were interrupted for some reason and fence
|
||||
* isn't signaled yet, resume waiting */
|
||||
if (r) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* mark the ring as not ready any more */
|
||||
rdev->ring[fence->ring].ready = false;
|
||||
r = radeon_gpu_reset(rdev);
|
||||
if (r)
|
||||
return r;
|
||||
write_lock_irqsave(&rdev->fence_lock, irq_flags);
|
||||
/* check if sequence value has changed since last_activity */
|
||||
if (seq != rdev->fence_drv[fence->ring].last_seq) {
|
||||
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* change sequence value on all rings, so nobody else things there is a lockup */
|
||||
for (i = 0; i < RADEON_NUM_RINGS; ++i)
|
||||
rdev->fence_drv[i].last_seq -= 0x10000;
|
||||
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
|
||||
|
||||
if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
|
||||
|
||||
/* good news we believe it's a lockup */
|
||||
printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
|
||||
fence->seq, seq);
|
||||
|
||||
/* mark the ring as not ready any more */
|
||||
rdev->ring[fence->ring].ready = false;
|
||||
r = radeon_gpu_reset(rdev);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
write_lock_irqsave(&rdev->fence_lock, irq_flags);
|
||||
rdev->fence_drv[fence->ring].last_activity = jiffies;
|
||||
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
|
||||
}
|
||||
}
|
||||
timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
|
||||
write_lock_irqsave(&rdev->fence_lock, irq_flags);
|
||||
rdev->fence_drv[fence->ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
|
||||
rdev->fence_drv[fence->ring].last_jiffies = jiffies;
|
||||
write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
|
||||
goto retry;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user