diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index da21c843fed8..7541c6f961b3 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -355,6 +355,7 @@ __create_hw_context(struct drm_i915_private *dev_priv, struct i915_gem_context *ctx; unsigned int n; int ret; + int i; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (ctx == NULL) @@ -407,6 +408,9 @@ __create_hw_context(struct drm_i915_private *dev_priv, ctx->desc_template = default_desc_template(dev_priv, dev_priv->mm.aliasing_ppgtt); + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++) + ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES; + return ctx; err_pid: diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h index 071108d34ae0..dc6c58f38cfa 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.h +++ b/drivers/gpu/drm/i915/i915_gem_context.h @@ -209,10 +209,11 @@ struct i915_gem_context { */ atomic_t active_count; -#define CONTEXT_SCORE_GUILTY 10 -#define CONTEXT_SCORE_BAN_THRESHOLD 40 - /** ban_score: Accumulated score of all hangs caused by this context. */ - atomic_t ban_score; + /** + * @hang_timestamp: The last time(s) this context caused a GPU hang + */ + unsigned long hang_timestamp[2]; +#define CONTEXT_FAST_HANG_JIFFIES (120 * HZ) /* 3 hangs within 120s? Banned! */ /** remap_slice: Bitmask of cache lines that need remapping */ u8 remap_slice; diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 9a65341fec09..3f6eddb6f6de 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -434,11 +434,6 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, ee->instdone.row[slice][subslice]); } -static const char *bannable(const struct drm_i915_error_context *ctx) -{ - return ctx->bannable ? "" : " (unbannable)"; -} - static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, const struct drm_i915_error_request *erq, @@ -447,9 +442,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, if (!erq->seqno) return; - err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", - prefix, erq->pid, erq->ban_score, - erq->context, erq->seqno, + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", + prefix, erq->pid, erq->context, erq->seqno, test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &erq->flags) ? "!" : "", test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, @@ -463,10 +457,9 @@ static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, const struct drm_i915_error_context *ctx) { - err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n", + err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, guilty %d active %d\n", header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, - ctx->sched_attr.priority, ctx->ban_score, bannable(ctx), - ctx->guilty, ctx->active); + ctx->sched_attr.priority, ctx->guilty, ctx->active); } static void error_print_engine(struct drm_i915_error_state_buf *m, @@ -688,12 +681,10 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (!error->engine[i].context.pid) continue; - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", + err_printf(m, "Active process (on ring %s): %s [%d]\n", engine_name(m->i915, i), error->engine[i].context.comm, - error->engine[i].context.pid, - error->engine[i].context.ban_score, - bannable(&error->engine[i].context)); + error->engine[i].context.pid); } err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Suspend count: %u\n", error->suspend_count); @@ -779,13 +770,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, if (obj) { err_puts(m, m->i915->engine[i]->name); if (ee->context.pid) - err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", + err_printf(m, " (submitted by %s [%d], ctx %d [%d])", ee->context.comm, ee->context.pid, ee->context.handle, - ee->context.hw_id, - ee->context.ban_score, - bannable(&ee->context)); + ee->context.hw_id); err_printf(m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); @@ -1301,8 +1290,6 @@ static void record_request(struct i915_request *request, erq->flags = request->fence.flags; erq->context = ctx->hw_id; erq->sched_attr = request->sched.attr; - erq->ban_score = atomic_read(&ctx->ban_score); - erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->start = i915_ggtt_offset(request->ring->vma); erq->head = request->head; @@ -1396,8 +1383,6 @@ static void record_context(struct drm_i915_error_context *e, e->handle = ctx->user_handle; e->hw_id = ctx->hw_id; e->sched_attr = ctx->sched; - e->ban_score = atomic_read(&ctx->ban_score); - e->bannable = i915_gem_context_is_bannable(ctx); e->guilty = atomic_read(&ctx->guilty_count); e->active = atomic_read(&ctx->active_count); } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index afa3adb28f02..94eaf8ab9051 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -122,10 +122,8 @@ struct i915_gpu_state { pid_t pid; u32 handle; u32 hw_id; - int ban_score; int active; int guilty; - bool bannable; struct i915_sched_attr sched_attr; } context; @@ -149,7 +147,6 @@ struct i915_gpu_state { long jiffies; pid_t pid; u32 context; - int ban_score; u32 seqno; u32 start; u32 head; diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 5ab4e1c01618..a61e3a4fc9dc 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -290,8 +290,6 @@ static void i915_request_retire(struct i915_request *request) i915_request_remove_from_client(request); - /* Retirement decays the ban score as it is a sign of ctx progress */ - atomic_dec_if_positive(&request->gem_context->ban_score); intel_context_unpin(request->hw_context); __retire_engine_upto(request->engine, request); diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c index 034b654be74c..c234feb5fdf5 100644 --- a/drivers/gpu/drm/i915/i915_reset.c +++ b/drivers/gpu/drm/i915/i915_reset.c @@ -59,24 +59,29 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv, static bool context_mark_guilty(struct i915_gem_context *ctx) { - unsigned int score; - bool banned, bannable; + unsigned long prev_hang; + bool banned; + int i; atomic_inc(&ctx->guilty_count); - bannable = i915_gem_context_is_bannable(ctx); - score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); - banned = (!i915_gem_context_is_recoverable(ctx) || - score >= CONTEXT_SCORE_BAN_THRESHOLD); - - /* Cool contexts don't accumulate client ban score */ - if (!bannable) + /* Cool contexts are too cool to be banned! (Used for reset testing.) */ + if (!i915_gem_context_is_bannable(ctx)) return false; + /* Record the timestamp for the last N hangs */ + prev_hang = ctx->hang_timestamp[0]; + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) + ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; + ctx->hang_timestamp[i] = jiffies; + + /* If we have hung N+1 times in rapid succession, we ban the context! */ + banned = !i915_gem_context_is_recoverable(ctx); + if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) + banned = true; if (banned) { - DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n", - ctx->name, atomic_read(&ctx->guilty_count), - score); + DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n", + ctx->name, atomic_read(&ctx->guilty_count)); i915_gem_context_set_banned(ctx); }