linux_dsm_epyc7002/drivers/gpu/drm/i915/gt/intel_reset.h

78 lines
2.0 KiB
C
Raw Normal View History

/*
* SPDX-License-Identifier: MIT
*
* Copyright © 2008-2018 Intel Corporation
*/
#ifndef I915_RESET_H
#define I915_RESET_H
#include <linux/compiler.h>
#include <linux/types.h>
drm/i915: Revoke mmaps and prevent access to fence registers across reset Previously, we were able to rely on the recursive properties of struct_mutex to allow us to serialise revoking mmaps and reacquiring the FENCE registers with them being clobbered over a global device reset. I then proceeded to throw out the baby with the bath water in order to pursue a struct_mutex-less reset. Perusing LWN for alternative strategies, the dilemma on how to serialise access to a global resource on one side was answered by https://lwn.net/Articles/202847/ -- Sleepable RCU: 1 int readside(void) { 2 int idx; 3 rcu_read_lock(); 4 if (nomoresrcu) { 5 rcu_read_unlock(); 6 return -EINVAL; 7 } 8 idx = srcu_read_lock(&ss); 9 rcu_read_unlock(); 10 /* SRCU read-side critical section. */ 11 srcu_read_unlock(&ss, idx); 12 return 0; 13 } 14 15 void cleanup(void) 16 { 17 nomoresrcu = 1; 18 synchronize_rcu(); 19 synchronize_srcu(&ss); 20 cleanup_srcu_struct(&ss); 21 } No more worrying about stop_machine, just an uber-complex mutex, optimised for reads, with the overhead pushed to the rare reset path. However, we do run the risk of a deadlock as we allocate underneath the SRCU read lock, and the allocation may require a GPU reset, causing a dependency cycle via the in-flight requests. We resolve that by declaring the driver wedged and cancelling all in-flight rendering. v2: Use expedited rcu barriers to match our earlier timing characteristics. v3: Try to annotate locking contexts for sparse v4: Reduce selftest lock duration to avoid a reset deadlock with fences v5: s/srcu/reset_backoff_srcu/ v6: Remove more stale comments Testcase: igt/gem_mmap_gtt/hang Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk
2019-02-08 22:37:03 +07:00
#include <linux/srcu.h>
#include "intel_engine_types.h"
#include "intel_reset_types.h"
struct drm_i915_private;
struct i915_request;
struct intel_engine_cs;
struct intel_gt;
struct intel_guc;
void intel_gt_init_reset(struct intel_gt *gt);
void intel_gt_fini_reset(struct intel_gt *gt);
__printf(4, 5)
void intel_gt_handle_error(struct intel_gt *gt,
intel_engine_mask_t engine_mask,
unsigned long flags,
const char *fmt, ...);
#define I915_ERROR_CAPTURE BIT(0)
void intel_gt_reset(struct intel_gt *gt,
intel_engine_mask_t stalled_mask,
const char *reason);
int intel_engine_reset(struct intel_engine_cs *engine,
const char *reason);
void __i915_request_reset(struct i915_request *rq, bool guilty);
drm/i915: Revoke mmaps and prevent access to fence registers across reset Previously, we were able to rely on the recursive properties of struct_mutex to allow us to serialise revoking mmaps and reacquiring the FENCE registers with them being clobbered over a global device reset. I then proceeded to throw out the baby with the bath water in order to pursue a struct_mutex-less reset. Perusing LWN for alternative strategies, the dilemma on how to serialise access to a global resource on one side was answered by https://lwn.net/Articles/202847/ -- Sleepable RCU: 1 int readside(void) { 2 int idx; 3 rcu_read_lock(); 4 if (nomoresrcu) { 5 rcu_read_unlock(); 6 return -EINVAL; 7 } 8 idx = srcu_read_lock(&ss); 9 rcu_read_unlock(); 10 /* SRCU read-side critical section. */ 11 srcu_read_unlock(&ss, idx); 12 return 0; 13 } 14 15 void cleanup(void) 16 { 17 nomoresrcu = 1; 18 synchronize_rcu(); 19 synchronize_srcu(&ss); 20 cleanup_srcu_struct(&ss); 21 } No more worrying about stop_machine, just an uber-complex mutex, optimised for reads, with the overhead pushed to the rare reset path. However, we do run the risk of a deadlock as we allocate underneath the SRCU read lock, and the allocation may require a GPU reset, causing a dependency cycle via the in-flight requests. We resolve that by declaring the driver wedged and cancelling all in-flight rendering. v2: Use expedited rcu barriers to match our earlier timing characteristics. v3: Try to annotate locking contexts for sparse v4: Reduce selftest lock duration to avoid a reset deadlock with fences v5: s/srcu/reset_backoff_srcu/ v6: Remove more stale comments Testcase: igt/gem_mmap_gtt/hang Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk
2019-02-08 22:37:03 +07:00
int __must_check intel_gt_reset_trylock(struct intel_gt *gt);
void intel_gt_reset_unlock(struct intel_gt *gt, int tag);
void intel_gt_set_wedged(struct intel_gt *gt);
bool intel_gt_unset_wedged(struct intel_gt *gt);
int intel_gt_terminally_wedged(struct intel_gt *gt);
int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask);
int intel_reset_guc(struct intel_gt *gt);
struct intel_wedge_me {
struct delayed_work work;
struct intel_gt *gt;
const char *name;
};
void __intel_init_wedge(struct intel_wedge_me *w,
struct intel_gt *gt,
long timeout,
const char *name);
void __intel_fini_wedge(struct intel_wedge_me *w);
#define intel_wedge_on_timeout(W, GT, TIMEOUT) \
for (__intel_init_wedge((W), (GT), (TIMEOUT), __func__); \
(W)->gt; \
__intel_fini_wedge((W)))
static inline bool __intel_reset_failed(const struct intel_reset *reset)
{
return unlikely(test_bit(I915_WEDGED, &reset->flags));
}
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
#endif /* I915_RESET_H */