linux_dsm_epyc7002/drivers/gpu/drm/i915/gt/intel_gt.c

640 lines
15 KiB
C
Raw Normal View History

// SPDX-License-Identifier: MIT
/*
* Copyright © 2019 Intel Corporation
*/
#include "debugfs_gt.h"
#include "i915_drv.h"
#include "intel_context.h"
#include "intel_gt.h"
#include "intel_gt_buffer_pool.h"
#include "intel_gt_clock_utils.h"
#include "intel_gt_pm.h"
#include "intel_gt_requests.h"
#include "intel_mocs.h"
#include "intel_rc6.h"
#include "intel_renderstate.h"
#include "intel_rps.h"
#include "intel_uncore.h"
#include "intel_pm.h"
drm/i915/gt: Keep a no-frills swappable copy of the default context state We need to keep the default context state around to instantiate new contexts (aka golden rendercontext), and we also keep it pinned while the engine is active so that we can quickly reset a hanging context. However, the default contexts are large enough to merit keeping in swappable memory as opposed to kernel memory, so we store them inside shmemfs. Currently, we use the normal GEM objects to create the default context image, but we can throw away all but the shmemfs file. This greatly simplifies the tricky power management code which wants to run underneath the normal GT locking, and we definitely do not want to use any high level objects that may appear to recurse back into the GT. Though perhaps the primary advantage of the complex GEM object is that we aggressively cache the mapping, but here we are recreating the vm_area everytime time we unpark. At the worst, we add a lightweight cache, but first find a microbenchmark that is impacted. Having started to create some utility functions to make working with shmemfs objects easier, we can start putting them to wider use, where GEM objects are overkill, such as storing persistent error state. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Ramalingam C <ramalingam.c@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200429172429.6054-1-chris@chris-wilson.co.uk
2020-04-30 00:24:29 +07:00
#include "shmem_utils.h"
void intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915)
{
gt->i915 = i915;
gt->uncore = &i915->uncore;
spin_lock_init(&gt->irq_lock);
INIT_LIST_HEAD(&gt->closed_vma);
spin_lock_init(&gt->closed_lock);
intel_gt_init_buffer_pool(gt);
intel_gt_init_reset(gt);
intel_gt_init_requests(gt);
intel_gt_init_timelines(gt);
intel_gt_pm_init_early(gt);
drm/i915/gt: Always track callers to intel_rps_mark_interactive() During startup, we may find ourselves in an interesting position where we haven't fully enabled RPS before the display starts trying to use it. This may lead to an imbalance in our "interactive" counter: <3>[ 4.813326] intel_rps_mark_interactive:652 GEM_BUG_ON(!rps->power.interactive) <4>[ 4.813396] ------------[ cut here ]------------ <2>[ 4.813398] kernel BUG at drivers/gpu/drm/i915/gt/intel_rps.c:652! <4>[ 4.813430] invalid opcode: 0000 [#1] PREEMPT SMP PTI <4>[ 4.813438] CPU: 1 PID: 18 Comm: kworker/1:0H Not tainted 5.4.0-rc5-CI-CI_DRM_7209+ #1 <4>[ 4.813447] Hardware name: /NUC7i5BNB, BIOS BNKBL357.86A.0054.2017.1025.1822 10/25/2017 <4>[ 4.813525] Workqueue: events_highpri intel_atomic_cleanup_work [i915] <4>[ 4.813589] RIP: 0010:intel_rps_mark_interactive+0xb3/0xc0 [i915] <4>[ 4.813597] Code: bc 3f de e0 48 8b 35 84 2e 24 00 49 c7 c0 f3 d4 4e a0 b9 8c 02 00 00 48 c7 c2 80 9c 48 a0 48 c7 c7 3e 73 34 a0 e8 8d 3b e5 e0 <0f> 0b 90 66 2e 0f 1f 84 00 00 00 00 00 80 bf c0 00 00 00 00 74 32 <4>[ 4.813616] RSP: 0018:ffffc900000efe00 EFLAGS: 00010286 <4>[ 4.813623] RAX: 000000000000000e RBX: ffff8882583cc7f0 RCX: 0000000000000000 <4>[ 4.813631] RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffff888275969c00 <4>[ 4.813639] RBP: 0000000000000000 R08: 0000000000000008 R09: ffff888275ace000 <4>[ 4.813646] R10: ffffc900000efe00 R11: ffff888275969c00 R12: ffff8882583cc8d8 <4>[ 4.813654] R13: ffff888276abce00 R14: 0000000000000000 R15: ffff88825e878860 <4>[ 4.813662] FS: 0000000000000000(0000) GS:ffff888276a80000(0000) knlGS:0000000000000000 <4>[ 4.813672] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 4.813678] CR2: 00007f051d5ca0a8 CR3: 0000000262f48001 CR4: 00000000003606e0 <4>[ 4.813686] Call Trace: <4>[ 4.813755] intel_cleanup_plane_fb+0x4e/0x60 [i915] <4>[ 4.813764] drm_atomic_helper_cleanup_planes+0x4d/0x70 <4>[ 4.813833] intel_atomic_cleanup_work+0x15/0x80 [i915] <4>[ 4.813842] process_one_work+0x26a/0x620 <4>[ 4.813850] worker_thread+0x37/0x380 <4>[ 4.813857] ? process_one_work+0x620/0x620 <4>[ 4.813864] kthread+0x119/0x130 <4>[ 4.813870] ? kthread_park+0x80/0x80 <4>[ 4.813878] ret_from_fork+0x3a/0x50 <4>[ 4.813887] Modules linked in: i915(+) mei_hdcp x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul btusb btrtl btbcm btintel snd_hda_intel snd_intel_nhlt snd_hda_codec bluetooth snd_hwdep snd_hda_core ghash_clmulni_intel snd_pcm e1000e ecdh_generic ecc ptp pps_core mei_me mei prime_numbers <4>[ 4.813934] ---[ end trace c13289af88174ffc ]--- The solution employed is to not worry about RPS state and keep the tally of the interactive counter separate. When we do enable RPS, we will then take the display activity into account. Fixes: 3e7abf814193 ("drm/i915: Extract GT render power state management") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Andi Shyti <andi.shyti@intel.com> Acked-by: Andi Shyti <andi.shyti@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191030103827.2413-1-chris@chris-wilson.co.uk
2019-10-30 17:38:23 +07:00
intel_rps_init_early(&gt->rps);
intel_uc_init_early(&gt->uc);
}
void intel_gt_init_hw_early(struct intel_gt *gt, struct i915_ggtt *ggtt)
{
gt->ggtt = ggtt;
}
static void init_unused_ring(struct intel_gt *gt, u32 base)
{
struct intel_uncore *uncore = gt->uncore;
intel_uncore_write(uncore, RING_CTL(base), 0);
intel_uncore_write(uncore, RING_HEAD(base), 0);
intel_uncore_write(uncore, RING_TAIL(base), 0);
intel_uncore_write(uncore, RING_START(base), 0);
}
static void init_unused_rings(struct intel_gt *gt)
{
struct drm_i915_private *i915 = gt->i915;
if (IS_I830(i915)) {
init_unused_ring(gt, PRB1_BASE);
init_unused_ring(gt, SRB0_BASE);
init_unused_ring(gt, SRB1_BASE);
init_unused_ring(gt, SRB2_BASE);
init_unused_ring(gt, SRB3_BASE);
} else if (IS_GEN(i915, 2)) {
init_unused_ring(gt, SRB0_BASE);
init_unused_ring(gt, SRB1_BASE);
} else if (IS_GEN(i915, 3)) {
init_unused_ring(gt, PRB1_BASE);
init_unused_ring(gt, PRB2_BASE);
}
}
int intel_gt_init_hw(struct intel_gt *gt)
{
struct drm_i915_private *i915 = gt->i915;
struct intel_uncore *uncore = gt->uncore;
int ret;
gt->last_init_time = ktime_get();
/* Double layer security blanket, see i915_gem_init() */
intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
if (HAS_EDRAM(i915) && INTEL_GEN(i915) < 9)
intel_uncore_rmw(uncore, HSW_IDICR, 0, IDIHASHMSK(0xf));
if (IS_HASWELL(i915))
intel_uncore_write(uncore,
MI_PREDICATE_RESULT_2,
IS_HSW_GT3(i915) ?
LOWER_SLICE_ENABLED : LOWER_SLICE_DISABLED);
/* Apply the GT workarounds... */
intel_gt_apply_workarounds(gt);
/* ...and determine whether they are sticking. */
intel_gt_verify_workarounds(gt, "init");
intel_gt_init_swizzling(gt);
/*
* At least 830 can leave some of the unused rings
* "active" (ie. head != tail) after resume which
* will prevent c3 entry. Makes sure all unused rings
* are totally idle.
*/
init_unused_rings(gt);
ret = i915_ppgtt_init_hw(gt);
if (ret) {
DRM_ERROR("Enabling PPGTT failed (%d)\n", ret);
goto out;
}
/* We can't enable contexts until all firmware is loaded */
ret = intel_uc_init_hw(&gt->uc);
if (ret) {
i915_probe_error(i915, "Enabling uc failed (%d)\n", ret);
goto out;
}
intel_mocs_init(gt);
out:
intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
return ret;
}
static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
{
intel_uncore_rmw(uncore, reg, 0, set);
}
static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
{
intel_uncore_rmw(uncore, reg, clr, 0);
}
static void clear_register(struct intel_uncore *uncore, i915_reg_t reg)
{
intel_uncore_rmw(uncore, reg, 0, 0);
}
static void gen8_clear_engine_error_register(struct intel_engine_cs *engine)
{
GEN6_RING_FAULT_REG_RMW(engine, RING_FAULT_VALID, 0);
GEN6_RING_FAULT_REG_POSTING_READ(engine);
}
void
intel_gt_clear_error_registers(struct intel_gt *gt,
intel_engine_mask_t engine_mask)
{
struct drm_i915_private *i915 = gt->i915;
struct intel_uncore *uncore = gt->uncore;
u32 eir;
if (!IS_GEN(i915, 2))
clear_register(uncore, PGTBL_ER);
if (INTEL_GEN(i915) < 4)
clear_register(uncore, IPEIR(RENDER_RING_BASE));
else
clear_register(uncore, IPEIR_I965);
clear_register(uncore, EIR);
eir = intel_uncore_read(uncore, EIR);
if (eir) {
/*
* some errors might have become stuck,
* mask them.
*/
DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
rmw_set(uncore, EMR, eir);
intel_uncore_write(uncore, GEN2_IIR,
I915_MASTER_ERROR_INTERRUPT);
}
if (INTEL_GEN(i915) >= 12) {
rmw_clear(uncore, GEN12_RING_FAULT_REG, RING_FAULT_VALID);
intel_uncore_posting_read(uncore, GEN12_RING_FAULT_REG);
} else if (INTEL_GEN(i915) >= 8) {
rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID);
intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG);
} else if (INTEL_GEN(i915) >= 6) {
struct intel_engine_cs *engine;
enum intel_engine_id id;
for_each_engine_masked(engine, gt, engine_mask, id)
gen8_clear_engine_error_register(engine);
}
}
static void gen6_check_faults(struct intel_gt *gt)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
u32 fault;
for_each_engine(engine, gt, id) {
fault = GEN6_RING_FAULT_REG_READ(engine);
if (fault & RING_FAULT_VALID) {
drm_dbg(&engine->i915->drm, "Unexpected fault\n"
"\tAddr: 0x%08lx\n"
"\tAddress space: %s\n"
"\tSource ID: %d\n"
"\tType: %d\n",
fault & PAGE_MASK,
fault & RING_FAULT_GTTSEL_MASK ?
"GGTT" : "PPGTT",
RING_FAULT_SRCID(fault),
RING_FAULT_FAULT_TYPE(fault));
}
}
}
static void gen8_check_faults(struct intel_gt *gt)
{
struct intel_uncore *uncore = gt->uncore;
i915_reg_t fault_reg, fault_data0_reg, fault_data1_reg;
u32 fault;
if (INTEL_GEN(gt->i915) >= 12) {
fault_reg = GEN12_RING_FAULT_REG;
fault_data0_reg = GEN12_FAULT_TLB_DATA0;
fault_data1_reg = GEN12_FAULT_TLB_DATA1;
} else {
fault_reg = GEN8_RING_FAULT_REG;
fault_data0_reg = GEN8_FAULT_TLB_DATA0;
fault_data1_reg = GEN8_FAULT_TLB_DATA1;
}
fault = intel_uncore_read(uncore, fault_reg);
if (fault & RING_FAULT_VALID) {
u32 fault_data0, fault_data1;
u64 fault_addr;
fault_data0 = intel_uncore_read(uncore, fault_data0_reg);
fault_data1 = intel_uncore_read(uncore, fault_data1_reg);
fault_addr = ((u64)(fault_data1 & FAULT_VA_HIGH_BITS) << 44) |
((u64)fault_data0 << 12);
drm_dbg(&uncore->i915->drm, "Unexpected fault\n"
"\tAddr: 0x%08x_%08x\n"
"\tAddress space: %s\n"
"\tEngine ID: %d\n"
"\tSource ID: %d\n"
"\tType: %d\n",
upper_32_bits(fault_addr), lower_32_bits(fault_addr),
fault_data1 & FAULT_GTT_SEL ? "GGTT" : "PPGTT",
GEN8_RING_FAULT_ENGINE_ID(fault),
RING_FAULT_SRCID(fault),
RING_FAULT_FAULT_TYPE(fault));
}
}
void intel_gt_check_and_clear_faults(struct intel_gt *gt)
{
struct drm_i915_private *i915 = gt->i915;
/* From GEN8 onwards we only have one 'All Engine Fault Register' */
if (INTEL_GEN(i915) >= 8)
gen8_check_faults(gt);
else if (INTEL_GEN(i915) >= 6)
gen6_check_faults(gt);
else
return;
intel_gt_clear_error_registers(gt, ALL_ENGINES);
}
void intel_gt_flush_ggtt_writes(struct intel_gt *gt)
{
struct intel_uncore *uncore = gt->uncore;
intel_wakeref_t wakeref;
/*
* No actual flushing is required for the GTT write domain for reads
* from the GTT domain. Writes to it "immediately" go to main memory
* as far as we know, so there's no chipset flush. It also doesn't
* land in the GPU render cache.
*
* However, we do have to enforce the order so that all writes through
* the GTT land before any writes to the device, such as updates to
* the GATT itself.
*
* We also have to wait a bit for the writes to land from the GTT.
* An uncached read (i.e. mmio) seems to be ideal for the round-trip
* timing. This issue has only been observed when switching quickly
* between GTT writes and CPU reads from inside the kernel on recent hw,
* and it appears to only affect discrete GTT blocks (i.e. on LLC
* system agents we cannot reproduce this behaviour, until Cannonlake
* that was!).
*/
wmb();
if (INTEL_INFO(gt->i915)->has_coherent_ggtt)
return;
intel_gt_chipset_flush(gt);
with_intel_runtime_pm_if_in_use(uncore->rpm, wakeref) {
drm/i915: Pull i915_vma_pin under the vm->mutex Replace the struct_mutex requirement for pinning the i915_vma with the local vm->mutex instead. Note that the vm->mutex is tainted by the shrinker (we require unbinding from inside fs-reclaim) and so we cannot allocate while holding that mutex. Instead we have to preallocate workers to do allocate and apply the PTE updates after we have we reserved their slot in the drm_mm (using fences to order the PTE writes with the GPU work and with later unbind). In adding the asynchronous vma binding, one subtle requirement is to avoid coupling the binding fence into the backing object->resv. That is the asynchronous binding only applies to the vma timeline itself and not to the pages as that is a more global timeline (the binding of one vma does not need to be ordered with another vma, nor does the implicit GEM fencing depend on a vma, only on writes to the backing store). Keeping the vma binding distinct from the backing store timelines is verified by a number of async gem_exec_fence and gem_exec_schedule tests. The way we do this is quite simple, we keep the fence for the vma binding separate and only wait on it as required, and never add it to the obj->resv itself. Another consequence in reducing the locking around the vma is the destruction of the vma is no longer globally serialised by struct_mutex. A natural solution would be to add a kref to i915_vma, but that requires decoupling the reference cycles, possibly by introducing a new i915_mm_pages object that is own by both obj->mm and vma->pages. However, we have not taken that route due to the overshadowing lmem/ttm discussions, and instead play a series of complicated games with trylocks to (hopefully) ensure that only one destruction path is called! v2: Add some commentary, and some helpers to reduce patch churn. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191004134015.13204-4-chris@chris-wilson.co.uk
2019-10-04 20:39:58 +07:00
unsigned long flags;
drm/i915: Pull i915_vma_pin under the vm->mutex Replace the struct_mutex requirement for pinning the i915_vma with the local vm->mutex instead. Note that the vm->mutex is tainted by the shrinker (we require unbinding from inside fs-reclaim) and so we cannot allocate while holding that mutex. Instead we have to preallocate workers to do allocate and apply the PTE updates after we have we reserved their slot in the drm_mm (using fences to order the PTE writes with the GPU work and with later unbind). In adding the asynchronous vma binding, one subtle requirement is to avoid coupling the binding fence into the backing object->resv. That is the asynchronous binding only applies to the vma timeline itself and not to the pages as that is a more global timeline (the binding of one vma does not need to be ordered with another vma, nor does the implicit GEM fencing depend on a vma, only on writes to the backing store). Keeping the vma binding distinct from the backing store timelines is verified by a number of async gem_exec_fence and gem_exec_schedule tests. The way we do this is quite simple, we keep the fence for the vma binding separate and only wait on it as required, and never add it to the obj->resv itself. Another consequence in reducing the locking around the vma is the destruction of the vma is no longer globally serialised by struct_mutex. A natural solution would be to add a kref to i915_vma, but that requires decoupling the reference cycles, possibly by introducing a new i915_mm_pages object that is own by both obj->mm and vma->pages. However, we have not taken that route due to the overshadowing lmem/ttm discussions, and instead play a series of complicated games with trylocks to (hopefully) ensure that only one destruction path is called! v2: Add some commentary, and some helpers to reduce patch churn. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191004134015.13204-4-chris@chris-wilson.co.uk
2019-10-04 20:39:58 +07:00
spin_lock_irqsave(&uncore->lock, flags);
intel_uncore_posting_read_fw(uncore,
RING_HEAD(RENDER_RING_BASE));
drm/i915: Pull i915_vma_pin under the vm->mutex Replace the struct_mutex requirement for pinning the i915_vma with the local vm->mutex instead. Note that the vm->mutex is tainted by the shrinker (we require unbinding from inside fs-reclaim) and so we cannot allocate while holding that mutex. Instead we have to preallocate workers to do allocate and apply the PTE updates after we have we reserved their slot in the drm_mm (using fences to order the PTE writes with the GPU work and with later unbind). In adding the asynchronous vma binding, one subtle requirement is to avoid coupling the binding fence into the backing object->resv. That is the asynchronous binding only applies to the vma timeline itself and not to the pages as that is a more global timeline (the binding of one vma does not need to be ordered with another vma, nor does the implicit GEM fencing depend on a vma, only on writes to the backing store). Keeping the vma binding distinct from the backing store timelines is verified by a number of async gem_exec_fence and gem_exec_schedule tests. The way we do this is quite simple, we keep the fence for the vma binding separate and only wait on it as required, and never add it to the obj->resv itself. Another consequence in reducing the locking around the vma is the destruction of the vma is no longer globally serialised by struct_mutex. A natural solution would be to add a kref to i915_vma, but that requires decoupling the reference cycles, possibly by introducing a new i915_mm_pages object that is own by both obj->mm and vma->pages. However, we have not taken that route due to the overshadowing lmem/ttm discussions, and instead play a series of complicated games with trylocks to (hopefully) ensure that only one destruction path is called! v2: Add some commentary, and some helpers to reduce patch churn. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191004134015.13204-4-chris@chris-wilson.co.uk
2019-10-04 20:39:58 +07:00
spin_unlock_irqrestore(&uncore->lock, flags);
}
}
void intel_gt_chipset_flush(struct intel_gt *gt)
{
wmb();
if (INTEL_GEN(gt->i915) < 6)
intel_gtt_chipset_flush();
}
void intel_gt_driver_register(struct intel_gt *gt)
{
intel_rps_driver_register(&gt->rps);
debugfs_gt_register(gt);
}
static int intel_gt_init_scratch(struct intel_gt *gt, unsigned int size)
{
struct drm_i915_private *i915 = gt->i915;
struct drm_i915_gem_object *obj;
struct i915_vma *vma;
int ret;
obj = i915_gem_object_create_stolen(i915, size);
if (IS_ERR(obj))
obj = i915_gem_object_create_internal(i915, size);
if (IS_ERR(obj)) {
DRM_ERROR("Failed to allocate scratch page\n");
return PTR_ERR(obj);
}
vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto err_unref;
}
ret = i915_ggtt_pin(vma, 0, PIN_HIGH);
if (ret)
goto err_unref;
gt->scratch = i915_vma_make_unshrinkable(vma);
return 0;
err_unref:
i915_gem_object_put(obj);
return ret;
}
static void intel_gt_fini_scratch(struct intel_gt *gt)
{
i915_vma_unpin_and_release(&gt->scratch, 0);
}
static struct i915_address_space *kernel_vm(struct intel_gt *gt)
{
if (INTEL_PPGTT(gt->i915) > INTEL_PPGTT_ALIASING)
return &i915_ppgtt_create(gt)->vm;
else
return i915_vm_get(&gt->ggtt->vm);
}
static int __engines_record_defaults(struct intel_gt *gt)
{
struct i915_request *requests[I915_NUM_ENGINES] = {};
struct intel_engine_cs *engine;
enum intel_engine_id id;
int err = 0;
/*
* As we reset the gpu during very early sanitisation, the current
* register state on the GPU should reflect its defaults values.
* We load a context onto the hw (with restore-inhibit), then switch
* over to a second context to save that default register state. We
* can then prime every new context with that state so they all start
* from the same default HW values.
*/
for_each_engine(engine, gt, id) {
struct intel_renderstate so;
struct intel_context *ce;
struct i915_request *rq;
/* We must be able to switch to something! */
GEM_BUG_ON(!engine->kernel_context);
err = intel_renderstate_init(&so, engine);
if (err)
goto out;
ce = intel_context_create(engine);
if (IS_ERR(ce)) {
err = PTR_ERR(ce);
goto out;
}
rq = intel_context_create_request(ce);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
intel_context_put(ce);
goto out;
}
err = intel_engine_emit_ctx_wa(rq);
if (err)
goto err_rq;
err = intel_renderstate_emit(&so, rq);
if (err)
goto err_rq;
err_rq:
requests[id] = i915_request_get(rq);
i915_request_add(rq);
intel_renderstate_fini(&so);
if (err)
goto out;
}
/* Flush the default context image to memory, and enable powersaving. */
if (intel_gt_wait_for_idle(gt, I915_GEM_IDLE_TIMEOUT) == -ETIME) {
err = -EIO;
goto out;
}
for (id = 0; id < ARRAY_SIZE(requests); id++) {
struct i915_request *rq;
drm/i915/gt: Keep a no-frills swappable copy of the default context state We need to keep the default context state around to instantiate new contexts (aka golden rendercontext), and we also keep it pinned while the engine is active so that we can quickly reset a hanging context. However, the default contexts are large enough to merit keeping in swappable memory as opposed to kernel memory, so we store them inside shmemfs. Currently, we use the normal GEM objects to create the default context image, but we can throw away all but the shmemfs file. This greatly simplifies the tricky power management code which wants to run underneath the normal GT locking, and we definitely do not want to use any high level objects that may appear to recurse back into the GT. Though perhaps the primary advantage of the complex GEM object is that we aggressively cache the mapping, but here we are recreating the vm_area everytime time we unpark. At the worst, we add a lightweight cache, but first find a microbenchmark that is impacted. Having started to create some utility functions to make working with shmemfs objects easier, we can start putting them to wider use, where GEM objects are overkill, such as storing persistent error state. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Ramalingam C <ramalingam.c@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200429172429.6054-1-chris@chris-wilson.co.uk
2020-04-30 00:24:29 +07:00
struct file *state;
rq = requests[id];
if (!rq)
continue;
if (rq->fence.error) {
err = -EIO;
goto out;
}
GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags));
drm/i915/gt: Keep a no-frills swappable copy of the default context state We need to keep the default context state around to instantiate new contexts (aka golden rendercontext), and we also keep it pinned while the engine is active so that we can quickly reset a hanging context. However, the default contexts are large enough to merit keeping in swappable memory as opposed to kernel memory, so we store them inside shmemfs. Currently, we use the normal GEM objects to create the default context image, but we can throw away all but the shmemfs file. This greatly simplifies the tricky power management code which wants to run underneath the normal GT locking, and we definitely do not want to use any high level objects that may appear to recurse back into the GT. Though perhaps the primary advantage of the complex GEM object is that we aggressively cache the mapping, but here we are recreating the vm_area everytime time we unpark. At the worst, we add a lightweight cache, but first find a microbenchmark that is impacted. Having started to create some utility functions to make working with shmemfs objects easier, we can start putting them to wider use, where GEM objects are overkill, such as storing persistent error state. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Ramalingam C <ramalingam.c@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200429172429.6054-1-chris@chris-wilson.co.uk
2020-04-30 00:24:29 +07:00
if (!rq->context->state)
continue;
drm/i915/gt: Keep a no-frills swappable copy of the default context state We need to keep the default context state around to instantiate new contexts (aka golden rendercontext), and we also keep it pinned while the engine is active so that we can quickly reset a hanging context. However, the default contexts are large enough to merit keeping in swappable memory as opposed to kernel memory, so we store them inside shmemfs. Currently, we use the normal GEM objects to create the default context image, but we can throw away all but the shmemfs file. This greatly simplifies the tricky power management code which wants to run underneath the normal GT locking, and we definitely do not want to use any high level objects that may appear to recurse back into the GT. Though perhaps the primary advantage of the complex GEM object is that we aggressively cache the mapping, but here we are recreating the vm_area everytime time we unpark. At the worst, we add a lightweight cache, but first find a microbenchmark that is impacted. Having started to create some utility functions to make working with shmemfs objects easier, we can start putting them to wider use, where GEM objects are overkill, such as storing persistent error state. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Ramalingam C <ramalingam.c@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200429172429.6054-1-chris@chris-wilson.co.uk
2020-04-30 00:24:29 +07:00
/* Keep a copy of the state's backing pages; free the obj */
state = shmem_create_from_object(rq->context->state->obj);
if (IS_ERR(state)) {
err = PTR_ERR(state);
goto out;
}
drm/i915/gt: Keep a no-frills swappable copy of the default context state We need to keep the default context state around to instantiate new contexts (aka golden rendercontext), and we also keep it pinned while the engine is active so that we can quickly reset a hanging context. However, the default contexts are large enough to merit keeping in swappable memory as opposed to kernel memory, so we store them inside shmemfs. Currently, we use the normal GEM objects to create the default context image, but we can throw away all but the shmemfs file. This greatly simplifies the tricky power management code which wants to run underneath the normal GT locking, and we definitely do not want to use any high level objects that may appear to recurse back into the GT. Though perhaps the primary advantage of the complex GEM object is that we aggressively cache the mapping, but here we are recreating the vm_area everytime time we unpark. At the worst, we add a lightweight cache, but first find a microbenchmark that is impacted. Having started to create some utility functions to make working with shmemfs objects easier, we can start putting them to wider use, where GEM objects are overkill, such as storing persistent error state. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Ramalingam C <ramalingam.c@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200429172429.6054-1-chris@chris-wilson.co.uk
2020-04-30 00:24:29 +07:00
rq->engine->default_state = state;
}
out:
/*
* If we have to abandon now, we expect the engines to be idle
* and ready to be torn-down. The quickest way we can accomplish
* this is by declaring ourselves wedged.
*/
if (err)
intel_gt_set_wedged(gt);
for (id = 0; id < ARRAY_SIZE(requests); id++) {
struct intel_context *ce;
struct i915_request *rq;
rq = requests[id];
if (!rq)
continue;
ce = rq->context;
i915_request_put(rq);
intel_context_put(ce);
}
return err;
}
static int __engines_verify_workarounds(struct intel_gt *gt)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
int err = 0;
if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
return 0;
for_each_engine(engine, gt, id) {
if (intel_engine_verify_workarounds(engine, "load"))
err = -EIO;
}
/* Flush and restore the kernel context for safety */
if (intel_gt_wait_for_idle(gt, I915_GEM_IDLE_TIMEOUT) == -ETIME)
err = -EIO;
return err;
}
static void __intel_gt_disable(struct intel_gt *gt)
{
intel_gt_set_wedged_on_init(gt);
intel_gt_suspend_prepare(gt);
intel_gt_suspend_late(gt);
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
}
int intel_gt_init(struct intel_gt *gt)
{
int err;
err = i915_inject_probe_error(gt->i915, -ENODEV);
if (err)
return err;
/*
* This is just a security blanket to placate dragons.
* On some systems, we very sporadically observe that the first TLBs
* used by the CS may be stale, despite us poking the TLB reset. If
* we hold the forcewake during initialisation these problems
* just magically go away.
*/
intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
intel_gt_init_clock_frequency(gt);
err = intel_gt_init_scratch(gt, IS_GEN(gt->i915, 2) ? SZ_256K : SZ_4K);
if (err)
goto out_fw;
intel_gt_pm_init(gt);
gt->vm = kernel_vm(gt);
if (!gt->vm) {
err = -ENOMEM;
goto err_pm;
}
err = intel_engines_init(gt);
if (err)
goto err_engines;
err = intel_uc_init(&gt->uc);
if (err)
goto err_engines;
err = intel_gt_resume(gt);
if (err)
goto err_uc_init;
err = __engines_record_defaults(gt);
if (err)
goto err_gt;
err = __engines_verify_workarounds(gt);
if (err)
goto err_gt;
err = i915_inject_probe_error(gt->i915, -EIO);
if (err)
goto err_gt;
goto out_fw;
err_gt:
__intel_gt_disable(gt);
intel_uc_fini_hw(&gt->uc);
err_uc_init:
intel_uc_fini(&gt->uc);
err_engines:
intel_engines_release(gt);
i915_vm_put(fetch_and_zero(&gt->vm));
err_pm:
intel_gt_pm_fini(gt);
intel_gt_fini_scratch(gt);
out_fw:
if (err)
intel_gt_set_wedged_on_init(gt);
intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
return err;
}
void intel_gt_driver_remove(struct intel_gt *gt)
{
__intel_gt_disable(gt);
intel_uc_driver_remove(&gt->uc);
intel_engines_release(gt);
}
void intel_gt_driver_unregister(struct intel_gt *gt)
{
intel_rps_driver_unregister(&gt->rps);
/*
* Upon unregistering the device to prevent any new users, cancel
* all in-flight requests so that we can quickly unbind the active
* resources.
*/
intel_gt_set_wedged(gt);
}
void intel_gt_driver_release(struct intel_gt *gt)
{
struct i915_address_space *vm;
vm = fetch_and_zero(&gt->vm);
if (vm) /* FIXME being called twice on error paths :( */
i915_vm_put(vm);
intel_gt_pm_fini(gt);
intel_gt_fini_scratch(gt);
intel_gt_fini_buffer_pool(gt);
}
void intel_gt_driver_late_release(struct intel_gt *gt)
{
/* We need to wait for inflight RCU frees to release their grip */
rcu_barrier();
intel_uc_driver_late_release(&gt->uc);
intel_gt_fini_requests(gt);
intel_gt_fini_reset(gt);
intel_gt_fini_timelines(gt);
intel_engines_free(gt);
}