linux_dsm_epyc7002/drivers/gpu/drm/i915/gt/intel_timeline.c

588 lines
14 KiB
C
Raw Normal View History

/*
* SPDX-License-Identifier: MIT
*
* Copyright © 2016-2018 Intel Corporation
*/
#include "i915_drv.h"
#include "i915_active.h"
#include "i915_syncmap.h"
#include "intel_gt.h"
#include "intel_ring.h"
#include "intel_timeline.h"
#define ptr_set_bit(ptr, bit) ((typeof(ptr))((unsigned long)(ptr) | BIT(bit)))
#define ptr_test_bit(ptr, bit) ((unsigned long)(ptr) & BIT(bit))
#define CACHELINE_BITS 6
#define CACHELINE_FREE CACHELINE_BITS
struct intel_timeline_hwsp {
struct intel_gt *gt;
struct intel_gt_timelines *gt_timelines;
struct list_head free_link;
struct i915_vma *vma;
u64 free_bitmap;
};
static struct i915_vma *__hwsp_alloc(struct intel_gt *gt)
{
struct drm_i915_private *i915 = gt->i915;
struct drm_i915_gem_object *obj;
struct i915_vma *vma;
obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
if (IS_ERR(obj))
return ERR_CAST(obj);
i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
if (IS_ERR(vma))
i915_gem_object_put(obj);
return vma;
}
static struct i915_vma *
hwsp_alloc(struct intel_timeline *timeline, unsigned int *cacheline)
{
struct intel_gt_timelines *gt = &timeline->gt->timelines;
struct intel_timeline_hwsp *hwsp;
BUILD_BUG_ON(BITS_PER_TYPE(u64) * CACHELINE_BYTES > PAGE_SIZE);
spin_lock_irq(&gt->hwsp_lock);
/* hwsp_free_list only contains HWSP that have available cachelines */
hwsp = list_first_entry_or_null(&gt->hwsp_free_list,
typeof(*hwsp), free_link);
if (!hwsp) {
struct i915_vma *vma;
spin_unlock_irq(&gt->hwsp_lock);
hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL);
if (!hwsp)
return ERR_PTR(-ENOMEM);
vma = __hwsp_alloc(timeline->gt);
if (IS_ERR(vma)) {
kfree(hwsp);
return vma;
}
vma->private = hwsp;
hwsp->gt = timeline->gt;
hwsp->vma = vma;
hwsp->free_bitmap = ~0ull;
hwsp->gt_timelines = gt;
spin_lock_irq(&gt->hwsp_lock);
list_add(&hwsp->free_link, &gt->hwsp_free_list);
}
GEM_BUG_ON(!hwsp->free_bitmap);
*cacheline = __ffs64(hwsp->free_bitmap);
hwsp->free_bitmap &= ~BIT_ULL(*cacheline);
if (!hwsp->free_bitmap)
list_del(&hwsp->free_link);
spin_unlock_irq(&gt->hwsp_lock);
GEM_BUG_ON(hwsp->vma->private != hwsp);
return hwsp->vma;
}
static void __idle_hwsp_free(struct intel_timeline_hwsp *hwsp, int cacheline)
{
struct intel_gt_timelines *gt = hwsp->gt_timelines;
unsigned long flags;
spin_lock_irqsave(&gt->hwsp_lock, flags);
/* As a cacheline becomes available, publish the HWSP on the freelist */
if (!hwsp->free_bitmap)
list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap));
hwsp->free_bitmap |= BIT_ULL(cacheline);
/* And if no one is left using it, give the page back to the system */
if (hwsp->free_bitmap == ~0ull) {
i915_vma_put(hwsp->vma);
list_del(&hwsp->free_link);
kfree(hwsp);
}
spin_unlock_irqrestore(&gt->hwsp_lock, flags);
}
drm/i915/gt: Mark timeline->cacheline as destroyed after rcu grace period Since we take advantage of RCU for some i915_active objects, like the intel_timeline_cacheline, we need to delay the i915_active_fini until after the RCU grace period and we perform the kfree -- that is until after all RCU protected readers. <3> [108.204873] ODEBUG: assert_init not available (active state 0) object type: i915_active hint: __cacheline_active+0x0/0x80 [i915] <4> [108.207377] WARNING: CPU: 3 PID: 2342 at lib/debugobjects.c:488 debug_print_object+0x67/0x90 <4> [108.207400] Modules linked in: vgem snd_hda_codec_hdmi x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel ghash_clmulni_intel snd_intel_dspcfg snd_hda_codec ax88179_178a snd_hwdep usbnet btusb snd_hda_core btrtl mii btbcm btintel snd_pcm bluetooth ecdh_generic ecc i915 i2c_hid pinctrl_sunrisepoint pinctrl_intel intel_lpss_pci prime_numbers <4> [108.207587] CPU: 3 PID: 2342 Comm: gem_exec_parall Tainted: G U 5.6.0-rc6-CI-Patchwork_17047+ #1 <4> [108.207609] Hardware name: Google Soraka/Soraka, BIOS MrChromebox-4.10 08/25/2019 <4> [108.207639] RIP: 0010:debug_print_object+0x67/0x90 <4> [108.207668] Code: 83 c2 01 8b 4b 14 4c 8b 45 00 89 15 87 d2 8a 02 8b 53 10 4c 89 e6 48 c7 c7 38 2b 32 82 48 8b 14 d5 80 2f 07 82 e8 49 d5 b7 ff <0f> 0b 5b 83 05 c3 f6 22 01 01 5d 41 5c c3 83 05 b8 f6 22 01 01 c3 <4> [108.207692] RSP: 0018:ffffc90000e7f890 EFLAGS: 00010282 <4> [108.207723] RAX: 0000000000000000 RBX: ffffc90000e7f8b0 RCX: 0000000000000001 <4> [108.207747] RDX: 0000000080000001 RSI: ffff88817ada8cb8 RDI: 00000000ffffffff <4> [108.207770] RBP: ffffffffa0341cc0 R08: ffff88816b5a8948 R09: 0000000000000000 <4> [108.207792] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82322d54 <4> [108.207814] R13: ffffffffa0341cc0 R14: ffffffff83df9568 R15: ffff88816064f400 <4> [108.207839] FS: 00007f437d753700(0000) GS:ffff88817ad80000(0000) knlGS:0000000000000000 <4> [108.207863] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4> [108.207887] CR2: 00007f2ad1fb5000 CR3: 00000001725d8004 CR4: 00000000003606e0 <4> [108.207907] Call Trace: <4> [108.207959] debug_object_assert_init+0x15c/0x180 <4> [108.208475] ? i915_active_acquire_if_busy+0x10/0x50 [i915] <4> [108.208513] ? rcu_read_lock_held+0x4d/0x60 <4> [108.208970] i915_active_acquire_if_busy+0x10/0x50 [i915] <4> [108.209380] intel_timeline_read_hwsp+0x81/0x540 [i915] <4> [108.210262] __emit_semaphore_wait+0x45/0x1b0 [i915] <4> [108.210726] ? i915_request_await_dma_fence+0x143/0x560 [i915] <4> [108.211156] i915_request_await_dma_fence+0x28a/0x560 [i915] <4> [108.211633] i915_request_await_object+0x24a/0x3f0 [i915] <4> [108.212102] eb_submit.isra.47+0x58f/0x920 [i915] <4> [108.212622] i915_gem_do_execbuffer+0x1706/0x2c70 [i915] <4> [108.213071] ? i915_gem_execbuffer2_ioctl+0xc0/0x470 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200323092841.22240-1-chris@chris-wilson.co.uk
2020-03-23 16:28:34 +07:00
static void __rcu_cacheline_free(struct rcu_head *rcu)
{
struct intel_timeline_cacheline *cl =
container_of(rcu, typeof(*cl), rcu);
i915_active_fini(&cl->active);
kfree(cl);
}
static void __idle_cacheline_free(struct intel_timeline_cacheline *cl)
{
GEM_BUG_ON(!i915_active_is_idle(&cl->active));
i915_gem_object_unpin_map(cl->hwsp->vma->obj);
i915_vma_put(cl->hwsp->vma);
__idle_hwsp_free(cl->hwsp, ptr_unmask_bits(cl->vaddr, CACHELINE_BITS));
drm/i915/gt: Mark timeline->cacheline as destroyed after rcu grace period Since we take advantage of RCU for some i915_active objects, like the intel_timeline_cacheline, we need to delay the i915_active_fini until after the RCU grace period and we perform the kfree -- that is until after all RCU protected readers. <3> [108.204873] ODEBUG: assert_init not available (active state 0) object type: i915_active hint: __cacheline_active+0x0/0x80 [i915] <4> [108.207377] WARNING: CPU: 3 PID: 2342 at lib/debugobjects.c:488 debug_print_object+0x67/0x90 <4> [108.207400] Modules linked in: vgem snd_hda_codec_hdmi x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel ghash_clmulni_intel snd_intel_dspcfg snd_hda_codec ax88179_178a snd_hwdep usbnet btusb snd_hda_core btrtl mii btbcm btintel snd_pcm bluetooth ecdh_generic ecc i915 i2c_hid pinctrl_sunrisepoint pinctrl_intel intel_lpss_pci prime_numbers <4> [108.207587] CPU: 3 PID: 2342 Comm: gem_exec_parall Tainted: G U 5.6.0-rc6-CI-Patchwork_17047+ #1 <4> [108.207609] Hardware name: Google Soraka/Soraka, BIOS MrChromebox-4.10 08/25/2019 <4> [108.207639] RIP: 0010:debug_print_object+0x67/0x90 <4> [108.207668] Code: 83 c2 01 8b 4b 14 4c 8b 45 00 89 15 87 d2 8a 02 8b 53 10 4c 89 e6 48 c7 c7 38 2b 32 82 48 8b 14 d5 80 2f 07 82 e8 49 d5 b7 ff <0f> 0b 5b 83 05 c3 f6 22 01 01 5d 41 5c c3 83 05 b8 f6 22 01 01 c3 <4> [108.207692] RSP: 0018:ffffc90000e7f890 EFLAGS: 00010282 <4> [108.207723] RAX: 0000000000000000 RBX: ffffc90000e7f8b0 RCX: 0000000000000001 <4> [108.207747] RDX: 0000000080000001 RSI: ffff88817ada8cb8 RDI: 00000000ffffffff <4> [108.207770] RBP: ffffffffa0341cc0 R08: ffff88816b5a8948 R09: 0000000000000000 <4> [108.207792] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82322d54 <4> [108.207814] R13: ffffffffa0341cc0 R14: ffffffff83df9568 R15: ffff88816064f400 <4> [108.207839] FS: 00007f437d753700(0000) GS:ffff88817ad80000(0000) knlGS:0000000000000000 <4> [108.207863] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4> [108.207887] CR2: 00007f2ad1fb5000 CR3: 00000001725d8004 CR4: 00000000003606e0 <4> [108.207907] Call Trace: <4> [108.207959] debug_object_assert_init+0x15c/0x180 <4> [108.208475] ? i915_active_acquire_if_busy+0x10/0x50 [i915] <4> [108.208513] ? rcu_read_lock_held+0x4d/0x60 <4> [108.208970] i915_active_acquire_if_busy+0x10/0x50 [i915] <4> [108.209380] intel_timeline_read_hwsp+0x81/0x540 [i915] <4> [108.210262] __emit_semaphore_wait+0x45/0x1b0 [i915] <4> [108.210726] ? i915_request_await_dma_fence+0x143/0x560 [i915] <4> [108.211156] i915_request_await_dma_fence+0x28a/0x560 [i915] <4> [108.211633] i915_request_await_object+0x24a/0x3f0 [i915] <4> [108.212102] eb_submit.isra.47+0x58f/0x920 [i915] <4> [108.212622] i915_gem_do_execbuffer+0x1706/0x2c70 [i915] <4> [108.213071] ? i915_gem_execbuffer2_ioctl+0xc0/0x470 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Matthew Auld <matthew.auld@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200323092841.22240-1-chris@chris-wilson.co.uk
2020-03-23 16:28:34 +07:00
call_rcu(&cl->rcu, __rcu_cacheline_free);
}
__i915_active_call
static void __cacheline_retire(struct i915_active *active)
{
struct intel_timeline_cacheline *cl =
container_of(active, typeof(*cl), active);
i915_vma_unpin(cl->hwsp->vma);
if (ptr_test_bit(cl->vaddr, CACHELINE_FREE))
__idle_cacheline_free(cl);
}
static int __cacheline_active(struct i915_active *active)
{
struct intel_timeline_cacheline *cl =
container_of(active, typeof(*cl), active);
__i915_vma_pin(cl->hwsp->vma);
return 0;
}
static struct intel_timeline_cacheline *
cacheline_alloc(struct intel_timeline_hwsp *hwsp, unsigned int cacheline)
{
struct intel_timeline_cacheline *cl;
void *vaddr;
GEM_BUG_ON(cacheline >= BIT(CACHELINE_BITS));
cl = kmalloc(sizeof(*cl), GFP_KERNEL);
if (!cl)
return ERR_PTR(-ENOMEM);
vaddr = i915_gem_object_pin_map(hwsp->vma->obj, I915_MAP_WB);
if (IS_ERR(vaddr)) {
kfree(cl);
return ERR_CAST(vaddr);
}
i915_vma_get(hwsp->vma);
cl->hwsp = hwsp;
cl->vaddr = page_pack_bits(vaddr, cacheline);
i915_active_init(&cl->active, __cacheline_active, __cacheline_retire);
return cl;
}
static void cacheline_acquire(struct intel_timeline_cacheline *cl)
{
if (cl)
i915_active_acquire(&cl->active);
}
static void cacheline_release(struct intel_timeline_cacheline *cl)
{
if (cl)
i915_active_release(&cl->active);
}
static void cacheline_free(struct intel_timeline_cacheline *cl)
{
if (!i915_active_acquire_if_busy(&cl->active)) {
__idle_cacheline_free(cl);
return;
}
GEM_BUG_ON(ptr_test_bit(cl->vaddr, CACHELINE_FREE));
cl->vaddr = ptr_set_bit(cl->vaddr, CACHELINE_FREE);
i915_active_release(&cl->active);
}
int intel_timeline_init(struct intel_timeline *timeline,
struct intel_gt *gt,
struct i915_vma *hwsp)
{
void *vaddr;
kref_init(&timeline->kref);
atomic_set(&timeline->pin_count, 0);
timeline->gt = gt;
timeline->has_initial_breadcrumb = !hwsp;
timeline->hwsp_cacheline = NULL;
if (!hwsp) {
struct intel_timeline_cacheline *cl;
unsigned int cacheline;
hwsp = hwsp_alloc(timeline, &cacheline);
if (IS_ERR(hwsp))
return PTR_ERR(hwsp);
cl = cacheline_alloc(hwsp->private, cacheline);
if (IS_ERR(cl)) {
__idle_hwsp_free(hwsp->private, cacheline);
return PTR_ERR(cl);
}
timeline->hwsp_cacheline = cl;
timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
vaddr = page_mask_bits(cl->vaddr);
} else {
timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
if (IS_ERR(vaddr))
return PTR_ERR(vaddr);
}
timeline->hwsp_seqno =
memset(vaddr + timeline->hwsp_offset, 0, CACHELINE_BYTES);
timeline->hwsp_ggtt = i915_vma_get(hwsp);
GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
timeline->fence_context = dma_fence_context_alloc(1);
mutex_init(&timeline->mutex);
drm/i915: Serialise i915_active_fence_set() with itself The expected downside to commit 58b4c1a07ada ("drm/i915: Reduce nested prepare_remote_context() to a trylock") was that it would need to return -EAGAIN to userspace in order to resolve potential mutex inversion. Such an unsightly round trip is unnecessary if we could atomically insert a barrier into the i915_active_fence, so make it happen. Currently, we use the timeline->mutex (or some other named outer lock) to order insertion into the i915_active_fence (and so individual nodes of i915_active). Inside __i915_active_fence_set, we only need then serialise with the interrupt handler in order to claim the timeline for ourselves. However, if we remove the outer lock, we need to ensure the order is intact between not only multiple threads trying to insert themselves into the timeline, but also with the interrupt handler completing the previous occupant. We use xchg() on insert so that we have an ordered sequence of insertions (and each caller knows the previous fence on which to wait, preserving the chain of all fences in the timeline), but we then have to cmpxchg() in the interrupt handler to avoid overwriting the new occupant. The only nasty side-effect is having to temporarily strip off the RCU-annotations to apply the atomic operations, otherwise the rules are much more conventional! Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112402 Fixes: 58b4c1a07ada ("drm/i915: Reduce nested prepare_remote_context() to a trylock") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191127134527.3438410-1-chris@chris-wilson.co.uk
2019-11-27 20:45:27 +07:00
INIT_ACTIVE_FENCE(&timeline->last_request);
INIT_LIST_HEAD(&timeline->requests);
i915_syncmap_init(&timeline->sync);
return 0;
}
void intel_gt_init_timelines(struct intel_gt *gt)
{
struct intel_gt_timelines *timelines = &gt->timelines;
spin_lock_init(&timelines->lock);
INIT_LIST_HEAD(&timelines->active_list);
spin_lock_init(&timelines->hwsp_lock);
INIT_LIST_HEAD(&timelines->hwsp_free_list);
}
void intel_timeline_fini(struct intel_timeline *timeline)
{
GEM_BUG_ON(atomic_read(&timeline->pin_count));
GEM_BUG_ON(!list_empty(&timeline->requests));
drm/i915/gt: Schedule request retirement when timeline idles The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") is that it disables RC6 while Skylake (and friends) is active, and we do not consider the GPU idle until all outstanding requests have been retired and the engine switched over to the kernel context. If userspace is idle, this task falls onto our background idle worker, which only runs roughly once a second, meaning that userspace has to have been idle for a couple of seconds before we enable RC6 again. Naturally, this causes us to consume considerably more energy than before as powersaving is effectively disabled while a display server (here's looking at you Xorg) is running. As execlists will get a completion event as each context is completed, we can use this interrupt to queue a retire worker bound to this engine to cleanup idle timelines. We will then immediately notice the idle engine (without userspace intervention or the aid of the background retire worker) and start parking the GPU. Thus during light workloads, we will do much more work to idle the GPU faster... Hopefully with commensurate power saving! v2: Watch context completions and only look at those local to the engine when retiring to reduce the amount of excess work we perform. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112315 References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA") References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191125105858.1718307-3-chris@chris-wilson.co.uk
2019-11-25 17:58:58 +07:00
GEM_BUG_ON(timeline->retire);
if (timeline->hwsp_cacheline)
cacheline_free(timeline->hwsp_cacheline);
else
i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
i915_vma_put(timeline->hwsp_ggtt);
}
struct intel_timeline *
intel_timeline_create(struct intel_gt *gt, struct i915_vma *global_hwsp)
{
struct intel_timeline *timeline;
int err;
timeline = kzalloc(sizeof(*timeline), GFP_KERNEL);
if (!timeline)
return ERR_PTR(-ENOMEM);
err = intel_timeline_init(timeline, gt, global_hwsp);
if (err) {
kfree(timeline);
return ERR_PTR(err);
}
return timeline;
}
int intel_timeline_pin(struct intel_timeline *tl)
{
int err;
if (atomic_add_unless(&tl->pin_count, 1, 0))
return 0;
err = i915_ggtt_pin(tl->hwsp_ggtt, 0, PIN_HIGH);
if (err)
return err;
tl->hwsp_offset =
i915_ggtt_offset(tl->hwsp_ggtt) +
offset_in_page(tl->hwsp_offset);
cacheline_acquire(tl->hwsp_cacheline);
if (atomic_fetch_inc(&tl->pin_count)) {
cacheline_release(tl->hwsp_cacheline);
__i915_vma_unpin(tl->hwsp_ggtt);
}
return 0;
}
void intel_timeline_enter(struct intel_timeline *tl)
{
struct intel_gt_timelines *timelines = &tl->gt->timelines;
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 23:55:13 +07:00
/*
* Pretend we are serialised by the timeline->mutex.
*
* While generally true, there are a few exceptions to the rule
* for the engine->kernel_context being used to manage power
* transitions. As the engine_park may be called from under any
* timeline, it uses the power mutex as a global serialisation
* lock to prevent any other request entering its timeline.
*
* The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
*
* However, intel_gt_retire_request() does not know which engine
* it is retiring along and so cannot partake in the engine-pm
* barrier, and there we use the tl->active_count as a means to
* pin the timeline in the active_list while the locks are dropped.
* Ergo, as that is outside of the engine-pm barrier, we need to
* use atomic to manipulate tl->active_count.
*/
lockdep_assert_held(&tl->mutex);
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 23:55:13 +07:00
if (atomic_add_unless(&tl->active_count, 1, 0))
return;
spin_lock(&timelines->lock);
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 23:55:13 +07:00
if (!atomic_fetch_inc(&tl->active_count))
list_add_tail(&tl->link, &timelines->active_list);
spin_unlock(&timelines->lock);
}
void intel_timeline_exit(struct intel_timeline *tl)
{
struct intel_gt_timelines *timelines = &tl->gt->timelines;
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 23:55:13 +07:00
/* See intel_timeline_enter() */
lockdep_assert_held(&tl->mutex);
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 23:55:13 +07:00
GEM_BUG_ON(!atomic_read(&tl->active_count));
if (atomic_add_unless(&tl->active_count, -1, 1))
return;
spin_lock(&timelines->lock);
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests The general concept was that intel_timeline.active_count was locked by the intel_timeline.mutex. The exception was for power management, where the engine->kernel_context->timeline could be manipulated under the global wakeref.mutex. This was quite solid, as we always manipulated the timeline only while we held an engine wakeref. And then we started retiring requests outside of struct_mutex, only using the timelines.active_list and the timeline->mutex. There we started manipulating intel_timeline.active_count outside of an engine wakeref, and so introduced a race between __engine_park() and intel_gt_retire_requests(), a race that could result in the engine->kernel_context not being added to the active timelines and so losing requests, which caused us to keep the system permanently powered up [and unloadable]. The race would be easy to close if we could take the engine wakeref for the timeline before we retire -- except timelines are not bound to any engine and so we would need to keep all active engines awake. The alternative is to guard intel_timeline_enter/intel_timeline_exit for use outside of the timeline->mutex. Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Matthew Auld <matthew.auld@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
2019-11-20 23:55:13 +07:00
if (atomic_dec_and_test(&tl->active_count))
list_del(&tl->link);
spin_unlock(&timelines->lock);
/*
* Since this timeline is idle, all bariers upon which we were waiting
* must also be complete and so we can discard the last used barriers
* without loss of information.
*/
i915_syncmap_free(&tl->sync);
}
static u32 timeline_advance(struct intel_timeline *tl)
{
GEM_BUG_ON(!atomic_read(&tl->pin_count));
GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb);
return tl->seqno += 1 + tl->has_initial_breadcrumb;
}
static void timeline_rollback(struct intel_timeline *tl)
{
tl->seqno -= 1 + tl->has_initial_breadcrumb;
}
static noinline int
__intel_timeline_get_seqno(struct intel_timeline *tl,
struct i915_request *rq,
u32 *seqno)
{
struct intel_timeline_cacheline *cl;
unsigned int cacheline;
struct i915_vma *vma;
void *vaddr;
int err;
might_lock(&tl->gt->ggtt->vm.mutex);
/*
* If there is an outstanding GPU reference to this cacheline,
* such as it being sampled by a HW semaphore on another timeline,
* we cannot wraparound our seqno value (the HW semaphore does
* a strict greater-than-or-equals compare, not i915_seqno_passed).
* So if the cacheline is still busy, we must detach ourselves
* from it and leave it inflight alongside its users.
*
* However, if nobody is watching and we can guarantee that nobody
* will, we could simply reuse the same cacheline.
*
* if (i915_active_request_is_signaled(&tl->last_request) &&
* i915_active_is_signaled(&tl->hwsp_cacheline->active))
* return 0;
*
* That seems unlikely for a busy timeline that needed to wrap in
* the first place, so just replace the cacheline.
*/
vma = hwsp_alloc(tl, &cacheline);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
goto err_rollback;
}
err = i915_ggtt_pin(vma, 0, PIN_HIGH);
if (err) {
__idle_hwsp_free(vma->private, cacheline);
goto err_rollback;
}
cl = cacheline_alloc(vma->private, cacheline);
if (IS_ERR(cl)) {
err = PTR_ERR(cl);
__idle_hwsp_free(vma->private, cacheline);
goto err_unpin;
}
GEM_BUG_ON(cl->hwsp->vma != vma);
/*
* Attach the old cacheline to the current request, so that we only
* free it after the current request is retired, which ensures that
* all writes into the cacheline from previous requests are complete.
*/
err = i915_active_ref(&tl->hwsp_cacheline->active, tl, &rq->fence);
if (err)
goto err_cacheline;
cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
cacheline_free(tl->hwsp_cacheline);
i915_vma_unpin(tl->hwsp_ggtt); /* binding kept alive by old cacheline */
i915_vma_put(tl->hwsp_ggtt);
tl->hwsp_ggtt = i915_vma_get(vma);
vaddr = page_mask_bits(cl->vaddr);
tl->hwsp_offset = cacheline * CACHELINE_BYTES;
tl->hwsp_seqno =
memset(vaddr + tl->hwsp_offset, 0, CACHELINE_BYTES);
tl->hwsp_offset += i915_ggtt_offset(vma);
cacheline_acquire(cl);
tl->hwsp_cacheline = cl;
*seqno = timeline_advance(tl);
GEM_BUG_ON(i915_seqno_passed(*tl->hwsp_seqno, *seqno));
return 0;
err_cacheline:
cacheline_free(cl);
err_unpin:
i915_vma_unpin(vma);
err_rollback:
timeline_rollback(tl);
return err;
}
int intel_timeline_get_seqno(struct intel_timeline *tl,
struct i915_request *rq,
u32 *seqno)
{
*seqno = timeline_advance(tl);
/* Replace the HWSP on wraparound for HW semaphores */
if (unlikely(!*seqno && tl->hwsp_cacheline))
return __intel_timeline_get_seqno(tl, rq, seqno);
return 0;
}
static int cacheline_ref(struct intel_timeline_cacheline *cl,
struct i915_request *rq)
{
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer The request->timeline is only valid until the request is retired (i.e. before it is completed). Upon retiring the request, the context may be unpinned and freed, and along with it the timeline may be freed. We therefore need to be very careful when chasing rq->timeline that the pointer does not disappear beneath us. The vast majority of users are in a protected context, either during request construction or retirement, where the timeline->mutex is held and the timeline cannot disappear. It is those few off the beaten path (where we access a second timeline) that need extra scrutiny -- to be added in the next patch after first adding the warnings about dangerous access. One complication, where we cannot use the timeline->mutex itself, is during request submission onto hardware (under spinlocks). Here, we want to check on the timeline to finalize the breadcrumb, and so we need to impose a second rule to ensure that the request->timeline is indeed valid. As we are submitting the request, it's context and timeline must be pinned, as it will be used by the hardware. Since it is pinned, we know the request->timeline must still be valid, and we cannot submit the idle barrier until after we release the engine->active.lock, ergo while submitting and holding that spinlock, a second thread cannot release the timeline. v2: Don't be lazy inside selftests; hold the timeline->mutex for as long as we need it, and tidy up acquiring the timeline with a bit of refactoring (i915_active_add_request) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 18:19:10 +07:00
return i915_active_add_request(&cl->active, rq);
}
int intel_timeline_read_hwsp(struct i915_request *from,
struct i915_request *to,
u32 *hwsp)
{
struct intel_timeline_cacheline *cl;
int err;
GEM_BUG_ON(!rcu_access_pointer(from->hwsp_cacheline));
rcu_read_lock();
cl = rcu_dereference(from->hwsp_cacheline);
if (unlikely(!i915_active_acquire_if_busy(&cl->active)))
goto unlock; /* seqno wrapped and completed! */
if (unlikely(i915_request_completed(from)))
goto release;
rcu_read_unlock();
err = cacheline_ref(cl, to);
if (err)
goto out;
*hwsp = i915_ggtt_offset(cl->hwsp->vma) +
ptr_unmask_bits(cl->vaddr, CACHELINE_BITS) * CACHELINE_BYTES;
out:
i915_active_release(&cl->active);
return err;
release:
i915_active_release(&cl->active);
unlock:
rcu_read_unlock();
return 1;
}
void intel_timeline_unpin(struct intel_timeline *tl)
{
GEM_BUG_ON(!atomic_read(&tl->pin_count));
if (!atomic_dec_and_test(&tl->pin_count))
return;
cacheline_release(tl->hwsp_cacheline);
__i915_vma_unpin(tl->hwsp_ggtt);
}
void __intel_timeline_free(struct kref *kref)
{
struct intel_timeline *timeline =
container_of(kref, typeof(*timeline), kref);
intel_timeline_fini(timeline);
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer The request->timeline is only valid until the request is retired (i.e. before it is completed). Upon retiring the request, the context may be unpinned and freed, and along with it the timeline may be freed. We therefore need to be very careful when chasing rq->timeline that the pointer does not disappear beneath us. The vast majority of users are in a protected context, either during request construction or retirement, where the timeline->mutex is held and the timeline cannot disappear. It is those few off the beaten path (where we access a second timeline) that need extra scrutiny -- to be added in the next patch after first adding the warnings about dangerous access. One complication, where we cannot use the timeline->mutex itself, is during request submission onto hardware (under spinlocks). Here, we want to check on the timeline to finalize the breadcrumb, and so we need to impose a second rule to ensure that the request->timeline is indeed valid. As we are submitting the request, it's context and timeline must be pinned, as it will be used by the hardware. Since it is pinned, we know the request->timeline must still be valid, and we cannot submit the idle barrier until after we release the engine->active.lock, ergo while submitting and holding that spinlock, a second thread cannot release the timeline. v2: Don't be lazy inside selftests; hold the timeline->mutex for as long as we need it, and tidy up acquiring the timeline with a bit of refactoring (i915_active_add_request) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 18:19:10 +07:00
kfree_rcu(timeline, rcu);
}
void intel_gt_fini_timelines(struct intel_gt *gt)
{
struct intel_gt_timelines *timelines = &gt->timelines;
GEM_BUG_ON(!list_empty(&timelines->active_list));
GEM_BUG_ON(!list_empty(&timelines->hwsp_free_list));
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "gt/selftests/mock_timeline.c"
#include "gt/selftest_timeline.c"
#endif