linux_dsm_epyc7002/drivers/gpu/drm/i915/i915_debugfs.c

1908 lines
54 KiB
C
Raw Normal View History

/*
* Copyright © 2008 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
* Keith Packard <keithp@keithp.com>
*
*/
#include <linux/sched/mm.h>
#include <linux/sort.h>
drm: Split out drm_probe_helper.h Having the probe helper stuff (which pretty much everyone needs) in the drm_crtc_helper.h file (which atomic drivers should never need) is confusing. Split them out. To make sure I actually achieved the goal here I went through all drivers. And indeed, all atomic drivers are now free of drm_crtc_helper.h includes. v2: Make it compile. There was so much compile fail on arm drivers that I figured I'll better not include any of the acks on v1. v3: Massive rebase because i915 has lost a lot of drmP.h includes, but not all: Through drm_crtc_helper.h > drm_modeset_helper.h -> drmP.h there was still one, which this patch largely removes. Which means rolling out lots more includes all over. This will also conflict with ongoing drmP.h cleanup by others I expect. v3: Rebase on top of atomic bochs. v4: Review from Laurent for bridge/rcar/omap/shmob/core bits: - (re)move some of the added includes, use the better include files in other places (all suggested from Laurent adopted unchanged). - sort alphabetically v5: Actually try to sort them, and while at it, sort all the ones I touch. v6: Rebase onto i915 changes. v7: Rebase once more. Acked-by: Harry Wentland <harry.wentland@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Cc: Sam Ravnborg <sam@ravnborg.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Acked-by: Benjamin Gaignard <benjamin.gaignard@linaro.org> Acked-by: Jani Nikula <jani.nikula@intel.com> Acked-by: Neil Armstrong <narmstrong@baylibre.com> Acked-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> Acked-by: CK Hu <ck.hu@mediatek.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Liviu Dudau <liviu.dudau@arm.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Cc: linux-arm-kernel@lists.infradead.org Cc: virtualization@lists.linux-foundation.org Cc: etnaviv@lists.freedesktop.org Cc: linux-samsung-soc@vger.kernel.org Cc: intel-gfx@lists.freedesktop.org Cc: linux-mediatek@lists.infradead.org Cc: linux-amlogic@lists.infradead.org Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: spice-devel@lists.freedesktop.org Cc: amd-gfx@lists.freedesktop.org Cc: linux-renesas-soc@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Cc: linux-stm32@st-md-mailman.stormreply.com Cc: linux-tegra@vger.kernel.org Cc: xen-devel@lists.xen.org Link: https://patchwork.freedesktop.org/patch/msgid/20190117210334.13234-1-daniel.vetter@ffwll.ch
2019-01-18 04:03:34 +07:00
#include <drm/drm_debugfs.h>
#include "gem/i915_gem_context.h"
drm/i915: Defer final intel_wakeref_put to process context As we need to acquire a mutex to serialise the final intel_wakeref_put, we need to ensure that we are in process context at that time. However, we want to allow operation on the intel_wakeref from inside timer and other hardirq context, which means that need to defer that final put to a workqueue. Inside the final wakeref puts, we are safe to operate in any context, as we are simply marking up the HW and state tracking for the potential sleep. It's only the serialisation with the potential sleeping getting that requires careful wait avoidance. This allows us to retain the immediate processing as before (we only need to sleep over the same races as the current mutex_lock). v2: Add a selftest to ensure we exercise the code while lockdep watches. v3: That test was extremely loud and complained about many things! v4: Not a whale! Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=111295 References: https://bugs.freedesktop.org/show_bug.cgi?id=111245 References: https://bugs.freedesktop.org/show_bug.cgi?id=111256 Fixes: 18398904ca9e ("drm/i915: Only recover active engines") Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190808202758.10453-1-chris@chris-wilson.co.uk
2019-08-09 03:27:58 +07:00
#include "gt/intel_gt_pm.h"
#include "gt/intel_gt_requests.h"
#include "gt/intel_reset.h"
#include "gt/intel_rc6.h"
#include "gt/intel_rps.h"
#include "i915_debugfs.h"
#include "i915_debugfs_params.h"
#include "i915_irq.h"
#include "i915_trace.h"
#include "intel_pm.h"
#include "intel_sideband.h"
static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
{
return to_i915(node->minor->dev);
}
static int i915_capabilities(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct drm_printer p = drm_seq_file_printer(m);
seq_printf(m, "pch: %d\n", INTEL_PCH_TYPE(i915));
intel_device_info_print_static(INTEL_INFO(i915), &p);
intel_device_info_print_runtime(RUNTIME_INFO(i915), &p);
intel_driver_caps_print(&i915->caps, &p);
kernel_param_lock(THIS_MODULE);
i915_params_dump(&i915_modparams, &p);
kernel_param_unlock(THIS_MODULE);
return 0;
}
static char get_tiling_flag(struct drm_i915_gem_object *obj)
{
switch (i915_gem_object_get_tiling(obj)) {
default:
case I915_TILING_NONE: return ' ';
case I915_TILING_X: return 'X';
case I915_TILING_Y: return 'Y';
}
}
static char get_global_flag(struct drm_i915_gem_object *obj)
{
return READ_ONCE(obj->userfault_count) ? 'g' : ' ';
}
static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
{
return obj->mm.mapping ? 'M' : ' ';
}
static const char *
stringify_page_sizes(unsigned int page_sizes, char *buf, size_t len)
{
size_t x = 0;
switch (page_sizes) {
case 0:
return "";
case I915_GTT_PAGE_SIZE_4K:
return "4K";
case I915_GTT_PAGE_SIZE_64K:
return "64K";
case I915_GTT_PAGE_SIZE_2M:
return "2M";
default:
if (!buf)
return "M";
if (page_sizes & I915_GTT_PAGE_SIZE_2M)
x += snprintf(buf + x, len - x, "2M, ");
if (page_sizes & I915_GTT_PAGE_SIZE_64K)
x += snprintf(buf + x, len - x, "64K, ");
if (page_sizes & I915_GTT_PAGE_SIZE_4K)
x += snprintf(buf + x, len - x, "4K, ");
buf[x-2] = '\0';
return buf;
}
}
void
i915_debugfs_describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
{
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 19:41:17 +07:00
struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct intel_engine_cs *engine;
struct i915_vma *vma;
int pin_count = 0;
drm/i915: Replace obj->pin_global with obj->frontbuffer obj->pin_global was originally used as a means to keep the shrinker off the active scanout, but we use the vma->pin_count itself for that and the obj->frontbuffer to delay shrinking active framebuffers. The other role that obj->pin_global gained was for spotting display objects inside GEM and working harder to keep those coherent; for which we can again simply inspect obj->frontbuffer directly. Coming up next, we will want to manipulate the pin_global counter outside of the principle locks, so would need to make pin_global atomic. However, since obj->frontbuffer is already managed atomically, it makes sense to use that the primary key for display objects instead of having pin_global. Ville pointed out the principle difference is that obj->frontbuffer is set for as long as an intel_framebuffer is attached to an object, but obj->pin_global was only raised for as long as the object was active. In practice, this means that we consider the object as being on the scanout for longer than is strictly required, causing us to be more proactive in flushing -- though it should be true that we would have flushed eventually when the back became the front, except that on the flip path that flush is async but when hit from another ioctl it will be synchronous. v2: i915_gem_object_is_framebuffer() Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190902040303.14195-5-chris@chris-wilson.co.uk
2019-09-02 11:02:47 +07:00
seq_printf(m, "%pK: %c%c%c %8zdKiB %02x %02x %s%s%s",
&obj->base,
get_tiling_flag(obj),
get_global_flag(obj),
get_pin_mapped_flag(obj),
obj->base.size / 1024,
obj->read_domains,
obj->write_domain,
i915_cache_level_str(dev_priv, obj->cache_level),
obj->mm.dirty ? " dirty" : "",
obj->mm.madv == I915_MADV_DONTNEED ? " purgeable" : "");
if (obj->base.name)
seq_printf(m, " (name: %d)", obj->base.name);
spin_lock(&obj->vma.lock);
list_for_each_entry(vma, &obj->vma.list, obj_link) {
if (!drm_mm_node_allocated(&vma->node))
continue;
spin_unlock(&obj->vma.lock);
if (i915_vma_is_pinned(vma))
pin_count++;
seq_printf(m, " (%sgtt offset: %08llx, size: %08llx, pages: %s",
i915_vma_is_ggtt(vma) ? "g" : "pp",
vma->node.start, vma->node.size,
stringify_page_sizes(vma->page_sizes.gtt, NULL, 0));
if (i915_vma_is_ggtt(vma)) {
switch (vma->ggtt_view.type) {
case I915_GGTT_VIEW_NORMAL:
seq_puts(m, ", normal");
break;
case I915_GGTT_VIEW_PARTIAL:
seq_printf(m, ", partial [%08llx+%x]",
vma->ggtt_view.partial.offset << PAGE_SHIFT,
vma->ggtt_view.partial.size << PAGE_SHIFT);
break;
case I915_GGTT_VIEW_ROTATED:
seq_printf(m, ", rotated [(%ux%u, stride=%u, offset=%u), (%ux%u, stride=%u, offset=%u)]",
vma->ggtt_view.rotated.plane[0].width,
vma->ggtt_view.rotated.plane[0].height,
vma->ggtt_view.rotated.plane[0].stride,
vma->ggtt_view.rotated.plane[0].offset,
vma->ggtt_view.rotated.plane[1].width,
vma->ggtt_view.rotated.plane[1].height,
vma->ggtt_view.rotated.plane[1].stride,
vma->ggtt_view.rotated.plane[1].offset);
break;
case I915_GGTT_VIEW_REMAPPED:
seq_printf(m, ", remapped [(%ux%u, stride=%u, offset=%u), (%ux%u, stride=%u, offset=%u)]",
vma->ggtt_view.remapped.plane[0].width,
vma->ggtt_view.remapped.plane[0].height,
vma->ggtt_view.remapped.plane[0].stride,
vma->ggtt_view.remapped.plane[0].offset,
vma->ggtt_view.remapped.plane[1].width,
vma->ggtt_view.remapped.plane[1].height,
vma->ggtt_view.remapped.plane[1].stride,
vma->ggtt_view.remapped.plane[1].offset);
break;
default:
MISSING_CASE(vma->ggtt_view.type);
break;
}
}
if (vma->fence)
seq_printf(m, " , fence: %d", vma->fence->id);
seq_puts(m, ")");
spin_lock(&obj->vma.lock);
}
spin_unlock(&obj->vma.lock);
seq_printf(m, " (pinned x %d)", pin_count);
if (obj->stolen)
seq_printf(m, " (stolen: %08llx)", obj->stolen->start);
drm/i915: Replace obj->pin_global with obj->frontbuffer obj->pin_global was originally used as a means to keep the shrinker off the active scanout, but we use the vma->pin_count itself for that and the obj->frontbuffer to delay shrinking active framebuffers. The other role that obj->pin_global gained was for spotting display objects inside GEM and working harder to keep those coherent; for which we can again simply inspect obj->frontbuffer directly. Coming up next, we will want to manipulate the pin_global counter outside of the principle locks, so would need to make pin_global atomic. However, since obj->frontbuffer is already managed atomically, it makes sense to use that the primary key for display objects instead of having pin_global. Ville pointed out the principle difference is that obj->frontbuffer is set for as long as an intel_framebuffer is attached to an object, but obj->pin_global was only raised for as long as the object was active. In practice, this means that we consider the object as being on the scanout for longer than is strictly required, causing us to be more proactive in flushing -- though it should be true that we would have flushed eventually when the back became the front, except that on the flip path that flush is async but when hit from another ioctl it will be synchronous. v2: i915_gem_object_is_framebuffer() Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190902040303.14195-5-chris@chris-wilson.co.uk
2019-09-02 11:02:47 +07:00
if (i915_gem_object_is_framebuffer(obj))
seq_printf(m, " (fb)");
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 19:58:44 +07:00
engine = i915_gem_object_last_write_engine(obj);
if (engine)
seq_printf(m, " (%s)", engine->name);
}
struct file_stats {
struct i915_address_space *vm;
unsigned long count;
u64 total;
u64 active, inactive;
u64 closed;
};
static int per_file_stats(int id, void *ptr, void *data)
{
struct drm_i915_gem_object *obj = ptr;
struct file_stats *stats = data;
struct i915_vma *vma;
drm/i915: Protect debugfs per_file_stats with RCU lock If we make sure we grab a strong reference to each object as we dump it, we can reduce the locks outside of our iterators to an rcu_read_lock. This should prevent errors like: [ 2138.371911] BUG: KASAN: use-after-free in per_file_stats+0x43/0x380 [i915] [ 2138.371924] Read of size 8 at addr ffff888223651000 by task cat/8293 [ 2138.371947] CPU: 0 PID: 8293 Comm: cat Not tainted 5.3.0-rc6-CI-Custom_4352+ #1 [ 2138.371953] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.40 07/14/2017 [ 2138.371959] Call Trace: [ 2138.371974] dump_stack+0x7c/0xbb [ 2138.372099] ? per_file_stats+0x43/0x380 [i915] [ 2138.372108] print_address_description+0x73/0x3a0 [ 2138.372231] ? per_file_stats+0x43/0x380 [i915] [ 2138.372352] ? per_file_stats+0x43/0x380 [i915] [ 2138.372362] __kasan_report+0x14e/0x192 [ 2138.372489] ? per_file_stats+0x43/0x380 [i915] [ 2138.372502] kasan_report+0xe/0x20 [ 2138.372625] per_file_stats+0x43/0x380 [i915] [ 2138.372751] ? i915_panel_show+0x110/0x110 [i915] [ 2138.372761] idr_for_each+0xa7/0x160 [ 2138.372773] ? idr_get_next_ul+0x110/0x110 [ 2138.372782] ? do_raw_spin_lock+0x10a/0x1d0 [ 2138.372923] print_context_stats+0x264/0x510 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: David Weinehall <david.weinehall@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190903062133.27360-1-chris@chris-wilson.co.uk
2019-09-03 13:21:33 +07:00
if (!kref_get_unless_zero(&obj->base.refcount))
return 0;
stats->count++;
stats->total += obj->base.size;
spin_lock(&obj->vma.lock);
if (!stats->vm) {
for_each_ggtt_vma(vma, obj) {
if (!drm_mm_node_allocated(&vma->node))
continue;
if (i915_vma_is_active(vma))
stats->active += vma->node.size;
else
stats->inactive += vma->node.size;
if (i915_vma_is_closed(vma))
stats->closed += vma->node.size;
}
} else {
struct rb_node *p = obj->vma.tree.rb_node;
while (p) {
long cmp;
vma = rb_entry(p, typeof(*vma), obj_node);
cmp = i915_vma_compare(vma, stats->vm, NULL);
if (cmp == 0) {
if (drm_mm_node_allocated(&vma->node)) {
if (i915_vma_is_active(vma))
stats->active += vma->node.size;
else
stats->inactive += vma->node.size;
if (i915_vma_is_closed(vma))
stats->closed += vma->node.size;
}
break;
}
if (cmp < 0)
p = p->rb_right;
else
p = p->rb_left;
}
}
spin_unlock(&obj->vma.lock);
drm/i915: Protect debugfs per_file_stats with RCU lock If we make sure we grab a strong reference to each object as we dump it, we can reduce the locks outside of our iterators to an rcu_read_lock. This should prevent errors like: [ 2138.371911] BUG: KASAN: use-after-free in per_file_stats+0x43/0x380 [i915] [ 2138.371924] Read of size 8 at addr ffff888223651000 by task cat/8293 [ 2138.371947] CPU: 0 PID: 8293 Comm: cat Not tainted 5.3.0-rc6-CI-Custom_4352+ #1 [ 2138.371953] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.40 07/14/2017 [ 2138.371959] Call Trace: [ 2138.371974] dump_stack+0x7c/0xbb [ 2138.372099] ? per_file_stats+0x43/0x380 [i915] [ 2138.372108] print_address_description+0x73/0x3a0 [ 2138.372231] ? per_file_stats+0x43/0x380 [i915] [ 2138.372352] ? per_file_stats+0x43/0x380 [i915] [ 2138.372362] __kasan_report+0x14e/0x192 [ 2138.372489] ? per_file_stats+0x43/0x380 [i915] [ 2138.372502] kasan_report+0xe/0x20 [ 2138.372625] per_file_stats+0x43/0x380 [i915] [ 2138.372751] ? i915_panel_show+0x110/0x110 [i915] [ 2138.372761] idr_for_each+0xa7/0x160 [ 2138.372773] ? idr_get_next_ul+0x110/0x110 [ 2138.372782] ? do_raw_spin_lock+0x10a/0x1d0 [ 2138.372923] print_context_stats+0x264/0x510 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: David Weinehall <david.weinehall@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190903062133.27360-1-chris@chris-wilson.co.uk
2019-09-03 13:21:33 +07:00
i915_gem_object_put(obj);
return 0;
}
#define print_file_stats(m, name, stats) do { \
if (stats.count) \
seq_printf(m, "%s: %lu objects, %llu bytes (%llu active, %llu inactive, %llu closed)\n", \
name, \
stats.count, \
stats.total, \
stats.active, \
stats.inactive, \
stats.closed); \
} while (0)
drm/i915: Implement a framework for batch buffer pools This adds a small module for managing a pool of batch buffers. The only current use case is for the command parser, as described in the kerneldoc in the patch. The code is simple, but separating it out makes it easier to change the underlying algorithms and to extend to future use cases should they arise. The interface is simple: init to create an empty pool, fini to clean it up, get to obtain a new buffer. Note that all buffers are expected to be inactive before cleaning up the pool. Locking is currently based on the caller holding the struct_mutex. We already do that in the places where we will use the batch pool for the command parser. v2: - s/BUG_ON/WARN_ON/ for locking assertions - Remove the cap on pool size - Switch from alloc/free to init/fini v3: - Idiomatic looping structure in _fini - Correct handling of purged objects - Don't return a buffer that's too much larger than needed v4: - Rebased to latest -nightly v5: - Remove _put() function and clean up comments to match v6: - Move purged check inside the loop (danvet, from v4 1/7 feedback) v7: - Use single list instead of two. (Chris W) - s/active_list/cache_list - Squashed in debug patches (Chris W) drm/i915: Add a batch pool debugfs file It provides some useful information about the buffers in the global command parser batch pool. v2: rebase on global pool instead of per-ring pools v3: rebase drm/i915: Add batch pool details to i915_gem_objects debugfs To better account for the potentially large memory consumption of the batch pool. v8: - Keep cache in LRU order (danvet, from v6 1/5 feedback) Issue: VIZ-4719 Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> Reviewed-By: Jon Bloomfield <jon.bloomfield@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-12 03:13:08 +07:00
static void print_context_stats(struct seq_file *m,
struct drm_i915_private *i915)
{
struct file_stats kstats = {};
struct i915_gem_context *ctx, *cn;
spin_lock(&i915->gem.contexts.lock);
list_for_each_entry_safe(ctx, cn, &i915->gem.contexts.list, link) {
struct i915_gem_engines_iter it;
struct intel_context *ce;
if (!kref_get_unless_zero(&ctx->ref))
continue;
spin_unlock(&i915->gem.contexts.lock);
for_each_gem_engine(ce,
i915_gem_context_lock_engines(ctx), it) {
if (intel_context_pin_if_active(ce)) {
drm/i915: Protect debugfs per_file_stats with RCU lock If we make sure we grab a strong reference to each object as we dump it, we can reduce the locks outside of our iterators to an rcu_read_lock. This should prevent errors like: [ 2138.371911] BUG: KASAN: use-after-free in per_file_stats+0x43/0x380 [i915] [ 2138.371924] Read of size 8 at addr ffff888223651000 by task cat/8293 [ 2138.371947] CPU: 0 PID: 8293 Comm: cat Not tainted 5.3.0-rc6-CI-Custom_4352+ #1 [ 2138.371953] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.40 07/14/2017 [ 2138.371959] Call Trace: [ 2138.371974] dump_stack+0x7c/0xbb [ 2138.372099] ? per_file_stats+0x43/0x380 [i915] [ 2138.372108] print_address_description+0x73/0x3a0 [ 2138.372231] ? per_file_stats+0x43/0x380 [i915] [ 2138.372352] ? per_file_stats+0x43/0x380 [i915] [ 2138.372362] __kasan_report+0x14e/0x192 [ 2138.372489] ? per_file_stats+0x43/0x380 [i915] [ 2138.372502] kasan_report+0xe/0x20 [ 2138.372625] per_file_stats+0x43/0x380 [i915] [ 2138.372751] ? i915_panel_show+0x110/0x110 [i915] [ 2138.372761] idr_for_each+0xa7/0x160 [ 2138.372773] ? idr_get_next_ul+0x110/0x110 [ 2138.372782] ? do_raw_spin_lock+0x10a/0x1d0 [ 2138.372923] print_context_stats+0x264/0x510 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: David Weinehall <david.weinehall@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190903062133.27360-1-chris@chris-wilson.co.uk
2019-09-03 13:21:33 +07:00
rcu_read_lock();
if (ce->state)
per_file_stats(0,
ce->state->obj, &kstats);
per_file_stats(0, ce->ring->vma->obj, &kstats);
drm/i915: Protect debugfs per_file_stats with RCU lock If we make sure we grab a strong reference to each object as we dump it, we can reduce the locks outside of our iterators to an rcu_read_lock. This should prevent errors like: [ 2138.371911] BUG: KASAN: use-after-free in per_file_stats+0x43/0x380 [i915] [ 2138.371924] Read of size 8 at addr ffff888223651000 by task cat/8293 [ 2138.371947] CPU: 0 PID: 8293 Comm: cat Not tainted 5.3.0-rc6-CI-Custom_4352+ #1 [ 2138.371953] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.40 07/14/2017 [ 2138.371959] Call Trace: [ 2138.371974] dump_stack+0x7c/0xbb [ 2138.372099] ? per_file_stats+0x43/0x380 [i915] [ 2138.372108] print_address_description+0x73/0x3a0 [ 2138.372231] ? per_file_stats+0x43/0x380 [i915] [ 2138.372352] ? per_file_stats+0x43/0x380 [i915] [ 2138.372362] __kasan_report+0x14e/0x192 [ 2138.372489] ? per_file_stats+0x43/0x380 [i915] [ 2138.372502] kasan_report+0xe/0x20 [ 2138.372625] per_file_stats+0x43/0x380 [i915] [ 2138.372751] ? i915_panel_show+0x110/0x110 [i915] [ 2138.372761] idr_for_each+0xa7/0x160 [ 2138.372773] ? idr_get_next_ul+0x110/0x110 [ 2138.372782] ? do_raw_spin_lock+0x10a/0x1d0 [ 2138.372923] print_context_stats+0x264/0x510 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: David Weinehall <david.weinehall@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190903062133.27360-1-chris@chris-wilson.co.uk
2019-09-03 13:21:33 +07:00
rcu_read_unlock();
intel_context_unpin(ce);
}
}
i915_gem_context_unlock_engines(ctx);
if (!IS_ERR_OR_NULL(ctx->file_priv)) {
struct file_stats stats = {
.vm = rcu_access_pointer(ctx->vm),
};
struct drm_file *file = ctx->file_priv->file;
struct task_struct *task;
char name[80];
drm/i915: Protect debugfs per_file_stats with RCU lock If we make sure we grab a strong reference to each object as we dump it, we can reduce the locks outside of our iterators to an rcu_read_lock. This should prevent errors like: [ 2138.371911] BUG: KASAN: use-after-free in per_file_stats+0x43/0x380 [i915] [ 2138.371924] Read of size 8 at addr ffff888223651000 by task cat/8293 [ 2138.371947] CPU: 0 PID: 8293 Comm: cat Not tainted 5.3.0-rc6-CI-Custom_4352+ #1 [ 2138.371953] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.40 07/14/2017 [ 2138.371959] Call Trace: [ 2138.371974] dump_stack+0x7c/0xbb [ 2138.372099] ? per_file_stats+0x43/0x380 [i915] [ 2138.372108] print_address_description+0x73/0x3a0 [ 2138.372231] ? per_file_stats+0x43/0x380 [i915] [ 2138.372352] ? per_file_stats+0x43/0x380 [i915] [ 2138.372362] __kasan_report+0x14e/0x192 [ 2138.372489] ? per_file_stats+0x43/0x380 [i915] [ 2138.372502] kasan_report+0xe/0x20 [ 2138.372625] per_file_stats+0x43/0x380 [i915] [ 2138.372751] ? i915_panel_show+0x110/0x110 [i915] [ 2138.372761] idr_for_each+0xa7/0x160 [ 2138.372773] ? idr_get_next_ul+0x110/0x110 [ 2138.372782] ? do_raw_spin_lock+0x10a/0x1d0 [ 2138.372923] print_context_stats+0x264/0x510 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: David Weinehall <david.weinehall@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190903062133.27360-1-chris@chris-wilson.co.uk
2019-09-03 13:21:33 +07:00
rcu_read_lock();
idr_for_each(&file->object_idr, per_file_stats, &stats);
drm/i915: Protect debugfs per_file_stats with RCU lock If we make sure we grab a strong reference to each object as we dump it, we can reduce the locks outside of our iterators to an rcu_read_lock. This should prevent errors like: [ 2138.371911] BUG: KASAN: use-after-free in per_file_stats+0x43/0x380 [i915] [ 2138.371924] Read of size 8 at addr ffff888223651000 by task cat/8293 [ 2138.371947] CPU: 0 PID: 8293 Comm: cat Not tainted 5.3.0-rc6-CI-Custom_4352+ #1 [ 2138.371953] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.40 07/14/2017 [ 2138.371959] Call Trace: [ 2138.371974] dump_stack+0x7c/0xbb [ 2138.372099] ? per_file_stats+0x43/0x380 [i915] [ 2138.372108] print_address_description+0x73/0x3a0 [ 2138.372231] ? per_file_stats+0x43/0x380 [i915] [ 2138.372352] ? per_file_stats+0x43/0x380 [i915] [ 2138.372362] __kasan_report+0x14e/0x192 [ 2138.372489] ? per_file_stats+0x43/0x380 [i915] [ 2138.372502] kasan_report+0xe/0x20 [ 2138.372625] per_file_stats+0x43/0x380 [i915] [ 2138.372751] ? i915_panel_show+0x110/0x110 [i915] [ 2138.372761] idr_for_each+0xa7/0x160 [ 2138.372773] ? idr_get_next_ul+0x110/0x110 [ 2138.372782] ? do_raw_spin_lock+0x10a/0x1d0 [ 2138.372923] print_context_stats+0x264/0x510 [i915] Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Tested-by: David Weinehall <david.weinehall@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190903062133.27360-1-chris@chris-wilson.co.uk
2019-09-03 13:21:33 +07:00
rcu_read_unlock();
rcu_read_lock();
task = pid_task(ctx->pid ?: file->pid, PIDTYPE_PID);
snprintf(name, sizeof(name), "%s",
task ? task->comm : "<unknown>");
rcu_read_unlock();
print_file_stats(m, name, stats);
}
spin_lock(&i915->gem.contexts.lock);
list_safe_reset_next(ctx, cn, link);
i915_gem_context_put(ctx);
}
spin_unlock(&i915->gem.contexts.lock);
print_file_stats(m, "[k]contexts", kstats);
}
static int i915_gem_object_info(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_memory_region *mr;
enum intel_region_id id;
seq_printf(m, "%u shrinkable [%u free] objects, %llu bytes\n",
i915->mm.shrink_count,
atomic_read(&i915->mm.free_count),
i915->mm.shrink_memory);
for_each_memory_region(mr, i915, id)
seq_printf(m, "%s: total:%pa, available:%pa bytes\n",
mr->name, &mr->total, &mr->avail);
drm/i915: Implement a framework for batch buffer pools This adds a small module for managing a pool of batch buffers. The only current use case is for the command parser, as described in the kerneldoc in the patch. The code is simple, but separating it out makes it easier to change the underlying algorithms and to extend to future use cases should they arise. The interface is simple: init to create an empty pool, fini to clean it up, get to obtain a new buffer. Note that all buffers are expected to be inactive before cleaning up the pool. Locking is currently based on the caller holding the struct_mutex. We already do that in the places where we will use the batch pool for the command parser. v2: - s/BUG_ON/WARN_ON/ for locking assertions - Remove the cap on pool size - Switch from alloc/free to init/fini v3: - Idiomatic looping structure in _fini - Correct handling of purged objects - Don't return a buffer that's too much larger than needed v4: - Rebased to latest -nightly v5: - Remove _put() function and clean up comments to match v6: - Move purged check inside the loop (danvet, from v4 1/7 feedback) v7: - Use single list instead of two. (Chris W) - s/active_list/cache_list - Squashed in debug patches (Chris W) drm/i915: Add a batch pool debugfs file It provides some useful information about the buffers in the global command parser batch pool. v2: rebase on global pool instead of per-ring pools v3: rebase drm/i915: Add batch pool details to i915_gem_objects debugfs To better account for the potentially large memory consumption of the batch pool. v8: - Keep cache in LRU order (danvet, from v6 1/5 feedback) Issue: VIZ-4719 Signed-off-by: Brad Volkin <bradley.d.volkin@intel.com> Reviewed-By: Jon Bloomfield <jon.bloomfield@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-12-12 03:13:08 +07:00
seq_putc(m, '\n');
print_context_stats(m, i915);
return 0;
}
static void gen8_display_interrupt_info(struct seq_file *m)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
enum pipe pipe;
for_each_pipe(dev_priv, pipe) {
enum intel_display_power_domain power_domain;
intel_wakeref_t wakeref;
power_domain = POWER_DOMAIN_PIPE(pipe);
wakeref = intel_display_power_get_if_enabled(dev_priv,
power_domain);
if (!wakeref) {
seq_printf(m, "Pipe %c power disabled\n",
pipe_name(pipe));
continue;
}
seq_printf(m, "Pipe %c IMR:\t%08x\n",
pipe_name(pipe),
I915_READ(GEN8_DE_PIPE_IMR(pipe)));
seq_printf(m, "Pipe %c IIR:\t%08x\n",
pipe_name(pipe),
I915_READ(GEN8_DE_PIPE_IIR(pipe)));
seq_printf(m, "Pipe %c IER:\t%08x\n",
pipe_name(pipe),
I915_READ(GEN8_DE_PIPE_IER(pipe)));
intel_display_power_put(dev_priv, power_domain, wakeref);
}
seq_printf(m, "Display Engine port interrupt mask:\t%08x\n",
I915_READ(GEN8_DE_PORT_IMR));
seq_printf(m, "Display Engine port interrupt identity:\t%08x\n",
I915_READ(GEN8_DE_PORT_IIR));
seq_printf(m, "Display Engine port interrupt enable:\t%08x\n",
I915_READ(GEN8_DE_PORT_IER));
seq_printf(m, "Display Engine misc interrupt mask:\t%08x\n",
I915_READ(GEN8_DE_MISC_IMR));
seq_printf(m, "Display Engine misc interrupt identity:\t%08x\n",
I915_READ(GEN8_DE_MISC_IIR));
seq_printf(m, "Display Engine misc interrupt enable:\t%08x\n",
I915_READ(GEN8_DE_MISC_IER));
seq_printf(m, "PCU interrupt mask:\t%08x\n",
I915_READ(GEN8_PCU_IMR));
seq_printf(m, "PCU interrupt identity:\t%08x\n",
I915_READ(GEN8_PCU_IIR));
seq_printf(m, "PCU interrupt enable:\t%08x\n",
I915_READ(GEN8_PCU_IER));
}
static int i915_interrupt_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_engine_cs *engine;
intel_wakeref_t wakeref;
int i, pipe;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
if (IS_CHERRYVIEW(dev_priv)) {
intel_wakeref_t pref;
seq_printf(m, "Master Interrupt Control:\t%08x\n",
I915_READ(GEN8_MASTER_IRQ));
seq_printf(m, "Display IER:\t%08x\n",
I915_READ(VLV_IER));
seq_printf(m, "Display IIR:\t%08x\n",
I915_READ(VLV_IIR));
seq_printf(m, "Display IIR_RW:\t%08x\n",
I915_READ(VLV_IIR_RW));
seq_printf(m, "Display IMR:\t%08x\n",
I915_READ(VLV_IMR));
for_each_pipe(dev_priv, pipe) {
enum intel_display_power_domain power_domain;
power_domain = POWER_DOMAIN_PIPE(pipe);
pref = intel_display_power_get_if_enabled(dev_priv,
power_domain);
if (!pref) {
seq_printf(m, "Pipe %c power disabled\n",
pipe_name(pipe));
continue;
}
seq_printf(m, "Pipe %c stat:\t%08x\n",
pipe_name(pipe),
I915_READ(PIPESTAT(pipe)));
intel_display_power_put(dev_priv, power_domain, pref);
}
pref = intel_display_power_get(dev_priv, POWER_DOMAIN_INIT);
seq_printf(m, "Port hotplug:\t%08x\n",
I915_READ(PORT_HOTPLUG_EN));
seq_printf(m, "DPFLIPSTAT:\t%08x\n",
I915_READ(VLV_DPFLIPSTAT));
seq_printf(m, "DPINVGTT:\t%08x\n",
I915_READ(DPINVGTT));
intel_display_power_put(dev_priv, POWER_DOMAIN_INIT, pref);
for (i = 0; i < 4; i++) {
seq_printf(m, "GT Interrupt IMR %d:\t%08x\n",
i, I915_READ(GEN8_GT_IMR(i)));
seq_printf(m, "GT Interrupt IIR %d:\t%08x\n",
i, I915_READ(GEN8_GT_IIR(i)));
seq_printf(m, "GT Interrupt IER %d:\t%08x\n",
i, I915_READ(GEN8_GT_IER(i)));
}
seq_printf(m, "PCU interrupt mask:\t%08x\n",
I915_READ(GEN8_PCU_IMR));
seq_printf(m, "PCU interrupt identity:\t%08x\n",
I915_READ(GEN8_PCU_IIR));
seq_printf(m, "PCU interrupt enable:\t%08x\n",
I915_READ(GEN8_PCU_IER));
} else if (INTEL_GEN(dev_priv) >= 11) {
seq_printf(m, "Master Interrupt Control: %08x\n",
I915_READ(GEN11_GFX_MSTR_IRQ));
seq_printf(m, "Render/Copy Intr Enable: %08x\n",
I915_READ(GEN11_RENDER_COPY_INTR_ENABLE));
seq_printf(m, "VCS/VECS Intr Enable: %08x\n",
I915_READ(GEN11_VCS_VECS_INTR_ENABLE));
seq_printf(m, "GUC/SG Intr Enable:\t %08x\n",
I915_READ(GEN11_GUC_SG_INTR_ENABLE));
seq_printf(m, "GPM/WGBOXPERF Intr Enable: %08x\n",
I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE));
seq_printf(m, "Crypto Intr Enable:\t %08x\n",
I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE));
seq_printf(m, "GUnit/CSME Intr Enable:\t %08x\n",
I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE));
seq_printf(m, "Display Interrupt Control:\t%08x\n",
I915_READ(GEN11_DISPLAY_INT_CTL));
gen8_display_interrupt_info(m);
} else if (INTEL_GEN(dev_priv) >= 8) {
seq_printf(m, "Master Interrupt Control:\t%08x\n",
I915_READ(GEN8_MASTER_IRQ));
for (i = 0; i < 4; i++) {
seq_printf(m, "GT Interrupt IMR %d:\t%08x\n",
i, I915_READ(GEN8_GT_IMR(i)));
seq_printf(m, "GT Interrupt IIR %d:\t%08x\n",
i, I915_READ(GEN8_GT_IIR(i)));
seq_printf(m, "GT Interrupt IER %d:\t%08x\n",
i, I915_READ(GEN8_GT_IER(i)));
}
gen8_display_interrupt_info(m);
} else if (IS_VALLEYVIEW(dev_priv)) {
intel_wakeref_t pref;
seq_printf(m, "Display IER:\t%08x\n",
I915_READ(VLV_IER));
seq_printf(m, "Display IIR:\t%08x\n",
I915_READ(VLV_IIR));
seq_printf(m, "Display IIR_RW:\t%08x\n",
I915_READ(VLV_IIR_RW));
seq_printf(m, "Display IMR:\t%08x\n",
I915_READ(VLV_IMR));
drm/i915/byt: Take powerwell for reading PIPESTAT in debugfs [12493.693827] WARNING: CPU: 1 PID: 14860 at drivers/gpu/drm/i915/intel_uncore.c:795 __unclaimed_reg_debug+0x5d/0x80 [i915] [12493.693868] Unclaimed read from register 0x1f0024 [12493.693905] Modules linked in: vgem i915 drm_kms_helper drm intel_gtt i2c_algo_bit syscopyarea sysfillrect sysimgblt fb_sys_fops prime_numbers intel_powerclamp crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel cryptd lpc_ich i2c_i801 mfd_core video i2c_designware_platform i2c_designware_core i2c_core button autofs4 sd_mod ahci libahci libata scsi_mod [last unloaded: i915] [12493.694039] CPU: 1 PID: 14860 Comm: intel-gpu-overl Tainted: G U 4.10.0-rc7+ #11 [12493.694079] Hardware name: GIGABYTE GB-BXBT-1900/MZBAYAB-00, BIOS F8 03/02/2016 [12493.694121] Call Trace: [12493.694169] dump_stack+0x67/0x9d [12493.694235] __warn+0x117/0x140 [12493.694288] warn_slowpath_fmt+0x4f/0x60 [12493.694344] ? do_raw_spin_lock+0x116/0x180 [12493.694533] ? check_for_unclaimed_mmio+0x98/0xe0 [i915] [12493.694727] __unclaimed_reg_debug+0x5d/0x80 [i915] [12493.694923] fwtable_read32+0x2c5/0x330 [i915] [12493.695108] i915_interrupt_info+0xd52/0xf80 [i915] [12493.695302] ? gen6_write16+0x310/0x310 [i915] [12493.695357] seq_read+0x187/0x710 [12493.695412] full_proxy_read+0x75/0xc0 [12493.695472] __vfs_read+0x5a/0x220 [12493.695524] ? kmem_cache_free+0x6c/0x260 [12493.695577] ? putname+0x97/0xa0 [12493.695629] ? putname+0x97/0xa0 [12493.695682] ? rcu_read_lock_sched_held+0xb8/0xd0 [12493.695735] ? rw_verify_area+0x65/0x140 [12493.695787] vfs_read+0xd1/0x1f0 [12493.695840] SyS_read+0x62/0xc0 [12493.695893] entry_SYSCALL_64_fastpath+0x1c/0xb1 [12493.695943] RIP: 0033:0x7f82dca99ba0 [12493.695985] RSP: 002b:00007ffc0bdfd4f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12493.696031] RAX: ffffffffffffffda RBX: 00007ffc0be005a0 RCX: 00007f82dca99ba0 [12493.696073] RDX: 0000000000001fff RSI: 00007ffc0bdfd500 RDI: 000000000000001a [12493.696115] RBP: ffffffff810fb639 R08: 302f6972642f6775 R09: 00007f82dca0999a [12493.696157] R10: 00007f82dcd62760 R11: 0000000000000246 R12: ffff880069a17f98 [12493.696199] R13: 00007ffc0bdfd428 R14: 0000000000000003 R15: 00007ffc0bdfd428 [12493.696250] ? trace_hardirqs_off_caller+0xd9/0x130 [12493.696300] ---[ end trace 52ccf4d39793cc59 ]--- Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99761 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/20170210133632.16946-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
2017-02-10 20:36:32 +07:00
for_each_pipe(dev_priv, pipe) {
enum intel_display_power_domain power_domain;
power_domain = POWER_DOMAIN_PIPE(pipe);
pref = intel_display_power_get_if_enabled(dev_priv,
power_domain);
if (!pref) {
drm/i915/byt: Take powerwell for reading PIPESTAT in debugfs [12493.693827] WARNING: CPU: 1 PID: 14860 at drivers/gpu/drm/i915/intel_uncore.c:795 __unclaimed_reg_debug+0x5d/0x80 [i915] [12493.693868] Unclaimed read from register 0x1f0024 [12493.693905] Modules linked in: vgem i915 drm_kms_helper drm intel_gtt i2c_algo_bit syscopyarea sysfillrect sysimgblt fb_sys_fops prime_numbers intel_powerclamp crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel cryptd lpc_ich i2c_i801 mfd_core video i2c_designware_platform i2c_designware_core i2c_core button autofs4 sd_mod ahci libahci libata scsi_mod [last unloaded: i915] [12493.694039] CPU: 1 PID: 14860 Comm: intel-gpu-overl Tainted: G U 4.10.0-rc7+ #11 [12493.694079] Hardware name: GIGABYTE GB-BXBT-1900/MZBAYAB-00, BIOS F8 03/02/2016 [12493.694121] Call Trace: [12493.694169] dump_stack+0x67/0x9d [12493.694235] __warn+0x117/0x140 [12493.694288] warn_slowpath_fmt+0x4f/0x60 [12493.694344] ? do_raw_spin_lock+0x116/0x180 [12493.694533] ? check_for_unclaimed_mmio+0x98/0xe0 [i915] [12493.694727] __unclaimed_reg_debug+0x5d/0x80 [i915] [12493.694923] fwtable_read32+0x2c5/0x330 [i915] [12493.695108] i915_interrupt_info+0xd52/0xf80 [i915] [12493.695302] ? gen6_write16+0x310/0x310 [i915] [12493.695357] seq_read+0x187/0x710 [12493.695412] full_proxy_read+0x75/0xc0 [12493.695472] __vfs_read+0x5a/0x220 [12493.695524] ? kmem_cache_free+0x6c/0x260 [12493.695577] ? putname+0x97/0xa0 [12493.695629] ? putname+0x97/0xa0 [12493.695682] ? rcu_read_lock_sched_held+0xb8/0xd0 [12493.695735] ? rw_verify_area+0x65/0x140 [12493.695787] vfs_read+0xd1/0x1f0 [12493.695840] SyS_read+0x62/0xc0 [12493.695893] entry_SYSCALL_64_fastpath+0x1c/0xb1 [12493.695943] RIP: 0033:0x7f82dca99ba0 [12493.695985] RSP: 002b:00007ffc0bdfd4f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12493.696031] RAX: ffffffffffffffda RBX: 00007ffc0be005a0 RCX: 00007f82dca99ba0 [12493.696073] RDX: 0000000000001fff RSI: 00007ffc0bdfd500 RDI: 000000000000001a [12493.696115] RBP: ffffffff810fb639 R08: 302f6972642f6775 R09: 00007f82dca0999a [12493.696157] R10: 00007f82dcd62760 R11: 0000000000000246 R12: ffff880069a17f98 [12493.696199] R13: 00007ffc0bdfd428 R14: 0000000000000003 R15: 00007ffc0bdfd428 [12493.696250] ? trace_hardirqs_off_caller+0xd9/0x130 [12493.696300] ---[ end trace 52ccf4d39793cc59 ]--- Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99761 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/20170210133632.16946-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
2017-02-10 20:36:32 +07:00
seq_printf(m, "Pipe %c power disabled\n",
pipe_name(pipe));
continue;
}
seq_printf(m, "Pipe %c stat:\t%08x\n",
pipe_name(pipe),
I915_READ(PIPESTAT(pipe)));
intel_display_power_put(dev_priv, power_domain, pref);
drm/i915/byt: Take powerwell for reading PIPESTAT in debugfs [12493.693827] WARNING: CPU: 1 PID: 14860 at drivers/gpu/drm/i915/intel_uncore.c:795 __unclaimed_reg_debug+0x5d/0x80 [i915] [12493.693868] Unclaimed read from register 0x1f0024 [12493.693905] Modules linked in: vgem i915 drm_kms_helper drm intel_gtt i2c_algo_bit syscopyarea sysfillrect sysimgblt fb_sys_fops prime_numbers intel_powerclamp crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel cryptd lpc_ich i2c_i801 mfd_core video i2c_designware_platform i2c_designware_core i2c_core button autofs4 sd_mod ahci libahci libata scsi_mod [last unloaded: i915] [12493.694039] CPU: 1 PID: 14860 Comm: intel-gpu-overl Tainted: G U 4.10.0-rc7+ #11 [12493.694079] Hardware name: GIGABYTE GB-BXBT-1900/MZBAYAB-00, BIOS F8 03/02/2016 [12493.694121] Call Trace: [12493.694169] dump_stack+0x67/0x9d [12493.694235] __warn+0x117/0x140 [12493.694288] warn_slowpath_fmt+0x4f/0x60 [12493.694344] ? do_raw_spin_lock+0x116/0x180 [12493.694533] ? check_for_unclaimed_mmio+0x98/0xe0 [i915] [12493.694727] __unclaimed_reg_debug+0x5d/0x80 [i915] [12493.694923] fwtable_read32+0x2c5/0x330 [i915] [12493.695108] i915_interrupt_info+0xd52/0xf80 [i915] [12493.695302] ? gen6_write16+0x310/0x310 [i915] [12493.695357] seq_read+0x187/0x710 [12493.695412] full_proxy_read+0x75/0xc0 [12493.695472] __vfs_read+0x5a/0x220 [12493.695524] ? kmem_cache_free+0x6c/0x260 [12493.695577] ? putname+0x97/0xa0 [12493.695629] ? putname+0x97/0xa0 [12493.695682] ? rcu_read_lock_sched_held+0xb8/0xd0 [12493.695735] ? rw_verify_area+0x65/0x140 [12493.695787] vfs_read+0xd1/0x1f0 [12493.695840] SyS_read+0x62/0xc0 [12493.695893] entry_SYSCALL_64_fastpath+0x1c/0xb1 [12493.695943] RIP: 0033:0x7f82dca99ba0 [12493.695985] RSP: 002b:00007ffc0bdfd4f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [12493.696031] RAX: ffffffffffffffda RBX: 00007ffc0be005a0 RCX: 00007f82dca99ba0 [12493.696073] RDX: 0000000000001fff RSI: 00007ffc0bdfd500 RDI: 000000000000001a [12493.696115] RBP: ffffffff810fb639 R08: 302f6972642f6775 R09: 00007f82dca0999a [12493.696157] R10: 00007f82dcd62760 R11: 0000000000000246 R12: ffff880069a17f98 [12493.696199] R13: 00007ffc0bdfd428 R14: 0000000000000003 R15: 00007ffc0bdfd428 [12493.696250] ? trace_hardirqs_off_caller+0xd9/0x130 [12493.696300] ---[ end trace 52ccf4d39793cc59 ]--- Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=99761 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/20170210133632.16946-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
2017-02-10 20:36:32 +07:00
}
seq_printf(m, "Master IER:\t%08x\n",
I915_READ(VLV_MASTER_IER));
seq_printf(m, "Render IER:\t%08x\n",
I915_READ(GTIER));
seq_printf(m, "Render IIR:\t%08x\n",
I915_READ(GTIIR));
seq_printf(m, "Render IMR:\t%08x\n",
I915_READ(GTIMR));
seq_printf(m, "PM IER:\t\t%08x\n",
I915_READ(GEN6_PMIER));
seq_printf(m, "PM IIR:\t\t%08x\n",
I915_READ(GEN6_PMIIR));
seq_printf(m, "PM IMR:\t\t%08x\n",
I915_READ(GEN6_PMIMR));
pref = intel_display_power_get(dev_priv, POWER_DOMAIN_INIT);
seq_printf(m, "Port hotplug:\t%08x\n",
I915_READ(PORT_HOTPLUG_EN));
seq_printf(m, "DPFLIPSTAT:\t%08x\n",
I915_READ(VLV_DPFLIPSTAT));
seq_printf(m, "DPINVGTT:\t%08x\n",
I915_READ(DPINVGTT));
intel_display_power_put(dev_priv, POWER_DOMAIN_INIT, pref);
} else if (!HAS_PCH_SPLIT(dev_priv)) {
seq_printf(m, "Interrupt enable: %08x\n",
I915_READ(GEN2_IER));
seq_printf(m, "Interrupt identity: %08x\n",
I915_READ(GEN2_IIR));
seq_printf(m, "Interrupt mask: %08x\n",
I915_READ(GEN2_IMR));
for_each_pipe(dev_priv, pipe)
seq_printf(m, "Pipe %c stat: %08x\n",
pipe_name(pipe),
I915_READ(PIPESTAT(pipe)));
} else {
seq_printf(m, "North Display Interrupt enable: %08x\n",
I915_READ(DEIER));
seq_printf(m, "North Display Interrupt identity: %08x\n",
I915_READ(DEIIR));
seq_printf(m, "North Display Interrupt mask: %08x\n",
I915_READ(DEIMR));
seq_printf(m, "South Display Interrupt enable: %08x\n",
I915_READ(SDEIER));
seq_printf(m, "South Display Interrupt identity: %08x\n",
I915_READ(SDEIIR));
seq_printf(m, "South Display Interrupt mask: %08x\n",
I915_READ(SDEIMR));
seq_printf(m, "Graphics Interrupt enable: %08x\n",
I915_READ(GTIER));
seq_printf(m, "Graphics Interrupt identity: %08x\n",
I915_READ(GTIIR));
seq_printf(m, "Graphics Interrupt mask: %08x\n",
I915_READ(GTIMR));
}
if (INTEL_GEN(dev_priv) >= 11) {
seq_printf(m, "RCS Intr Mask:\t %08x\n",
I915_READ(GEN11_RCS0_RSVD_INTR_MASK));
seq_printf(m, "BCS Intr Mask:\t %08x\n",
I915_READ(GEN11_BCS_RSVD_INTR_MASK));
seq_printf(m, "VCS0/VCS1 Intr Mask:\t %08x\n",
I915_READ(GEN11_VCS0_VCS1_INTR_MASK));
seq_printf(m, "VCS2/VCS3 Intr Mask:\t %08x\n",
I915_READ(GEN11_VCS2_VCS3_INTR_MASK));
seq_printf(m, "VECS0/VECS1 Intr Mask:\t %08x\n",
I915_READ(GEN11_VECS0_VECS1_INTR_MASK));
seq_printf(m, "GUC/SG Intr Mask:\t %08x\n",
I915_READ(GEN11_GUC_SG_INTR_MASK));
seq_printf(m, "GPM/WGBOXPERF Intr Mask: %08x\n",
I915_READ(GEN11_GPM_WGBOXPERF_INTR_MASK));
seq_printf(m, "Crypto Intr Mask:\t %08x\n",
I915_READ(GEN11_CRYPTO_RSVD_INTR_MASK));
seq_printf(m, "Gunit/CSME Intr Mask:\t %08x\n",
I915_READ(GEN11_GUNIT_CSME_INTR_MASK));
} else if (INTEL_GEN(dev_priv) >= 6) {
for_each_uabi_engine(engine, dev_priv) {
seq_printf(m,
"Graphics Interrupt mask (%s): %08x\n",
engine->name, ENGINE_READ(engine, RING_IMR));
}
}
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
unsigned int i;
seq_printf(m, "Total fences = %d\n", i915->ggtt.num_fences);
rcu_read_lock();
for (i = 0; i < i915->ggtt.num_fences; i++) {
struct i915_fence_reg *reg = &i915->ggtt.fence_regs[i];
struct i915_vma *vma = reg->vma;
drm/i915: Track unbound pages When dealing with a working set larger than the GATT, or even the mappable aperture when touching through the GTT, we end up with evicting objects only to rebind them at a new offset again later. Moving an object into and out of the GTT requires clflushing the pages, thus causing a double-clflush penalty for rebinding. To avoid having to clflush on rebinding, we can track the pages as they are evicted from the GTT and only relinquish those pages on memory pressure. As usual, if it were not for the handling of out-of-memory condition and having to manually shrink our own bo caches, it would be a net reduction of code. Alas. Note: The patch also contains a few changes to the last-hope evict_everything logic in i916_gem_execbuffer.c - we no longer try to only evict the purgeable stuff in a first try (since that's superflous and only helps in OOM corner-cases, not fragmented-gtt trashing situations). Also, the extraction of the get_pages retry loop from bind_to_gtt (and other callsites) to get_pages should imo have been a separate patch. v2: Ditch the newly added put_pages (for unbound objects only) in i915_gem_reset. A quick irc discussion hasn't revealed any important reason for this, so if we need this, I'd like to have a git blame'able explanation for it. v3: Undo the s/drm_malloc_ab/kmalloc/ in get_pages that Chris noticed. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> [danvet: Split out code movements and rant a bit in the commit message with a few Notes. Done v2] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2012-08-20 16:40:46 +07:00
seq_printf(m, "Fence %d, pin count = %d, object = ",
i, atomic_read(&reg->pin_count));
if (!vma)
seq_puts(m, "unused");
else
i915_debugfs_describe_obj(m, vma->obj);
seq_putc(m, '\n');
}
rcu_read_unlock();
return 0;
}
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
size_t count, loff_t *pos)
{
struct i915_gpu_coredump *error;
ssize_t ret;
void *buf;
error = file->private_data;
if (!error)
return 0;
/* Bounce buffer required because of kernfs __user API convenience. */
buf = kmalloc(count, GFP_KERNEL);
if (!buf)
return -ENOMEM;
ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count);
if (ret <= 0)
goto out;
if (!copy_to_user(ubuf, buf, ret))
*pos += ret;
else
ret = -EFAULT;
out:
kfree(buf);
return ret;
}
static int gpu_state_release(struct inode *inode, struct file *file)
{
i915_gpu_coredump_put(file->private_data);
return 0;
}
static int i915_gpu_info_open(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct i915_gpu_coredump *gpu;
intel_wakeref_t wakeref;
gpu = NULL;
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
gpu = i915_gpu_coredump(i915);
if (IS_ERR(gpu))
return PTR_ERR(gpu);
file->private_data = gpu;
return 0;
}
static const struct file_operations i915_gpu_info_fops = {
.owner = THIS_MODULE,
.open = i915_gpu_info_open,
.read = gpu_state_read,
.llseek = default_llseek,
.release = gpu_state_release,
};
static ssize_t
i915_error_state_write(struct file *filp,
const char __user *ubuf,
size_t cnt,
loff_t *ppos)
{
struct i915_gpu_coredump *error = filp->private_data;
if (!error)
return 0;
drm_dbg(&error->i915->drm, "Resetting error state\n");
i915_reset_error_state(error->i915);
return cnt;
}
static int i915_error_state_open(struct inode *inode, struct file *file)
{
struct i915_gpu_coredump *error;
error = i915_first_error_state(inode->i_private);
if (IS_ERR(error))
return PTR_ERR(error);
file->private_data = error;
return 0;
}
static const struct file_operations i915_error_state_fops = {
.owner = THIS_MODULE,
.open = i915_error_state_open,
.read = gpu_state_read,
.write = i915_error_state_write,
.llseek = default_llseek,
.release = gpu_state_release,
};
#endif
static int i915_frequency_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_uncore *uncore = &dev_priv->uncore;
struct intel_rps *rps = &dev_priv->gt.rps;
intel_wakeref_t wakeref;
int ret = 0;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
if (IS_GEN(dev_priv, 5)) {
u16 rgvswctl = intel_uncore_read16(uncore, MEMSWCTL);
u16 rgvstat = intel_uncore_read16(uncore, MEMSTAT_ILK);
seq_printf(m, "Requested P-state: %d\n", (rgvswctl >> 8) & 0xf);
seq_printf(m, "Requested VID: %d\n", rgvswctl & 0x3f);
seq_printf(m, "Current VID: %d\n", (rgvstat & MEMSTAT_VID_MASK) >>
MEMSTAT_VID_SHIFT);
seq_printf(m, "Current P-state: %d\n",
(rgvstat & MEMSTAT_PSTATE_MASK) >> MEMSTAT_PSTATE_SHIFT);
} else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) {
u32 rpmodectl, freq_sts;
rpmodectl = I915_READ(GEN6_RP_CONTROL);
seq_printf(m, "Video Turbo Mode: %s\n",
yesno(rpmodectl & GEN6_RP_MEDIA_TURBO));
seq_printf(m, "HW control enabled: %s\n",
yesno(rpmodectl & GEN6_RP_ENABLE));
seq_printf(m, "SW control enabled: %s\n",
yesno((rpmodectl & GEN6_RP_MEDIA_MODE_MASK) ==
GEN6_RP_MEDIA_SW_MODE));
vlv_punit_get(dev_priv);
freq_sts = vlv_punit_read(dev_priv, PUNIT_REG_GPU_FREQ_STS);
vlv_punit_put(dev_priv);
seq_printf(m, "PUNIT_REG_GPU_FREQ_STS: 0x%08x\n", freq_sts);
seq_printf(m, "DDR freq: %d MHz\n", dev_priv->mem_freq);
seq_printf(m, "actual GPU freq: %d MHz\n",
intel_gpu_freq(rps, (freq_sts >> 8) & 0xff));
seq_printf(m, "current GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->cur_freq));
seq_printf(m, "max GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, "min GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->min_freq));
seq_printf(m, "idle GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->idle_freq));
seq_printf(m,
"efficient (RPe) frequency: %d MHz\n",
intel_gpu_freq(rps, rps->efficient_freq));
} else if (INTEL_GEN(dev_priv) >= 6) {
u32 rp_state_limits;
u32 gt_perf_status;
u32 rp_state_cap;
u32 rpmodectl, rpinclimit, rpdeclimit;
u32 rpstat, cagf, reqf;
u32 rpupei, rpcurup, rpprevup;
u32 rpdownei, rpcurdown, rpprevdown;
u32 pm_ier, pm_imr, pm_isr, pm_iir, pm_mask;
int max_freq;
rp_state_limits = I915_READ(GEN6_RP_STATE_LIMITS);
if (IS_GEN9_LP(dev_priv)) {
rp_state_cap = I915_READ(BXT_RP_STATE_CAP);
gt_perf_status = I915_READ(BXT_GT_PERF_STATUS);
} else {
rp_state_cap = I915_READ(GEN6_RP_STATE_CAP);
gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS);
}
/* RPSTAT1 is in the GT power well */
intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
reqf = I915_READ(GEN6_RPNSWREQ);
if (INTEL_GEN(dev_priv) >= 9)
reqf >>= 23;
else {
reqf &= ~GEN6_TURBO_DISABLE;
if (IS_HASWELL(dev_priv) || IS_BROADWELL(dev_priv))
reqf >>= 24;
else
reqf >>= 25;
}
reqf = intel_gpu_freq(rps, reqf);
rpmodectl = I915_READ(GEN6_RP_CONTROL);
rpinclimit = I915_READ(GEN6_RP_UP_THRESHOLD);
rpdeclimit = I915_READ(GEN6_RP_DOWN_THRESHOLD);
rpstat = I915_READ(GEN6_RPSTAT1);
rpupei = I915_READ(GEN6_RP_CUR_UP_EI) & GEN6_CURICONT_MASK;
rpcurup = I915_READ(GEN6_RP_CUR_UP) & GEN6_CURBSYTAVG_MASK;
rpprevup = I915_READ(GEN6_RP_PREV_UP) & GEN6_CURBSYTAVG_MASK;
rpdownei = I915_READ(GEN6_RP_CUR_DOWN_EI) & GEN6_CURIAVG_MASK;
rpcurdown = I915_READ(GEN6_RP_CUR_DOWN) & GEN6_CURBSYTAVG_MASK;
rpprevdown = I915_READ(GEN6_RP_PREV_DOWN) & GEN6_CURBSYTAVG_MASK;
cagf = intel_rps_read_actual_frequency(rps);
intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
if (INTEL_GEN(dev_priv) >= 11) {
pm_ier = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE);
pm_imr = I915_READ(GEN11_GPM_WGBOXPERF_INTR_MASK);
/*
* The equivalent to the PM ISR & IIR cannot be read
* without affecting the current state of the system
*/
pm_isr = 0;
pm_iir = 0;
} else if (INTEL_GEN(dev_priv) >= 8) {
pm_ier = I915_READ(GEN8_GT_IER(2));
pm_imr = I915_READ(GEN8_GT_IMR(2));
pm_isr = I915_READ(GEN8_GT_ISR(2));
pm_iir = I915_READ(GEN8_GT_IIR(2));
} else {
pm_ier = I915_READ(GEN6_PMIER);
pm_imr = I915_READ(GEN6_PMIMR);
pm_isr = I915_READ(GEN6_PMISR);
pm_iir = I915_READ(GEN6_PMIIR);
}
pm_mask = I915_READ(GEN6_PMINTRMSK);
seq_printf(m, "Video Turbo Mode: %s\n",
yesno(rpmodectl & GEN6_RP_MEDIA_TURBO));
seq_printf(m, "HW control enabled: %s\n",
yesno(rpmodectl & GEN6_RP_ENABLE));
seq_printf(m, "SW control enabled: %s\n",
yesno((rpmodectl & GEN6_RP_MEDIA_MODE_MASK) ==
GEN6_RP_MEDIA_SW_MODE));
seq_printf(m, "PM IER=0x%08x IMR=0x%08x, MASK=0x%08x\n",
pm_ier, pm_imr, pm_mask);
if (INTEL_GEN(dev_priv) <= 10)
seq_printf(m, "PM ISR=0x%08x IIR=0x%08x\n",
pm_isr, pm_iir);
seq_printf(m, "pm_intrmsk_mbz: 0x%08x\n",
rps->pm_intrmsk_mbz);
seq_printf(m, "GT_PERF_STATUS: 0x%08x\n", gt_perf_status);
seq_printf(m, "Render p-state ratio: %d\n",
(gt_perf_status & (INTEL_GEN(dev_priv) >= 9 ? 0x1ff00 : 0xff00)) >> 8);
seq_printf(m, "Render p-state VID: %d\n",
gt_perf_status & 0xff);
seq_printf(m, "Render p-state limit: %d\n",
rp_state_limits & 0xff);
seq_printf(m, "RPSTAT1: 0x%08x\n", rpstat);
seq_printf(m, "RPMODECTL: 0x%08x\n", rpmodectl);
seq_printf(m, "RPINCLIMIT: 0x%08x\n", rpinclimit);
seq_printf(m, "RPDECLIMIT: 0x%08x\n", rpdeclimit);
seq_printf(m, "RPNSWREQ: %dMHz\n", reqf);
seq_printf(m, "CAGF: %dMHz\n", cagf);
seq_printf(m, "RP CUR UP EI: %d (%dus)\n",
rpupei, GT_PM_INTERVAL_TO_US(dev_priv, rpupei));
seq_printf(m, "RP CUR UP: %d (%dus)\n",
rpcurup, GT_PM_INTERVAL_TO_US(dev_priv, rpcurup));
seq_printf(m, "RP PREV UP: %d (%dus)\n",
rpprevup, GT_PM_INTERVAL_TO_US(dev_priv, rpprevup));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 20:26:29 +07:00
seq_printf(m, "Up threshold: %d%%\n",
rps->power.up_threshold);
seq_printf(m, "RP CUR DOWN EI: %d (%dus)\n",
rpdownei, GT_PM_INTERVAL_TO_US(dev_priv, rpdownei));
seq_printf(m, "RP CUR DOWN: %d (%dus)\n",
rpcurdown, GT_PM_INTERVAL_TO_US(dev_priv, rpcurdown));
seq_printf(m, "RP PREV DOWN: %d (%dus)\n",
rpprevdown, GT_PM_INTERVAL_TO_US(dev_priv, rpprevdown));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 20:26:29 +07:00
seq_printf(m, "Down threshold: %d%%\n",
rps->power.down_threshold);
max_freq = (IS_GEN9_LP(dev_priv) ? rp_state_cap >> 0 :
rp_state_cap >> 16) & 0xff;
max_freq *= (IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ? GEN9_FREQ_SCALER : 1);
seq_printf(m, "Lowest (RPN) frequency: %dMHz\n",
intel_gpu_freq(rps, max_freq));
max_freq = (rp_state_cap & 0xff00) >> 8;
max_freq *= (IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ? GEN9_FREQ_SCALER : 1);
seq_printf(m, "Nominal (RP1) frequency: %dMHz\n",
intel_gpu_freq(rps, max_freq));
max_freq = (IS_GEN9_LP(dev_priv) ? rp_state_cap >> 16 :
rp_state_cap >> 0) & 0xff;
max_freq *= (IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ? GEN9_FREQ_SCALER : 1);
seq_printf(m, "Max non-overclocked (RP0) frequency: %dMHz\n",
intel_gpu_freq(rps, max_freq));
seq_printf(m, "Max overclocked frequency: %dMHz\n",
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, "Current freq: %d MHz\n",
intel_gpu_freq(rps, rps->cur_freq));
seq_printf(m, "Actual freq: %d MHz\n", cagf);
seq_printf(m, "Idle freq: %d MHz\n",
intel_gpu_freq(rps, rps->idle_freq));
seq_printf(m, "Min freq: %d MHz\n",
intel_gpu_freq(rps, rps->min_freq));
seq_printf(m, "Boost freq: %d MHz\n",
intel_gpu_freq(rps, rps->boost_freq));
seq_printf(m, "Max freq: %d MHz\n",
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m,
"efficient (RPe) frequency: %d MHz\n",
intel_gpu_freq(rps, rps->efficient_freq));
} else {
seq_puts(m, "no P-state info available\n");
}
seq_printf(m, "Current CD clock frequency: %d kHz\n", dev_priv->cdclk.hw.cdclk);
seq_printf(m, "Max CD clock frequency: %d kHz\n", dev_priv->max_cdclk_freq);
seq_printf(m, "Max pixel clock frequency: %d kHz\n", dev_priv->max_dotclk_freq);
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return ret;
}
static int i915_ring_freq_table(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_rps *rps = &dev_priv->gt.rps;
unsigned int max_gpu_freq, min_gpu_freq;
intel_wakeref_t wakeref;
int gpu_freq, ia_freq;
if (!HAS_LLC(dev_priv))
return -ENODEV;
min_gpu_freq = rps->min_freq;
max_gpu_freq = rps->max_freq;
if (IS_GEN9_BC(dev_priv) || INTEL_GEN(dev_priv) >= 10) {
/* Convert GT frequency to 50 HZ units */
min_gpu_freq /= GEN9_FREQ_SCALER;
max_gpu_freq /= GEN9_FREQ_SCALER;
}
seq_puts(m, "GPU freq (MHz)\tEffective CPU freq (MHz)\tEffective Ring freq (MHz)\n");
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) {
ia_freq = gpu_freq;
sandybridge_pcode_read(dev_priv,
GEN6_PCODE_READ_MIN_FREQ_TABLE,
&ia_freq, NULL);
seq_printf(m, "%d\t\t%d\t\t\t\t%d\n",
intel_gpu_freq(rps,
(gpu_freq *
(IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ?
GEN9_FREQ_SCALER : 1))),
((ia_freq >> 0) & 0xff) * 100,
((ia_freq >> 8) & 0xff) * 100);
}
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static void describe_ctx_ring(struct seq_file *m, struct intel_ring *ring)
{
seq_printf(m, " (ringbuffer, space: %d, head: %u, tail: %u, emit: %u)",
ring->space, ring->head, ring->tail, ring->emit);
}
static int i915_context_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct i915_gem_context *ctx, *cn;
spin_lock(&i915->gem.contexts.lock);
list_for_each_entry_safe(ctx, cn, &i915->gem.contexts.list, link) {
struct i915_gem_engines_iter it;
struct intel_context *ce;
if (!kref_get_unless_zero(&ctx->ref))
continue;
spin_unlock(&i915->gem.contexts.lock);
drm/i915: Reduce context HW ID lifetime Future gen reduce the number of bits we will have available to differentiate between contexts, so reduce the lifetime of the ID assignment from that of the context to its current active cycle (i.e. only while it is pinned for use by the HW, will it have a constant ID). This means that instead of a max of 2k allocated contexts (worst case before fun with bit twiddling), we instead have a limit of 2k in flight contexts (minus a few that have been pinned by the kernel or by perf). To reduce the number of contexts id we require, we allocate a context id on first and mark it as pinned for as long as the GEM context itself is, that is we keep it pinned it while active on each engine. If we exhaust our context id space, then we try to reclaim an id from an idle context. In the extreme case where all context ids are pinned by active contexts, we force the system to idle in order to recover ids. We cannot reduce the scope of an HW-ID to an engine (allowing the same gem_context to have different ids on each engine) as in the future we will need to preassign an id before we know which engine the context is being executed on. v2: Improved commentary (Tvrtko) [I tried at least] References: https://bugs.freedesktop.org/show_bug.cgi?id=107788 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Michel Thierry <michel.thierry@intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180904153117.3907-1-chris@chris-wilson.co.uk
2018-09-04 22:31:17 +07:00
seq_puts(m, "HW context ");
if (ctx->pid) {
struct task_struct *task;
task = get_pid_task(ctx->pid, PIDTYPE_PID);
if (task) {
seq_printf(m, "(%s [%d]) ",
task->comm, task->pid);
put_task_struct(task);
}
} else if (IS_ERR(ctx->file_priv)) {
seq_puts(m, "(deleted) ");
} else {
seq_puts(m, "(kernel) ");
}
seq_putc(m, ctx->remap_slice ? 'R' : 'r');
seq_putc(m, '\n');
for_each_gem_engine(ce,
i915_gem_context_lock_engines(ctx), it) {
if (intel_context_pin_if_active(ce)) {
seq_printf(m, "%s: ", ce->engine->name);
if (ce->state)
i915_debugfs_describe_obj(m, ce->state->obj);
describe_ctx_ring(m, ce->ring);
seq_putc(m, '\n');
intel_context_unpin(ce);
}
}
i915_gem_context_unlock_engines(ctx);
seq_putc(m, '\n');
spin_lock(&i915->gem.contexts.lock);
list_safe_reset_next(ctx, cn, link);
i915_gem_context_put(ctx);
}
spin_unlock(&i915->gem.contexts.lock);
return 0;
}
static const char *swizzle_string(unsigned swizzle)
{
switch (swizzle) {
case I915_BIT_6_SWIZZLE_NONE:
return "none";
case I915_BIT_6_SWIZZLE_9:
return "bit9";
case I915_BIT_6_SWIZZLE_9_10:
return "bit9/bit10";
case I915_BIT_6_SWIZZLE_9_11:
return "bit9/bit11";
case I915_BIT_6_SWIZZLE_9_10_11:
return "bit9/bit10/bit11";
case I915_BIT_6_SWIZZLE_9_17:
return "bit9/bit17";
case I915_BIT_6_SWIZZLE_9_10_17:
return "bit9/bit10/bit17";
case I915_BIT_6_SWIZZLE_UNKNOWN:
return "unknown";
}
return "bug";
}
static int i915_swizzle_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_uncore *uncore = &dev_priv->uncore;
intel_wakeref_t wakeref;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
seq_printf(m, "bit6 swizzle for X-tiling = %s\n",
swizzle_string(dev_priv->ggtt.bit_6_swizzle_x));
seq_printf(m, "bit6 swizzle for Y-tiling = %s\n",
swizzle_string(dev_priv->ggtt.bit_6_swizzle_y));
if (IS_GEN_RANGE(dev_priv, 3, 4)) {
seq_printf(m, "DDC = 0x%08x\n",
intel_uncore_read(uncore, DCC));
seq_printf(m, "DDC2 = 0x%08x\n",
intel_uncore_read(uncore, DCC2));
seq_printf(m, "C0DRB3 = 0x%04x\n",
intel_uncore_read16(uncore, C0DRB3));
seq_printf(m, "C1DRB3 = 0x%04x\n",
intel_uncore_read16(uncore, C1DRB3));
} else if (INTEL_GEN(dev_priv) >= 6) {
seq_printf(m, "MAD_DIMM_C0 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C0));
seq_printf(m, "MAD_DIMM_C1 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C1));
seq_printf(m, "MAD_DIMM_C2 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C2));
seq_printf(m, "TILECTL = 0x%08x\n",
intel_uncore_read(uncore, TILECTL));
if (INTEL_GEN(dev_priv) >= 8)
seq_printf(m, "GAMTARBMODE = 0x%08x\n",
intel_uncore_read(uncore, GAMTARBMODE));
else
seq_printf(m, "ARB_MODE = 0x%08x\n",
intel_uncore_read(uncore, ARB_MODE));
seq_printf(m, "DISP_ARB_CTL = 0x%08x\n",
intel_uncore_read(uncore, DISP_ARB_CTL));
}
if (dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES)
seq_puts(m, "L-shaped memory detected\n");
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static const char *rps_power_to_str(unsigned int power)
{
static const char * const strings[] = {
[LOW_POWER] = "low power",
[BETWEEN] = "mixed",
[HIGH_POWER] = "high power",
};
if (power >= ARRAY_SIZE(strings) || !strings[power])
return "unknown";
return strings[power];
}
static int i915_rps_boost_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_rps *rps = &dev_priv->gt.rps;
seq_printf(m, "RPS enabled? %d\n", rps->enabled);
drm/i915: Invert the GEM wakeref hierarchy In the current scheme, on submitting a request we take a single global GEM wakeref, which trickles down to wake up all GT power domains. This is undesirable as we would like to be able to localise our power management to the available power domains and to remove the global GEM operations from the heart of the driver. (The intent there is to push global GEM decisions to the boundary as used by the GEM user interface.) Now during request construction, each request is responsible via its logical context to acquire a wakeref on each power domain it intends to utilize. Currently, each request takes a wakeref on the engine(s) and the engines themselves take a chipset wakeref. This gives us a transition on each engine which we can extend if we want to insert more powermangement control (such as soft rc6). The global GEM operations that currently require a struct_mutex are reduced to listening to pm events from the chipset GT wakeref. As we reduce the struct_mutex requirement, these listeners should evaporate. Perhaps the biggest immediate change is that this removes the struct_mutex requirement around GT power management, allowing us greater flexibility in request construction. Another important knock-on effect, is that by tracking engine usage, we can insert a switch back to the kernel context on that engine immediately, avoiding any extra delay or inserting global synchronisation barriers. This makes tracking when an engine and its associated contexts are idle much easier -- important for when we forgo our assumed execution ordering and need idle barriers to unpin used contexts. In the process, it means we remove a large chunk of code whose only purpose was to switch back to the kernel context. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Imre Deak <imre.deak@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-25 03:07:17 +07:00
seq_printf(m, "GPU busy? %s\n", yesno(dev_priv->gt.awake));
drm/i915: Avoid keeping waitboost active for signaling threads Once a client has requested a waitboost, we keep that waitboost active until all clients are no longer waiting. This is because we don't distinguish which waiter deserves the boost. However, with the advent of fence signaling, the signaler threads appear as waiters to the RPS interrupt handler. So instead of using a single boolean to track when to keep the waitboost active, use a counter of all outstanding waitboosted requests. At this point, I have removed all vestiges of the rate limiting on clients. Whilst this means that compositors should remain more fluid, it also means that boosts are more prevalent. See commit b29c19b64528 ("drm/i915: Boost RPS frequency for CPU stalls") for a longer discussion on the pros and cons of both approaches. A drawback of this implementation is that it requires constant request submission to keep the waitboost trimmed (as it is now cancelled when the request is completed). This will be fine for a busy system, but near idle the boosts may be kept for longer than desired (effectively tens of vblanks worstcase) and there is a reliance on rc6 instead. v2: Remove defunct rps.client_lock Reported-by: Michał Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170628123548.9236-1-chris@chris-wilson.co.uk
2017-06-28 19:35:48 +07:00
seq_printf(m, "Boosts outstanding? %d\n",
atomic_read(&rps->num_waiters));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 20:26:29 +07:00
seq_printf(m, "Interactive? %d\n", READ_ONCE(rps->power.interactive));
seq_printf(m, "Frequency requested %d, actual %d\n",
intel_gpu_freq(rps, rps->cur_freq),
intel_rps_read_actual_frequency(rps));
seq_printf(m, " min hard:%d, soft:%d; max soft:%d, hard:%d\n",
intel_gpu_freq(rps, rps->min_freq),
intel_gpu_freq(rps, rps->min_freq_softlimit),
intel_gpu_freq(rps, rps->max_freq_softlimit),
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, " idle:%d, efficient:%d, boost:%d\n",
intel_gpu_freq(rps, rps->idle_freq),
intel_gpu_freq(rps, rps->efficient_freq),
intel_gpu_freq(rps, rps->boost_freq));
seq_printf(m, "Wait boosts: %d\n", atomic_read(&rps->boosts));
drm/i915: Invert the GEM wakeref hierarchy In the current scheme, on submitting a request we take a single global GEM wakeref, which trickles down to wake up all GT power domains. This is undesirable as we would like to be able to localise our power management to the available power domains and to remove the global GEM operations from the heart of the driver. (The intent there is to push global GEM decisions to the boundary as used by the GEM user interface.) Now during request construction, each request is responsible via its logical context to acquire a wakeref on each power domain it intends to utilize. Currently, each request takes a wakeref on the engine(s) and the engines themselves take a chipset wakeref. This gives us a transition on each engine which we can extend if we want to insert more powermangement control (such as soft rc6). The global GEM operations that currently require a struct_mutex are reduced to listening to pm events from the chipset GT wakeref. As we reduce the struct_mutex requirement, these listeners should evaporate. Perhaps the biggest immediate change is that this removes the struct_mutex requirement around GT power management, allowing us greater flexibility in request construction. Another important knock-on effect, is that by tracking engine usage, we can insert a switch back to the kernel context on that engine immediately, avoiding any extra delay or inserting global synchronisation barriers. This makes tracking when an engine and its associated contexts are idle much easier -- important for when we forgo our assumed execution ordering and need idle barriers to unpin used contexts. In the process, it means we remove a large chunk of code whose only purpose was to switch back to the kernel context. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Imre Deak <imre.deak@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-25 03:07:17 +07:00
if (INTEL_GEN(dev_priv) >= 6 && rps->enabled && dev_priv->gt.awake) {
u32 rpup, rpupei;
u32 rpdown, rpdownei;
intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
rpup = I915_READ_FW(GEN6_RP_CUR_UP) & GEN6_RP_EI_MASK;
rpupei = I915_READ_FW(GEN6_RP_CUR_UP_EI) & GEN6_RP_EI_MASK;
rpdown = I915_READ_FW(GEN6_RP_CUR_DOWN) & GEN6_RP_EI_MASK;
rpdownei = I915_READ_FW(GEN6_RP_CUR_DOWN_EI) & GEN6_RP_EI_MASK;
intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
seq_printf(m, "\nRPS Autotuning (current \"%s\" window):\n",
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 20:26:29 +07:00
rps_power_to_str(rps->power.mode));
seq_printf(m, " Avg. up: %d%% [above threshold? %d%%]\n",
rpup && rpupei ? 100 * rpup / rpupei : 0,
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 20:26:29 +07:00
rps->power.up_threshold);
seq_printf(m, " Avg. down: %d%% [below threshold? %d%%]\n",
rpdown && rpdownei ? 100 * rpdown / rpdownei : 0,
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 20:26:29 +07:00
rps->power.down_threshold);
} else {
seq_puts(m, "\nRPS Autotuning inactive\n");
}
return 0;
}
static int i915_llc(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
const bool edram = INTEL_GEN(dev_priv) > 8;
seq_printf(m, "LLC: %s\n", yesno(HAS_LLC(dev_priv)));
seq_printf(m, "%s: %uMB\n", edram ? "eDRAM" : "eLLC",
dev_priv->edram_size_mb);
return 0;
}
static int i915_runtime_pm_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct pci_dev *pdev = dev_priv->drm.pdev;
if (!HAS_RUNTIME_PM(dev_priv))
seq_puts(m, "Runtime power management not supported\n");
seq_printf(m, "Runtime power status: %s\n",
enableddisabled(!dev_priv->power_domains.wakeref));
seq_printf(m, "GPU idle: %s\n", yesno(!dev_priv->gt.awake));
seq_printf(m, "IRQs disabled: %s\n",
yesno(!intel_irqs_enabled(dev_priv)));
#ifdef CONFIG_PM
seq_printf(m, "Usage count: %d\n",
atomic_read(&dev_priv->drm.dev->power.usage_count));
#else
seq_printf(m, "Device Power Management (CONFIG_PM) disabled\n");
#endif
seq_printf(m, "PCI device power state: %s [%d]\n",
pci_power_name(pdev->current_state),
pdev->current_state);
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_RUNTIME_PM)) {
struct drm_printer p = drm_seq_file_printer(m);
print_intel_runtime_pm_wakeref(&dev_priv->runtime_pm, &p);
}
return 0;
}
static int i915_engine_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_engine_cs *engine;
intel_wakeref_t wakeref;
struct drm_printer p;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
seq_printf(m, "GT awake? %s [%d]\n",
yesno(dev_priv->gt.awake),
atomic_read(&dev_priv->gt.wakeref.count));
seq_printf(m, "CS timestamp frequency: %u kHz\n",
RUNTIME_INFO(dev_priv)->cs_timestamp_frequency_khz);
p = drm_seq_file_printer(m);
for_each_uabi_engine(engine, dev_priv)
intel_engine_dump(engine, &p, "%s\n", engine->name);
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static int i915_rcs_topology(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct drm_printer p = drm_seq_file_printer(m);
intel_device_info_print_topology(&RUNTIME_INFO(dev_priv)->sseu, &p);
return 0;
}
static int i915_shrinker_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
seq_printf(m, "seeks = %d\n", i915->mm.shrinker.seeks);
seq_printf(m, "batch = %lu\n", i915->mm.shrinker.batch);
return 0;
}
static int i915_wa_registers(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_engine_cs *engine;
for_each_uabi_engine(engine, i915) {
const struct i915_wa_list *wal = &engine->ctx_wa_list;
const struct i915_wa *wa;
unsigned int count;
count = wal->count;
if (!count)
continue;
seq_printf(m, "%s: Workarounds applied: %u\n",
engine->name, count);
for (wa = wal->list; count--; wa++)
seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X\n",
i915_mmio_reg_offset(wa->reg),
wa->set, wa->clr);
seq_printf(m, "\n");
}
return 0;
}
static int
i915_wedged_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
int ret = intel_gt_terminally_wedged(&i915->gt);
switch (ret) {
case -EIO:
*val = 1;
return 0;
case 0:
*val = 0;
return 0;
default:
return ret;
}
}
static int
i915_wedged_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
/* Flush any previous reset before applying for a new one */
wait_event(i915->gt.reset.queue,
!test_bit(I915_RESET_BACKOFF, &i915->gt.reset.flags));
intel_gt_handle_error(&i915->gt, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
static int
i915_perf_noa_delay_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
const u32 clk = RUNTIME_INFO(i915)->cs_timestamp_frequency_khz;
/*
* This would lead to infinite waits as we're doing timestamp
* difference on the CS with only 32bits.
*/
if (val > mul_u32_u32(U32_MAX, clk))
return -EINVAL;
atomic64_set(&i915->perf.noa_programming_delay, val);
return 0;
}
static int
i915_perf_noa_delay_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
*val = atomic64_read(&i915->perf.noa_programming_delay);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
i915_perf_noa_delay_get,
i915_perf_noa_delay_set,
"%llu\n");
#define DROP_UNBOUND BIT(0)
#define DROP_BOUND BIT(1)
#define DROP_RETIRE BIT(2)
#define DROP_ACTIVE BIT(3)
#define DROP_FREED BIT(4)
#define DROP_SHRINK_ALL BIT(5)
#define DROP_IDLE BIT(6)
#define DROP_RESET_ACTIVE BIT(7)
#define DROP_RESET_SEQNO BIT(8)
#define DROP_RCU BIT(9)
#define DROP_ALL (DROP_UNBOUND | \
DROP_BOUND | \
DROP_RETIRE | \
DROP_ACTIVE | \
DROP_FREED | \
DROP_SHRINK_ALL |\
DROP_IDLE | \
DROP_RESET_ACTIVE | \
DROP_RESET_SEQNO | \
DROP_RCU)
static int
i915_drop_caches_get(void *data, u64 *val)
{
*val = DROP_ALL;
return 0;
}
static int
gt_drop_caches(struct intel_gt *gt, u64 val)
{
int ret;
if (val & DROP_RESET_ACTIVE &&
wait_for(intel_engines_are_idle(gt), I915_IDLE_ENGINES_TIMEOUT))
intel_gt_set_wedged(gt);
if (val & DROP_RETIRE)
intel_gt_retire_requests(gt);
if (val & (DROP_IDLE | DROP_ACTIVE)) {
ret = intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
}
if (val & DROP_IDLE) {
ret = intel_gt_pm_wait_for_idle(gt);
if (ret)
return ret;
}
if (val & DROP_RESET_ACTIVE && intel_gt_terminally_wedged(gt))
intel_gt_handle_error(gt, ALL_ENGINES, 0, NULL);
return 0;
}
static int
i915_drop_caches_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
int ret;
DRM_DEBUG("Dropping caches: 0x%08llx [0x%08llx]\n",
val, val & DROP_ALL);
ret = gt_drop_caches(&i915->gt, val);
if (ret)
return ret;
fs_reclaim_acquire(GFP_KERNEL);
if (val & DROP_BOUND)
i915_gem_shrink(i915, LONG_MAX, NULL, I915_SHRINK_BOUND);
if (val & DROP_UNBOUND)
i915_gem_shrink(i915, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
if (val & DROP_SHRINK_ALL)
i915_gem_shrink_all(i915);
fs_reclaim_release(GFP_KERNEL);
if (val & DROP_RCU)
rcu_barrier();
if (val & DROP_FREED)
i915_gem_drain_freed_objects(i915);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_drop_caches_fops,
i915_drop_caches_get, i915_drop_caches_set,
"0x%08llx\n");
static int
i915_cache_sharing_get(void *data, u64 *val)
{
struct drm_i915_private *dev_priv = data;
intel_wakeref_t wakeref;
u32 snpcr = 0;
if (!(IS_GEN_RANGE(dev_priv, 6, 7)))
return -ENODEV;
with_intel_runtime_pm(&dev_priv->runtime_pm, wakeref)
snpcr = I915_READ(GEN6_MBCUNIT_SNPCR);
*val = (snpcr & GEN6_MBC_SNPCR_MASK) >> GEN6_MBC_SNPCR_SHIFT;
return 0;
}
static int
i915_cache_sharing_set(void *data, u64 val)
{
struct drm_i915_private *dev_priv = data;
intel_wakeref_t wakeref;
if (!(IS_GEN_RANGE(dev_priv, 6, 7)))
return -ENODEV;
if (val > 3)
return -EINVAL;
drm_dbg(&dev_priv->drm,
"Manually setting uncore sharing to %llu\n", val);
with_intel_runtime_pm(&dev_priv->runtime_pm, wakeref) {
u32 snpcr;
/* Update the cache sharing policy here as well */
snpcr = I915_READ(GEN6_MBCUNIT_SNPCR);
snpcr &= ~GEN6_MBC_SNPCR_MASK;
snpcr |= val << GEN6_MBC_SNPCR_SHIFT;
I915_WRITE(GEN6_MBCUNIT_SNPCR, snpcr);
}
return 0;
}
static void
intel_sseu_copy_subslices(const struct sseu_dev_info *sseu, int slice,
u8 *to_mask)
{
int offset = slice * sseu->ss_stride;
memcpy(&to_mask[offset], &sseu->subslice_mask[offset], sseu->ss_stride);
}
DEFINE_SIMPLE_ATTRIBUTE(i915_cache_sharing_fops,
i915_cache_sharing_get, i915_cache_sharing_set,
"%llu\n");
static void cherryview_sseu_device_status(struct drm_i915_private *dev_priv,
struct sseu_dev_info *sseu)
{
#define SS_MAX 2
const int ss_max = SS_MAX;
u32 sig1[SS_MAX], sig2[SS_MAX];
int ss;
sig1[0] = I915_READ(CHV_POWER_SS0_SIG1);
sig1[1] = I915_READ(CHV_POWER_SS1_SIG1);
sig2[0] = I915_READ(CHV_POWER_SS0_SIG2);
sig2[1] = I915_READ(CHV_POWER_SS1_SIG2);
for (ss = 0; ss < ss_max; ss++) {
unsigned int eu_cnt;
if (sig1[ss] & CHV_SS_PG_ENABLE)
/* skip disabled subslice */
continue;
sseu->slice_mask = BIT(0);
sseu->subslice_mask[0] |= BIT(ss);
eu_cnt = ((sig1[ss] & CHV_EU08_PG_ENABLE) ? 0 : 2) +
((sig1[ss] & CHV_EU19_PG_ENABLE) ? 0 : 2) +
((sig1[ss] & CHV_EU210_PG_ENABLE) ? 0 : 2) +
((sig2[ss] & CHV_EU311_PG_ENABLE) ? 0 : 2);
sseu->eu_total += eu_cnt;
sseu->eu_per_subslice = max_t(unsigned int,
sseu->eu_per_subslice, eu_cnt);
}
#undef SS_MAX
}
2017-10-26 07:15:46 +07:00
static void gen10_sseu_device_status(struct drm_i915_private *dev_priv,
struct sseu_dev_info *sseu)
{
#define SS_MAX 6
const struct intel_runtime_info *info = RUNTIME_INFO(dev_priv);
u32 s_reg[SS_MAX], eu_reg[2 * SS_MAX], eu_mask[2];
2017-10-26 07:15:46 +07:00
int s, ss;
for (s = 0; s < info->sseu.max_slices; s++) {
2017-10-26 07:15:46 +07:00
/*
* FIXME: Valid SS Mask respects the spec and read
* only valid bits for those registers, excluding reserved
2017-10-26 07:15:46 +07:00
* although this seems wrong because it would leave many
* subslices without ACK.
*/
s_reg[s] = I915_READ(GEN10_SLICE_PGCTL_ACK(s)) &
GEN10_PGCTL_VALID_SS_MASK(s);
eu_reg[2 * s] = I915_READ(GEN10_SS01_EU_PGCTL_ACK(s));
eu_reg[2 * s + 1] = I915_READ(GEN10_SS23_EU_PGCTL_ACK(s));
}
eu_mask[0] = GEN9_PGCTL_SSA_EU08_ACK |
GEN9_PGCTL_SSA_EU19_ACK |
GEN9_PGCTL_SSA_EU210_ACK |
GEN9_PGCTL_SSA_EU311_ACK;
eu_mask[1] = GEN9_PGCTL_SSB_EU08_ACK |
GEN9_PGCTL_SSB_EU19_ACK |
GEN9_PGCTL_SSB_EU210_ACK |
GEN9_PGCTL_SSB_EU311_ACK;
for (s = 0; s < info->sseu.max_slices; s++) {
2017-10-26 07:15:46 +07:00
if ((s_reg[s] & GEN9_PGCTL_SLICE_ACK) == 0)
/* skip disabled slice */
continue;
sseu->slice_mask |= BIT(s);
intel_sseu_copy_subslices(&info->sseu, s, sseu->subslice_mask);
2017-10-26 07:15:46 +07:00
for (ss = 0; ss < info->sseu.max_subslices; ss++) {
2017-10-26 07:15:46 +07:00
unsigned int eu_cnt;
if (info->sseu.has_subslice_pg &&
!(s_reg[s] & (GEN9_PGCTL_SS_ACK(ss))))
2017-10-26 07:15:46 +07:00
/* skip disabled subslice */
continue;
eu_cnt = 2 * hweight32(eu_reg[2 * s + ss / 2] &
eu_mask[ss % 2]);
sseu->eu_total += eu_cnt;
sseu->eu_per_subslice = max_t(unsigned int,
sseu->eu_per_subslice,
eu_cnt);
}
}
#undef SS_MAX
2017-10-26 07:15:46 +07:00
}
static void gen9_sseu_device_status(struct drm_i915_private *dev_priv,
struct sseu_dev_info *sseu)
{
#define SS_MAX 3
const struct intel_runtime_info *info = RUNTIME_INFO(dev_priv);
u32 s_reg[SS_MAX], eu_reg[2 * SS_MAX], eu_mask[2];
int s, ss;
for (s = 0; s < info->sseu.max_slices; s++) {
s_reg[s] = I915_READ(GEN9_SLICE_PGCTL_ACK(s));
eu_reg[2*s] = I915_READ(GEN9_SS01_EU_PGCTL_ACK(s));
eu_reg[2*s + 1] = I915_READ(GEN9_SS23_EU_PGCTL_ACK(s));
}
eu_mask[0] = GEN9_PGCTL_SSA_EU08_ACK |
GEN9_PGCTL_SSA_EU19_ACK |
GEN9_PGCTL_SSA_EU210_ACK |
GEN9_PGCTL_SSA_EU311_ACK;
eu_mask[1] = GEN9_PGCTL_SSB_EU08_ACK |
GEN9_PGCTL_SSB_EU19_ACK |
GEN9_PGCTL_SSB_EU210_ACK |
GEN9_PGCTL_SSB_EU311_ACK;
for (s = 0; s < info->sseu.max_slices; s++) {
if ((s_reg[s] & GEN9_PGCTL_SLICE_ACK) == 0)
/* skip disabled slice */
continue;
sseu->slice_mask |= BIT(s);
2017-10-26 07:15:46 +07:00
if (IS_GEN9_BC(dev_priv))
intel_sseu_copy_subslices(&info->sseu, s,
sseu->subslice_mask);
for (ss = 0; ss < info->sseu.max_subslices; ss++) {
unsigned int eu_cnt;
u8 ss_idx = s * info->sseu.ss_stride +
ss / BITS_PER_BYTE;
if (IS_GEN9_LP(dev_priv)) {
if (!(s_reg[s] & (GEN9_PGCTL_SS_ACK(ss))))
/* skip disabled subslice */
continue;
sseu->subslice_mask[ss_idx] |=
BIT(ss % BITS_PER_BYTE);
}
eu_cnt = 2 * hweight32(eu_reg[2*s + ss/2] &
eu_mask[ss%2]);
sseu->eu_total += eu_cnt;
sseu->eu_per_subslice = max_t(unsigned int,
sseu->eu_per_subslice,
eu_cnt);
}
}
#undef SS_MAX
}
static void bdw_sseu_device_status(struct drm_i915_private *dev_priv,
struct sseu_dev_info *sseu)
{
const struct intel_runtime_info *info = RUNTIME_INFO(dev_priv);
u32 slice_info = I915_READ(GEN8_GT_SLICE_INFO);
int s;
sseu->slice_mask = slice_info & GEN8_LSLICESTAT_MASK;
if (sseu->slice_mask) {
sseu->eu_per_subslice = info->sseu.eu_per_subslice;
for (s = 0; s < fls(sseu->slice_mask); s++)
intel_sseu_copy_subslices(&info->sseu, s,
sseu->subslice_mask);
sseu->eu_total = sseu->eu_per_subslice *
intel_sseu_subslice_total(sseu);
/* subtract fused off EU(s) from enabled slice(s) */
for (s = 0; s < fls(sseu->slice_mask); s++) {
u8 subslice_7eu = info->sseu.subslice_7eu[s];
sseu->eu_total -= hweight8(subslice_7eu);
}
}
}
static void i915_print_sseu_info(struct seq_file *m, bool is_available_info,
const struct sseu_dev_info *sseu)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
const char *type = is_available_info ? "Available" : "Enabled";
int s;
seq_printf(m, " %s Slice Mask: %04x\n", type,
sseu->slice_mask);
seq_printf(m, " %s Slice Total: %u\n", type,
hweight8(sseu->slice_mask));
seq_printf(m, " %s Subslice Total: %u\n", type,
intel_sseu_subslice_total(sseu));
for (s = 0; s < fls(sseu->slice_mask); s++) {
seq_printf(m, " %s Slice%i subslices: %u\n", type,
s, intel_sseu_subslices_per_slice(sseu, s));
}
seq_printf(m, " %s EU Total: %u\n", type,
sseu->eu_total);
seq_printf(m, " %s EU Per Subslice: %u\n", type,
sseu->eu_per_subslice);
if (!is_available_info)
return;
seq_printf(m, " Has Pooled EU: %s\n", yesno(HAS_POOLED_EU(dev_priv)));
if (HAS_POOLED_EU(dev_priv))
seq_printf(m, " Min EU in pool: %u\n", sseu->min_eu_in_pool);
seq_printf(m, " Has Slice Power Gating: %s\n",
yesno(sseu->has_slice_pg));
seq_printf(m, " Has Subslice Power Gating: %s\n",
yesno(sseu->has_subslice_pg));
seq_printf(m, " Has EU Power Gating: %s\n",
yesno(sseu->has_eu_pg));
}
static int i915_sseu_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
const struct intel_runtime_info *info = RUNTIME_INFO(dev_priv);
struct sseu_dev_info sseu;
intel_wakeref_t wakeref;
if (INTEL_GEN(dev_priv) < 8)
return -ENODEV;
seq_puts(m, "SSEU Device Info\n");
i915_print_sseu_info(m, true, &info->sseu);
seq_puts(m, "SSEU Device Status\n");
memset(&sseu, 0, sizeof(sseu));
intel_sseu_set_info(&sseu, info->sseu.max_slices,
info->sseu.max_subslices,
info->sseu.max_eus_per_subslice);
with_intel_runtime_pm(&dev_priv->runtime_pm, wakeref) {
if (IS_CHERRYVIEW(dev_priv))
cherryview_sseu_device_status(dev_priv, &sseu);
else if (IS_BROADWELL(dev_priv))
bdw_sseu_device_status(dev_priv, &sseu);
else if (IS_GEN(dev_priv, 9))
gen9_sseu_device_status(dev_priv, &sseu);
else if (INTEL_GEN(dev_priv) >= 10)
gen10_sseu_device_status(dev_priv, &sseu);
}
i915_print_sseu_info(m, false, &sseu);
return 0;
}
static int i915_forcewake_open(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct intel_gt *gt = &i915->gt;
atomic_inc(&gt->user_wakeref);
intel_gt_pm_get(gt);
if (INTEL_GEN(i915) >= 6)
intel_uncore_forcewake_user_get(gt->uncore);
return 0;
}
static int i915_forcewake_release(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct intel_gt *gt = &i915->gt;
if (INTEL_GEN(i915) >= 6)
intel_uncore_forcewake_user_put(&i915->uncore);
intel_gt_pm_put(gt);
atomic_dec(&gt->user_wakeref);
return 0;
}
static const struct file_operations i915_forcewake_fops = {
.owner = THIS_MODULE,
.open = i915_forcewake_open,
.release = i915_forcewake_release,
};
static const struct drm_info_list i915_debugfs_list[] = {
{"i915_capabilities", i915_capabilities, 0},
{"i915_gem_objects", i915_gem_object_info, 0},
{"i915_gem_fence_regs", i915_gem_fence_regs_info, 0},
{"i915_gem_interrupt", i915_interrupt_info, 0},
{"i915_frequency_info", i915_frequency_info, 0},
{"i915_ring_freq_table", i915_ring_freq_table, 0},
{"i915_context_status", i915_context_status, 0},
{"i915_swizzle_info", i915_swizzle_info, 0},
{"i915_llc", i915_llc, 0},
{"i915_runtime_pm_status", i915_runtime_pm_status, 0},
{"i915_engine_info", i915_engine_info, 0},
{"i915_rcs_topology", i915_rcs_topology, 0},
{"i915_shrinker_info", i915_shrinker_info, 0},
{"i915_wa_registers", i915_wa_registers, 0},
{"i915_sseu_status", i915_sseu_status, 0},
{"i915_rps_boost_info", i915_rps_boost_info, 0},
};
#define I915_DEBUGFS_ENTRIES ARRAY_SIZE(i915_debugfs_list)
static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
} i915_debugfs_files[] = {
{"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
{"i915_wedged", &i915_wedged_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops},
{"i915_gpu_info", &i915_gpu_info_fops},
#endif
};
int i915_debugfs_register(struct drm_i915_private *dev_priv)
{
struct drm_minor *minor = dev_priv->drm.primary;
int i;
i915_debugfs_params(dev_priv);
debugfs_create_file("i915_forcewake_user", S_IRUSR, minor->debugfs_root,
to_i915(minor->dev), &i915_forcewake_fops);
for (i = 0; i < ARRAY_SIZE(i915_debugfs_files); i++) {
debugfs_create_file(i915_debugfs_files[i].name,
S_IRUGO | S_IWUSR,
minor->debugfs_root,
to_i915(minor->dev),
i915_debugfs_files[i].fops);
}
return drm_debugfs_create_files(i915_debugfs_list,
I915_DEBUGFS_ENTRIES,
minor->debugfs_root, minor);
}