linux_dsm_epyc7002/drivers/gpu/drm/i915/display/intel_overlay.c

1489 lines
37 KiB
C
Raw Normal View History

/*
* Copyright © 2009
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors:
* Daniel Vetter <daniel@ffwll.ch>
*
* Derived from Xorg ddx, xf86-video-intel, src/i830_video.c
*/
drm: Split out drm_probe_helper.h Having the probe helper stuff (which pretty much everyone needs) in the drm_crtc_helper.h file (which atomic drivers should never need) is confusing. Split them out. To make sure I actually achieved the goal here I went through all drivers. And indeed, all atomic drivers are now free of drm_crtc_helper.h includes. v2: Make it compile. There was so much compile fail on arm drivers that I figured I'll better not include any of the acks on v1. v3: Massive rebase because i915 has lost a lot of drmP.h includes, but not all: Through drm_crtc_helper.h > drm_modeset_helper.h -> drmP.h there was still one, which this patch largely removes. Which means rolling out lots more includes all over. This will also conflict with ongoing drmP.h cleanup by others I expect. v3: Rebase on top of atomic bochs. v4: Review from Laurent for bridge/rcar/omap/shmob/core bits: - (re)move some of the added includes, use the better include files in other places (all suggested from Laurent adopted unchanged). - sort alphabetically v5: Actually try to sort them, and while at it, sort all the ones I touch. v6: Rebase onto i915 changes. v7: Rebase once more. Acked-by: Harry Wentland <harry.wentland@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Cc: Sam Ravnborg <sam@ravnborg.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Acked-by: Benjamin Gaignard <benjamin.gaignard@linaro.org> Acked-by: Jani Nikula <jani.nikula@intel.com> Acked-by: Neil Armstrong <narmstrong@baylibre.com> Acked-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> Acked-by: CK Hu <ck.hu@mediatek.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Liviu Dudau <liviu.dudau@arm.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Cc: linux-arm-kernel@lists.infradead.org Cc: virtualization@lists.linux-foundation.org Cc: etnaviv@lists.freedesktop.org Cc: linux-samsung-soc@vger.kernel.org Cc: intel-gfx@lists.freedesktop.org Cc: linux-mediatek@lists.infradead.org Cc: linux-amlogic@lists.infradead.org Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: spice-devel@lists.freedesktop.org Cc: amd-gfx@lists.freedesktop.org Cc: linux-renesas-soc@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Cc: linux-stm32@st-md-mailman.stormreply.com Cc: linux-tegra@vger.kernel.org Cc: xen-devel@lists.xen.org Link: https://patchwork.freedesktop.org/patch/msgid/20190117210334.13234-1-daniel.vetter@ffwll.ch
2019-01-18 04:03:34 +07:00
#include <drm/drm_fourcc.h>
#include "gem/i915_gem_pm.h"
#include "gt/intel_ring.h"
#include "i915_drv.h"
#include "i915_reg.h"
#include "intel_display_types.h"
#include "intel_frontbuffer.h"
#include "intel_overlay.h"
/* Limits for overlay size. According to intel doc, the real limits are:
* Y width: 4095, UV width (planar): 2047, Y height: 2047,
* UV width (planar): * 1023. But the xorg thinks 2048 for height and width. Use
* the mininum of both. */
#define IMAGE_MAX_WIDTH 2048
#define IMAGE_MAX_HEIGHT 2046 /* 2 * 1023 */
/* on 830 and 845 these large limits result in the card hanging */
#define IMAGE_MAX_WIDTH_LEGACY 1024
#define IMAGE_MAX_HEIGHT_LEGACY 1088
/* overlay register definitions */
/* OCMD register */
#define OCMD_TILED_SURFACE (0x1<<19)
#define OCMD_MIRROR_MASK (0x3<<17)
#define OCMD_MIRROR_MODE (0x3<<17)
#define OCMD_MIRROR_HORIZONTAL (0x1<<17)
#define OCMD_MIRROR_VERTICAL (0x2<<17)
#define OCMD_MIRROR_BOTH (0x3<<17)
#define OCMD_BYTEORDER_MASK (0x3<<14) /* zero for YUYV or FOURCC YUY2 */
#define OCMD_UV_SWAP (0x1<<14) /* YVYU */
#define OCMD_Y_SWAP (0x2<<14) /* UYVY or FOURCC UYVY */
#define OCMD_Y_AND_UV_SWAP (0x3<<14) /* VYUY */
#define OCMD_SOURCE_FORMAT_MASK (0xf<<10)
#define OCMD_RGB_888 (0x1<<10) /* not in i965 Intel docs */
#define OCMD_RGB_555 (0x2<<10) /* not in i965 Intel docs */
#define OCMD_RGB_565 (0x3<<10) /* not in i965 Intel docs */
#define OCMD_YUV_422_PACKED (0x8<<10)
#define OCMD_YUV_411_PACKED (0x9<<10) /* not in i965 Intel docs */
#define OCMD_YUV_420_PLANAR (0xc<<10)
#define OCMD_YUV_422_PLANAR (0xd<<10)
#define OCMD_YUV_410_PLANAR (0xe<<10) /* also 411 */
#define OCMD_TVSYNCFLIP_PARITY (0x1<<9)
#define OCMD_TVSYNCFLIP_ENABLE (0x1<<7)
#define OCMD_BUF_TYPE_MASK (0x1<<5)
#define OCMD_BUF_TYPE_FRAME (0x0<<5)
#define OCMD_BUF_TYPE_FIELD (0x1<<5)
#define OCMD_TEST_MODE (0x1<<4)
#define OCMD_BUFFER_SELECT (0x3<<2)
#define OCMD_BUFFER0 (0x0<<2)
#define OCMD_BUFFER1 (0x1<<2)
#define OCMD_FIELD_SELECT (0x1<<2)
#define OCMD_FIELD0 (0x0<<1)
#define OCMD_FIELD1 (0x1<<1)
#define OCMD_ENABLE (0x1<<0)
/* OCONFIG register */
#define OCONF_PIPE_MASK (0x1<<18)
#define OCONF_PIPE_A (0x0<<18)
#define OCONF_PIPE_B (0x1<<18)
#define OCONF_GAMMA2_ENABLE (0x1<<16)
#define OCONF_CSC_MODE_BT601 (0x0<<5)
#define OCONF_CSC_MODE_BT709 (0x1<<5)
#define OCONF_CSC_BYPASS (0x1<<4)
#define OCONF_CC_OUT_8BIT (0x1<<3)
#define OCONF_TEST_MODE (0x1<<2)
#define OCONF_THREE_LINE_BUFFER (0x1<<0)
#define OCONF_TWO_LINE_BUFFER (0x0<<0)
/* DCLRKM (dst-key) register */
#define DST_KEY_ENABLE (0x1<<31)
#define CLK_RGB24_MASK 0x0
#define CLK_RGB16_MASK 0x070307
#define CLK_RGB15_MASK 0x070707
#define CLK_RGB8I_MASK 0xffffff
#define RGB16_TO_COLORKEY(c) \
(((c & 0xF800) << 8) | ((c & 0x07E0) << 5) | ((c & 0x001F) << 3))
#define RGB15_TO_COLORKEY(c) \
(((c & 0x7c00) << 9) | ((c & 0x03E0) << 6) | ((c & 0x001F) << 3))
/* overlay flip addr flag */
#define OFC_UPDATE 0x1
/* polyphase filter coefficients */
#define N_HORIZ_Y_TAPS 5
#define N_VERT_Y_TAPS 3
#define N_HORIZ_UV_TAPS 3
#define N_VERT_UV_TAPS 3
#define N_PHASES 17
#define MAX_TAPS 5
/* memory bufferd overlay registers */
struct overlay_registers {
u32 OBUF_0Y;
u32 OBUF_1Y;
u32 OBUF_0U;
u32 OBUF_0V;
u32 OBUF_1U;
u32 OBUF_1V;
u32 OSTRIDE;
u32 YRGB_VPH;
u32 UV_VPH;
u32 HORZ_PH;
u32 INIT_PHS;
u32 DWINPOS;
u32 DWINSZ;
u32 SWIDTH;
u32 SWIDTHSW;
u32 SHEIGHT;
u32 YRGBSCALE;
u32 UVSCALE;
u32 OCLRC0;
u32 OCLRC1;
u32 DCLRKV;
u32 DCLRKM;
u32 SCLRKVH;
u32 SCLRKVL;
u32 SCLRKEN;
u32 OCONFIG;
u32 OCMD;
u32 RESERVED1; /* 0x6C */
u32 OSTART_0Y;
u32 OSTART_1Y;
u32 OSTART_0U;
u32 OSTART_0V;
u32 OSTART_1U;
u32 OSTART_1V;
u32 OTILEOFF_0Y;
u32 OTILEOFF_1Y;
u32 OTILEOFF_0U;
u32 OTILEOFF_0V;
u32 OTILEOFF_1U;
u32 OTILEOFF_1V;
u32 FASTHSCALE; /* 0xA0 */
u32 UVSCALEV; /* 0xA4 */
u32 RESERVEDC[(0x200 - 0xA8) / 4]; /* 0xA8 - 0x1FC */
u16 Y_VCOEFS[N_VERT_Y_TAPS * N_PHASES]; /* 0x200 */
u16 RESERVEDD[0x100 / 2 - N_VERT_Y_TAPS * N_PHASES];
u16 Y_HCOEFS[N_HORIZ_Y_TAPS * N_PHASES]; /* 0x300 */
u16 RESERVEDE[0x200 / 2 - N_HORIZ_Y_TAPS * N_PHASES];
u16 UV_VCOEFS[N_VERT_UV_TAPS * N_PHASES]; /* 0x500 */
u16 RESERVEDF[0x100 / 2 - N_VERT_UV_TAPS * N_PHASES];
u16 UV_HCOEFS[N_HORIZ_UV_TAPS * N_PHASES]; /* 0x600 */
u16 RESERVEDG[0x100 / 2 - N_HORIZ_UV_TAPS * N_PHASES];
};
struct intel_overlay {
struct drm_i915_private *i915;
struct intel_context *context;
struct intel_crtc *crtc;
struct i915_vma *vma;
struct i915_vma *old_vma;
bool active;
bool pfit_active;
u32 pfit_vscale_ratio; /* shifted-point number, (1<<12) == 1.0 */
u32 color_key:24;
u32 color_key_enabled:1;
u32 brightness, contrast, saturation;
u32 old_xscale, old_yscale;
/* register access */
struct drm_i915_gem_object *reg_bo;
struct overlay_registers __iomem *regs;
u32 flip_addr;
/* flip handling */
struct i915_active last_flip;
void (*flip_complete)(struct intel_overlay *ovl);
};
static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
bool enable)
{
struct pci_dev *pdev = dev_priv->drm.pdev;
u8 val;
/* WA_OVERLAY_CLKGATE:alm */
if (enable)
intel_de_write(dev_priv, DSPCLK_GATE_D, 0);
else
intel_de_write(dev_priv, DSPCLK_GATE_D,
OVRUNIT_CLOCK_GATE_DISABLE);
/* WA_DISABLE_L2CACHE_CLOCK_GATING:alm */
pci_bus_read_config_byte(pdev->bus,
PCI_DEVFN(0, 0), I830_CLOCK_GATE, &val);
if (enable)
val &= ~I830_L2_CACHE_CLOCK_GATE_DISABLE;
else
val |= I830_L2_CACHE_CLOCK_GATE_DISABLE;
pci_bus_write_config_byte(pdev->bus,
PCI_DEVFN(0, 0), I830_CLOCK_GATE, val);
}
static struct i915_request *
alloc_request(struct intel_overlay *overlay, void (*fn)(struct intel_overlay *))
{
struct i915_request *rq;
int err;
overlay->flip_complete = fn;
rq = i915_request_create(overlay->context);
if (IS_ERR(rq))
return rq;
drm/i915: Mark i915_request.timeline as a volatile, rcu pointer The request->timeline is only valid until the request is retired (i.e. before it is completed). Upon retiring the request, the context may be unpinned and freed, and along with it the timeline may be freed. We therefore need to be very careful when chasing rq->timeline that the pointer does not disappear beneath us. The vast majority of users are in a protected context, either during request construction or retirement, where the timeline->mutex is held and the timeline cannot disappear. It is those few off the beaten path (where we access a second timeline) that need extra scrutiny -- to be added in the next patch after first adding the warnings about dangerous access. One complication, where we cannot use the timeline->mutex itself, is during request submission onto hardware (under spinlocks). Here, we want to check on the timeline to finalize the breadcrumb, and so we need to impose a second rule to ensure that the request->timeline is indeed valid. As we are submitting the request, it's context and timeline must be pinned, as it will be used by the hardware. Since it is pinned, we know the request->timeline must still be valid, and we cannot submit the idle barrier until after we release the engine->active.lock, ergo while submitting and holding that spinlock, a second thread cannot release the timeline. v2: Don't be lazy inside selftests; hold the timeline->mutex for as long as we need it, and tidy up acquiring the timeline with a bit of refactoring (i915_active_add_request) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190919111912.21631-1-chris@chris-wilson.co.uk
2019-09-19 18:19:10 +07:00
err = i915_active_add_request(&overlay->last_flip, rq);
if (err) {
i915_request_add(rq);
return ERR_PTR(err);
}
return rq;
}
/* overlay needs to be disable in OCMD reg */
static int intel_overlay_on(struct intel_overlay *overlay)
{
struct drm_i915_private *dev_priv = overlay->i915;
struct i915_request *rq;
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
u32 *cs;
drm_WARN_ON(&dev_priv->drm, overlay->active);
rq = alloc_request(overlay, NULL);
if (IS_ERR(rq))
return PTR_ERR(rq);
cs = intel_ring_begin(rq, 4);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
if (IS_ERR(cs)) {
i915_request_add(rq);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
return PTR_ERR(cs);
}
overlay->active = true;
if (IS_I830(dev_priv))
i830_overlay_clock_gating(dev_priv, false);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
*cs++ = MI_OVERLAY_FLIP | MI_OVERLAY_ON;
*cs++ = overlay->flip_addr | OFC_UPDATE;
*cs++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
*cs++ = MI_NOOP;
intel_ring_advance(rq, cs);
i915_request_add(rq);
return i915_active_wait(&overlay->last_flip);
}
static void intel_overlay_flip_prepare(struct intel_overlay *overlay,
struct i915_vma *vma)
{
enum pipe pipe = overlay->crtc->pipe;
struct intel_frontbuffer *from = NULL, *to = NULL;
drm_WARN_ON(&overlay->i915->drm, overlay->old_vma);
if (overlay->vma)
from = intel_frontbuffer_get(overlay->vma->obj);
if (vma)
to = intel_frontbuffer_get(vma->obj);
intel_frontbuffer_track(from, to, INTEL_FRONTBUFFER_OVERLAY(pipe));
if (to)
intel_frontbuffer_put(to);
if (from)
intel_frontbuffer_put(from);
intel_frontbuffer_flip_prepare(overlay->i915,
INTEL_FRONTBUFFER_OVERLAY(pipe));
overlay->old_vma = overlay->vma;
if (vma)
overlay->vma = i915_vma_get(vma);
else
overlay->vma = NULL;
}
/* overlay needs to be enabled in OCMD reg */
static int intel_overlay_continue(struct intel_overlay *overlay,
struct i915_vma *vma,
bool load_polyphase_filter)
{
struct drm_i915_private *dev_priv = overlay->i915;
struct i915_request *rq;
u32 flip_addr = overlay->flip_addr;
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
u32 tmp, *cs;
drm_WARN_ON(&dev_priv->drm, !overlay->active);
if (load_polyphase_filter)
flip_addr |= OFC_UPDATE;
/* check for underruns */
tmp = intel_de_read(dev_priv, DOVSTA);
if (tmp & (1 << 17))
drm_dbg(&dev_priv->drm, "overlay underrun, DOVSTA: %x\n", tmp);
rq = alloc_request(overlay, NULL);
if (IS_ERR(rq))
return PTR_ERR(rq);
cs = intel_ring_begin(rq, 2);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
if (IS_ERR(cs)) {
i915_request_add(rq);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
return PTR_ERR(cs);
}
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
*cs++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
*cs++ = flip_addr;
intel_ring_advance(rq, cs);
intel_overlay_flip_prepare(overlay, vma);
i915_request_add(rq);
drm/i915: i915_add_request must not fail The i915_add_request() function is called to keep track of work that has been written to the ring buffer. It adds epilogue commands to track progress (seqno updates and such), moves the request structure onto the right list and other such house keeping tasks. However, the work itself has already been written to the ring and will get executed whether or not the add request call succeeds. So no matter what goes wrong, there isn't a whole lot of point in failing the call. At the moment, this is fine(ish). If the add request does bail early on and not do the housekeeping, the request will still float around in the ring->outstanding_lazy_request field and be picked up next time. It means multiple pieces of work will be tagged as the same request and driver can't actually wait for the first piece of work until something else has been submitted. But it all sort of hangs together. This patch series is all about removing the OLR and guaranteeing that each piece of work gets its own personal request. That means that there is no more 'hoovering up of forgotten requests'. If the request does not get tracked then it will be leaked. Thus the add request call _must_ not fail. The previous patch should have already ensured that it _will_ not fail by removing the potential for running out of ring space. This patch enforces the rule by actually removing the early exit paths and the return code. Note that if something does manage to fail and the epilogue commands don't get written to the ring, the driver will still hang together. The request will be added to the tracking lists. And as in the old case, any subsequent work will generate a new seqno which will suffice for marking the old one as complete. v2: Improved WARNings (Tomas Elf review request). For: VIZ-5115 Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Tomas Elf <tomas.elf@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-05-29 23:43:24 +07:00
return 0;
}
static void intel_overlay_release_old_vma(struct intel_overlay *overlay)
{
struct i915_vma *vma;
vma = fetch_and_zero(&overlay->old_vma);
if (drm_WARN_ON(&overlay->i915->drm, !vma))
return;
intel_frontbuffer_flip_complete(overlay->i915,
INTEL_FRONTBUFFER_OVERLAY(overlay->crtc->pipe));
i915_gem_object_unpin_from_display_plane(vma);
i915_vma_put(vma);
}
static void
intel_overlay_release_old_vid_tail(struct intel_overlay *overlay)
{
intel_overlay_release_old_vma(overlay);
}
static void intel_overlay_off_tail(struct intel_overlay *overlay)
{
struct drm_i915_private *dev_priv = overlay->i915;
intel_overlay_release_old_vma(overlay);
overlay->crtc->overlay = NULL;
overlay->crtc = NULL;
overlay->active = false;
if (IS_I830(dev_priv))
i830_overlay_clock_gating(dev_priv, true);
}
static void
intel_overlay_last_flip_retire(struct i915_active *active)
{
struct intel_overlay *overlay =
container_of(active, typeof(*overlay), last_flip);
if (overlay->flip_complete)
overlay->flip_complete(overlay);
}
/* overlay needs to be disabled in OCMD reg */
static int intel_overlay_off(struct intel_overlay *overlay)
{
struct i915_request *rq;
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
u32 *cs, flip_addr = overlay->flip_addr;
drm_WARN_ON(&overlay->i915->drm, !overlay->active);
/* According to intel docs the overlay hw may hang (when switching
* off) without loading the filter coeffs. It is however unclear whether
* this applies to the disabling of the overlay or to the switching off
* of the hw. Do it in both cases */
flip_addr |= OFC_UPDATE;
rq = alloc_request(overlay, intel_overlay_off_tail);
if (IS_ERR(rq))
return PTR_ERR(rq);
cs = intel_ring_begin(rq, 6);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
if (IS_ERR(cs)) {
i915_request_add(rq);
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
return PTR_ERR(cs);
}
/* wait for overlay to go idle */
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
*cs++ = MI_OVERLAY_FLIP | MI_OVERLAY_CONTINUE;
*cs++ = flip_addr;
*cs++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
/* turn overlay off */
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
*cs++ = MI_OVERLAY_FLIP | MI_OVERLAY_OFF;
*cs++ = flip_addr;
*cs++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
intel_ring_advance(rq, cs);
intel_overlay_flip_prepare(overlay, NULL);
i915_request_add(rq);
return i915_active_wait(&overlay->last_flip);
}
/* recover from an interruption due to a signal
* We have to be careful not to repeat work forever an make forward progess. */
static int intel_overlay_recover_from_interrupt(struct intel_overlay *overlay)
{
return i915_active_wait(&overlay->last_flip);
}
/* Wait for pending overlay flip and release old frame.
* Needs to be called before the overlay register are changed
* via intel_overlay_(un)map_regs
*/
static int intel_overlay_release_old_vid(struct intel_overlay *overlay)
{
struct drm_i915_private *dev_priv = overlay->i915;
struct i915_request *rq;
drm/i915: Emit to ringbuffer directly This removes the usage of intel_ring_emit in favour of directly writing to the ring buffer. intel_ring_emit was preventing the compiler for optimising fetch and increment of the current ring buffer pointer and therefore generating very verbose code for every write. It had no useful purpose since all ringbuffer operations are started and ended with intel_ring_begin and intel_ring_advance respectively, with no bail out in the middle possible, so it is fine to increment the tail in intel_ring_begin and let the code manage the pointer itself. Useless instruction removal amounts to approximately two and half kilobytes of saved text on my build. Not sure if this has any measurable performance implications but executing a ton of useless instructions on fast paths cannot be good. v2: * Change return from intel_ring_begin to error pointer by popular demand. * Move tail increment to intel_ring_advance to enable some error checking. v3: * Move tail advance back into intel_ring_begin. * Rebase and tidy. v4: * Complete rebase after a few months since v3. v5: * Remove unecessary cast and fix !debug compile. (Chris Wilson) v6: * Make intel_ring_offset take request as well. * Fix recording of request postfix plus a sprinkle of asserts. (Chris Wilson) v7: * Use intel_ring_offset to get the postfix. (Chris Wilson) * Convert GVT code as well. v8: * Rename *out++ to *cs++. v9: * Fix GVT out to cs conversion in GVT. v10: * Rebase for new intel_ring_begin in selftests. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Zhi Wang <zhi.a.wang@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
2017-02-14 18:32:42 +07:00
u32 *cs;
/*
* Only wait if there is actually an old frame to release to
* guarantee forward progress.
*/
if (!overlay->old_vma)
return 0;
if (!(intel_de_read(dev_priv, GEN2_ISR) & I915_OVERLAY_PLANE_FLIP_PENDING_INTERRUPT)) {
intel_overlay_release_old_vid_tail(overlay);
return 0;
}
rq = alloc_request(overlay, intel_overlay_release_old_vid_tail);
if (IS_ERR(rq))
return PTR_ERR(rq);
cs = intel_ring_begin(rq, 2);
if (IS_ERR(cs)) {
i915_request_add(rq);
return PTR_ERR(cs);
}
*cs++ = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_OVERLAY_FLIP;
*cs++ = MI_NOOP;
intel_ring_advance(rq, cs);
i915_request_add(rq);
return i915_active_wait(&overlay->last_flip);
}
void intel_overlay_reset(struct drm_i915_private *dev_priv)
{
struct intel_overlay *overlay = dev_priv->overlay;
if (!overlay)
return;
overlay->old_xscale = 0;
overlay->old_yscale = 0;
overlay->crtc = NULL;
overlay->active = false;
}
static int packed_depth_bytes(u32 format)
{
switch (format & I915_OVERLAY_DEPTH_MASK) {
case I915_OVERLAY_YUV422:
return 4;
case I915_OVERLAY_YUV411:
/* return 6; not implemented */
default:
return -EINVAL;
}
}
static int packed_width_bytes(u32 format, short width)
{
switch (format & I915_OVERLAY_DEPTH_MASK) {
case I915_OVERLAY_YUV422:
return width << 1;
default:
return -EINVAL;
}
}
static int uv_hsubsampling(u32 format)
{
switch (format & I915_OVERLAY_DEPTH_MASK) {
case I915_OVERLAY_YUV422:
case I915_OVERLAY_YUV420:
return 2;
case I915_OVERLAY_YUV411:
case I915_OVERLAY_YUV410:
return 4;
default:
return -EINVAL;
}
}
static int uv_vsubsampling(u32 format)
{
switch (format & I915_OVERLAY_DEPTH_MASK) {
case I915_OVERLAY_YUV420:
case I915_OVERLAY_YUV410:
return 2;
case I915_OVERLAY_YUV422:
case I915_OVERLAY_YUV411:
return 1;
default:
return -EINVAL;
}
}
static u32 calc_swidthsw(struct drm_i915_private *dev_priv, u32 offset, u32 width)
{
u32 sw;
if (IS_GEN(dev_priv, 2))
sw = ALIGN((offset & 31) + width, 32);
else
sw = ALIGN((offset & 63) + width, 64);
if (sw == 0)
return 0;
return (sw - 32) >> 3;
}
static const u16 y_static_hcoeffs[N_PHASES][N_HORIZ_Y_TAPS] = {
[ 0] = { 0x3000, 0xb4a0, 0x1930, 0x1920, 0xb4a0, },
[ 1] = { 0x3000, 0xb500, 0x19d0, 0x1880, 0xb440, },
[ 2] = { 0x3000, 0xb540, 0x1a88, 0x2f80, 0xb3e0, },
[ 3] = { 0x3000, 0xb580, 0x1b30, 0x2e20, 0xb380, },
[ 4] = { 0x3000, 0xb5c0, 0x1bd8, 0x2cc0, 0xb320, },
[ 5] = { 0x3020, 0xb5e0, 0x1c60, 0x2b80, 0xb2c0, },
[ 6] = { 0x3020, 0xb5e0, 0x1cf8, 0x2a20, 0xb260, },
[ 7] = { 0x3020, 0xb5e0, 0x1d80, 0x28e0, 0xb200, },
[ 8] = { 0x3020, 0xb5c0, 0x1e08, 0x3f40, 0xb1c0, },
[ 9] = { 0x3020, 0xb580, 0x1e78, 0x3ce0, 0xb160, },
[10] = { 0x3040, 0xb520, 0x1ed8, 0x3aa0, 0xb120, },
[11] = { 0x3040, 0xb4a0, 0x1f30, 0x3880, 0xb0e0, },
[12] = { 0x3040, 0xb400, 0x1f78, 0x3680, 0xb0a0, },
[13] = { 0x3020, 0xb340, 0x1fb8, 0x34a0, 0xb060, },
[14] = { 0x3020, 0xb240, 0x1fe0, 0x32e0, 0xb040, },
[15] = { 0x3020, 0xb140, 0x1ff8, 0x3160, 0xb020, },
[16] = { 0xb000, 0x3000, 0x0800, 0x3000, 0xb000, },
};
static const u16 uv_static_hcoeffs[N_PHASES][N_HORIZ_UV_TAPS] = {
[ 0] = { 0x3000, 0x1800, 0x1800, },
[ 1] = { 0xb000, 0x18d0, 0x2e60, },
[ 2] = { 0xb000, 0x1990, 0x2ce0, },
[ 3] = { 0xb020, 0x1a68, 0x2b40, },
[ 4] = { 0xb040, 0x1b20, 0x29e0, },
[ 5] = { 0xb060, 0x1bd8, 0x2880, },
[ 6] = { 0xb080, 0x1c88, 0x3e60, },
[ 7] = { 0xb0a0, 0x1d28, 0x3c00, },
[ 8] = { 0xb0c0, 0x1db8, 0x39e0, },
[ 9] = { 0xb0e0, 0x1e40, 0x37e0, },
[10] = { 0xb100, 0x1eb8, 0x3620, },
[11] = { 0xb100, 0x1f18, 0x34a0, },
[12] = { 0xb100, 0x1f68, 0x3360, },
[13] = { 0xb0e0, 0x1fa8, 0x3240, },
[14] = { 0xb0c0, 0x1fe0, 0x3140, },
[15] = { 0xb060, 0x1ff0, 0x30a0, },
[16] = { 0x3000, 0x0800, 0x3000, },
};
static void update_polyphase_filter(struct overlay_registers __iomem *regs)
{
memcpy_toio(regs->Y_HCOEFS, y_static_hcoeffs, sizeof(y_static_hcoeffs));
memcpy_toio(regs->UV_HCOEFS, uv_static_hcoeffs,
sizeof(uv_static_hcoeffs));
}
static bool update_scaling_factors(struct intel_overlay *overlay,
struct overlay_registers __iomem *regs,
struct drm_intel_overlay_put_image *params)
{
/* fixed point with a 12 bit shift */
u32 xscale, yscale, xscale_UV, yscale_UV;
#define FP_SHIFT 12
#define FRACT_MASK 0xfff
bool scale_changed = false;
int uv_hscale = uv_hsubsampling(params->flags);
int uv_vscale = uv_vsubsampling(params->flags);
if (params->dst_width > 1)
xscale = ((params->src_scan_width - 1) << FP_SHIFT) /
params->dst_width;
else
xscale = 1 << FP_SHIFT;
if (params->dst_height > 1)
yscale = ((params->src_scan_height - 1) << FP_SHIFT) /
params->dst_height;
else
yscale = 1 << FP_SHIFT;
/*if (params->format & I915_OVERLAY_YUV_PLANAR) {*/
xscale_UV = xscale/uv_hscale;
yscale_UV = yscale/uv_vscale;
/* make the Y scale to UV scale ratio an exact multiply */
xscale = xscale_UV * uv_hscale;
yscale = yscale_UV * uv_vscale;
/*} else {
xscale_UV = 0;
yscale_UV = 0;
}*/
if (xscale != overlay->old_xscale || yscale != overlay->old_yscale)
scale_changed = true;
overlay->old_xscale = xscale;
overlay->old_yscale = yscale;
iowrite32(((yscale & FRACT_MASK) << 20) |
((xscale >> FP_SHIFT) << 16) |
((xscale & FRACT_MASK) << 3),
&regs->YRGBSCALE);
iowrite32(((yscale_UV & FRACT_MASK) << 20) |
((xscale_UV >> FP_SHIFT) << 16) |
((xscale_UV & FRACT_MASK) << 3),
&regs->UVSCALE);
iowrite32((((yscale >> FP_SHIFT) << 16) |
((yscale_UV >> FP_SHIFT) << 0)),
&regs->UVSCALEV);
if (scale_changed)
update_polyphase_filter(regs);
return scale_changed;
}
static void update_colorkey(struct intel_overlay *overlay,
struct overlay_registers __iomem *regs)
{
const struct intel_plane_state *state =
to_intel_plane_state(overlay->crtc->base.primary->state);
u32 key = overlay->color_key;
u32 format = 0;
u32 flags = 0;
if (overlay->color_key_enabled)
flags |= DST_KEY_ENABLE;
if (state->uapi.visible)
format = state->hw.fb->format->format;
switch (format) {
case DRM_FORMAT_C8:
key = 0;
flags |= CLK_RGB8I_MASK;
break;
case DRM_FORMAT_XRGB1555:
key = RGB15_TO_COLORKEY(key);
flags |= CLK_RGB15_MASK;
break;
case DRM_FORMAT_RGB565:
key = RGB16_TO_COLORKEY(key);
flags |= CLK_RGB16_MASK;
break;
default:
flags |= CLK_RGB24_MASK;
break;
}
iowrite32(key, &regs->DCLRKV);
iowrite32(flags, &regs->DCLRKM);
}
static u32 overlay_cmd_reg(struct drm_intel_overlay_put_image *params)
{
u32 cmd = OCMD_ENABLE | OCMD_BUF_TYPE_FRAME | OCMD_BUFFER0;
if (params->flags & I915_OVERLAY_YUV_PLANAR) {
switch (params->flags & I915_OVERLAY_DEPTH_MASK) {
case I915_OVERLAY_YUV422:
cmd |= OCMD_YUV_422_PLANAR;
break;
case I915_OVERLAY_YUV420:
cmd |= OCMD_YUV_420_PLANAR;
break;
case I915_OVERLAY_YUV411:
case I915_OVERLAY_YUV410:
cmd |= OCMD_YUV_410_PLANAR;
break;
}
} else { /* YUV packed */
switch (params->flags & I915_OVERLAY_DEPTH_MASK) {
case I915_OVERLAY_YUV422:
cmd |= OCMD_YUV_422_PACKED;
break;
case I915_OVERLAY_YUV411:
cmd |= OCMD_YUV_411_PACKED;
break;
}
switch (params->flags & I915_OVERLAY_SWAP_MASK) {
case I915_OVERLAY_NO_SWAP:
break;
case I915_OVERLAY_UV_SWAP:
cmd |= OCMD_UV_SWAP;
break;
case I915_OVERLAY_Y_SWAP:
cmd |= OCMD_Y_SWAP;
break;
case I915_OVERLAY_Y_AND_UV_SWAP:
cmd |= OCMD_Y_AND_UV_SWAP;
break;
}
}
return cmd;
}
static int intel_overlay_do_put_image(struct intel_overlay *overlay,
struct drm_i915_gem_object *new_bo,
struct drm_intel_overlay_put_image *params)
{
struct overlay_registers __iomem *regs = overlay->regs;
struct drm_i915_private *dev_priv = overlay->i915;
u32 swidth, swidthsw, sheight, ostride;
drm/i915: Introduce accurate frontbuffer tracking So from just a quick look we seem to have enough information to accurately figure out whether a given gem bo is used as a frontbuffer and where exactly: We have obj->pin_count as a first check with no false negatives and only negligible false positives. And then we can just walk the modeset objects and figure out where exactly a buffer is used as scanout. Except that we can't due to locking order: If we already hold dev->struct_mutex we can't acquire any modeset locks, so could potential chase freed pointers and other evil stuff. So we need something else. For that introduce a new set of bits obj->frontbuffer_bits to track where a buffer object is used. That we can then chase without grabbing any modeset locks. Of course the consumers of this (DRRS, PSR, FBC, ...) still need to be able to do their magic both when called from modeset and from gem code. But that can be easily achieved by adding locks for these specific subsystems which always nest within either kms or gem locking. This patch just adds the relevant update code to all places. Note that if we ever support multi-planar scanout targets then we need one frontbuffer tracking bit per attachment point that we expose to userspace. v2: - Fix more oopsen. Oops. - WARN if we leak obj->frontbuffer_bits when freeing a gem buffer. Fix the bugs this brought to light. - s/update_frontbuffer_bits/update_fb_bits/. More consistent with the fb tracking functions (fb for gem object, frontbuffer for raw bits). And the function name was way too long. v3: Size obj->frontbuffer_bits correctly so that all pipes fit in. v4: Don't update fb bits in set_base on failure. Noticed by Chris. v5: s/i915_gem_update_fb_bits/i915_gem_track_fb/ Also remove a few local enum pipe variables which are now no longer needed to make the function arguments no drop over the 80 char limit. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 04:28:09 +07:00
enum pipe pipe = overlay->crtc->pipe;
bool scale_changed = false;
struct i915_vma *vma;
int ret, tmp_width;
drm_WARN_ON(&dev_priv->drm,
!drm_modeset_is_locked(&dev_priv->drm.mode_config.connection_mutex));
ret = intel_overlay_release_old_vid(overlay);
if (ret != 0)
return ret;
drm/i915: More surgically unbreak the modeset vs reset deadlock There's no reason to entirely wedge the gpu, for the minimal deadlock bugfix we only need to unbreak/decouple the atomic commit from the gpu reset. The simplest way to fix that is by replacing the unconditional fence wait a the top of commit_tail by a wait which completes either when the fences are done (normal case, or when a reset doesn't need to touch the display state). Or when the gpu reset needs to force-unblock all pending modeset states. The lesser source of deadlocks is when we try to pin a new framebuffer and run into a stall. There's a bunch of places this can happen, like eviction, changing the caching mode, acquiring a fence on older platforms. And we can't just break the depency loop and keep going, the only way would be to break out and restart. But the problem with that approach is that we must stall for the reset to complete before we grab any locks, and with the atomic infrastructure that's a bit tricky. The only place is the ioctl code, and we don't want to insert code into e.g. the BUSY ioctl. Hence for that problem just create a critical section, and if any code is in there, wedge the GPU. For the steady-state this should never be a problem. Note that in both cases TDR itself keeps working, so from a userspace pov this trickery isn't observable. Users themselvs might spot a short glitch while the rendering is catching up again, but that's still better than pre-TDR where we've thrown away all the rendering, including innocent batches. Also, this fixes the regression TDR introduced of making gpu resets deadlock-prone when we do need to touch the display. One thing I noticed is that gpu_error.flags seems to use both our own wait-queue in gpu_error.wait_queue, and the generic wait_on_bit facilities. Not entirely sure why this inconsistency exists, I just picked one style. A possible future avenue could be to insert the gpu reset in-between ongoing modeset changes, which would avoid the momentary glitch. But that's a lot more work to implement in the atomic commit machinery, and given that we only need this for pre-g4x hw, of questionable utility just for the sake of polishing gpu reset even more on those old boxes. It might be useful for other features though. v2: Rebase onto 4.13 with a s/wait_queue_t/struct wait_queue_entry/. v3: Really emabarrassing fixup, I checked the wrong bit and broke the unbreak/wakeup logic. v4: Also handle deadlocks in pin_to_display. v5: Review from Michel: - Fixup the BUILD_BUG_ON - Don't forget about the overlay Cc: Michel Thierry <michel.thierry@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v2) Cc: Michel Thierry <michel.thierry@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20170808080828.23650-3-daniel.vetter@ffwll.ch Reviewed-by: Michel Thierry <michel.thierry@intel.com>
2017-08-08 15:08:28 +07:00
atomic_inc(&dev_priv->gpu_error.pending_fb_pin);
drm/i915: Move the policy for placement of the GGTT vma into the caller Currently we make the unilateral decision inside i915_gem_object_pin_to_display() where the VMA should resided (inside the fence and mappable region or above?). This is not our decision to make as it impacts on how the display engine can use the resulting scanout object, and it would rather instruct us where to place the VMA so that it can enable the features it wants. As such, make the pin flags an argument to i915_gem_object_pin_to_display() and control them from intel_pin_and_fence_fb_obj() Whilst taking control of the mapping for ourselves, start tracking how we use it to avoid trying to free a fence we never claimed: <3>[ 227.151869] GEM_BUG_ON(vma->fence->pin_count <= 0) <4>[ 227.152064] ------------[ cut here ]------------ <2>[ 227.152068] kernel BUG at drivers/gpu/drm/i915/i915_vma.h:391! <4>[ 227.152084] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI <0>[ 227.152092] Dumping ftrace buffer: <0>[ 227.152099] (ftrace buffer empty) <4>[ 227.152102] Modules linked in: i915 snd_hda_codec_analog snd_hda_codec_generic coretemp snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm lpc_ich e1000e mei_me mei prime_numbers <4>[ 227.152131] CPU: 1 PID: 1587 Comm: kworker/u16:49 Tainted: G U 4.16.0-rc1-gbab67b2f6177-kasan_7+ #1 <4>[ 227.152134] Hardware name: Dell Inc. OptiPlex 755 /0PU052, BIOS A08 02/19/2008 <4>[ 227.152236] Workqueue: events_unbound intel_atomic_commit_work [i915] <4>[ 227.152292] RIP: 0010:intel_unpin_fb_vma+0x23a/0x2a0 [i915] <4>[ 227.152295] RSP: 0018:ffff88005aad7b68 EFLAGS: 00010286 <4>[ 227.152300] RAX: 0000000000000026 RBX: ffff88005c359580 RCX: 0000000000000000 <4>[ 227.152304] RDX: 0000000000000026 RSI: ffffffff8707d840 RDI: ffffed000b55af63 <4>[ 227.152307] RBP: ffff880056817e58 R08: 0000000000000001 R09: 0000000000000000 <4>[ 227.152311] R10: ffff88005aad7b88 R11: 0000000000000000 R12: ffff8800568184d0 <4>[ 227.152314] R13: ffff880065b5ab08 R14: 0000000000000000 R15: dffffc0000000000 <4>[ 227.152318] FS: 0000000000000000(0000) GS:ffff88006ac40000(0000) knlGS:0000000000000000 <4>[ 227.152322] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[ 227.152325] CR2: 00007f5fb25550a8 CR3: 0000000068c78000 CR4: 00000000000006e0 <4>[ 227.152328] Call Trace: <4>[ 227.152385] intel_cleanup_plane_fb+0x6b/0xd0 [i915] <4>[ 227.152395] drm_atomic_helper_cleanup_planes+0x166/0x280 <4>[ 227.152452] intel_atomic_commit_tail+0x159d/0x3380 [i915] <4>[ 227.152463] ? process_one_work+0x66e/0x1460 <4>[ 227.152516] ? skl_update_crtcs+0x9c0/0x9c0 [i915] <4>[ 227.152523] ? lock_acquire+0x13d/0x390 <4>[ 227.152527] ? lock_acquire+0x13d/0x390 <4>[ 227.152534] process_one_work+0x71a/0x1460 <4>[ 227.152540] ? __schedule+0x815/0x1e20 <4>[ 227.152547] ? pwq_dec_nr_in_flight+0x2b0/0x2b0 <4>[ 227.152553] ? _raw_spin_lock_irq+0xa/0x40 <4>[ 227.152559] worker_thread+0xdf/0xf60 <4>[ 227.152569] ? process_one_work+0x1460/0x1460 <4>[ 227.152573] kthread+0x2cf/0x3c0 <4>[ 227.152578] ? _kthread_create_on_node+0xa0/0xa0 <4>[ 227.152583] ret_from_fork+0x3a/0x50 <4>[ 227.152591] Code: c6 00 11 86 c0 48 c7 c7 e0 bd 85 c0 e8 60 e7 a9 c4 0f ff e9 1f fe ff ff 48 c7 c6 40 10 86 c0 48 c7 c7 e0 ca 85 c0 e8 2b 95 bd c4 <0f> 0b 48 89 ef e8 4c 44 e8 c4 e9 ef fd ff ff e8 42 44 e8 c4 e9 <1>[ 227.152720] RIP: intel_unpin_fb_vma+0x23a/0x2a0 [i915] RSP: ffff88005aad7b68 v2: i915_vma_pin_fence() is a no-op if a fence isn't required, so check vma->fence as well. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180220134208.24988-2-chris@chris-wilson.co.uk
2018-02-20 20:42:06 +07:00
vma = i915_gem_object_pin_to_display_plane(new_bo,
0, NULL, PIN_MAPPABLE);
drm/i915: More surgically unbreak the modeset vs reset deadlock There's no reason to entirely wedge the gpu, for the minimal deadlock bugfix we only need to unbreak/decouple the atomic commit from the gpu reset. The simplest way to fix that is by replacing the unconditional fence wait a the top of commit_tail by a wait which completes either when the fences are done (normal case, or when a reset doesn't need to touch the display state). Or when the gpu reset needs to force-unblock all pending modeset states. The lesser source of deadlocks is when we try to pin a new framebuffer and run into a stall. There's a bunch of places this can happen, like eviction, changing the caching mode, acquiring a fence on older platforms. And we can't just break the depency loop and keep going, the only way would be to break out and restart. But the problem with that approach is that we must stall for the reset to complete before we grab any locks, and with the atomic infrastructure that's a bit tricky. The only place is the ioctl code, and we don't want to insert code into e.g. the BUSY ioctl. Hence for that problem just create a critical section, and if any code is in there, wedge the GPU. For the steady-state this should never be a problem. Note that in both cases TDR itself keeps working, so from a userspace pov this trickery isn't observable. Users themselvs might spot a short glitch while the rendering is catching up again, but that's still better than pre-TDR where we've thrown away all the rendering, including innocent batches. Also, this fixes the regression TDR introduced of making gpu resets deadlock-prone when we do need to touch the display. One thing I noticed is that gpu_error.flags seems to use both our own wait-queue in gpu_error.wait_queue, and the generic wait_on_bit facilities. Not entirely sure why this inconsistency exists, I just picked one style. A possible future avenue could be to insert the gpu reset in-between ongoing modeset changes, which would avoid the momentary glitch. But that's a lot more work to implement in the atomic commit machinery, and given that we only need this for pre-g4x hw, of questionable utility just for the sake of polishing gpu reset even more on those old boxes. It might be useful for other features though. v2: Rebase onto 4.13 with a s/wait_queue_t/struct wait_queue_entry/. v3: Really emabarrassing fixup, I checked the wrong bit and broke the unbreak/wakeup logic. v4: Also handle deadlocks in pin_to_display. v5: Review from Michel: - Fixup the BUILD_BUG_ON - Don't forget about the overlay Cc: Michel Thierry <michel.thierry@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v2) Cc: Michel Thierry <michel.thierry@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20170808080828.23650-3-daniel.vetter@ffwll.ch Reviewed-by: Michel Thierry <michel.thierry@intel.com>
2017-08-08 15:08:28 +07:00
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out_pin_section;
}
i915_gem_object_flush_frontbuffer(new_bo, ORIGIN_DIRTYFB);
if (!overlay->active) {
u32 oconfig;
oconfig = OCONF_CC_OUT_8BIT;
if (IS_GEN(dev_priv, 4))
oconfig |= OCONF_CSC_MODE_BT709;
drm/i915: Introduce accurate frontbuffer tracking So from just a quick look we seem to have enough information to accurately figure out whether a given gem bo is used as a frontbuffer and where exactly: We have obj->pin_count as a first check with no false negatives and only negligible false positives. And then we can just walk the modeset objects and figure out where exactly a buffer is used as scanout. Except that we can't due to locking order: If we already hold dev->struct_mutex we can't acquire any modeset locks, so could potential chase freed pointers and other evil stuff. So we need something else. For that introduce a new set of bits obj->frontbuffer_bits to track where a buffer object is used. That we can then chase without grabbing any modeset locks. Of course the consumers of this (DRRS, PSR, FBC, ...) still need to be able to do their magic both when called from modeset and from gem code. But that can be easily achieved by adding locks for these specific subsystems which always nest within either kms or gem locking. This patch just adds the relevant update code to all places. Note that if we ever support multi-planar scanout targets then we need one frontbuffer tracking bit per attachment point that we expose to userspace. v2: - Fix more oopsen. Oops. - WARN if we leak obj->frontbuffer_bits when freeing a gem buffer. Fix the bugs this brought to light. - s/update_frontbuffer_bits/update_fb_bits/. More consistent with the fb tracking functions (fb for gem object, frontbuffer for raw bits). And the function name was way too long. v3: Size obj->frontbuffer_bits correctly so that all pipes fit in. v4: Don't update fb bits in set_base on failure. Noticed by Chris. v5: s/i915_gem_update_fb_bits/i915_gem_track_fb/ Also remove a few local enum pipe variables which are now no longer needed to make the function arguments no drop over the 80 char limit. Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-06-19 04:28:09 +07:00
oconfig |= pipe == 0 ?
OCONF_PIPE_A : OCONF_PIPE_B;
iowrite32(oconfig, &regs->OCONFIG);
ret = intel_overlay_on(overlay);
if (ret != 0)
goto out_unpin;
}
iowrite32(params->dst_y << 16 | params->dst_x, &regs->DWINPOS);
iowrite32(params->dst_height << 16 | params->dst_width, &regs->DWINSZ);
if (params->flags & I915_OVERLAY_YUV_PACKED)
tmp_width = packed_width_bytes(params->flags,
params->src_width);
else
tmp_width = params->src_width;
swidth = params->src_width;
swidthsw = calc_swidthsw(dev_priv, params->offset_Y, tmp_width);
sheight = params->src_height;
iowrite32(i915_ggtt_offset(vma) + params->offset_Y, &regs->OBUF_0Y);
ostride = params->stride_Y;
if (params->flags & I915_OVERLAY_YUV_PLANAR) {
int uv_hscale = uv_hsubsampling(params->flags);
int uv_vscale = uv_vsubsampling(params->flags);
u32 tmp_U, tmp_V;
swidth |= (params->src_width / uv_hscale) << 16;
sheight |= (params->src_height / uv_vscale) << 16;
tmp_U = calc_swidthsw(dev_priv, params->offset_U,
params->src_width / uv_hscale);
tmp_V = calc_swidthsw(dev_priv, params->offset_V,
params->src_width / uv_hscale);
swidthsw |= max(tmp_U, tmp_V) << 16;
iowrite32(i915_ggtt_offset(vma) + params->offset_U,
&regs->OBUF_0U);
iowrite32(i915_ggtt_offset(vma) + params->offset_V,
&regs->OBUF_0V);
ostride |= params->stride_UV << 16;
}
iowrite32(swidth, &regs->SWIDTH);
iowrite32(swidthsw, &regs->SWIDTHSW);
iowrite32(sheight, &regs->SHEIGHT);
iowrite32(ostride, &regs->OSTRIDE);
scale_changed = update_scaling_factors(overlay, regs, params);
update_colorkey(overlay, regs);
iowrite32(overlay_cmd_reg(params), &regs->OCMD);
ret = intel_overlay_continue(overlay, vma, scale_changed);
if (ret)
goto out_unpin;
return 0;
out_unpin:
i915_gem_object_unpin_from_display_plane(vma);
drm/i915: More surgically unbreak the modeset vs reset deadlock There's no reason to entirely wedge the gpu, for the minimal deadlock bugfix we only need to unbreak/decouple the atomic commit from the gpu reset. The simplest way to fix that is by replacing the unconditional fence wait a the top of commit_tail by a wait which completes either when the fences are done (normal case, or when a reset doesn't need to touch the display state). Or when the gpu reset needs to force-unblock all pending modeset states. The lesser source of deadlocks is when we try to pin a new framebuffer and run into a stall. There's a bunch of places this can happen, like eviction, changing the caching mode, acquiring a fence on older platforms. And we can't just break the depency loop and keep going, the only way would be to break out and restart. But the problem with that approach is that we must stall for the reset to complete before we grab any locks, and with the atomic infrastructure that's a bit tricky. The only place is the ioctl code, and we don't want to insert code into e.g. the BUSY ioctl. Hence for that problem just create a critical section, and if any code is in there, wedge the GPU. For the steady-state this should never be a problem. Note that in both cases TDR itself keeps working, so from a userspace pov this trickery isn't observable. Users themselvs might spot a short glitch while the rendering is catching up again, but that's still better than pre-TDR where we've thrown away all the rendering, including innocent batches. Also, this fixes the regression TDR introduced of making gpu resets deadlock-prone when we do need to touch the display. One thing I noticed is that gpu_error.flags seems to use both our own wait-queue in gpu_error.wait_queue, and the generic wait_on_bit facilities. Not entirely sure why this inconsistency exists, I just picked one style. A possible future avenue could be to insert the gpu reset in-between ongoing modeset changes, which would avoid the momentary glitch. But that's a lot more work to implement in the atomic commit machinery, and given that we only need this for pre-g4x hw, of questionable utility just for the sake of polishing gpu reset even more on those old boxes. It might be useful for other features though. v2: Rebase onto 4.13 with a s/wait_queue_t/struct wait_queue_entry/. v3: Really emabarrassing fixup, I checked the wrong bit and broke the unbreak/wakeup logic. v4: Also handle deadlocks in pin_to_display. v5: Review from Michel: - Fixup the BUILD_BUG_ON - Don't forget about the overlay Cc: Michel Thierry <michel.thierry@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> (v2) Cc: Michel Thierry <michel.thierry@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20170808080828.23650-3-daniel.vetter@ffwll.ch Reviewed-by: Michel Thierry <michel.thierry@intel.com>
2017-08-08 15:08:28 +07:00
out_pin_section:
atomic_dec(&dev_priv->gpu_error.pending_fb_pin);
return ret;
}
int intel_overlay_switch_off(struct intel_overlay *overlay)
{
struct drm_i915_private *dev_priv = overlay->i915;
int ret;
drm_WARN_ON(&dev_priv->drm,
!drm_modeset_is_locked(&dev_priv->drm.mode_config.connection_mutex));
ret = intel_overlay_recover_from_interrupt(overlay);
if (ret != 0)
return ret;
if (!overlay->active)
return 0;
ret = intel_overlay_release_old_vid(overlay);
if (ret != 0)
return ret;
iowrite32(0, &overlay->regs->OCMD);
return intel_overlay_off(overlay);
}
static int check_overlay_possible_on_crtc(struct intel_overlay *overlay,
struct intel_crtc *crtc)
{
if (!crtc->active)
return -EINVAL;
/* can't use the overlay with double wide pipe */
drm/i915: Make intel_crtc->config a pointer To match the semantics of drm_crtc->state, which this will eventually become. The allocation of the memory for config will be fixed in a followup patch. By adding the extra _config field to intel_crtc it was possible to generate this entire patch with the cocci script below. @@ @@ struct intel_crtc { ... -struct intel_crtc_state config; +struct intel_crtc_state _config; +struct intel_crtc_state *config; ... } @@ struct intel_crtc *crtc; @@ -memset(&crtc->config, 0, sizeof(crtc->config)); +memset(crtc->config, 0, sizeof(*crtc->config)); @@ @@ __intel_set_mode(...) { <... -to_intel_crtc(crtc)->config = *pipe_config; +(*(to_intel_crtc(crtc)->config)) = *pipe_config; ...> } @@ @@ intel_crtc_init(...) { ... WARN_ON(drm_crtc_index(&intel_crtc->base) != intel_crtc->pipe); +intel_crtc->config = &intel_crtc->_config; return; ... } @@ struct intel_crtc *crtc; @@ -&crtc->config +crtc->config @@ struct intel_crtc *crtc; identifier member; @@ -crtc->config.member +crtc->config->member @@ expression E; @@ -&(to_intel_crtc(E)->config) +to_intel_crtc(E)->config @@ expression E; identifier member; @@ -to_intel_crtc(E)->config.member +to_intel_crtc(E)->config->member v2: Clarify manual changes by splitting them into another patch. (Matt) Improve cocci script to generate even more of the changes. (Ander) Signed-off-by: Ander Conselvan de Oliveira <ander.conselvan.de.oliveira@intel.com> Reviewed-by: Matt Roper <matthew.d.roper@intel.com> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-01-15 19:55:25 +07:00
if (crtc->config->double_wide)
return -EINVAL;
return 0;
}
static void update_pfit_vscale_ratio(struct intel_overlay *overlay)
{
struct drm_i915_private *dev_priv = overlay->i915;
u32 pfit_control = intel_de_read(dev_priv, PFIT_CONTROL);
u32 ratio;
/* XXX: This is not the same logic as in the xorg driver, but more in
* line with the intel documentation for the i965
*/
if (INTEL_GEN(dev_priv) >= 4) {
/* on i965 use the PGM reg to read out the autoscaler values */
ratio = intel_de_read(dev_priv, PFIT_PGM_RATIOS) >> PFIT_VERT_SCALE_SHIFT_965;
} else {
if (pfit_control & VERT_AUTO_SCALE)
ratio = intel_de_read(dev_priv, PFIT_AUTO_RATIOS);
else
ratio = intel_de_read(dev_priv, PFIT_PGM_RATIOS);
ratio >>= PFIT_VERT_SCALE_SHIFT;
}
overlay->pfit_vscale_ratio = ratio;
}
static int check_overlay_dst(struct intel_overlay *overlay,
struct drm_intel_overlay_put_image *rec)
{
const struct intel_crtc_state *pipe_config =
overlay->crtc->config;
if (rec->dst_x < pipe_config->pipe_src_w &&
rec->dst_x + rec->dst_width <= pipe_config->pipe_src_w &&
rec->dst_y < pipe_config->pipe_src_h &&
rec->dst_y + rec->dst_height <= pipe_config->pipe_src_h)
return 0;
else
return -EINVAL;
}
static int check_overlay_scaling(struct drm_intel_overlay_put_image *rec)
{
u32 tmp;
/* downscaling limit is 8.0 */
tmp = ((rec->src_scan_height << 16) / rec->dst_height) >> 16;
if (tmp > 7)
return -EINVAL;
tmp = ((rec->src_scan_width << 16) / rec->dst_width) >> 16;
if (tmp > 7)
return -EINVAL;
return 0;
}
static int check_overlay_src(struct drm_i915_private *dev_priv,
struct drm_intel_overlay_put_image *rec,
struct drm_i915_gem_object *new_bo)
{
int uv_hscale = uv_hsubsampling(rec->flags);
int uv_vscale = uv_vsubsampling(rec->flags);
u32 stride_mask;
int depth;
u32 tmp;
/* check src dimensions */
if (IS_I845G(dev_priv) || IS_I830(dev_priv)) {
if (rec->src_height > IMAGE_MAX_HEIGHT_LEGACY ||
rec->src_width > IMAGE_MAX_WIDTH_LEGACY)
return -EINVAL;
} else {
if (rec->src_height > IMAGE_MAX_HEIGHT ||
rec->src_width > IMAGE_MAX_WIDTH)
return -EINVAL;
}
/* better safe than sorry, use 4 as the maximal subsampling ratio */
if (rec->src_height < N_VERT_Y_TAPS*4 ||
rec->src_width < N_HORIZ_Y_TAPS*4)
return -EINVAL;
/* check alignment constraints */
switch (rec->flags & I915_OVERLAY_TYPE_MASK) {
case I915_OVERLAY_RGB:
/* not implemented */
return -EINVAL;
case I915_OVERLAY_YUV_PACKED:
if (uv_vscale != 1)
return -EINVAL;
depth = packed_depth_bytes(rec->flags);
if (depth < 0)
return depth;
/* ignore UV planes */
rec->stride_UV = 0;
rec->offset_U = 0;
rec->offset_V = 0;
/* check pixel alignment */
if (rec->offset_Y % depth)
return -EINVAL;
break;
case I915_OVERLAY_YUV_PLANAR:
if (uv_vscale < 0 || uv_hscale < 0)
return -EINVAL;
/* no offset restrictions for planar formats */
break;
default:
return -EINVAL;
}
if (rec->src_width % uv_hscale)
return -EINVAL;
/* stride checking */
if (IS_I830(dev_priv) || IS_I845G(dev_priv))
stride_mask = 255;
else
stride_mask = 63;
if (rec->stride_Y & stride_mask || rec->stride_UV & stride_mask)
return -EINVAL;
if (IS_GEN(dev_priv, 4) && rec->stride_Y < 512)
return -EINVAL;
tmp = (rec->flags & I915_OVERLAY_TYPE_MASK) == I915_OVERLAY_YUV_PLANAR ?
4096 : 8192;
if (rec->stride_Y > tmp || rec->stride_UV > 2*1024)
return -EINVAL;
/* check buffer dimensions */
switch (rec->flags & I915_OVERLAY_TYPE_MASK) {
case I915_OVERLAY_RGB:
case I915_OVERLAY_YUV_PACKED:
/* always 4 Y values per depth pixels */
if (packed_width_bytes(rec->flags, rec->src_width) > rec->stride_Y)
return -EINVAL;
tmp = rec->stride_Y*rec->src_height;
if (rec->offset_Y + tmp > new_bo->base.size)
return -EINVAL;
break;
case I915_OVERLAY_YUV_PLANAR:
if (rec->src_width > rec->stride_Y)
return -EINVAL;
if (rec->src_width/uv_hscale > rec->stride_UV)
return -EINVAL;
tmp = rec->stride_Y * rec->src_height;
if (rec->offset_Y + tmp > new_bo->base.size)
return -EINVAL;
tmp = rec->stride_UV * (rec->src_height / uv_vscale);
if (rec->offset_U + tmp > new_bo->base.size ||
rec->offset_V + tmp > new_bo->base.size)
return -EINVAL;
break;
}
return 0;
}
int intel_overlay_put_image_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv)
{
struct drm_intel_overlay_put_image *params = data;
struct drm_i915_private *dev_priv = to_i915(dev);
struct intel_overlay *overlay;
struct drm_crtc *drmmode_crtc;
struct intel_crtc *crtc;
struct drm_i915_gem_object *new_bo;
int ret;
overlay = dev_priv->overlay;
if (!overlay) {
drm_dbg(&dev_priv->drm, "userspace bug: no overlay\n");
return -ENODEV;
}
if (!(params->flags & I915_OVERLAY_ENABLE)) {
drm_modeset_lock_all(dev);
ret = intel_overlay_switch_off(overlay);
drm_modeset_unlock_all(dev);
return ret;
}
drmmode_crtc = drm_crtc_find(dev, file_priv, params->crtc_id);
if (!drmmode_crtc)
return -ENOENT;
crtc = to_intel_crtc(drmmode_crtc);
new_bo = i915_gem_object_lookup(file_priv, params->bo_handle);
if (!new_bo)
return -ENOENT;
drm_modeset_lock_all(dev);
if (i915_gem_object_is_tiled(new_bo)) {
drm_dbg_kms(&dev_priv->drm,
"buffer used for overlay image can not be tiled\n");
ret = -EINVAL;
goto out_unlock;
}
ret = intel_overlay_recover_from_interrupt(overlay);
if (ret != 0)
goto out_unlock;
if (overlay->crtc != crtc) {
ret = intel_overlay_switch_off(overlay);
if (ret != 0)
goto out_unlock;
ret = check_overlay_possible_on_crtc(overlay, crtc);
if (ret != 0)
goto out_unlock;
overlay->crtc = crtc;
crtc->overlay = overlay;
/* line too wide, i.e. one-line-mode */
if (crtc->config->pipe_src_w > 1024 &&
crtc->config->gmch_pfit.control & PFIT_ENABLE) {
overlay->pfit_active = true;
update_pfit_vscale_ratio(overlay);
} else
overlay->pfit_active = false;
}
ret = check_overlay_dst(overlay, params);
if (ret != 0)
goto out_unlock;
if (overlay->pfit_active) {
params->dst_y = (((u32)params->dst_y << 12) /
overlay->pfit_vscale_ratio);
/* shifting right rounds downwards, so add 1 */
params->dst_height = (((u32)params->dst_height << 12) /
overlay->pfit_vscale_ratio) + 1;
}
if (params->src_scan_height > params->src_height ||
params->src_scan_width > params->src_width) {
ret = -EINVAL;
goto out_unlock;
}
ret = check_overlay_src(dev_priv, params, new_bo);
if (ret != 0)
goto out_unlock;
/* Check scaling after src size to prevent a divide-by-zero. */
ret = check_overlay_scaling(params);
if (ret != 0)
goto out_unlock;
ret = intel_overlay_do_put_image(overlay, new_bo, params);
if (ret != 0)
goto out_unlock;
drm_modeset_unlock_all(dev);
i915_gem_object_put(new_bo);
return 0;
out_unlock:
drm_modeset_unlock_all(dev);
i915_gem_object_put(new_bo);
return ret;
}
static void update_reg_attrs(struct intel_overlay *overlay,
struct overlay_registers __iomem *regs)
{
iowrite32((overlay->contrast << 18) | (overlay->brightness & 0xff),
&regs->OCLRC0);
iowrite32(overlay->saturation, &regs->OCLRC1);
}
static bool check_gamma_bounds(u32 gamma1, u32 gamma2)
{
int i;
if (gamma1 & 0xff000000 || gamma2 & 0xff000000)
return false;
for (i = 0; i < 3; i++) {
if (((gamma1 >> i*8) & 0xff) >= ((gamma2 >> i*8) & 0xff))
return false;
}
return true;
}
static bool check_gamma5_errata(u32 gamma5)
{
int i;
for (i = 0; i < 3; i++) {
if (((gamma5 >> i*8) & 0xff) == 0x80)
return false;
}
return true;
}
static int check_gamma(struct drm_intel_overlay_attrs *attrs)
{
if (!check_gamma_bounds(0, attrs->gamma0) ||
!check_gamma_bounds(attrs->gamma0, attrs->gamma1) ||
!check_gamma_bounds(attrs->gamma1, attrs->gamma2) ||
!check_gamma_bounds(attrs->gamma2, attrs->gamma3) ||
!check_gamma_bounds(attrs->gamma3, attrs->gamma4) ||
!check_gamma_bounds(attrs->gamma4, attrs->gamma5) ||
!check_gamma_bounds(attrs->gamma5, 0x00ffffff))
return -EINVAL;
if (!check_gamma5_errata(attrs->gamma5))
return -EINVAL;
return 0;
}
int intel_overlay_attrs_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv)
{
struct drm_intel_overlay_attrs *attrs = data;
struct drm_i915_private *dev_priv = to_i915(dev);
struct intel_overlay *overlay;
int ret;
overlay = dev_priv->overlay;
if (!overlay) {
drm_dbg(&dev_priv->drm, "userspace bug: no overlay\n");
return -ENODEV;
}
drm_modeset_lock_all(dev);
ret = -EINVAL;
if (!(attrs->flags & I915_OVERLAY_UPDATE_ATTRS)) {
attrs->color_key = overlay->color_key;
attrs->brightness = overlay->brightness;
attrs->contrast = overlay->contrast;
attrs->saturation = overlay->saturation;
if (!IS_GEN(dev_priv, 2)) {
attrs->gamma0 = intel_de_read(dev_priv, OGAMC0);
attrs->gamma1 = intel_de_read(dev_priv, OGAMC1);
attrs->gamma2 = intel_de_read(dev_priv, OGAMC2);
attrs->gamma3 = intel_de_read(dev_priv, OGAMC3);
attrs->gamma4 = intel_de_read(dev_priv, OGAMC4);
attrs->gamma5 = intel_de_read(dev_priv, OGAMC5);
}
} else {
if (attrs->brightness < -128 || attrs->brightness > 127)
goto out_unlock;
if (attrs->contrast > 255)
goto out_unlock;
if (attrs->saturation > 1023)
goto out_unlock;
overlay->color_key = attrs->color_key;
overlay->brightness = attrs->brightness;
overlay->contrast = attrs->contrast;
overlay->saturation = attrs->saturation;
update_reg_attrs(overlay, overlay->regs);
if (attrs->flags & I915_OVERLAY_UPDATE_GAMMA) {
if (IS_GEN(dev_priv, 2))
goto out_unlock;
if (overlay->active) {
ret = -EBUSY;
goto out_unlock;
}
ret = check_gamma(attrs);
if (ret)
goto out_unlock;
intel_de_write(dev_priv, OGAMC0, attrs->gamma0);
intel_de_write(dev_priv, OGAMC1, attrs->gamma1);
intel_de_write(dev_priv, OGAMC2, attrs->gamma2);
intel_de_write(dev_priv, OGAMC3, attrs->gamma3);
intel_de_write(dev_priv, OGAMC4, attrs->gamma4);
intel_de_write(dev_priv, OGAMC5, attrs->gamma5);
}
}
overlay->color_key_enabled = (attrs->flags & I915_OVERLAY_DISABLE_DEST_COLORKEY) == 0;
ret = 0;
out_unlock:
drm_modeset_unlock_all(dev);
return ret;
}
static int get_registers(struct intel_overlay *overlay, bool use_phys)
{
struct drm_i915_private *i915 = overlay->i915;
struct drm_i915_gem_object *obj;
struct i915_vma *vma;
int err;
obj = i915_gem_object_create_stolen(i915, PAGE_SIZE);
if (IS_ERR(obj))
obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
drm/i915: Pull i915_vma_pin under the vm->mutex Replace the struct_mutex requirement for pinning the i915_vma with the local vm->mutex instead. Note that the vm->mutex is tainted by the shrinker (we require unbinding from inside fs-reclaim) and so we cannot allocate while holding that mutex. Instead we have to preallocate workers to do allocate and apply the PTE updates after we have we reserved their slot in the drm_mm (using fences to order the PTE writes with the GPU work and with later unbind). In adding the asynchronous vma binding, one subtle requirement is to avoid coupling the binding fence into the backing object->resv. That is the asynchronous binding only applies to the vma timeline itself and not to the pages as that is a more global timeline (the binding of one vma does not need to be ordered with another vma, nor does the implicit GEM fencing depend on a vma, only on writes to the backing store). Keeping the vma binding distinct from the backing store timelines is verified by a number of async gem_exec_fence and gem_exec_schedule tests. The way we do this is quite simple, we keep the fence for the vma binding separate and only wait on it as required, and never add it to the obj->resv itself. Another consequence in reducing the locking around the vma is the destruction of the vma is no longer globally serialised by struct_mutex. A natural solution would be to add a kref to i915_vma, but that requires decoupling the reference cycles, possibly by introducing a new i915_mm_pages object that is own by both obj->mm and vma->pages. However, we have not taken that route due to the overshadowing lmem/ttm discussions, and instead play a series of complicated games with trylocks to (hopefully) ensure that only one destruction path is called! v2: Add some commentary, and some helpers to reduce patch churn. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191004134015.13204-4-chris@chris-wilson.co.uk
2019-10-04 20:39:58 +07:00
if (IS_ERR(obj))
return PTR_ERR(obj);
vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, PIN_MAPPABLE);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
goto err_put_bo;
}
if (use_phys)
overlay->flip_addr = sg_dma_address(obj->mm.pages->sgl);
else
overlay->flip_addr = i915_ggtt_offset(vma);
overlay->regs = i915_vma_pin_iomap(vma);
i915_vma_unpin(vma);
if (IS_ERR(overlay->regs)) {
err = PTR_ERR(overlay->regs);
goto err_put_bo;
}
overlay->reg_bo = obj;
return 0;
err_put_bo:
i915_gem_object_put(obj);
return err;
}
void intel_overlay_setup(struct drm_i915_private *dev_priv)
{
struct intel_overlay *overlay;
struct intel_engine_cs *engine;
int ret;
if (!HAS_OVERLAY(dev_priv))
return;
engine = dev_priv->gt.engine[RCS0];
if (!engine || !engine->kernel_context)
return;
overlay = kzalloc(sizeof(*overlay), GFP_KERNEL);
if (!overlay)
return;
overlay->i915 = dev_priv;
overlay->context = engine->kernel_context;
GEM_BUG_ON(!overlay->context);
overlay->color_key = 0x0101fe;
overlay->color_key_enabled = true;
overlay->brightness = -19;
overlay->contrast = 75;
overlay->saturation = 146;
i915_active_init(&overlay->last_flip,
NULL, intel_overlay_last_flip_retire);
ret = get_registers(overlay, OVERLAY_NEEDS_PHYSICAL(dev_priv));
if (ret)
goto out_free;
memset_io(overlay->regs, 0, sizeof(struct overlay_registers));
update_polyphase_filter(overlay->regs);
update_reg_attrs(overlay, overlay->regs);
dev_priv->overlay = overlay;
drm_info(&dev_priv->drm, "Initialized overlay support.\n");
return;
out_free:
kfree(overlay);
}
void intel_overlay_cleanup(struct drm_i915_private *dev_priv)
{
struct intel_overlay *overlay;
overlay = fetch_and_zero(&dev_priv->overlay);
if (!overlay)
return;
/*
* The bo's should be free'd by the generic code already.
* Furthermore modesetting teardown happens beforehand so the
* hardware should be off already.
*/
drm_WARN_ON(&dev_priv->drm, overlay->active);
i915_gem_object_put(overlay->reg_bo);
i915_active_fini(&overlay->last_flip);
kfree(overlay);
}
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
struct intel_overlay_error_state {
struct overlay_registers regs;
unsigned long base;
u32 dovsta;
u32 isr;
};
struct intel_overlay_error_state *
intel_overlay_capture_error_state(struct drm_i915_private *dev_priv)
{
struct intel_overlay *overlay = dev_priv->overlay;
struct intel_overlay_error_state *error;
if (!overlay || !overlay->active)
return NULL;
error = kmalloc(sizeof(*error), GFP_ATOMIC);
if (error == NULL)
return NULL;
error->dovsta = intel_de_read(dev_priv, DOVSTA);
error->isr = intel_de_read(dev_priv, GEN2_ISR);
error->base = overlay->flip_addr;
memcpy_fromio(&error->regs, overlay->regs, sizeof(error->regs));
return error;
}
void
intel_overlay_print_error_state(struct drm_i915_error_state_buf *m,
struct intel_overlay_error_state *error)
{
i915_error_printf(m, "Overlay, status: 0x%08x, interrupt: 0x%08x\n",
error->dovsta, error->isr);
i915_error_printf(m, " Register file at 0x%08lx:\n",
error->base);
#define P(x) i915_error_printf(m, " " #x ": 0x%08x\n", error->regs.x)
P(OBUF_0Y);
P(OBUF_1Y);
P(OBUF_0U);
P(OBUF_0V);
P(OBUF_1U);
P(OBUF_1V);
P(OSTRIDE);
P(YRGB_VPH);
P(UV_VPH);
P(HORZ_PH);
P(INIT_PHS);
P(DWINPOS);
P(DWINSZ);
P(SWIDTH);
P(SWIDTHSW);
P(SHEIGHT);
P(YRGBSCALE);
P(UVSCALE);
P(OCLRC0);
P(OCLRC1);
P(DCLRKV);
P(DCLRKM);
P(SCLRKVH);
P(SCLRKVL);
P(SCLRKEN);
P(OCONFIG);
P(OCMD);
P(OSTART_0Y);
P(OSTART_1Y);
P(OSTART_0U);
P(OSTART_0V);
P(OSTART_1U);
P(OSTART_1V);
P(OTILEOFF_0Y);
P(OTILEOFF_1Y);
P(OTILEOFF_0U);
P(OTILEOFF_0V);
P(OTILEOFF_1U);
P(OTILEOFF_1V);
P(FASTHSCALE);
P(UVSCALEV);
#undef P
}
#endif