mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-18 07:57:03 +07:00
drm/amdgpu: optionally do a writeback but don't invalidate TC for IB fences
There is a new IB flag that enables this new behavior. Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense when draw calls from two adjacent gfx IBs run in parallel. This will be the new default for Mesa. v2: bump the version Signed-off-by: Marek Olšák <marek.olsak@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
3f188453fa
commit
d240cd9edd
@ -75,9 +75,10 @@
|
|||||||
* - 3.23.0 - Add query for VRAM lost counter
|
* - 3.23.0 - Add query for VRAM lost counter
|
||||||
* - 3.24.0 - Add high priority compute support for gfx9
|
* - 3.24.0 - Add high priority compute support for gfx9
|
||||||
* - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
|
* - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
|
||||||
|
* - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
|
||||||
*/
|
*/
|
||||||
#define KMS_DRIVER_MAJOR 3
|
#define KMS_DRIVER_MAJOR 3
|
||||||
#define KMS_DRIVER_MINOR 25
|
#define KMS_DRIVER_MINOR 26
|
||||||
#define KMS_DRIVER_PATCHLEVEL 0
|
#define KMS_DRIVER_PATCHLEVEL 0
|
||||||
|
|
||||||
int amdgpu_vram_limit = 0;
|
int amdgpu_vram_limit = 0;
|
||||||
|
@ -131,7 +131,8 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
|
|||||||
* Emits a fence command on the requested ring (all asics).
|
* Emits a fence command on the requested ring (all asics).
|
||||||
* Returns 0 on success, -ENOMEM on failure.
|
* Returns 0 on success, -ENOMEM on failure.
|
||||||
*/
|
*/
|
||||||
int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
|
int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
|
||||||
|
unsigned flags)
|
||||||
{
|
{
|
||||||
struct amdgpu_device *adev = ring->adev;
|
struct amdgpu_device *adev = ring->adev;
|
||||||
struct amdgpu_fence *fence;
|
struct amdgpu_fence *fence;
|
||||||
@ -149,7 +150,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
|
|||||||
adev->fence_context + ring->idx,
|
adev->fence_context + ring->idx,
|
||||||
seq);
|
seq);
|
||||||
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
|
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
|
||||||
seq, AMDGPU_FENCE_FLAG_INT);
|
seq, flags | AMDGPU_FENCE_FLAG_INT);
|
||||||
|
|
||||||
ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
|
ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
|
||||||
/* This function can't be called concurrently anyway, otherwise
|
/* This function can't be called concurrently anyway, otherwise
|
||||||
|
@ -127,6 +127,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
|
|||||||
struct amdgpu_vm *vm;
|
struct amdgpu_vm *vm;
|
||||||
uint64_t fence_ctx;
|
uint64_t fence_ctx;
|
||||||
uint32_t status = 0, alloc_size;
|
uint32_t status = 0, alloc_size;
|
||||||
|
unsigned fence_flags = 0;
|
||||||
|
|
||||||
unsigned i;
|
unsigned i;
|
||||||
int r = 0;
|
int r = 0;
|
||||||
@ -227,7 +228,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
|
|||||||
#endif
|
#endif
|
||||||
amdgpu_asic_invalidate_hdp(adev, ring);
|
amdgpu_asic_invalidate_hdp(adev, ring);
|
||||||
|
|
||||||
r = amdgpu_fence_emit(ring, f);
|
if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
|
||||||
|
fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
|
||||||
|
|
||||||
|
r = amdgpu_fence_emit(ring, f, fence_flags);
|
||||||
if (r) {
|
if (r) {
|
||||||
dev_err(adev->dev, "failed to emit fence (%d)\n", r);
|
dev_err(adev->dev, "failed to emit fence (%d)\n", r);
|
||||||
if (job && job->vmid)
|
if (job && job->vmid)
|
||||||
@ -242,7 +246,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
|
|||||||
/* wrap the last IB with fence */
|
/* wrap the last IB with fence */
|
||||||
if (job && job->uf_addr) {
|
if (job && job->uf_addr) {
|
||||||
amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
|
amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
|
||||||
AMDGPU_FENCE_FLAG_64BIT);
|
fence_flags | AMDGPU_FENCE_FLAG_64BIT);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
|
if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
|
|
||||||
#define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
|
#define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
|
||||||
#define AMDGPU_FENCE_FLAG_INT (1 << 1)
|
#define AMDGPU_FENCE_FLAG_INT (1 << 1)
|
||||||
|
#define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2)
|
||||||
|
|
||||||
enum amdgpu_ring_type {
|
enum amdgpu_ring_type {
|
||||||
AMDGPU_RING_TYPE_GFX,
|
AMDGPU_RING_TYPE_GFX,
|
||||||
@ -90,7 +91,8 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
|
|||||||
unsigned irq_type);
|
unsigned irq_type);
|
||||||
void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
|
void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
|
||||||
void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
|
void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
|
||||||
int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
|
int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
|
||||||
|
unsigned flags);
|
||||||
int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
|
int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
|
||||||
void amdgpu_fence_process(struct amdgpu_ring *ring);
|
void amdgpu_fence_process(struct amdgpu_ring *ring);
|
||||||
int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
|
int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
|
||||||
|
@ -633,7 +633,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
|
|||||||
amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
|
amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
|
||||||
|
|
||||||
if (vm_flush_needed || pasid_mapping_needed) {
|
if (vm_flush_needed || pasid_mapping_needed) {
|
||||||
r = amdgpu_fence_emit(ring, &fence);
|
r = amdgpu_fence_emit(ring, &fence, 0);
|
||||||
if (r)
|
if (r)
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
@ -3775,13 +3775,16 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
|
|||||||
{
|
{
|
||||||
bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
|
bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
|
||||||
bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
|
bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
|
||||||
|
bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
|
||||||
|
|
||||||
/* RELEASE_MEM - flush caches, send int */
|
/* RELEASE_MEM - flush caches, send int */
|
||||||
amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
|
amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
|
||||||
amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
|
amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
|
||||||
EOP_TC_ACTION_EN |
|
EOP_TC_NC_ACTION_EN) :
|
||||||
EOP_TC_WB_ACTION_EN |
|
(EOP_TCL1_ACTION_EN |
|
||||||
EOP_TC_MD_ACTION_EN |
|
EOP_TC_ACTION_EN |
|
||||||
|
EOP_TC_WB_ACTION_EN |
|
||||||
|
EOP_TC_MD_ACTION_EN)) |
|
||||||
EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
|
EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
|
||||||
EVENT_INDEX(5)));
|
EVENT_INDEX(5)));
|
||||||
amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
|
amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
|
||||||
|
@ -159,6 +159,7 @@
|
|||||||
#define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */
|
#define EOP_TC_WB_ACTION_EN (1 << 15) /* L2 */
|
||||||
#define EOP_TCL1_ACTION_EN (1 << 16)
|
#define EOP_TCL1_ACTION_EN (1 << 16)
|
||||||
#define EOP_TC_ACTION_EN (1 << 17) /* L2 */
|
#define EOP_TC_ACTION_EN (1 << 17) /* L2 */
|
||||||
|
#define EOP_TC_NC_ACTION_EN (1 << 19)
|
||||||
#define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
|
#define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
|
||||||
|
|
||||||
#define DATA_SEL(x) ((x) << 29)
|
#define DATA_SEL(x) ((x) << 29)
|
||||||
|
@ -526,6 +526,10 @@ union drm_amdgpu_cs {
|
|||||||
/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
|
/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
|
||||||
#define AMDGPU_IB_FLAG_PREEMPT (1<<2)
|
#define AMDGPU_IB_FLAG_PREEMPT (1<<2)
|
||||||
|
|
||||||
|
/* The IB fence should do the L2 writeback but not invalidate any shader
|
||||||
|
* caches (L2/vL1/sL1/I$). */
|
||||||
|
#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
|
||||||
|
|
||||||
struct drm_amdgpu_cs_chunk_ib {
|
struct drm_amdgpu_cs_chunk_ib {
|
||||||
__u32 _pad;
|
__u32 _pad;
|
||||||
/** AMDGPU_IB_FLAG_* */
|
/** AMDGPU_IB_FLAG_* */
|
||||||
|
Loading…
Reference in New Issue
Block a user