summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2020-12-26 23:14:01 -0500
committerMarge Bot <eric+marge@anholt.net>2021-01-18 01:17:19 +0000
commit961aa67adf651ddb2b035a2ad5608db84fdbd258 (patch)
tree88375803f775d81a1c41dbd8a4a4fd559f399ea8
parent0eca4660a5588696047c18546a9525e456478af9 (diff)
radeonsi: add a specialized function for CP DMA L2 prefetch
This radically simplifies the code to decrease CPU overhead in si_draw_vbo. The generic CP DMA copy function is too complicated. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8548>
-rw-r--r--src/gallium/drivers/radeonsi/si_compute.c2
-rw-r--r--src/gallium/drivers/radeonsi/si_cp_dma.c38
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h2
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h2
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.cpp13
5 files changed, 43 insertions, 14 deletions
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 4289e83083b..f3f948d4696 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -900,7 +900,7 @@ static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info
/* Prefetch the compute shader to L2. */
if (sctx->chip_class >= GFX7 && prefetch)
- cik_prefetch_TC_L2_async(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
+ si_cp_dma_prefetch(sctx, &program->shader.bo->b.b, 0, program->shader.bo->b.b.width0);
if (program->ir_type != PIPE_SHADER_IR_NATIVE)
si_setup_nir_user_data(sctx, info);
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index f8e483d9fcc..7945143f916 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -399,6 +399,44 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
}
}
+void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
+ unsigned offset, unsigned size)
+{
+ uint64_t address = si_resource(buf)->gpu_address + offset;
+
+ assert(sctx->chip_class >= GFX7);
+
+ /* The prefetch address and size must be aligned, so that we don't have to apply
+ * the complicated hw bug workaround.
+ *
+ * The size should also be less than 2 MB, so that we don't have to use a loop.
+ * Callers shouldn't need to prefetch more than 2 MB.
+ */
+ assert(size % SI_CPDMA_ALIGNMENT == 0);
+ assert(address % SI_CPDMA_ALIGNMENT == 0);
+ assert(size < S_414_BYTE_COUNT_GFX6(~0u));
+
+ uint32_t header = S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2);
+ uint32_t command = S_414_BYTE_COUNT_GFX6(size);
+
+ if (sctx->chip_class >= GFX9) {
+ command |= S_414_DISABLE_WR_CONFIRM_GFX9(1);
+ header |= S_411_DST_SEL(V_411_NOWHERE);
+ } else {
+ command |= S_414_DISABLE_WR_CONFIRM_GFX6(1);
+ header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2);
+ }
+
+ struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
+ radeon_emit(cs, header);
+ radeon_emit(cs, address); /* SRC_ADDR_LO [31:0] */
+ radeon_emit(cs, address >> 32); /* SRC_ADDR_HI [31:0] */
+ radeon_emit(cs, address); /* DST_ADDR_LO [31:0] */
+ radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */
+ radeon_emit(cs, command);
+}
+
void si_test_gds(struct si_context *sctx)
{
struct pipe_context *ctx = &sctx->b;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 1fc7d0d25b5..0623ad31b9d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1401,6 +1401,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
unsigned size, unsigned user_flags, enum si_coherency coher,
enum si_cache_policy cache_policy);
+void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
+ unsigned offset, unsigned size);
void si_test_gds(struct si_context *sctx);
void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
unsigned size, unsigned dst_sel, unsigned engine, const void *data);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 63ede1d1e15..a293787487a 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -587,8 +587,6 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs);
bool si_update_ngg(struct si_context *sctx);
/* si_state_draw.c */
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
- unsigned size);
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
void si_trace_emit(struct si_context *sctx);
void si_init_draw_functions(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index c180e1d3153..9731780bcb7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -59,20 +59,11 @@ static unsigned si_conv_pipe_prim(unsigned mode)
return prim_conv[mode];
}
-void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset,
- unsigned size)
-{
- assert(sctx->chip_class >= GFX7);
-
- si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL,
- SI_COHERENCY_SHADER, L2_LRU);
-}
-
static void si_prefetch_shader_async(struct si_context *sctx, struct si_pm4_state *state)
{
struct pipe_resource *bo = &state->shader->bo->b.b;
- cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+ si_cp_dma_prefetch(sctx, bo, 0, bo->width0);
}
static void si_prefetch_VBO_descriptors(struct si_context *sctx)
@@ -80,7 +71,7 @@ static void si_prefetch_VBO_descriptors(struct si_context *sctx)
if (!sctx->vertex_elements || !sctx->vertex_elements->vb_desc_list_alloc_size)
return;
- cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
+ si_cp_dma_prefetch(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset,
sctx->vertex_elements->vb_desc_list_alloc_size);
}