summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2020-12-26 20:59:38 -0500
committerMarge Bot <eric+marge@anholt.net>2021-01-18 01:17:19 +0000
commit4056e953fe43bd667e1812c1c7075285d24b42c2 (patch)
tree20e736d36d2436e2def60109d64a2c991584d1ed
parent1ceec51b128660d5f9037dd10ec5b0229a2378d1 (diff)
radeonsi: move emit_cache_flush functions into si_gfx_cs.c
This is a better place for them. They are not inlined anyway. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8548>
-rw-r--r--src/gallium/drivers/radeonsi/si_gfx_cs.c445
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h4
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h4
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.cpp447
4 files changed, 449 insertions, 451 deletions
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 42d895de91e..a72062d15f0 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -552,3 +552,448 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->index_ring_offset = 0;
}
+
+void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
+{
+ bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
+
+ assert(sctx->chip_class <= GFX9);
+
+ if (sctx->chip_class == GFX9 || compute_ib) {
+ /* Flush caches and wait for the caches to assert idle. */
+ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ } else {
+ /* ACQUIRE_MEM is only required on a compute ring. */
+ radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
+ radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ }
+
+ /* ACQUIRE_MEM has an implicit context roll if the current context
+ * is busy. */
+ if (!compute_ib)
+ sctx->context_roll = true;
+}
+
+void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
+{
+ uint32_t gcr_cntl = 0;
+ unsigned cb_db_event = 0;
+ unsigned flags = ctx->flags;
+
+ if (!ctx->has_graphics) {
+ /* Only process compute flags. */
+ flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* We don't need these. */
+ assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
+
+ if (flags & SI_CONTEXT_VGT_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ }
+
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+ ctx->num_cb_cache_flushes++;
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+ ctx->num_db_cache_flushes++;
+
+ if (flags & SI_CONTEXT_INV_ICACHE)
+ gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
+ if (flags & SI_CONTEXT_INV_SCACHE) {
+ /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
+ * to FORWARD when both L1 and L2 are written out (WB or INV).
+ */
+ gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
+ }
+ if (flags & SI_CONTEXT_INV_VCACHE)
+ gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
+
+ /* The L2 cache ops are:
+ * - INV: - invalidate lines that reflect memory (were loaded from memory)
+ * - don't touch lines that were overwritten (were stored by gfx clients)
+ * - WB: - don't touch lines that reflect memory
+ * - write back lines that were overwritten
+ * - WB | INV: - invalidate lines that reflect memory
+ * - write back lines that were overwritten
+ *
+ * GLM doesn't support WB alone. If WB is set, INV must be set too.
+ */
+ if (flags & SI_CONTEXT_INV_L2) {
+ /* Writeback and invalidate everything in L2. */
+ gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
+ ctx->num_L2_invalidates++;
+ } else if (flags & SI_CONTEXT_WB_L2) {
+ gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
+ } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
+ gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
+ }
+
+ if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+ }
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+ /* Flush HTILE. Will wait for idle later. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+ }
+
+ /* First flush CB/DB, then L1/L2. */
+ gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
+
+ if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
+ (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
+ cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+ } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+ } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
+ cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+ } else {
+ assert(0);
+ }
+ } else {
+ /* Wait for graphics shaders to go idle if requested. */
+ if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ /* Only count explicit shader flushes, not implicit ones. */
+ ctx->num_vs_flushes++;
+ ctx->num_ps_flushes++;
+ } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ ctx->num_vs_flushes++;
+ }
+ }
+
+ if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
+ ctx->num_cs_flushes++;
+ ctx->compute_is_busy = false;
+ }
+
+ if (cb_db_event) {
+ struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?
+ ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch;
+ /* CB/DB flush and invalidate (or possibly just a wait for a
+ * meta flush) via RELEASE_MEM.
+ *
+ * Combine this with other cache flushes when possible; this
+ * requires affected shaders to be idle, so do it after the
+ * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
+ * implied).
+ */
+ uint64_t va;
+
+ /* Do the flush (enqueue the event and wait for it). */
+ va = wait_mem_scratch->gpu_address;
+ ctx->wait_mem_number++;
+
+ /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
+ unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
+ unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
+ unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
+ unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
+ assert(G_586_GL2_US(gcr_cntl) == 0);
+ assert(G_586_GL2_RANGE(gcr_cntl) == 0);
+ assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
+ unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
+ unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
+ unsigned gcr_seq = G_586_SEQ(gcr_cntl);
+
+ gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
+ C_586_GL2_WB; /* keep SEQ */
+
+ si_cp_release_mem(ctx, cs, cb_db_event,
+ S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
+ S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
+ S_490_SEQ(gcr_seq),
+ EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
+ EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
+ SI_NOT_QUERY);
+ si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+ }
+
+ /* Ignore fields that only modify the behavior of other fields. */
+ if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
+ /* Flush caches and wait for the caches to assert idle.
+ * The cache flush is executed in the ME, but the PFP waits
+ * for completion.
+ */
+ radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
+ radeon_emit(cs, 0); /* CP_COHER_CNTL */
+ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
+ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
+ radeon_emit(cs, 0); /* CP_COHER_BASE */
+ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
+ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
+ radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
+ } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
+ SI_CONTEXT_CS_PARTIAL_FLUSH))) {
+ /* We need to ensure that PFP waits as well. */
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ }
+
+ if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+ } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+ }
+
+ ctx->flags = 0;
+}
+
+void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
+{
+ uint32_t flags = sctx->flags;
+
+ if (!sctx->has_graphics) {
+ /* Only process compute flags. */
+ flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ uint32_t cp_coher_cntl = 0;
+ const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
+ const bool is_barrier =
+ flush_cb_db ||
+ /* INV_ICACHE == beginning of gfx IB. Checking
+ * INV_ICACHE fixes corruption for DeusExMD with
+ * compute-based culling, but I don't know why.
+ */
+ flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
+ (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
+
+ assert(sctx->chip_class <= GFX9);
+
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
+ sctx->num_cb_cache_flushes++;
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+ sctx->num_db_cache_flushes++;
+
+ /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
+ * bit is set. An alternative way is to write SQC_CACHES, but that
+ * doesn't seem to work reliably. Since the bug doesn't affect
+ * correctness (it only does more work than necessary) and
+ * the performance impact is likely negligible, there is no plan
+ * to add a workaround for it.
+ */
+
+ if (flags & SI_CONTEXT_INV_ICACHE)
+ cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
+ if (flags & SI_CONTEXT_INV_SCACHE)
+ cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+
+ if (sctx->chip_class <= GFX8) {
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
+ S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
+ S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
+ S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
+ S_0085F0_CB7_DEST_BASE_ENA(1);
+
+ /* Necessary for DCC */
+ if (sctx->chip_class == GFX8)
+ si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
+ }
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
+ cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
+ }
+
+ if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
+ /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
+ }
+ if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
+ /* Flush HTILE. SURFACE_SYNC will wait for idle. */
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
+ }
+
+ /* Wait for shader engines to go idle.
+ * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
+ * for everything including CB/DB cache flushes.
+ */
+ if (!flush_cb_db) {
+ if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ /* Only count explicit shader flushes, not implicit ones
+ * done by SURFACE_SYNC.
+ */
+ sctx->num_vs_flushes++;
+ sctx->num_ps_flushes++;
+ } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ sctx->num_vs_flushes++;
+ }
+ }
+
+ if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ sctx->num_cs_flushes++;
+ sctx->compute_is_busy = false;
+ }
+
+ /* VGT state synchronization. */
+ if (flags & SI_CONTEXT_VGT_FLUSH) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ }
+ if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
+ }
+
+ /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
+ * wait for idle on GFX9. We have to use a TS event.
+ */
+ if (sctx->chip_class == GFX9 && flush_cb_db) {
+ uint64_t va;
+ unsigned tc_flags, cb_db_event;
+
+ /* Set the CB/DB flush event. */
+ switch (flush_cb_db) {
+ case SI_CONTEXT_FLUSH_AND_INV_CB:
+ cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
+ break;
+ case SI_CONTEXT_FLUSH_AND_INV_DB:
+ cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
+ break;
+ default:
+ /* both CB & DB */
+ cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
+ }
+
+ /* These are the only allowed combinations. If you need to
+ * do multiple operations at once, do them separately.
+ * All operations that invalidate L2 also seem to invalidate
+ * metadata. Volatile (VOL) and WC flushes are not listed here.
+ *
+ * TC | TC_WB = writeback & invalidate L2 & L1
+ * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
+ * TC_WB | TC_NC = writeback L2 for MTYPE == NC
+ * TC | TC_NC = invalidate L2 for MTYPE == NC
+ * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
+ * TCL1 = invalidate L1
+ */
+ tc_flags = 0;
+
+ if (flags & SI_CONTEXT_INV_L2_METADATA) {
+ tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
+ }
+
+ /* Ideally flush TC together with CB/DB. */
+ if (flags & SI_CONTEXT_INV_L2) {
+ /* Writeback and invalidate everything in L2 & L1. */
+ tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
+
+ /* Clear the flags. */
+ flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
+ sctx->num_L2_invalidates++;
+ }
+
+ /* Do the flush (enqueue the event and wait for it). */
+ struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ?
+ sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch;
+ va = wait_mem_scratch->gpu_address;
+ sctx->wait_mem_number++;
+
+ si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
+ EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
+ wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
+ si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
+ }
+
+ /* Make sure ME is idle (it executes most packets) before continuing.
+ * This prevents read-after-write hazards between PFP and ME.
+ */
+ if (sctx->has_graphics &&
+ (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
+ SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+ radeon_emit(cs, 0);
+ }
+
+ /* GFX6-GFX8 only:
+ * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
+ * waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
+ *
+ * cp_coher_cntl should contain all necessary flags except TC flags
+ * at this point.
+ *
+ * GFX6-GFX7 don't support L2 write-back.
+ */
+ if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
+ /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
+ * WB must be set on GFX8+ when TC_ACTION is set.
+ */
+ si_emit_surface_sync(sctx, cs,
+ cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
+ S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
+ cp_coher_cntl = 0;
+ sctx->num_L2_invalidates++;
+ } else {
+ /* L1 invalidation and L2 writeback must be done separately,
+ * because both operations can't be done together.
+ */
+ if (flags & SI_CONTEXT_WB_L2) {
+ /* WB = write-back
+ * NC = apply to non-coherent MTYPEs
+ * (i.e. MTYPE <= 1, which is what we use everywhere)
+ *
+ * WB doesn't work without NC.
+ */
+ si_emit_surface_sync(
+ sctx, cs,
+ cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ sctx->num_L2_writebacks++;
+ }
+ if (flags & SI_CONTEXT_INV_VCACHE) {
+ /* Invalidate per-CU VMEM L1. */
+ si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
+ cp_coher_cntl = 0;
+ }
+ }
+
+ /* If TC flushes haven't cleared this... */
+ if (cp_coher_cntl)
+ si_emit_surface_sync(sctx, cs, cp_coher_cntl);
+
+ if (is_barrier)
+ si_prim_discard_signal_next_compute_ib_start(sctx);
+
+ if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+ } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+ }
+
+ sctx->flags = 0;
+}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 339021a792d..f269b6ec1c0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1450,6 +1450,10 @@ void si_allocate_gds(struct si_context *ctx);
void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws);
+void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
+ unsigned cp_coher_cntl);
+void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
+void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
/* si_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 2de248f3705..a293787487a 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -587,11 +587,7 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs);
bool si_update_ngg(struct si_context *sctx);
/* si_state_draw.c */
-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs,
- unsigned cp_coher_cntl);
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
-void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
-void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
void si_trace_emit(struct si_context *sctx);
void si_init_draw_functions(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index c4042b53533..d6993a281e6 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -1136,37 +1136,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
}
extern "C"
-void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl)
-{
- bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs;
-
- assert(sctx->chip_class <= GFX9);
-
- if (sctx->chip_class == GFX9 || compute_ib) {
- /* Flush caches and wait for the caches to assert idle. */
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- } else {
- /* ACQUIRE_MEM is only required on a compute ring. */
- radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
- radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- }
-
- /* ACQUIRE_MEM has an implicit context roll if the current context
- * is busy. */
- if (!compute_ib)
- sctx->context_roll = true;
-}
-
-extern "C"
void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
{
if (!si_compute_prim_discard_enabled(sctx))
@@ -1192,422 +1161,6 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
}
-void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
-{
- uint32_t gcr_cntl = 0;
- unsigned cb_db_event = 0;
- unsigned flags = ctx->flags;
-
- if (!ctx->has_graphics) {
- /* Only process compute flags. */
- flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- /* We don't need these. */
- assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
-
- if (flags & SI_CONTEXT_VGT_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
- }
-
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
- ctx->num_cb_cache_flushes++;
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
- ctx->num_db_cache_flushes++;
-
- if (flags & SI_CONTEXT_INV_ICACHE)
- gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL);
- if (flags & SI_CONTEXT_INV_SCACHE) {
- /* TODO: When writing to the SMEM L1 cache, we need to set SEQ
- * to FORWARD when both L1 and L2 are written out (WB or INV).
- */
- gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1);
- }
- if (flags & SI_CONTEXT_INV_VCACHE)
- gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1);
-
- /* The L2 cache ops are:
- * - INV: - invalidate lines that reflect memory (were loaded from memory)
- * - don't touch lines that were overwritten (were stored by gfx clients)
- * - WB: - don't touch lines that reflect memory
- * - write back lines that were overwritten
- * - WB | INV: - invalidate lines that reflect memory
- * - write back lines that were overwritten
- *
- * GLM doesn't support WB alone. If WB is set, INV must be set too.
- */
- if (flags & SI_CONTEXT_INV_L2) {
- /* Writeback and invalidate everything in L2. */
- gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1);
- ctx->num_L2_invalidates++;
- } else if (flags & SI_CONTEXT_WB_L2) {
- gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1);
- } else if (flags & SI_CONTEXT_INV_L2_METADATA) {
- gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1);
- }
-
- if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- /* Flush CMASK/FMASK/DCC. Will wait for idle later. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
- }
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
- /* Flush HTILE. Will wait for idle later. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
- }
-
- /* First flush CB/DB, then L1/L2. */
- gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD);
-
- if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) ==
- (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) {
- cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
- } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
- } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
- cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
- } else {
- assert(0);
- }
- } else {
- /* Wait for graphics shaders to go idle if requested. */
- if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- /* Only count explicit shader flushes, not implicit ones. */
- ctx->num_vs_flushes++;
- ctx->num_ps_flushes++;
- } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- ctx->num_vs_flushes++;
- }
- }
-
- if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
- ctx->num_cs_flushes++;
- ctx->compute_is_busy = false;
- }
-
- if (cb_db_event) {
- struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?
- ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch;
- /* CB/DB flush and invalidate (or possibly just a wait for a
- * meta flush) via RELEASE_MEM.
- *
- * Combine this with other cache flushes when possible; this
- * requires affected shaders to be idle, so do it after the
- * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always
- * implied).
- */
- uint64_t va;
-
- /* Do the flush (enqueue the event and wait for it). */
- va = wait_mem_scratch->gpu_address;
- ctx->wait_mem_number++;
-
- /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */
- unsigned glm_wb = G_586_GLM_WB(gcr_cntl);
- unsigned glm_inv = G_586_GLM_INV(gcr_cntl);
- unsigned glv_inv = G_586_GLV_INV(gcr_cntl);
- unsigned gl1_inv = G_586_GL1_INV(gcr_cntl);
- assert(G_586_GL2_US(gcr_cntl) == 0);
- assert(G_586_GL2_RANGE(gcr_cntl) == 0);
- assert(G_586_GL2_DISCARD(gcr_cntl) == 0);
- unsigned gl2_inv = G_586_GL2_INV(gcr_cntl);
- unsigned gl2_wb = G_586_GL2_WB(gcr_cntl);
- unsigned gcr_seq = G_586_SEQ(gcr_cntl);
-
- gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV &
- C_586_GL2_WB; /* keep SEQ */
-
- si_cp_release_mem(ctx, cs, cb_db_event,
- S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) |
- S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) |
- S_490_SEQ(gcr_seq),
- EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
- EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number,
- SI_NOT_QUERY);
- si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
- }
-
- /* Ignore fields that only modify the behavior of other fields. */
- if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
- /* Flush caches and wait for the caches to assert idle.
- * The cache flush is executed in the ME, but the PFP waits
- * for completion.
- */
- radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
- radeon_emit(cs, 0); /* CP_COHER_CNTL */
- radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
- radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
- radeon_emit(cs, 0); /* CP_COHER_BASE */
- radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
- radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
- radeon_emit(cs, gcr_cntl); /* GCR_CNTL */
- } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH |
- SI_CONTEXT_CS_PARTIAL_FLUSH))) {
- /* We need to ensure that PFP waits as well. */
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
- radeon_emit(cs, 0);
- }
-
- if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
- } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
- }
-
- ctx->flags = 0;
-}
-
-extern "C"
-void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
-{
- uint32_t flags = sctx->flags;
-
- if (!sctx->has_graphics) {
- /* Only process compute flags. */
- flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- uint32_t cp_coher_cntl = 0;
- const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB);
- const bool is_barrier =
- flush_cb_db ||
- /* INV_ICACHE == beginning of gfx IB. Checking
- * INV_ICACHE fixes corruption for DeusExMD with
- * compute-based culling, but I don't know why.
- */
- flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) ||
- (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy);
-
- assert(sctx->chip_class <= GFX9);
-
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
- sctx->num_cb_cache_flushes++;
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
- sctx->num_db_cache_flushes++;
-
- /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
- * bit is set. An alternative way is to write SQC_CACHES, but that
- * doesn't seem to work reliably. Since the bug doesn't affect
- * correctness (it only does more work than necessary) and
- * the performance impact is likely negligible, there is no plan
- * to add a workaround for it.
- */
-
- if (flags & SI_CONTEXT_INV_ICACHE)
- cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
- if (flags & SI_CONTEXT_INV_SCACHE)
- cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
-
- if (sctx->chip_class <= GFX8) {
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) |
- S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) |
- S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) |
- S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) |
- S_0085F0_CB7_DEST_BASE_ENA(1);
-
- /* Necessary for DCC */
- if (sctx->chip_class == GFX8)
- si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM,
- EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY);
- }
- if (flags & SI_CONTEXT_FLUSH_AND_INV_DB)
- cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
- }
-
- if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
- /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0));
- }
- if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) {
- /* Flush HTILE. SURFACE_SYNC will wait for idle. */
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0));
- }
-
- /* Wait for shader engines to go idle.
- * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait
- * for everything including CB/DB cache flushes.
- */
- if (!flush_cb_db) {
- if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- /* Only count explicit shader flushes, not implicit ones
- * done by SURFACE_SYNC.
- */
- sctx->num_vs_flushes++;
- sctx->num_ps_flushes++;
- } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- sctx->num_vs_flushes++;
- }
- }
-
- if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
- sctx->num_cs_flushes++;
- sctx->compute_is_busy = false;
- }
-
- /* VGT state synchronization. */
- if (flags & SI_CONTEXT_VGT_FLUSH) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
- }
- if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
- }
-
- /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
- * wait for idle on GFX9. We have to use a TS event.
- */
- if (sctx->chip_class == GFX9 && flush_cb_db) {
- uint64_t va;
- unsigned tc_flags, cb_db_event;
-
- /* Set the CB/DB flush event. */
- switch (flush_cb_db) {
- case SI_CONTEXT_FLUSH_AND_INV_CB:
- cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
- break;
- case SI_CONTEXT_FLUSH_AND_INV_DB:
- cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
- break;
- default:
- /* both CB & DB */
- cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
- }
-
- /* These are the only allowed combinations. If you need to
- * do multiple operations at once, do them separately.
- * All operations that invalidate L2 also seem to invalidate
- * metadata. Volatile (VOL) and WC flushes are not listed here.
- *
- * TC | TC_WB = writeback & invalidate L2 & L1
- * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC
- * TC_WB | TC_NC = writeback L2 for MTYPE == NC
- * TC | TC_NC = invalidate L2 for MTYPE == NC
- * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.)
- * TCL1 = invalidate L1
- */
- tc_flags = 0;
-
- if (flags & SI_CONTEXT_INV_L2_METADATA) {
- tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA;
- }
-
- /* Ideally flush TC together with CB/DB. */
- if (flags & SI_CONTEXT_INV_L2) {
- /* Writeback and invalidate everything in L2 & L1. */
- tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA;
-
- /* Clear the flags. */
- flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE);
- sctx->num_L2_invalidates++;
- }
-
- /* Do the flush (enqueue the event and wait for it). */
- struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ?
- sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch;
- va = wait_mem_scratch->gpu_address;
- sctx->wait_mem_number++;
-
- si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM,
- EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT,
- wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY);
- si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
- }
-
- /* Make sure ME is idle (it executes most packets) before continuing.
- * This prevents read-after-write hazards between PFP and ME.
- */
- if (sctx->has_graphics &&
- (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
- SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
- radeon_emit(cs, 0);
- }
-
- /* GFX6-GFX8 only:
- * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
- * waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
- *
- * cp_coher_cntl should contain all necessary flags except TC flags
- * at this point.
- *
- * GFX6-GFX7 don't support L2 write-back.
- */
- if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) {
- /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
- * WB must be set on GFX8+ when TC_ACTION is set.
- */
- si_emit_surface_sync(sctx, cs,
- cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
- S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
- cp_coher_cntl = 0;
- sctx->num_L2_invalidates++;
- } else {
- /* L1 invalidation and L2 writeback must be done separately,
- * because both operations can't be done together.
- */
- if (flags & SI_CONTEXT_WB_L2) {
- /* WB = write-back
- * NC = apply to non-coherent MTYPEs
- * (i.e. MTYPE <= 1, which is what we use everywhere)
- *
- * WB doesn't work without NC.
- */
- si_emit_surface_sync(
- sctx, cs,
- cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1));
- cp_coher_cntl = 0;
- sctx->num_L2_writebacks++;
- }
- if (flags & SI_CONTEXT_INV_VCACHE) {
- /* Invalidate per-CU VMEM L1. */
- si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1));
- cp_coher_cntl = 0;
- }
- }
-
- /* If TC flushes haven't cleared this... */
- if (cp_coher_cntl)
- si_emit_surface_sync(sctx, cs, cp_coher_cntl);
-
- if (is_barrier)
- si_prim_discard_signal_next_compute_ib_start(sctx);
-
- if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
- } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
- radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
- }
-
- sctx->flags = 0;
-}
-
template <chip_class GFX_VERSION> ALWAYS_INLINE
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
{