diff options
author | Marek Olšák <marek.olsak@amd.com> | 2020-12-26 20:59:38 -0500 |
---|---|---|
committer | Marge Bot <eric+marge@anholt.net> | 2021-01-18 01:17:19 +0000 |
commit | 4056e953fe43bd667e1812c1c7075285d24b42c2 (patch) | |
tree | 20e736d36d2436e2def60109d64a2c991584d1ed | |
parent | 1ceec51b128660d5f9037dd10ec5b0229a2378d1 (diff) |
radeonsi: move emit_cache_flush functions into si_gfx_cs.c
This is a better place for them. They are not inlined anyway.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8548>
-rw-r--r-- | src/gallium/drivers/radeonsi/si_gfx_cs.c | 445 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_draw.cpp | 447 |
4 files changed, 449 insertions, 451 deletions
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 42d895de91e..a72062d15f0 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -552,3 +552,448 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) ctx->index_ring_offset = 0; } + +void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl) +{ + bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs; + + assert(sctx->chip_class <= GFX9); + + if (sctx->chip_class == GFX9 || compute_ib) { + /* Flush caches and wait for the caches to assert idle. */ + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + } else { + /* ACQUIRE_MEM is only required on a compute ring. */ + radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + } + + /* ACQUIRE_MEM has an implicit context roll if the current context + * is busy. */ + if (!compute_ib) + sctx->context_roll = true; +} + +void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) +{ + uint32_t gcr_cntl = 0; + unsigned cb_db_event = 0; + unsigned flags = ctx->flags; + + if (!ctx->has_graphics) { + /* Only process compute flags. */ + flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* We don't need these. */ + assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META))); + + if (flags & SI_CONTEXT_VGT_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) + ctx->num_cb_cache_flushes++; + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + ctx->num_db_cache_flushes++; + + if (flags & SI_CONTEXT_INV_ICACHE) + gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); + if (flags & SI_CONTEXT_INV_SCACHE) { + /* TODO: When writing to the SMEM L1 cache, we need to set SEQ + * to FORWARD when both L1 and L2 are written out (WB or INV). + */ + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); + } + if (flags & SI_CONTEXT_INV_VCACHE) + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); + + /* The L2 cache ops are: + * - INV: - invalidate lines that reflect memory (were loaded from memory) + * - don't touch lines that were overwritten (were stored by gfx clients) + * - WB: - don't touch lines that reflect memory + * - write back lines that were overwritten + * - WB | INV: - invalidate lines that reflect memory + * - write back lines that were overwritten + * + * GLM doesn't support WB alone. If WB is set, INV must be set too. + */ + if (flags & SI_CONTEXT_INV_L2) { + /* Writeback and invalidate everything in L2. */ + gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1); + ctx->num_L2_invalidates++; + } else if (flags & SI_CONTEXT_WB_L2) { + gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1); + } else if (flags & SI_CONTEXT_INV_L2_METADATA) { + gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); + } + + if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + } + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + /* Flush HTILE. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + } + + /* First flush CB/DB, then L1/L2. */ + gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); + + if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) == + (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + } else { + assert(0); + } + } else { + /* Wait for graphics shaders to go idle if requested. */ + if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* Only count explicit shader flushes, not implicit ones. */ + ctx->num_vs_flushes++; + ctx->num_ps_flushes++; + } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + ctx->num_vs_flushes++; + } + } + + if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); + ctx->num_cs_flushes++; + ctx->compute_is_busy = false; + } + + if (cb_db_event) { + struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ? + ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch; + /* CB/DB flush and invalidate (or possibly just a wait for a + * meta flush) via RELEASE_MEM. + * + * Combine this with other cache flushes when possible; this + * requires affected shaders to be idle, so do it after the + * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always + * implied). + */ + uint64_t va; + + /* Do the flush (enqueue the event and wait for it). */ + va = wait_mem_scratch->gpu_address; + ctx->wait_mem_number++; + + /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ + unsigned glm_wb = G_586_GLM_WB(gcr_cntl); + unsigned glm_inv = G_586_GLM_INV(gcr_cntl); + unsigned glv_inv = G_586_GLV_INV(gcr_cntl); + unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); + assert(G_586_GL2_US(gcr_cntl) == 0); + assert(G_586_GL2_RANGE(gcr_cntl) == 0); + assert(G_586_GL2_DISCARD(gcr_cntl) == 0); + unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); + unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); + unsigned gcr_seq = G_586_SEQ(gcr_cntl); + + gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV & + C_586_GL2_WB; /* keep SEQ */ + + si_cp_release_mem(ctx, cs, cb_db_event, + S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) | + S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) | + S_490_SEQ(gcr_seq), + EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, + EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number, + SI_NOT_QUERY); + si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); + } + + /* Ignore fields that only modify the behavior of other fields. */ + if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { + /* Flush caches and wait for the caches to assert idle. + * The cache flush is executed in the ME, but the PFP waits + * for completion. + */ + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(cs, 0); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ + } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* We need to ensure that PFP waits as well. */ + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } + + if (flags & SI_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); + } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); + } + + ctx->flags = 0; +} + +void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) +{ + uint32_t flags = sctx->flags; + + if (!sctx->has_graphics) { + /* Only process compute flags. */ + flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + uint32_t cp_coher_cntl = 0; + const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB); + const bool is_barrier = + flush_cb_db || + /* INV_ICACHE == beginning of gfx IB. Checking + * INV_ICACHE fixes corruption for DeusExMD with + * compute-based culling, but I don't know why. + */ + flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) || + (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy); + + assert(sctx->chip_class <= GFX9); + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) + sctx->num_cb_cache_flushes++; + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + sctx->num_db_cache_flushes++; + + /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either + * bit is set. An alternative way is to write SQC_CACHES, but that + * doesn't seem to work reliably. Since the bug doesn't affect + * correctness (it only does more work than necessary) and + * the performance impact is likely negligible, there is no plan + * to add a workaround for it. + */ + + if (flags & SI_CONTEXT_INV_ICACHE) + cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); + if (flags & SI_CONTEXT_INV_SCACHE) + cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); + + if (sctx->chip_class <= GFX8) { + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | + S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | + S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | + S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | + S_0085F0_CB7_DEST_BASE_ENA(1); + + /* Necessary for DCC */ + if (sctx->chip_class == GFX8) + si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM, + EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY); + } + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); + } + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); + } + if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) { + /* Flush HTILE. SURFACE_SYNC will wait for idle. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); + } + + /* Wait for shader engines to go idle. + * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait + * for everything including CB/DB cache flushes. + */ + if (!flush_cb_db) { + if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* Only count explicit shader flushes, not implicit ones + * done by SURFACE_SYNC. + */ + sctx->num_vs_flushes++; + sctx->num_ps_flushes++; + } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + sctx->num_vs_flushes++; + } + } + + if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + sctx->num_cs_flushes++; + sctx->compute_is_busy = false; + } + + /* VGT state synchronization. */ + if (flags & SI_CONTEXT_VGT_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); + } + if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); + } + + /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't + * wait for idle on GFX9. We have to use a TS event. + */ + if (sctx->chip_class == GFX9 && flush_cb_db) { + uint64_t va; + unsigned tc_flags, cb_db_event; + + /* Set the CB/DB flush event. */ + switch (flush_cb_db) { + case SI_CONTEXT_FLUSH_AND_INV_CB: + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + break; + case SI_CONTEXT_FLUSH_AND_INV_DB: + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + break; + default: + /* both CB & DB */ + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } + + /* These are the only allowed combinations. If you need to + * do multiple operations at once, do them separately. + * All operations that invalidate L2 also seem to invalidate + * metadata. Volatile (VOL) and WC flushes are not listed here. + * + * TC | TC_WB = writeback & invalidate L2 & L1 + * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC + * TC_WB | TC_NC = writeback L2 for MTYPE == NC + * TC | TC_NC = invalidate L2 for MTYPE == NC + * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) + * TCL1 = invalidate L1 + */ + tc_flags = 0; + + if (flags & SI_CONTEXT_INV_L2_METADATA) { + tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA; + } + + /* Ideally flush TC together with CB/DB. */ + if (flags & SI_CONTEXT_INV_L2) { + /* Writeback and invalidate everything in L2 & L1. */ + tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA; + + /* Clear the flags. */ + flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE); + sctx->num_L2_invalidates++; + } + + /* Do the flush (enqueue the event and wait for it). */ + struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ? + sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch; + va = wait_mem_scratch->gpu_address; + sctx->wait_mem_number++; + + si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM, + EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT, + wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY); + si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); + } + + /* Make sure ME is idle (it executes most packets) before continuing. + * This prevents read-after-write hazards between PFP and ME. + */ + if (sctx->has_graphics && + (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) { + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } + + /* GFX6-GFX8 only: + * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC + * waits for idle, so it should be last. SURFACE_SYNC is done in PFP. + * + * cp_coher_cntl should contain all necessary flags except TC flags + * at this point. + * + * GFX6-GFX7 don't support L2 write-back. + */ + if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) { + /* Invalidate L1 & L2. (L1 is always invalidated on GFX6) + * WB must be set on GFX8+ when TC_ACTION is set. + */ + si_emit_surface_sync(sctx, cs, + cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8)); + cp_coher_cntl = 0; + sctx->num_L2_invalidates++; + } else { + /* L1 invalidation and L2 writeback must be done separately, + * because both operations can't be done together. + */ + if (flags & SI_CONTEXT_WB_L2) { + /* WB = write-back + * NC = apply to non-coherent MTYPEs + * (i.e. MTYPE <= 1, which is what we use everywhere) + * + * WB doesn't work without NC. + */ + si_emit_surface_sync( + sctx, cs, + cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); + cp_coher_cntl = 0; + sctx->num_L2_writebacks++; + } + if (flags & SI_CONTEXT_INV_VCACHE) { + /* Invalidate per-CU VMEM L1. */ + si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); + cp_coher_cntl = 0; + } + } + + /* If TC flushes haven't cleared this... */ + if (cp_coher_cntl) + si_emit_surface_sync(sctx, cs, cp_coher_cntl); + + if (is_barrier) + si_prim_discard_signal_next_compute_ib_start(sctx); + + if (flags & SI_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); + } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); + } + + sctx->flags = 0; +} diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 339021a792d..f269b6ec1c0 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1450,6 +1450,10 @@ void si_allocate_gds(struct si_context *ctx); void si_set_tracked_regs_to_clear_state(struct si_context *ctx); void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs); void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws); +void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned cp_coher_cntl); +void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); +void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); /* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 2de248f3705..a293787487a 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -587,11 +587,7 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs); bool si_update_ngg(struct si_context *sctx); /* si_state_draw.c */ -void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, - unsigned cp_coher_cntl); void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); -void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); -void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); void si_trace_emit(struct si_context *sctx); void si_init_draw_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index c4042b53533..d6993a281e6 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -1136,37 +1136,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw } extern "C" -void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl) -{ - bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs; - - assert(sctx->chip_class <= GFX9); - - if (sctx->chip_class == GFX9 || compute_ib) { - /* Flush caches and wait for the caches to assert idle. */ - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0)); - radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - } else { - /* ACQUIRE_MEM is only required on a compute ring. */ - radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); - radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - } - - /* ACQUIRE_MEM has an implicit context roll if the current context - * is busy. */ - if (!compute_ib) - sctx->context_roll = true; -} - -extern "C" void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) { if (!si_compute_prim_discard_enabled(sctx)) @@ -1192,422 +1161,6 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0); } -void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs) -{ - uint32_t gcr_cntl = 0; - unsigned cb_db_event = 0; - unsigned flags = ctx->flags; - - if (!ctx->has_graphics) { - /* Only process compute flags. */ - flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | - SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - /* We don't need these. */ - assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META))); - - if (flags & SI_CONTEXT_VGT_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); - } - - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) - ctx->num_cb_cache_flushes++; - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) - ctx->num_db_cache_flushes++; - - if (flags & SI_CONTEXT_INV_ICACHE) - gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); - if (flags & SI_CONTEXT_INV_SCACHE) { - /* TODO: When writing to the SMEM L1 cache, we need to set SEQ - * to FORWARD when both L1 and L2 are written out (WB or INV). - */ - gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); - } - if (flags & SI_CONTEXT_INV_VCACHE) - gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); - - /* The L2 cache ops are: - * - INV: - invalidate lines that reflect memory (were loaded from memory) - * - don't touch lines that were overwritten (were stored by gfx clients) - * - WB: - don't touch lines that reflect memory - * - write back lines that were overwritten - * - WB | INV: - invalidate lines that reflect memory - * - write back lines that were overwritten - * - * GLM doesn't support WB alone. If WB is set, INV must be set too. - */ - if (flags & SI_CONTEXT_INV_L2) { - /* Writeback and invalidate everything in L2. */ - gcr_cntl |= S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | S_586_GLM_WB(1); - ctx->num_L2_invalidates++; - } else if (flags & SI_CONTEXT_WB_L2) { - gcr_cntl |= S_586_GL2_WB(1) | S_586_GLM_WB(1) | S_586_GLM_INV(1); - } else if (flags & SI_CONTEXT_INV_L2_METADATA) { - gcr_cntl |= S_586_GLM_INV(1) | S_586_GLM_WB(1); - } - - if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); - } - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { - /* Flush HTILE. Will wait for idle later. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); - } - - /* First flush CB/DB, then L1/L2. */ - gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); - - if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) == - (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { - cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; - } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; - } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { - cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; - } else { - assert(0); - } - } else { - /* Wait for graphics shaders to go idle if requested. */ - if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - /* Only count explicit shader flushes, not implicit ones. */ - ctx->num_vs_flushes++; - ctx->num_ps_flushes++; - } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - ctx->num_vs_flushes++; - } - } - - if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); - ctx->num_cs_flushes++; - ctx->compute_is_busy = false; - } - - if (cb_db_event) { - struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ? - ctx->wait_mem_scratch_tmz : ctx->wait_mem_scratch; - /* CB/DB flush and invalidate (or possibly just a wait for a - * meta flush) via RELEASE_MEM. - * - * Combine this with other cache flushes when possible; this - * requires affected shaders to be idle, so do it after the - * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always - * implied). - */ - uint64_t va; - - /* Do the flush (enqueue the event and wait for it). */ - va = wait_mem_scratch->gpu_address; - ctx->wait_mem_number++; - - /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ - unsigned glm_wb = G_586_GLM_WB(gcr_cntl); - unsigned glm_inv = G_586_GLM_INV(gcr_cntl); - unsigned glv_inv = G_586_GLV_INV(gcr_cntl); - unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); - assert(G_586_GL2_US(gcr_cntl) == 0); - assert(G_586_GL2_RANGE(gcr_cntl) == 0); - assert(G_586_GL2_DISCARD(gcr_cntl) == 0); - unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); - unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); - unsigned gcr_seq = G_586_SEQ(gcr_cntl); - - gcr_cntl &= C_586_GLM_WB & C_586_GLM_INV & C_586_GLV_INV & C_586_GL1_INV & C_586_GL2_INV & - C_586_GL2_WB; /* keep SEQ */ - - si_cp_release_mem(ctx, cs, cb_db_event, - S_490_GLM_WB(glm_wb) | S_490_GLM_INV(glm_inv) | S_490_GLV_INV(glv_inv) | - S_490_GL1_INV(gl1_inv) | S_490_GL2_INV(gl2_inv) | S_490_GL2_WB(gl2_wb) | - S_490_SEQ(gcr_seq), - EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, - EOP_DATA_SEL_VALUE_32BIT, wait_mem_scratch, va, ctx->wait_mem_number, - SI_NOT_QUERY); - si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); - } - - /* Ignore fields that only modify the behavior of other fields. */ - if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { - /* Flush caches and wait for the caches to assert idle. - * The cache flush is executed in the ME, but the PFP waits - * for completion. - */ - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(cs, 0); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ - } else if (cb_db_event || (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH))) { - /* We need to ensure that PFP waits as well. */ - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); - } - - if (flags & SI_CONTEXT_START_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); - } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); - } - - ctx->flags = 0; -} - -extern "C" -void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) -{ - uint32_t flags = sctx->flags; - - if (!sctx->has_graphics) { - /* Only process compute flags. */ - flags &= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | - SI_CONTEXT_CS_PARTIAL_FLUSH; - } - - uint32_t cp_coher_cntl = 0; - const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB); - const bool is_barrier = - flush_cb_db || - /* INV_ICACHE == beginning of gfx IB. Checking - * INV_ICACHE fixes corruption for DeusExMD with - * compute-based culling, but I don't know why. - */ - flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) || - (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy); - - assert(sctx->chip_class <= GFX9); - - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) - sctx->num_cb_cache_flushes++; - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) - sctx->num_db_cache_flushes++; - - /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either - * bit is set. An alternative way is to write SQC_CACHES, but that - * doesn't seem to work reliably. Since the bug doesn't affect - * correctness (it only does more work than necessary) and - * the performance impact is likely negligible, there is no plan - * to add a workaround for it. - */ - - if (flags & SI_CONTEXT_INV_ICACHE) - cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (flags & SI_CONTEXT_INV_SCACHE) - cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - - if (sctx->chip_class <= GFX8) { - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | - S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | - S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | - S_0085F0_CB5_DEST_BASE_ENA(1) | S_0085F0_CB6_DEST_BASE_ENA(1) | - S_0085F0_CB7_DEST_BASE_ENA(1); - - /* Necessary for DCC */ - if (sctx->chip_class == GFX8) - si_cp_release_mem(sctx, cs, V_028A90_FLUSH_AND_INV_CB_DATA_TS, 0, EOP_DST_SEL_MEM, - EOP_INT_SEL_NONE, EOP_DATA_SEL_DISCARD, NULL, 0, 0, SI_NOT_QUERY); - } - if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) - cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1); - } - - if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { - /* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); - } - if (flags & (SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_FLUSH_AND_INV_DB_META)) { - /* Flush HTILE. SURFACE_SYNC will wait for idle. */ - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); - } - - /* Wait for shader engines to go idle. - * VS and PS waits are unnecessary if SURFACE_SYNC is going to wait - * for everything including CB/DB cache flushes. - */ - if (!flush_cb_db) { - if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - /* Only count explicit shader flushes, not implicit ones - * done by SURFACE_SYNC. - */ - sctx->num_vs_flushes++; - sctx->num_ps_flushes++; - } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - sctx->num_vs_flushes++; - } - } - - if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - sctx->num_cs_flushes++; - sctx->compute_is_busy = false; - } - - /* VGT state synchronization. */ - if (flags & SI_CONTEXT_VGT_FLUSH) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); - } - if (flags & SI_CONTEXT_VGT_STREAMOUT_SYNC) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0)); - } - - /* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't - * wait for idle on GFX9. We have to use a TS event. - */ - if (sctx->chip_class == GFX9 && flush_cb_db) { - uint64_t va; - unsigned tc_flags, cb_db_event; - - /* Set the CB/DB flush event. */ - switch (flush_cb_db) { - case SI_CONTEXT_FLUSH_AND_INV_CB: - cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; - break; - case SI_CONTEXT_FLUSH_AND_INV_DB: - cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; - break; - default: - /* both CB & DB */ - cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; - } - - /* These are the only allowed combinations. If you need to - * do multiple operations at once, do them separately. - * All operations that invalidate L2 also seem to invalidate - * metadata. Volatile (VOL) and WC flushes are not listed here. - * - * TC | TC_WB = writeback & invalidate L2 & L1 - * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC - * TC_WB | TC_NC = writeback L2 for MTYPE == NC - * TC | TC_NC = invalidate L2 for MTYPE == NC - * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) - * TCL1 = invalidate L1 - */ - tc_flags = 0; - - if (flags & SI_CONTEXT_INV_L2_METADATA) { - tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA; - } - - /* Ideally flush TC together with CB/DB. */ - if (flags & SI_CONTEXT_INV_L2) { - /* Writeback and invalidate everything in L2 & L1. */ - tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA; - - /* Clear the flags. */ - flags &= ~(SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_VCACHE); - sctx->num_L2_invalidates++; - } - - /* Do the flush (enqueue the event and wait for it). */ - struct si_resource* wait_mem_scratch = unlikely(sctx->ws->cs_is_secure(cs)) ? - sctx->wait_mem_scratch_tmz : sctx->wait_mem_scratch; - va = wait_mem_scratch->gpu_address; - sctx->wait_mem_number++; - - si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM, - EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_VALUE_32BIT, - wait_mem_scratch, va, sctx->wait_mem_number, SI_NOT_QUERY); - si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); - } - - /* Make sure ME is idle (it executes most packets) before continuing. - * This prevents read-after-write hazards between PFP and ME. - */ - if (sctx->has_graphics && - (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE | - SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) { - radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); - radeon_emit(cs, 0); - } - - /* GFX6-GFX8 only: - * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC - * waits for idle, so it should be last. SURFACE_SYNC is done in PFP. - * - * cp_coher_cntl should contain all necessary flags except TC flags - * at this point. - * - * GFX6-GFX7 don't support L2 write-back. - */ - if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && (flags & SI_CONTEXT_WB_L2))) { - /* Invalidate L1 & L2. (L1 is always invalidated on GFX6) - * WB must be set on GFX8+ when TC_ACTION is set. - */ - si_emit_surface_sync(sctx, cs, - cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8)); - cp_coher_cntl = 0; - sctx->num_L2_invalidates++; - } else { - /* L1 invalidation and L2 writeback must be done separately, - * because both operations can't be done together. - */ - if (flags & SI_CONTEXT_WB_L2) { - /* WB = write-back - * NC = apply to non-coherent MTYPEs - * (i.e. MTYPE <= 1, which is what we use everywhere) - * - * WB doesn't work without NC. - */ - si_emit_surface_sync( - sctx, cs, - cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); - cp_coher_cntl = 0; - sctx->num_L2_writebacks++; - } - if (flags & SI_CONTEXT_INV_VCACHE) { - /* Invalidate per-CU VMEM L1. */ - si_emit_surface_sync(sctx, cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); - cp_coher_cntl = 0; - } - } - - /* If TC flushes haven't cleared this... */ - if (cp_coher_cntl) - si_emit_surface_sync(sctx, cs, cp_coher_cntl); - - if (is_barrier) - si_prim_discard_signal_next_compute_ib_start(sctx); - - if (flags & SI_CONTEXT_START_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0)); - } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { - radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); - radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0)); - } - - sctx->flags = 0; -} - template <chip_class GFX_VERSION> ALWAYS_INLINE static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) { |