summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2021-01-09 15:14:22 -0500
committerMarge Bot <eric+marge@anholt.net>2021-01-22 16:45:29 +0000
commita0978fffb849264ccb20e6b4905b9cf05ed17593 (patch)
tree3759333a16c4272a73f2230185a9ab7190c2bf1b
parent3ef89b245e3e1ac4e67fea9c1b13ebeda75769d0 (diff)
radeonsi: add new possibly faster command submission helpers
This decreases the release libgallium_dri.so size without debug symbols by 16384 bytes. The CPU time spent in si_emit_draw_packets decreased from 4.5% to 4.1% in viewperf13/catia/plane01. The previous code did: cs->current.buf[cs->current.cdw++] = ...; cs->current.buf[cs->current.cdw++] = ...; cs->current.buf[cs->current.cdw++] = ...; cs->current.buf[cs->current.cdw++] = ...; The new code does: unsigned num = cs->current.cdw; uint32_t *buf = cs->current.buf; buf[num++] = ...; buf[num++] = ...; buf[num++] = ...; buf[num++] = ...; cs->current.cdw = num; The code is the same (radeon_emit is redefined as a macro) except that all set and emit functions must be surrounded by radeon_begin(cs) and radeon_end(). radeon_packets_added() returns whether there has been any new packets added since radeon_begin. radeon_end_update_context_roll(sctx) sets sctx->context_roll = true if there has been any new packets added since radeon_begin. For now, the "cs" parameter is intentionally unused in radeon_emit and radeon_emit_array. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8653>
-rw-r--r--src/gallium/drivers/radeonsi/si_build_pm4.h446
-rw-r--r--src/gallium/drivers/radeonsi/si_compute.c21
-rw-r--r--src/gallium/drivers/radeonsi/si_compute_prim_discard.c19
-rw-r--r--src/gallium/drivers/radeonsi/si_cp_dma.c10
-rw-r--r--src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c12
-rw-r--r--src/gallium/drivers/radeonsi/si_descriptors.c128
-rw-r--r--src/gallium/drivers/radeonsi/si_fence.c6
-rw-r--r--src/gallium/drivers/radeonsi/si_gfx_cs.c21
-rw-r--r--src/gallium/drivers/radeonsi/si_perfcounter.c17
-rw-r--r--src/gallium/drivers/radeonsi/si_pm4.c2
-rw-r--r--src/gallium/drivers/radeonsi/si_query.c24
-rw-r--r--src/gallium/drivers/radeonsi/si_sqtt.c46
-rw-r--r--src/gallium/drivers/radeonsi/si_state.c39
-rw-r--r--src/gallium/drivers/radeonsi/si_state_binning.c10
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.cpp56
-rw-r--r--src/gallium/drivers/radeonsi/si_state_msaa.c4
-rw-r--r--src/gallium/drivers/radeonsi/si_state_shaders.c77
-rw-r--r--src/gallium/drivers/radeonsi/si_state_streamout.c15
-rw-r--r--src/gallium/drivers/radeonsi/si_state_viewport.c35
19 files changed, 593 insertions, 395 deletions
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index 3ccf3529d56..e08ffe2f305 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -39,249 +39,251 @@
#define SI_CHECK_SHADOWED_REGS(reg_offset, count)
#endif
-static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
-{
- SI_CHECK_SHADOWED_REGS(reg, num);
- assert(reg < SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
- radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
-}
-
-static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
-{
- radeon_set_config_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
-{
- SI_CHECK_SHADOWED_REGS(reg, num);
- assert(reg >= SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
- radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
-}
-
-static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
-{
- radeon_set_context_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_context_reg_seq_array(struct radeon_cmdbuf *cs, unsigned reg,
- unsigned num, const uint32_t *values)
-{
- radeon_set_context_reg_seq(cs, reg, num);
- radeon_emit_array(cs, values, num);
-}
-
-static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx,
- unsigned value)
-{
- SI_CHECK_SHADOWED_REGS(reg, 1);
- assert(reg >= SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 3 <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
- radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
-{
- SI_CHECK_SHADOWED_REGS(reg, num);
- assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
- radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
-}
-
-static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
-{
- radeon_set_sh_reg_seq(cs, reg, 1);
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num, bool perfctr)
-{
- SI_CHECK_SHADOWED_REGS(reg, num);
- assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
- assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, perfctr));
- radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
-}
-
-static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
-{
- radeon_set_uconfig_reg_seq(cs, reg, 1, false);
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_uconfig_reg_perfctr(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
-{
- radeon_set_uconfig_reg_seq(cs, reg, 1, true);
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen,
- unsigned reg, unsigned idx, unsigned value)
-{
- SI_CHECK_SHADOWED_REGS(reg, 1);
- assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
- assert(cs->current.cdw + 3 <= cs->current.max_dw);
- assert(idx != 0);
- unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
- if (screen->info.chip_class < GFX9 ||
- (screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
- opcode = PKT3_SET_UCONFIG_REG;
- radeon_emit(cs, PKT3(opcode, 1, 0));
- radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
- radeon_emit(cs, value);
-}
-
-static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg,
- unsigned value, unsigned mask)
-{
- SI_CHECK_SHADOWED_REGS(reg, 1);
- assert(reg >= SI_CONTEXT_REG_OFFSET);
- assert(cs->current.cdw + 4 <= cs->current.max_dw);
- radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
- radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
- radeon_emit(cs, mask);
- radeon_emit(cs, value);
-}
+#define radeon_begin(cs) struct radeon_cmdbuf *__cs = (cs); \
+ unsigned __cs_num = __cs->current.cdw; \
+ UNUSED unsigned __cs_num_initial = __cs_num; \
+ uint32_t *__cs_buf = __cs->current.buf
+
+#define radeon_begin_again(cs) do { \
+ assert(__cs == NULL); \
+ __cs = (cs); \
+ __cs_num = __cs->current.cdw; \
+ __cs_num_initial = __cs_num; \
+ __cs_buf = __cs->current.buf; \
+} while (0)
+
+#define radeon_end() do { \
+ __cs->current.cdw = __cs_num; \
+ assert(__cs->current.cdw <= __cs->current.max_dw); \
+ __cs = NULL; \
+} while (0)
+
+#define radeon_emit(cs, value) __cs_buf[__cs_num++] = (value)
+#define radeon_packets_added() (__cs_num != __cs_num_initial)
+
+#define radeon_end_update_context_roll(sctx) do { \
+ radeon_end(); \
+ if (radeon_packets_added()) \
+ (sctx)->context_roll = true; \
+} while (0)
+
+#define radeon_emit_array(cs, values, num) do { \
+ unsigned __n = (num); \
+ memcpy(__cs_buf + __cs_num, (values), __n * 4); \
+ __cs_num += __n; \
+} while (0)
+
+#define radeon_set_config_reg_seq(cs, reg, num) do { \
+ SI_CHECK_SHADOWED_REGS(reg, num); \
+ assert((reg) < SI_CONTEXT_REG_OFFSET); \
+ radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0)); \
+ radeon_emit(cs, ((reg) - SI_CONFIG_REG_OFFSET) >> 2); \
+} while (0)
+
+#define radeon_set_config_reg(cs, reg, value) do { \
+ radeon_set_config_reg_seq(cs, reg, 1); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_context_reg_seq(cs, reg, num) do { \
+ SI_CHECK_SHADOWED_REGS(reg, num); \
+ assert((reg) >= SI_CONTEXT_REG_OFFSET); \
+ radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0)); \
+ radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \
+} while (0)
+
+#define radeon_set_context_reg(cs, reg, value) do { \
+ radeon_set_context_reg_seq(cs, reg, 1); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_context_reg_seq_array(cs, reg, num, values) do { \
+ radeon_set_context_reg_seq(cs, reg, num); \
+ radeon_emit_array(cs, values, num); \
+} while (0)
+
+#define radeon_set_context_reg_idx(cs, reg, idx, value) do { \
+ SI_CHECK_SHADOWED_REGS(reg, 1); \
+ assert((reg) >= SI_CONTEXT_REG_OFFSET); \
+ radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); \
+ radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2 | ((idx) << 28)); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_sh_reg_seq(cs, reg, num) do { \
+ SI_CHECK_SHADOWED_REGS(reg, num); \
+ assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \
+ radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0)); \
+ radeon_emit(cs, ((reg) - SI_SH_REG_OFFSET) >> 2); \
+} while (0)
+
+#define radeon_set_sh_reg(cs, reg, value) do { \
+ radeon_set_sh_reg_seq(cs, reg, 1); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_uconfig_reg_seq(cs, reg, num, perfctr) do { \
+ SI_CHECK_SHADOWED_REGS(reg, num); \
+ assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
+ radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \
+ radeon_emit(cs, ((reg) - CIK_UCONFIG_REG_OFFSET) >> 2); \
+} while (0)
+
+#define radeon_set_uconfig_reg(cs, reg, value) do { \
+ radeon_set_uconfig_reg_seq(cs, reg, 1, false); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_uconfig_reg_perfctr(cs, reg, value) do { \
+ radeon_set_uconfig_reg_seq(cs, reg, 1, true); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_uconfig_reg_idx(cs, screen, chip_class, reg, idx, value) do { \
+ SI_CHECK_SHADOWED_REGS(reg, 1); \
+ assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
+ assert((idx) != 0); \
+ unsigned __opcode = PKT3_SET_UCONFIG_REG_INDEX; \
+ if ((chip_class) < GFX9 || \
+ ((chip_class) == GFX9 && (screen)->info.me_fw_version < 26)) \
+ __opcode = PKT3_SET_UCONFIG_REG; \
+ radeon_emit(cs, PKT3(__opcode, 1, 0)); \
+ radeon_emit(cs, ((reg) - CIK_UCONFIG_REG_OFFSET) >> 2 | ((idx) << 28)); \
+ radeon_emit(cs, value); \
+} while (0)
+
+#define radeon_set_context_reg_rmw(cs, reg, value, mask) do { \
+ SI_CHECK_SHADOWED_REGS(reg, 1); \
+ assert((reg) >= SI_CONTEXT_REG_OFFSET); \
+ radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0)); \
+ radeon_emit(cs, ((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \
+ radeon_emit(cs, mask); \
+ radeon_emit(cs, value); \
+} while (0)
/* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
-static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value,
- unsigned mask)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- assert((value & ~mask) == 0);
- value &= mask;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
- sctx->tracked_regs.reg_value[reg] != value) {
- radeon_set_context_reg_rmw(cs, offset, value, mask);
-
- sctx->tracked_regs.reg_saved |= 0x1ull << reg;
- sctx->tracked_regs.reg_value[reg] = value;
- }
-}
+#define radeon_opt_set_context_reg_rmw(sctx, offset, reg, val, mask) do { \
+ unsigned __value = (val); \
+ assert((__value & ~mask) == 0); \
+ __value &= mask; \
+ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
+ sctx->tracked_regs.reg_value[reg] != __value) { \
+ radeon_set_context_reg_rmw(&sctx->gfx_cs, offset, __value, mask); \
+ sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \
+ sctx->tracked_regs.reg_value[reg] = __value; \
+ } \
+} while (0)
/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
-static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
- sctx->tracked_regs.reg_value[reg] != value) {
- radeon_set_context_reg(cs, offset, value);
-
- sctx->tracked_regs.reg_saved |= 0x1ull << reg;
- sctx->tracked_regs.reg_value[reg] = value;
- }
-}
+#define radeon_opt_set_context_reg(sctx, offset, reg, val) do { \
+ unsigned __value = val; \
+ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x1) != 0x1 || \
+ sctx->tracked_regs.reg_value[reg] != __value) { \
+ radeon_set_context_reg(&sctx->gfx_cs, offset, __value); \
+ sctx->tracked_regs.reg_saved |= 0x1ull << (reg); \
+ sctx->tracked_regs.reg_value[reg] = __value; \
+ } \
+} while (0)
/**
* Set 2 consecutive registers if any registers value is different.
* @param offset starting register offset
- * @param value1 is written to first register
- * @param value2 is written to second register
+ * @param val1 is written to first register
+ * @param val2 is written to second register
*/
-static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value1,
- unsigned value2)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
- sctx->tracked_regs.reg_value[reg] != value1 ||
- sctx->tracked_regs.reg_value[reg + 1] != value2) {
- radeon_set_context_reg_seq(cs, offset, 2);
- radeon_emit(cs, value1);
- radeon_emit(cs, value2);
-
- sctx->tracked_regs.reg_value[reg] = value1;
- sctx->tracked_regs.reg_value[reg + 1] = value2;
- sctx->tracked_regs.reg_saved |= 0x3ull << reg;
- }
-}
+#define radeon_opt_set_context_reg2(sctx, offset, reg, val1, val2) do { \
+ unsigned __value1 = (val1), __value2 = (val2); \
+ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x3) != 0x3 || \
+ sctx->tracked_regs.reg_value[reg] != __value1 || \
+ sctx->tracked_regs.reg_value[(reg) + 1] != __value2) { \
+ radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 2); \
+ radeon_emit(cs, __value1); \
+ radeon_emit(cs, __value2); \
+ sctx->tracked_regs.reg_value[reg] = __value1; \
+ sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \
+ sctx->tracked_regs.reg_saved |= 0x3ull << (reg); \
+ } \
+} while (0)
/**
* Set 3 consecutive registers if any registers value is different.
*/
-static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value1,
- unsigned value2, unsigned value3)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
- sctx->tracked_regs.reg_value[reg] != value1 ||
- sctx->tracked_regs.reg_value[reg + 1] != value2 ||
- sctx->tracked_regs.reg_value[reg + 2] != value3) {
- radeon_set_context_reg_seq(cs, offset, 3);
- radeon_emit(cs, value1);
- radeon_emit(cs, value2);
- radeon_emit(cs, value3);
-
- sctx->tracked_regs.reg_value[reg] = value1;
- sctx->tracked_regs.reg_value[reg + 1] = value2;
- sctx->tracked_regs.reg_value[reg + 2] = value3;
- sctx->tracked_regs.reg_saved |= 0x7ull << reg;
- }
-}
+#define radeon_opt_set_context_reg3(sctx, offset, reg, val1, val2, val3) do { \
+ unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3); \
+ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0x7) != 0x7 || \
+ sctx->tracked_regs.reg_value[reg] != __value1 || \
+ sctx->tracked_regs.reg_value[(reg) + 1] != __value2 || \
+ sctx->tracked_regs.reg_value[(reg) + 2] != __value3) { \
+ radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 3); \
+ radeon_emit(cs, __value1); \
+ radeon_emit(cs, __value2); \
+ radeon_emit(cs, __value3); \
+ sctx->tracked_regs.reg_value[reg] = __value1; \
+ sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \
+ sctx->tracked_regs.reg_value[(reg) + 2] = __value3; \
+ sctx->tracked_regs.reg_saved |= 0x7ull << (reg); \
+ } \
+} while (0)
/**
* Set 4 consecutive registers if any registers value is different.
*/
-static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
- enum si_tracked_reg reg, unsigned value1,
- unsigned value2, unsigned value3, unsigned value4)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
- sctx->tracked_regs.reg_value[reg] != value1 ||
- sctx->tracked_regs.reg_value[reg + 1] != value2 ||
- sctx->tracked_regs.reg_value[reg + 2] != value3 ||
- sctx->tracked_regs.reg_value[reg + 3] != value4) {
- radeon_set_context_reg_seq(cs, offset, 4);
- radeon_emit(cs, value1);
- radeon_emit(cs, value2);
- radeon_emit(cs, value3);
- radeon_emit(cs, value4);
-
- sctx->tracked_regs.reg_value[reg] = value1;
- sctx->tracked_regs.reg_value[reg + 1] = value2;
- sctx->tracked_regs.reg_value[reg + 2] = value3;
- sctx->tracked_regs.reg_value[reg + 3] = value4;
- sctx->tracked_regs.reg_saved |= 0xfull << reg;
- }
-}
+#define radeon_opt_set_context_reg4(sctx, offset, reg, val1, val2, val3, val4) do { \
+ unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3), __value4 = (val4); \
+ if (((sctx->tracked_regs.reg_saved >> (reg)) & 0xf) != 0xf || \
+ sctx->tracked_regs.reg_value[reg] != __value1 || \
+ sctx->tracked_regs.reg_value[(reg) + 1] != __value2 || \
+ sctx->tracked_regs.reg_value[(reg) + 2] != __value3 || \
+ sctx->tracked_regs.reg_value[(reg) + 3] != __value4) { \
+ radeon_set_context_reg_seq(&sctx->gfx_cs, offset, 4); \
+ radeon_emit(cs, __value1); \
+ radeon_emit(cs, __value2); \
+ radeon_emit(cs, __value3); \
+ radeon_emit(cs, __value4); \
+ sctx->tracked_regs.reg_value[reg] = __value1; \
+ sctx->tracked_regs.reg_value[(reg) + 1] = __value2; \
+ sctx->tracked_regs.reg_value[(reg) + 2] = __value3; \
+ sctx->tracked_regs.reg_value[(reg) + 3] = __value4; \
+ sctx->tracked_regs.reg_saved |= 0xfull << (reg); \
+ } \
+} while (0)
/**
* Set consecutive registers if any registers value is different.
*/
-static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
- unsigned *value, unsigned *saved_val, unsigned num)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
- for (unsigned i = 0; i < num; i++) {
- if (saved_val[i] != value[i]) {
- radeon_set_context_reg_seq(cs, offset, num);
- for (unsigned j = 0; j < num; j++)
- radeon_emit(cs, value[j]);
-
- memcpy(saved_val, value, sizeof(uint32_t) * num);
- break;
- }
- }
-}
+#define radeon_opt_set_context_regn(sctx, offset, value, saved_val, num) do { \
+ for (unsigned i = 0; i < (num); i++) { \
+ if ((saved_val)[i] != (value)[i]) { \
+ radeon_set_context_reg_seq(&(sctx)->gfx_cs, offset, num); \
+ for (unsigned j = 0; j < (num); j++) \
+ radeon_emit(cs, value[j]); \
+ memcpy(saved_val, value, sizeof(uint32_t) * (num)); \
+ break; \
+ } \
+ } \
+} while (0)
+
+#define radeon_set_privileged_config_reg(cs, reg, value) do { \
+ assert((reg) < CIK_UCONFIG_REG_OFFSET); \
+ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); \
+ radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | \
+ COPY_DATA_DST_SEL(COPY_DATA_PERF)); \
+ radeon_emit(cs, value); \
+ radeon_emit(cs, 0); /* unused */ \
+ radeon_emit(cs, (reg) >> 2); \
+ radeon_emit(cs, 0); /* unused */ \
+} while (0)
+
+#define radeon_emit_32bit_pointer(sscreen, cs, va) do { \
+ radeon_emit(cs, va); \
+ assert((va) == 0 || ((va) >> 32) == sscreen->info.address32_hi); \
+} while (0)
+
+#define radeon_emit_one_32bit_pointer(sctx, desc, sh_base) do { \
+ unsigned sh_offset = (sh_base) + (desc)->shader_userdata_offset; \
+ radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, 1); \
+ radeon_emit_32bit_pointer(sctx->screen, cs, (desc)->gpu_address); \
+} while (0)
/* This should be evaluated at compile time if all parameters are constants. */
static ALWAYS_INLINE unsigned
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index c2b0c24887f..008972e27f3 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -349,6 +349,7 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
{
uint64_t bc_va = sctx->border_color_buffer->gpu_address;
+ radeon_begin(cs);
radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1,
* renamed COMPUTE_DESTINATION_EN_SEn on gfx10. */
@@ -404,6 +405,7 @@ void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf
radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
radeon_set_sh_reg(cs, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
}
+ radeon_end();
}
static bool si_setup_compute_scratch_buffer(struct si_context *sctx, struct si_shader *shader,
@@ -505,6 +507,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, shader->bo, RADEON_USAGE_READ,
RADEON_PRIO_SHADER_BINARY);
+ radeon_begin(cs);
radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
radeon_emit(cs, shader_va >> 8);
radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
@@ -524,6 +527,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, struct si_compute
radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(sctx->scratch_waves) |
S_00B860_WAVESIZE(sctx->max_seen_compute_scratch_bytes_per_wave >> 10));
+ radeon_end();
sctx->cs_shader_state.emitted_program = program;
sctx->cs_shader_state.offset = offset;
@@ -562,11 +566,13 @@ static void setup_scratch_rsrc_user_sgprs(struct si_context *sctx,
}
}
+ radeon_begin(cs);
radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + (user_sgpr * 4), 4);
radeon_emit(cs, scratch_dword0);
radeon_emit(cs, scratch_dword1);
radeon_emit(cs, scratch_dword2);
radeon_emit(cs, scratch_dword3);
+ radeon_end();
}
static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_code_t *code_object,
@@ -589,6 +595,8 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
user_sgpr += 4;
}
+ radeon_begin(cs);
+
if (AMD_HSA_BITS_GET(code_object->code_properties, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR)) {
struct dispatch_packet dispatch;
unsigned dispatch_offset;
@@ -646,6 +654,7 @@ static void si_setup_user_sgprs_co_v2(struct si_context *sctx, const amd_kernel_
user_sgpr += 1;
}
}
+ radeon_end();
}
static bool si_upload_compute_input(struct si_context *sctx, const amd_kernel_code_t *code_object,
@@ -693,13 +702,18 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
12 * sel->info.uses_grid_size;
unsigned cs_user_data_reg = block_size_reg + 12 * program->sel.info.uses_variable_block_size;
+ radeon_begin(cs);
+
if (sel->info.uses_grid_size) {
if (info->indirect) {
+ radeon_end();
+
for (unsigned i = 0; i < 3; ++i) {
si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL, (grid_size_reg >> 2) + i,
COPY_DATA_SRC_MEM, si_resource(info->indirect),
info->indirect_offset + 4 * i);
}
+ radeon_begin_again(cs);
} else {
radeon_set_sh_reg_seq(cs, grid_size_reg, 3);
radeon_emit(cs, info->grid[0]);
@@ -719,6 +733,7 @@ static void si_setup_nir_user_data(struct si_context *sctx, const struct pipe_gr
radeon_set_sh_reg_seq(cs, cs_user_data_reg, sel->info.base.cs.user_data_components_amd);
radeon_emit_array(cs, sctx->cs_user_data, sel->info.base.cs.user_data_components_amd);
}
+ radeon_end();
}
static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_grid_info *info)
@@ -734,6 +749,7 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
if (sctx->chip_class >= GFX10 && waves_per_threadgroup == 1)
threadgroups_per_cu = 2;
+ radeon_begin(cs);
radeon_set_sh_reg(
cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
ac_get_compute_resource_limits(&sscreen->info, waves_per_threadgroup,
@@ -795,9 +811,10 @@ static void si_emit_dispatch_packets(struct si_context *sctx, const struct pipe_
}
if (unlikely(sctx->thread_trace_enabled && sctx->chip_class >= GFX9)) {
- radeon_emit(&sctx->gfx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
- radeon_emit(&sctx->gfx_cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
+ radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+ radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
}
+ radeon_end();
}
static void si_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *info)
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index 4c94f2c53e3..bad93320496 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -1084,8 +1084,10 @@ si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe
*/
if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+ radeon_begin(gfx_cs);
radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(gfx_cs, 0);
+ radeon_end();
}
si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
@@ -1184,6 +1186,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
* TTM buffer moves in the kernel.
*/
if (sctx->chip_class >= GFX10) {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
radeon_emit(cs, 0); /* CP_COHER_CNTL */
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
@@ -1195,6 +1198,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) |
S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) |
S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD));
+ radeon_end();
} else {
si_emit_surface_sync(sctx, cs,
S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) |
@@ -1211,6 +1215,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
si_emit_initial_compute_regs(sctx, cs);
+ radeon_begin(cs);
radeon_set_sh_reg(
cs, R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */
@@ -1231,6 +1236,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
radeon_emit(cs, 0);
radeon_emit(cs, S_03107C_ENABLE(0));
}
+ radeon_end();
if (sctx->last_ib_barrier_buf) {
assert(!sctx->last_ib_barrier_fence);
@@ -1349,6 +1355,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
* in parallel with compute shaders.
*/
if (first_dispatch) {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size / 4, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
radeon_emit(cs, gds_offset);
@@ -1356,6 +1363,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
radeon_emit(cs, 0); /* value to write */
if (gds_size == 8)
radeon_emit(cs, 0);
+ radeon_end();
}
}
@@ -1370,6 +1378,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
assert(shader->config.scratch_bytes_per_wave == 0);
assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+ radeon_begin(cs);
radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
radeon_emit(cs, shader_va >> 8);
radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
@@ -1390,6 +1399,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG,
MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
+ radeon_end();
sctx->compute_ib_last_shader = shader;
}
@@ -1417,8 +1427,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+ radeon_begin(gfx_cs);
radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(gfx_cs, 0);
+ radeon_end();
si_cp_wait_mem(
sctx, gfx_cs,
@@ -1430,8 +1442,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
*/
sctx->ws->cs_check_space(gfx_cs, 0, true);
} else {
+ radeon_begin(gfx_cs);
radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
radeon_emit(gfx_cs, 0);
+ radeon_end();
}
}
@@ -1441,12 +1455,16 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
uint64_t index_va = out_indexbuf_va + start_prim * 12;
/* Emit the draw packet into the gfx IB. */
+ radeon_begin(gfx_cs);
radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
radeon_emit(gfx_cs, num_prims * vertices_per_prim);
radeon_emit(gfx_cs, index_va);
radeon_emit(gfx_cs, index_va >> 32);
radeon_emit(gfx_cs, 0);
radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+ radeon_end();
+
+ radeon_begin_again(cs);
/* Continue with the compute IB. */
if (start_prim == 0) {
@@ -1503,6 +1521,7 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
S_00B800_ORDER_MODE(0 /* launch in order */));
+ radeon_end();
/* This is only for unordered append. Ordered append writes this from
* the shader.
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 7945143f916..5cd30e50b60 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -24,6 +24,7 @@
#include "si_pipe.h"
#include "sid.h"
+#include "si_build_pm4.h"
/* Set this if you want the ME to wait until CP DMA is done.
* It should be set on the last CP DMA packet. */
@@ -102,6 +103,8 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | S_500_SRC_CACHE_POLICY(cache_policy == L2_STREAM);
}
+ radeon_begin(cs);
+
if (sctx->chip_class >= GFX7) {
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
radeon_emit(cs, header);
@@ -130,6 +133,7 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
}
+ radeon_end();
}
void si_cp_dma_wait_for_idle(struct si_context *sctx, struct radeon_cmdbuf *cs)
@@ -428,6 +432,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
}
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
radeon_emit(cs, header);
radeon_emit(cs, address); /* SRC_ADDR_LO [31:0] */
@@ -435,6 +440,7 @@ void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf,
radeon_emit(cs, address); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, address >> 32); /* DST_ADDR_HI [31:0] */
radeon_emit(cs, command);
+ radeon_end();
}
void si_test_gds(struct si_context *sctx)
@@ -495,11 +501,13 @@ void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned
radeon_add_to_buffer_list(sctx, cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
uint64_t va = buf->gpu_address + offset;
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + size / 4, 0));
radeon_emit(cs, S_370_DST_SEL(dst_sel) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
radeon_emit_array(cs, (const uint32_t *)data, size / 4);
+ radeon_end();
}
void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned dst_sel,
@@ -517,10 +525,12 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned
uint64_t dst_va = (dst ? dst->gpu_address : 0ull) + dst_offset;
uint64_t src_va = (src ? src->gpu_address : 0ull) + src_offset;
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(src_sel) | COPY_DATA_DST_SEL(dst_sel) | COPY_DATA_WR_CONFIRM);
radeon_emit(cs, src_va);
radeon_emit(cs, src_va >> 32);
radeon_emit(cs, dst_va);
radeon_emit(cs, dst_va >> 32);
+ radeon_end();
}
diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
index d48fb14278b..ad9341a83bb 100644
--- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
+++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c
@@ -144,6 +144,15 @@ si_create_shadowing_ib_preamble(struct si_context *sctx)
return pm4;
}
+static void si_set_context_reg_array(struct radeon_cmdbuf *cs, unsigned reg, unsigned num,
+ const uint32_t *values)
+{
+ radeon_begin(cs);
+ radeon_set_context_reg_seq(cs, reg, num);
+ radeon_emit_array(cs, values, num);
+ radeon_end();
+}
+
void si_init_cp_reg_shadowing(struct si_context *sctx)
{
if (sctx->screen->info.mid_command_buffer_preemption_enabled ||
@@ -174,8 +183,7 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->shadowed_regs,
RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
si_pm4_emit(sctx, shadowing_preamble);
- ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs,
- radeon_set_context_reg_seq_array);
+ ac_emulate_clear_state(&sctx->screen->info, &sctx->gfx_cs, si_set_context_reg_array);
si_pm4_emit(sctx, sctx->cs_preamble_state);
/* The register values are shadowed, so we won't need to set them again. */
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 75cfc1c8662..fdb0333c3a1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1930,82 +1930,59 @@ void si_shader_change_notify(struct si_context *sctx)
PIPE_SHADER_TESS_EVAL));
}
-static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs, unsigned sh_offset,
- unsigned pointer_count)
-{
- SI_CHECK_SHADOWED_REGS(sh_offset, pointer_count);
- radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count, 0));
- radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
-}
-
-static void si_emit_shader_pointer_body(struct si_screen *sscreen, struct radeon_cmdbuf *cs,
- uint64_t va)
-{
- radeon_emit(cs, va);
-
- assert(va == 0 || (va >> 32) == sscreen->info.address32_hi);
-}
-
-static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc,
- unsigned sh_base)
-{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- unsigned sh_offset = sh_base + desc->shader_userdata_offset;
-
- si_emit_shader_pointer_head(cs, sh_offset, 1);
- si_emit_shader_pointer_body(sctx->screen, cs, desc->gpu_address);
-}
-
-static void si_emit_consecutive_shader_pointers(struct si_context *sctx, unsigned pointer_mask,
- unsigned sh_base)
-{
- if (!sh_base)
- return;
-
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
-
- while (mask) {
- int start, count;
- u_bit_scan_consecutive_range(&mask, &start, &count);
-
- struct si_descriptors *descs = &sctx->descriptors[start];
- unsigned sh_offset = sh_base + descs->shader_userdata_offset;
-
- si_emit_shader_pointer_head(cs, sh_offset, count);
- for (int i = 0; i < count; i++)
- si_emit_shader_pointer_body(sctx->screen, cs, descs[i].gpu_address);
- }
-}
+#define si_emit_consecutive_shader_pointers(sctx, pointer_mask, sh_base) do { \
+ unsigned sh_reg_base = (sh_base); \
+ if (sh_reg_base) { \
+ unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \
+ \
+ while (mask) { \
+ int start, count; \
+ u_bit_scan_consecutive_range(&mask, &start, &count); \
+ \
+ struct si_descriptors *descs = &sctx->descriptors[start]; \
+ unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \
+ \
+ radeon_set_sh_reg_seq(&sctx->gfx_cs, sh_offset, count); \
+ for (int i = 0; i < count; i++) \
+ radeon_emit_32bit_pointer(sctx->screen, cs, descs[i].gpu_address); \
+ } \
+ } \
+} while (0)
static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_descriptors *descs)
{
+ radeon_begin(&sctx->gfx_cs);
+
if (sctx->chip_class >= GFX10) {
- si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
/* HW VS stage only used in non-NGG mode. */
- si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
- si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
- si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ radeon_end();
return;
} else if (sctx->chip_class == GFX9 && sctx->shadowed_regs) {
/* We can't use the COMMON registers with register shadowing. */
- si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
- si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
- si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
- si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_LS_0);
+ radeon_end();
return;
} else if (sctx->chip_class == GFX9) {
/* Broadcast it to all shader stages. */
- si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_COMMON_0);
+ radeon_end();
return;
}
- si_emit_shader_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
- si_emit_shader_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
- si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
- si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
- si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
- si_emit_shader_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
+ radeon_emit_one_32bit_pointer(sctx, descs, R_00B530_SPI_SHADER_USER_DATA_LS_0);
+ radeon_end();
}
void si_emit_graphics_shader_pointers(struct si_context *sctx)
@@ -2016,6 +1993,7 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
si_emit_global_shader_pointers(sctx, &sctx->descriptors[SI_DESCS_RW_BUFFERS]);
}
+ radeon_begin(&sctx->gfx_cs);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(VERTEX),
sh_base[PIPE_SHADER_VERTEX]);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(TESS_EVAL),
@@ -2030,8 +2008,6 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
sctx->shader_pointers_dirty &= ~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
if (sctx->vertex_buffer_pointer_dirty && sctx->num_vertex_elements) {
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
-
/* Find the location of the VB descriptor pointer. */
unsigned sh_dw_offset = SI_VS_NUM_USER_SGPR;
if (sctx->chip_class >= GFX9) {
@@ -2042,22 +2018,22 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
}
unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + sh_dw_offset * 4;
- si_emit_shader_pointer_head(cs, sh_offset, 1);
- si_emit_shader_pointer_body(
+ radeon_set_sh_reg_seq(cs, sh_offset, 1);
+ radeon_emit_32bit_pointer(
sctx->screen, cs, sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset);
sctx->vertex_buffer_pointer_dirty = false;
}
if (sctx->vertex_buffer_user_sgprs_dirty && sctx->num_vertex_elements &&
sctx->screen->num_vbos_in_user_sgprs) {
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
unsigned num_desc = MIN2(sctx->num_vertex_elements, sctx->screen->num_vbos_in_user_sgprs);
unsigned sh_offset = sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4;
- si_emit_shader_pointer_head(cs, sh_offset, num_desc * 4);
+ radeon_set_sh_reg_seq(cs, sh_offset, num_desc * 4);
radeon_emit_array(cs, sctx->vb_descriptor_user_sgprs, num_desc * 4);
sctx->vertex_buffer_user_sgprs_dirty = false;
}
+ radeon_end();
if (sctx->graphics_bindless_pointer_dirty) {
si_emit_global_shader_pointers(sctx, &sctx->bindless_descriptors);
@@ -2071,12 +2047,13 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
struct si_shader_selector *shader = &sctx->cs_shader_state.program->sel;
unsigned base = R_00B900_COMPUTE_USER_DATA_0;
+ radeon_begin(cs);
si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE),
R_00B900_COMPUTE_USER_DATA_0);
sctx->shader_pointers_dirty &= ~SI_DESCS_SHADER_MASK(COMPUTE);
if (sctx->compute_bindless_pointer_dirty) {
- si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base);
+ radeon_emit_one_32bit_pointer(sctx, &sctx->bindless_descriptors, base);
sctx->compute_bindless_pointer_dirty = false;
}
@@ -2085,9 +2062,9 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) {
struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE);
- si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 +
- shader->cs_shaderbufs_sgpr_index * 4,
- num_shaderbufs * 4);
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ shader->cs_shaderbufs_sgpr_index * 4,
+ num_shaderbufs * 4);
for (unsigned i = 0; i < num_shaderbufs; i++)
radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4);
@@ -2100,9 +2077,9 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
if (num_images && sctx->compute_image_sgprs_dirty) {
struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE);
- si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 +
- shader->cs_images_sgpr_index * 4,
- shader->cs_images_num_sgprs);
+ radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 +
+ shader->cs_images_sgpr_index * 4,
+ shader->cs_images_num_sgprs);
for (unsigned i = 0; i < num_images; i++) {
unsigned desc_offset = si_get_image_slot(i) * 8;
@@ -2119,6 +2096,7 @@ void si_emit_compute_shader_pointers(struct si_context *sctx)
sctx->compute_image_sgprs_dirty = false;
}
+ radeon_end();
}
/* BINDLESS */
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index ab48a71729e..4159364265d 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -75,6 +75,8 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel);
bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs;
+ radeon_begin(cs);
+
if (ctx->chip_class >= GFX9 || (compute_ib && ctx->chip_class >= GFX7)) {
/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
* counters) must immediately precede every timestamp event to
@@ -136,6 +138,8 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne
radeon_emit(cs, 0); /* unused */
}
+ radeon_end();
+
if (buf) {
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
}
@@ -154,6 +158,7 @@ unsigned si_cp_write_fence_dwords(struct si_screen *screen)
void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t va, uint32_t ref,
uint32_t mask, unsigned flags)
{
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
radeon_emit(cs, WAIT_REG_MEM_MEM_SPACE(1) | flags);
radeon_emit(cs, va);
@@ -161,6 +166,7 @@ void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, uint64_t v
radeon_emit(cs, ref); /* reference value */
radeon_emit(cs, mask); /* mask */
radeon_emit(cs, 4); /* poll interval */
+ radeon_end();
}
static void si_add_fence_dependency(struct si_context *sctx, struct pipe_fence_handle *fence)
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 16b6a10986c..6d3abb7557c 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -110,8 +110,10 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h
/* Make sure compute shaders are idle before leaving the IB, so that
* the next IB doesn't overwrite GDS that might be in use. */
+ radeon_begin(compute_cs);
radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
+ radeon_end();
/* Save the GDS prim restart counter if needed. */
if (ctx->preserve_prim_restart_gds_at_flush) {
@@ -559,6 +561,8 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns
assert(sctx->chip_class <= GFX9);
+ radeon_begin(cs);
+
if (sctx->chip_class == GFX9 || compute_ib) {
/* Flush caches and wait for the caches to assert idle. */
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, 0));
@@ -576,6 +580,7 @@ void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, uns
radeon_emit(cs, 0); /* CP_COHER_BASE */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
}
+ radeon_end();
/* ACQUIRE_MEM has an implicit context roll if the current context
* is busy. */
@@ -599,6 +604,8 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
/* We don't need these. */
assert(!(flags & (SI_CONTEXT_VGT_STREAMOUT_SYNC | SI_CONTEXT_FLUSH_AND_INV_DB_META)));
+ radeon_begin(cs);
+
if (flags & SI_CONTEXT_VGT_FLUSH) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
@@ -686,6 +693,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
ctx->num_cs_flushes++;
ctx->compute_is_busy = false;
}
+ radeon_end();
if (cb_db_event) {
struct si_resource* wait_mem_scratch = unlikely(ctx->ws->cs_is_secure(cs)) ?
@@ -729,6 +737,8 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
si_cp_wait_mem(ctx, cs, va, ctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL);
}
+ radeon_begin_again(cs);
+
/* Ignore fields that only modify the behavior of other fields. */
if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) {
/* Flush caches and wait for the caches to assert idle.
@@ -757,6 +767,7 @@ void gfx10_emit_cache_flush(struct si_context *ctx, struct radeon_cmdbuf *cs)
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
}
+ radeon_end();
ctx->flags = 0;
}
@@ -820,6 +831,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1);
}
+ radeon_begin(cs);
+
if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
/* Flush CMASK/FMASK/DCC. SURFACE_SYNC will wait for idle. */
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -868,6 +881,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_STREAMOUT_SYNC) | EVENT_INDEX(0));
}
+ radeon_end();
+
/* GFX9: Wait for idle if we're flushing CB or DB. ACQUIRE_MEM doesn't
* wait for idle on GFX9. We have to use a TS event.
*/
@@ -934,8 +949,10 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
if (sctx->has_graphics &&
(cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_INV_VCACHE |
SI_CONTEXT_INV_L2 | SI_CONTEXT_WB_L2)))) {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
radeon_emit(cs, 0);
+ radeon_end();
}
/* GFX6-GFX8 only:
@@ -988,11 +1005,15 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs)
si_prim_discard_signal_next_compute_ib_start(sctx);
if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | EVENT_INDEX(0));
+ radeon_end();
} else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | EVENT_INDEX(0));
+ radeon_end();
}
sctx->flags = 0;
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index 6363368c5a3..6d2868509f6 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -723,16 +723,20 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
}
+ radeon_begin(cs);
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
+ radeon_end();
}
static void si_pc_emit_shaders(struct si_context *sctx, unsigned shaders)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2, false);
radeon_emit(cs, shaders & 0x7f);
radeon_emit(cs, 0xffffffff);
+ radeon_end();
}
static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block, unsigned count,
@@ -749,6 +753,8 @@ static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block
if (regs->layout & SI_PC_FAKE)
return;
+ radeon_begin(cs);
+
if (layout_multi == SI_PC_MULTI_BLOCK) {
assert(!(regs->layout & SI_PC_REG_REVERSE));
@@ -826,6 +832,7 @@ static void si_pc_emit_select(struct si_context *sctx, struct si_pc_block *block
radeon_emit(cs, 0);
}
}
+ radeon_end();
}
static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer, uint64_t va)
@@ -835,12 +842,14 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer
si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_DST_MEM, buffer, va - buffer->gpu_address,
COPY_DATA_IMM, NULL, 1);
+ radeon_begin(cs);
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
+ radeon_end();
}
/* Note: The buffer was already added in si_pc_emit_start, so we don't have to
@@ -853,6 +862,7 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
EOP_DATA_SEL_VALUE_32BIT, buffer, va, 0, SI_NOT_QUERY);
si_cp_wait_mem(sctx, cs, va, 0, 0xffffffff, WAIT_REG_MEM_EQUAL);
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -860,6 +870,7 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
radeon_set_uconfig_reg(
cs, R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1));
+ radeon_end();
}
static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block, unsigned count,
@@ -871,6 +882,8 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block,
unsigned reg = regs->counter0_lo;
unsigned reg_delta = 8;
+ radeon_begin(cs);
+
if (!(regs->layout & SI_PC_FAKE)) {
if (regs->layout & SI_PC_REG_REVERSE)
reg_delta = -reg_delta;
@@ -901,6 +914,7 @@ static void si_pc_emit_read(struct si_context *sctx, struct si_pc_block *block,
va += sizeof(uint64_t);
}
}
+ radeon_end();
}
static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery)
@@ -921,6 +935,8 @@ static void si_pc_query_destroy(struct si_context *sctx, struct si_query *squery
void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, bool inhibit)
{
+ radeon_begin(&sctx->gfx_cs);
+
if (sctx->chip_class >= GFX10) {
radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL,
S_037390_PERFMON_CLOCK_STATE(inhibit));
@@ -928,6 +944,7 @@ void si_inhibit_clockgating(struct si_context *sctx, struct radeon_cmdbuf *cs, b
radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
S_0372FC_PERFMON_CLOCK_STATE(inhibit));
}
+ radeon_end();
}
static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index 2f63fc02105..6918ae5a11f 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -116,7 +116,9 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
}
+ radeon_begin(cs);
radeon_emit_array(cs, state->pm4, state->ndw);
+ radeon_end();
if (state->atom.emit)
state->atom.emit(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index a109501e179..3a3beaba473 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -25,6 +25,7 @@
*/
#include "si_query.h"
+#include "si_build_pm4.h"
#include "amd/common/sid.h"
#include "si_pipe.h"
@@ -771,10 +772,12 @@ static unsigned event_type_for_stream(unsigned stream)
static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va, unsigned stream)
{
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
+ radeon_end();
}
static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_hw *query,
@@ -785,12 +788,15 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h
switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
+ radeon_end();
break;
+ }
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
@@ -805,12 +811,15 @@ static void si_query_hw_do_emit_start(struct si_context *sctx, struct si_query_h
si_cp_release_mem(sctx, cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
EOP_DATA_SEL_TIMESTAMP, NULL, va, 0, query->b.type);
break;
- case PIPE_QUERY_PIPELINE_STATISTICS:
+ case PIPE_QUERY_PIPELINE_STATISTICS: {
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
+ radeon_end();
break;
+ }
default:
assert(0);
}
@@ -846,15 +855,18 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw
switch (query->b.type) {
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
- case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+ case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
va += 8;
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
+ radeon_end();
fence_va = va + sctx->screen->info.max_render_backends * 16 - 8;
break;
+ }
case PIPE_QUERY_PRIMITIVES_EMITTED:
case PIPE_QUERY_PRIMITIVES_GENERATED:
case PIPE_QUERY_SO_STATISTICS:
@@ -879,10 +891,12 @@ static void si_query_hw_do_emit_stop(struct si_context *sctx, struct si_query_hw
unsigned sample_size = (query->result_size - 8) / 2;
va += sample_size;
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
radeon_emit(cs, va);
radeon_emit(cs, va >> 32);
+ radeon_end();
fence_va = va + sample_size;
break;
@@ -934,6 +948,8 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf,
{
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
+ radeon_begin(cs);
+
if (ctx->chip_class >= GFX9) {
radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
radeon_emit(cs, op);
@@ -944,6 +960,8 @@ static void emit_set_predicate(struct si_context *ctx, struct si_resource *buf,
radeon_emit(cs, va);
radeon_emit(cs, op | ((va >> 32) & 0xFF));
}
+ radeon_end();
+
radeon_add_to_buffer_list(ctx, &ctx->gfx_cs, buf, RADEON_USAGE_READ, RADEON_PRIO_QUERY);
}
diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c
index 1366430cff8..f5263d5e9fa 100644
--- a/src/gallium/drivers/radeonsi/si_sqtt.c
+++ b/src/gallium/drivers/radeonsi/si_sqtt.c
@@ -35,22 +35,6 @@ static void
si_emit_spi_config_cntl(struct si_context* sctx,
struct radeon_cmdbuf *cs, bool enable);
-static inline void
-radeon_set_privileged_config_reg(struct radeon_cmdbuf *cs,
- unsigned reg,
- unsigned value)
-{
- assert(reg < CIK_UCONFIG_REG_OFFSET);
-
- radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
- radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) |
- COPY_DATA_DST_SEL(COPY_DATA_PERF));
- radeon_emit(cs, value);
- radeon_emit(cs, 0); /* unused */
- radeon_emit(cs, reg >> 2);
- radeon_emit(cs, 0); /* unused */
-}
-
static bool
si_thread_trace_init_bo(struct si_context *sctx)
{
@@ -89,6 +73,8 @@ si_emit_thread_trace_start(struct si_context* sctx,
uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
unsigned max_se = sscreen->info.max_se;
+ radeon_begin(cs);
+
for (unsigned se = 0; se < max_se; se++) {
uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
uint64_t data_va = ac_thread_trace_get_data_va(sctx->thread_trace, va, se);
@@ -220,6 +206,7 @@ si_emit_thread_trace_start(struct si_context* sctx,
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
}
+ radeon_end();
}
static const uint32_t gfx9_thread_trace_info_regs[] =
@@ -258,6 +245,8 @@ si_copy_thread_trace_info_regs(struct si_context* sctx,
uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo);
uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
+ radeon_begin(cs);
+
/* Copy back the info struct one DWORD at a time. */
for (unsigned i = 0; i < 3; i++) {
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
@@ -269,6 +258,7 @@ si_copy_thread_trace_info_regs(struct si_context* sctx,
radeon_emit(cs, (info_va + i * 4));
radeon_emit(cs, (info_va + i * 4) >> 32);
}
+ radeon_end();
}
@@ -280,6 +270,8 @@ si_emit_thread_trace_stop(struct si_context *sctx,
{
unsigned max_se = sctx->screen->info.max_se;
+ radeon_begin(cs);
+
/* Stop the thread trace with a different event based on the queue. */
if (queue_family_index == RING_COMPUTE) {
radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE,
@@ -291,8 +283,11 @@ si_emit_thread_trace_stop(struct si_context *sctx,
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
+ radeon_end();
for (unsigned se = 0; se < max_se; se++) {
+ radeon_begin(cs);
+
/* Target SEi and SH0. */
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
S_030800_SE_INDEX(se) |
@@ -335,15 +330,18 @@ si_emit_thread_trace_stop(struct si_context *sctx,
radeon_emit(cs, S_030CE8_BUSY(1)); /* mask */
radeon_emit(cs, 4); /* poll interval */
}
+ radeon_end();
si_copy_thread_trace_info_regs(sctx, cs, se);
}
/* Restore global broadcasting. */
+ radeon_begin_again(cs);
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
S_030800_SE_BROADCAST_WRITES(1) |
S_030800_SH_BROADCAST_WRITES(1) |
S_030800_INSTANCE_BROADCAST_WRITES(1));
+ radeon_end();
}
static void
@@ -351,6 +349,8 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
{
struct radeon_winsys *ws = sctx->ws;
+ radeon_begin(cs);
+
switch (family) {
case RING_GFX:
radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
@@ -361,7 +361,8 @@ si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(cs, 0);
break;
- }
+ }
+ radeon_end();
ws->cs_add_buffer(cs,
sctx->thread_trace->bo,
@@ -390,6 +391,9 @@ static void
si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs)
{
struct radeon_winsys *ws = sctx->ws;
+
+ radeon_begin(cs);
+
switch (family) {
case RING_GFX:
radeon_emit(sctx->thread_trace->stop_cs[family], PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
@@ -401,6 +405,8 @@ si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *
radeon_emit(sctx->thread_trace->stop_cs[family], 0);
break;
}
+ radeon_end();
+
ws->cs_add_buffer(cs,
sctx->thread_trace->bo,
RADEON_USAGE_READWRITE,
@@ -643,6 +649,8 @@ si_emit_thread_trace_userdata(struct si_context* sctx,
{
const uint32_t *dwords = (uint32_t *)data;
+ radeon_begin(cs);
+
while (num_dwords > 0) {
uint32_t count = MIN2(num_dwords, 2);
@@ -655,12 +663,15 @@ si_emit_thread_trace_userdata(struct si_context* sctx,
dwords += count;
num_dwords -= count;
}
+ radeon_end();
}
static void
si_emit_spi_config_cntl(struct si_context* sctx,
struct radeon_cmdbuf *cs, bool enable)
{
+ radeon_begin(cs);
+
if (sctx->chip_class >= GFX9) {
uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) |
S_031100_EXP_PRIORITY_ORDER(3) |
@@ -677,6 +688,7 @@ si_emit_spi_config_cntl(struct si_context* sctx,
S_009100_ENABLE_SQG_TOP_EVENTS(enable) |
S_009100_ENABLE_SQG_BOP_EVENTS(enable));
}
+ radeon_end();
}
void
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 278c9a733f0..2a8e852a5e6 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -91,11 +91,13 @@ static void si_emit_cb_render_state(struct si_context *sctx)
if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) {
sctx->last_cb_target_mask = cb_target_mask;
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
+ radeon_end();
}
- unsigned initial_cdw = cs->current.cdw;
+ radeon_begin(cs);
radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK,
cb_target_mask);
@@ -256,8 +258,7 @@ static void si_emit_cb_render_state(struct si_context *sctx)
radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT,
sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
}
- if (initial_cdw != cs->current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
/*
@@ -689,8 +690,10 @@ static void si_emit_blend_color(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4);
+ radeon_end();
}
/*
@@ -721,8 +724,10 @@ static void si_emit_clip_state(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4);
+ radeon_end();
}
static void si_emit_clip_regs(struct si_context *sctx)
@@ -747,7 +752,6 @@ static void si_emit_clip_regs(struct si_context *sctx)
clipdist_mask &= rs->clip_plane_enable;
culldist_mask |= clipdist_mask;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) |
S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) |
S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3 &&
@@ -755,6 +759,8 @@ static void si_emit_clip_regs(struct si_context *sctx)
S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
clipdist_mask | (culldist_mask << 8);
+ radeon_begin(&sctx->gfx_cs);
+
if (sctx->chip_class >= GFX10) {
radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
@@ -765,9 +771,7 @@ static void si_emit_clip_regs(struct si_context *sctx)
}
radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
/*
@@ -1048,6 +1052,7 @@ static void si_emit_stencil_ref(struct si_context *sctx)
struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
S_028430_STENCILMASK(dsa->valuemask[0]) |
@@ -1056,6 +1061,7 @@ static void si_emit_stencil_ref(struct si_context *sctx)
S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
S_028434_STENCILOPVAL_BF(1));
+ radeon_end();
}
static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref state)
@@ -1334,7 +1340,6 @@ static void si_emit_db_render_state(struct si_context *sctx)
{
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
unsigned db_shader_control, db_render_control, db_count_control;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
/* DB_RENDER_CONTROL */
if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) {
@@ -1374,6 +1379,7 @@ static void si_emit_db_render_state(struct si_context *sctx)
}
}
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL,
db_render_control, db_count_control);
@@ -1427,9 +1433,7 @@ static void si_emit_db_render_state(struct si_context *sctx)
S_028064_VRS_OVERRIDE_RATE_Y(0));
}
}
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
/*
@@ -2909,6 +2913,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
struct si_surface *cb = NULL;
unsigned cb_color_info = 0;
+ radeon_begin(cs);
+
/* Colorbuffers. */
for (i = 0; i < nr_cbufs; i++) {
uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
@@ -3260,6 +3266,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx)
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
}
+ radeon_end();
si_update_display_dcc_dirty(sctx);
@@ -3292,6 +3299,8 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx)
si_emit_sample_locations(cs, nr_samples);
}
+ radeon_begin(cs);
+
if (sctx->family >= CHIP_POLARIS10) {
unsigned small_prim_filter_cntl =
S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
@@ -3323,6 +3332,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx)
radeon_opt_set_context_reg(
sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
+ radeon_end();
}
static bool si_out_of_order_rasterization(struct si_context *sctx)
@@ -3501,7 +3511,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
}
}
- unsigned initial_cdw = cs->current.cdw;
+ radeon_begin(cs);
/* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL,
@@ -3512,7 +3522,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
sc_mode_cntl_1);
- if (initial_cdw != cs->current.cdw) {
+ if (radeon_packets_added()) {
sctx->context_roll = true;
/* GFX9: Flush DFSM when the AA mode changes. */
@@ -3521,6 +3531,7 @@ static void si_emit_msaa_config(struct si_context *sctx)
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
}
}
+ radeon_end();
}
void si_update_ps_iter_samples(struct si_context *sctx)
@@ -4509,9 +4520,11 @@ static void si_emit_sample_mask(struct si_context *sctx)
assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
(mask & 1 && sctx->blitter->running));
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
radeon_emit(cs, mask | (mask << 16));
radeon_emit(cs, mask | (mask << 16));
+ radeon_end();
}
static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index d3425e68449..f9e4b273317 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -404,7 +404,7 @@ static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enab
static void si_emit_dpbb_disable(struct si_context *sctx)
{
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
+ radeon_begin(&sctx->gfx_cs);
if (sctx->chip_class >= GFX10) {
struct uvec2 bin_size = {};
@@ -441,8 +441,7 @@ static void si_emit_dpbb_disable(struct si_context *sctx)
radeon_opt_set_context_reg(
sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
sctx->last_binning_enabled = false;
}
@@ -526,7 +525,7 @@ void si_emit_dpbb_state(struct si_context *sctx)
if (bin_size.y >= 32)
bin_size_extend.y = util_logbase2(bin_size.y) - 5;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(
sctx, R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.x == 16) |
@@ -546,8 +545,7 @@ void si_emit_dpbb_state(struct si_context *sctx)
radeon_opt_set_context_reg(
sctx, db_dfsm_control, SI_TRACKED_DB_DFSM_CONTROL,
S_028060_PUNCHOUT_MODE(punchout_mode) | S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
sctx->last_binning_enabled = true;
}
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index e2beac6f7fa..d17ab2e7a37 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -399,6 +399,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
assert(ls_current->config.lds_size == 0);
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
if (sctx->chip_class >= GFX9) {
unsigned hs_rsrc2 = ls_current->config.rsrc2;
@@ -443,6 +444,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
radeon_set_sh_reg_seq(cs, tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
radeon_emit(cs, offchip_layout);
radeon_emit(cs, ring_va);
+ radeon_end();
unsigned ls_hs_config =
S_028B58_NUM_PATCHES(*num_patches) |
@@ -450,13 +452,14 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
if (sctx->last_ls_hs_config != ls_hs_config) {
+ radeon_begin(cs);
if (sctx->chip_class >= GFX7) {
radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
} else {
radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
}
+ radeon_end_update_context_roll(sctx);
sctx->last_ls_hs_config = ls_hs_config;
- sctx->context_roll = true;
}
}
@@ -734,7 +737,8 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
enum pipe_prim_type rast_prim = sctx->current_rast_prim;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
- unsigned initial_cdw = cs->current.cdw;
+
+ radeon_begin(cs);
if (unlikely(si_is_line_stipple_enabled(sctx))) {
/* For lines, reset the stipple pattern at each primitive. Otherwise,
@@ -756,8 +760,10 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
sctx->last_gs_out_prim = gs_out_prim;
}
- if (GFX_VERSION == GFX9 && initial_cdw != cs->current.cdw)
- sctx->context_roll = true;
+ if (GFX_VERSION == GFX9)
+ radeon_end_update_context_roll(sctx);
+ else
+ radeon_end();
if (NGG) {
struct si_shader *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->current;
@@ -797,6 +803,7 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
/* For the API vertex shader (VS_STATE_INDEXED, LS_OUT_*). */
unsigned vs_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
PIPE_SHADER_VERTEX);
+ radeon_begin(cs);
radeon_set_sh_reg(cs, vs_base + SI_SGPR_VS_STATE_BITS * 4,
sctx->current_vs_state);
@@ -815,6 +822,7 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
radeon_set_sh_reg(cs, R_00B230_SPI_SHADER_USER_DATA_GS_0 + SI_SGPR_VS_STATE_BITS * 4,
sctx->current_vs_state);
}
+ radeon_end();
sctx->last_vs_state = sctx->current_vs_state;
}
@@ -845,14 +853,18 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
/* Draw state. */
if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
+ radeon_begin(cs);
+
if (GFX_VERSION == GFX9)
- radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4,
- ia_multi_vgt_param);
+ radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION,
+ R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
else if (GFX_VERSION >= GFX7)
radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
else
radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
+ radeon_end();
+
sctx->last_multi_vgt_param = ia_multi_vgt_param;
}
}
@@ -897,7 +909,11 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
ge_cntl |= S_03096C_PACKET_TO_ONE_PA(si_is_line_stipple_enabled(sctx));
if (ge_cntl != sctx->last_multi_vgt_param) {
- radeon_set_uconfig_reg(&sctx->gfx_cs, R_03096C_GE_CNTL, ge_cntl);
+ struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+
+ radeon_begin(cs);
+ radeon_set_uconfig_reg(cs, R_03096C_GE_CNTL, ge_cntl);
+ radeon_end();
sctx->last_multi_vgt_param = ge_cntl;
}
}
@@ -919,13 +935,15 @@ static void si_emit_draw_registers(struct si_context *sctx,
(sctx, indirect, prim, num_patches, instance_count, primitive_restart,
min_vertex_count, vertices_per_patch);
+ radeon_begin(cs);
+
if (prim != sctx->last_prim) {
unsigned vgt_prim = si_conv_pipe_prim(prim);
if (GFX_VERSION >= GFX10)
radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
else if (GFX_VERSION >= GFX7)
- radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
+ radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
else
radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
@@ -947,14 +965,17 @@ static void si_emit_draw_registers(struct si_context *sctx,
if (GFX_VERSION == GFX9)
sctx->context_roll = true;
}
+ radeon_end();
}
#define EMIT_SQTT_END_DRAW do { \
if (GFX_VERSION >= GFX9 && unlikely(sctx->thread_trace_enabled)) { \
+ radeon_begin(&sctx->gfx_cs); \
radeon_emit(&sctx->gfx_cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); \
radeon_emit(&sctx->gfx_cs, \
EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | \
EVENT_INDEX(0)); \
+ radeon_end(); \
} \
} while (0)
@@ -979,7 +1000,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
if (indirect && indirect->count_from_stream_output) {
struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output;
+ radeon_begin(cs);
radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
+ radeon_end();
+
si_cp_copy_data(sctx, &sctx->gfx_cs, COPY_DATA_REG, NULL,
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2, COPY_DATA_SRC_MEM,
t->buf_filled_size, t->buf_filled_size_offset);
@@ -990,6 +1014,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
uint32_t index_max_size = 0;
uint64_t index_va = 0;
+ radeon_begin(cs);
+
/* draw packet */
if (index_size) {
/* Register shadowing doesn't shadow INDEX_TYPE. */
@@ -1017,7 +1043,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
}
if (GFX_VERSION >= GFX9) {
- radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type);
+ radeon_set_uconfig_reg_idx(cs, sctx->screen, GFX_VERSION,
+ R_03090C_VGT_INDEX_TYPE, 2, index_type);
} else {
radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
radeon_emit(cs, index_type);
@@ -1032,8 +1059,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
/* Skip draw calls with 0-sized index buffers.
* They cause a hang on some chips, like Navi10-14.
*/
- if (!index_max_size)
+ if (!index_max_size) {
+ radeon_end();
return;
+ }
index_va = si_resource(indexbuf)->gpu_address + index_offset;
@@ -1173,6 +1202,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
if (index_size) {
if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
+ radeon_end();
+
for (unsigned i = 0; i < num_draws; i++) {
uint64_t va = index_va + draws[0].start * original_index_size;
@@ -1238,6 +1269,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
radeon_emit(cs, draws[i].count);
radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
}
+ radeon_end();
+
EMIT_SQTT_END_DRAW;
return;
}
@@ -1265,6 +1298,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
sctx->last_base_vertex = draws[num_draws - 1].start;
}
}
+ radeon_end();
EMIT_SQTT_END_DRAW;
}
@@ -2181,8 +2215,10 @@ void si_trace_emit(struct si_context *sctx)
si_cp_write_data(sctx, sctx->current_saved_cs->trace_buf, 0, 4, V_370_MEM, V_370_ME, &trace_id);
+ radeon_begin(cs);
radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
radeon_emit(cs, AC_ENCODE_TRACE_POINT(trace_id));
+ radeon_end();
if (sctx->log)
u_log_flush(sctx->log);
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c
index 9ebb1e5dcb4..5412a87f0a1 100644
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -150,6 +150,7 @@ static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_cou
static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
uint32_t sample_locs)
{
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
radeon_emit(cs, centroid_priority);
radeon_emit(cs, centroid_priority >> 32);
@@ -157,11 +158,13 @@ static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroi
radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+ radeon_end();
}
static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
const uint32_t *sample_locs, unsigned num_samples)
{
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
radeon_emit(cs, centroid_priority);
radeon_emit(cs, centroid_priority >> 32);
@@ -171,6 +174,7 @@ static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centro
radeon_emit_array(cs, sample_locs, 4);
radeon_emit_array(cs, sample_locs, 4);
radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+ radeon_end();
}
void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 3326ad934fe..b2c7cf0e49e 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -566,11 +566,10 @@ static void si_shader_hs(struct si_screen *sscreen, struct si_shader *shader)
static void si_emit_shader_es(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.es->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
shader->selector->esgs_itemsize / 4);
@@ -583,9 +582,7 @@ static void si_emit_shader_es(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
shader->vgt_vertex_reuse_block_cntl);
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
@@ -729,11 +726,11 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
static void si_emit_shader_gs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
+
/* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
* R_028A68_VGT_GSVS_RING_OFFSET_3 */
radeon_opt_set_context_reg3(
@@ -782,9 +779,7 @@ static void si_emit_shader_gs(struct si_context *sctx)
SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
shader->vgt_vertex_reuse_block_cntl);
}
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
@@ -931,6 +926,8 @@ static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
sctx->tracked_regs.reg_value[reg] != value) {
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
+
if (sctx->chip_class == GFX10) {
/* SQ_NON_EVENT must be emitted before GE_PC_ALLOC is written. */
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -938,6 +935,7 @@ static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
}
radeon_set_uconfig_reg(cs, R_030980_GE_PC_ALLOC, value);
+ radeon_end();
sctx->tracked_regs.reg_saved |= 0x1ull << reg;
sctx->tracked_regs.reg_value[reg] = value;
@@ -945,9 +943,9 @@ static void gfx10_emit_ge_pc_alloc(struct si_context *sctx, unsigned value)
}
/* Common tail code for NGG primitive shaders. */
-static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader,
- unsigned initial_cdw)
+static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader *shader)
{
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
shader->ctx_reg.ngg.ge_max_output_per_subgroup);
@@ -975,9 +973,7 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
/* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
gfx10_emit_ge_pc_alloc(sctx, shader->ctx_reg.ngg.ge_pc_alloc);
@@ -986,56 +982,55 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
static void gfx10_emit_shader_ngg_notess_nogs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader);
}
static void gfx10_emit_shader_ngg_tess_nogs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
shader->vgt_tf_param);
+ radeon_end_update_context_roll(sctx);
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader);
}
static void gfx10_emit_shader_ngg_notess_gs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
shader->ctx_reg.ngg.vgt_gs_max_vert_out);
+ radeon_end_update_context_roll(sctx);
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader);
}
static void gfx10_emit_shader_ngg_tess_gs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.gs->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, SI_TRACKED_VGT_GS_MAX_VERT_OUT,
shader->ctx_reg.ngg.vgt_gs_max_vert_out);
radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM, SI_TRACKED_VGT_TF_PARAM,
shader->vgt_tf_param);
+ radeon_end_update_context_roll(sctx);
- gfx10_emit_shader_ngg_tail(sctx, shader, initial_cdw);
+ gfx10_emit_shader_ngg_tail(sctx, shader);
}
unsigned si_get_input_prim(const struct si_shader_selector *gs)
@@ -1308,11 +1303,10 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
static void si_emit_shader_vs(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.vs->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_reg(sctx, R_028A40_VGT_GS_MODE, SI_TRACKED_VGT_GS_MODE,
shader->ctx_reg.vs.vgt_gs_mode);
radeon_opt_set_context_reg(sctx, R_028A84_VGT_PRIMITIVEID_EN, SI_TRACKED_VGT_PRIMITIVEID_EN,
@@ -1356,9 +1350,7 @@ static void si_emit_shader_vs(struct si_context *sctx)
SI_TRACKED_PA_CL_VS_OUT_CNTL__VS, shader->pa_cl_vs_out_cntl,
SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
}
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
/* GE_PC_ALLOC is not a context register, so it doesn't cause a context roll. */
if (sctx->chip_class >= GFX10)
@@ -1536,11 +1528,10 @@ static unsigned si_get_spi_shader_col_format(struct si_shader *shader)
static void si_emit_shader_ps(struct si_context *sctx)
{
struct si_shader *shader = sctx->queued.named.ps->shader;
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
-
if (!shader)
return;
+ radeon_begin(&sctx->gfx_cs);
/* R_0286CC_SPI_PS_INPUT_ENA, R_0286D0_SPI_PS_INPUT_ADDR*/
radeon_opt_set_context_reg2(sctx, R_0286CC_SPI_PS_INPUT_ENA, SI_TRACKED_SPI_PS_INPUT_ENA,
shader->ctx_reg.ps.spi_ps_input_ena,
@@ -1558,9 +1549,7 @@ static void si_emit_shader_ps(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK, SI_TRACKED_CB_SHADER_MASK,
shader->ctx_reg.ps.cb_shader_mask);
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
static void si_shader_ps(struct si_screen *sscreen, struct si_shader *shader)
@@ -3371,12 +3360,10 @@ static void si_emit_spi_map(struct si_context *sctx)
/* R_028644_SPI_PS_INPUT_CNTL_0 */
/* Dota 2: Only ~16% of SPI map updates set different values. */
/* Talos: Only ~9% of SPI map updates set different values. */
- unsigned initial_cdw = sctx->gfx_cs.current.cdw;
+ radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
sctx->tracked_regs.spi_ps_input_cntl, num_interp);
-
- if (initial_cdw != sctx->gfx_cs.current.cdw)
- sctx->context_roll = true;
+ radeon_end_update_context_roll(sctx);
}
/**
@@ -3405,6 +3392,8 @@ static void si_cs_preamble_add_vgt_flush(struct si_context *sctx)
*/
static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
{
+ radeon_begin(cs);
+
/* This is required before VGT_FLUSH. */
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
@@ -3412,6 +3401,7 @@ static void si_emit_vgt_flush(struct radeon_cmdbuf *cs)
/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
+ radeon_end();
}
/* Initialize state related to ESGS / GSVS ring buffers */
@@ -3505,6 +3495,8 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
si_emit_vgt_flush(cs);
+ radeon_begin(cs);
+
/* Set the GS registers. */
if (sctx->esgs_ring) {
assert(sctx->chip_class <= GFX8);
@@ -3515,6 +3507,7 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
radeon_set_uconfig_reg(cs, R_030904_VGT_GSVS_RING_SIZE,
sctx->gsvs_ring->width0 / 256);
}
+ radeon_end();
return true;
}
@@ -3789,6 +3782,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
si_emit_vgt_flush(cs);
/* Set tessellation registers. */
+ radeon_begin(cs);
radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
S_030938_SIZE(sctx->screen->tess_factor_ring_size / 4));
radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE, factor_va >> 8);
@@ -3801,6 +3795,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
}
radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM,
sctx->screen->vgt_hs_offchip_param);
+ radeon_end();
return;
}
@@ -4153,7 +4148,9 @@ static void si_emit_scratch_state(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ radeon_begin(cs);
radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, sctx->spi_tmpring_size);
+ radeon_end();
if (sctx->scratch_buffer) {
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->scratch_buffer, RADEON_USAGE_READWRITE,
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index 4c38746ed16..9ba4f73517d 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -221,6 +221,8 @@ static void gfx10_emit_streamout_begin(struct si_context *sctx)
last_target = i;
}
+ radeon_begin(cs);
+
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
if (!t[i])
continue;
@@ -246,6 +248,7 @@ static void gfx10_emit_streamout_begin(struct si_context *sctx)
radeon_emit(cs, 0);
radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
}
+ radeon_end();
sctx->streamout.begin_emitted = true;
}
@@ -275,6 +278,8 @@ static void si_flush_vgt_streamout(struct si_context *sctx)
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
unsigned reg_strmout_cntl;
+ radeon_begin(cs);
+
/* The register is at different places on different ASICs. */
if (sctx->chip_class >= GFX7) {
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
@@ -295,6 +300,7 @@ static void si_flush_vgt_streamout(struct si_context *sctx)
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
radeon_emit(cs, 4); /* poll interval */
+ radeon_end();
}
static void si_emit_streamout_begin(struct si_context *sctx)
@@ -306,6 +312,8 @@ static void si_emit_streamout_begin(struct si_context *sctx)
si_flush_vgt_streamout(sctx);
+ radeon_begin(cs);
+
for (i = 0; i < sctx->streamout.num_targets; i++) {
if (!t[i])
continue;
@@ -344,6 +352,7 @@ static void si_emit_streamout_begin(struct si_context *sctx)
radeon_emit(cs, 0); /* unused */
}
}
+ radeon_end();
sctx->streamout.begin_emitted = true;
}
@@ -362,6 +371,8 @@ void si_emit_streamout_end(struct si_context *sctx)
si_flush_vgt_streamout(sctx);
+ radeon_begin(cs);
+
for (i = 0; i < sctx->streamout.num_targets; i++) {
if (!t[i])
continue;
@@ -383,10 +394,10 @@ void si_emit_streamout_end(struct si_context *sctx)
* buffer bound. This ensures that the primitives-emitted query
* won't increment. */
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
- sctx->context_roll = true;
t[i]->buf_filled_size_valid = true;
}
+ radeon_end_update_context_roll(sctx);
sctx->streamout.begin_emitted = false;
}
@@ -402,6 +413,7 @@ static void si_emit_streamout_enable(struct si_context *sctx)
{
assert(!sctx->screen->use_ngg_streamout);
+ radeon_begin(&sctx->gfx_cs);
radeon_set_context_reg_seq(&sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
radeon_emit(&sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
S_028B94_RAST_STREAM(0) |
@@ -410,6 +422,7 @@ static void si_emit_streamout_enable(struct si_context *sctx)
S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
radeon_emit(&sctx->gfx_cs,
sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
+ radeon_end();
}
static void si_set_streamout_enable(struct si_context *sctx, bool enable)
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 41432755c64..0327d2f5d15 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -103,8 +103,10 @@ static void si_emit_cull_state(struct si_context *sctx)
/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+ radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg(&sctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
sctx->small_prim_cull_info_address >> 8);
+ radeon_end();
/* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling.
*
@@ -213,18 +215,22 @@ static void si_emit_one_scissor(struct si_context *ctx, struct radeon_cmdbuf *cs
if (scissor)
si_clip_scissor(&final, scissor);
+ radeon_begin(cs);
+
/* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
* SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
*/
if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
radeon_emit(cs, S_028250_TL_X(1) | S_028250_TL_Y(1) | S_028250_WINDOW_OFFSET_DISABLE(1));
radeon_emit(cs, S_028254_BR_X(1) | S_028254_BR_Y(1));
+ radeon_end();
return;
}
radeon_emit(cs, S_028250_TL_X(final.minx) | S_028250_TL_Y(final.miny) |
S_028250_WINDOW_OFFSET_DISABLE(1));
radeon_emit(cs, S_028254_BR_X(final.maxx) | S_028254_BR_Y(final.maxy));
+ radeon_end();
}
#define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176
@@ -350,7 +356,7 @@ static void si_emit_guardband(struct si_context *ctx)
* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
*/
- unsigned initial_cdw = ctx->gfx_cs.current.cdw;
+ radeon_begin(&ctx->gfx_cs);
radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, fui(guardband_y), fui(discard_y),
fui(guardband_x), fui(discard_x));
@@ -362,8 +368,7 @@ static void si_emit_guardband(struct si_context *ctx)
ctx, R_028BE4_PA_SU_VTX_CNTL, SI_TRACKED_PA_SU_VTX_CNTL,
S_028BE4_PIX_CENTER(rs->half_pixel_center) |
S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH + vp_as_scissor.quant_mode));
- if (initial_cdw != ctx->gfx_cs.current.cdw)
- ctx->context_roll = true;
+ radeon_end_update_context_roll(ctx);
}
static void si_emit_scissors(struct si_context *ctx)
@@ -376,7 +381,10 @@ static void si_emit_scissors(struct si_context *ctx)
if (!ctx->vs_writes_viewport_index) {
struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
+ radeon_end();
+
si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
return;
}
@@ -384,7 +392,10 @@ static void si_emit_scissors(struct si_context *ctx)
/* All registers in the array need to be updated if any of them is changed.
* This is a hardware requirement.
*/
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, SI_MAX_VIEWPORTS * 2);
+ radeon_end();
+
for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
scissor_enabled ? &states[i] : NULL);
@@ -477,12 +488,14 @@ static void si_emit_one_viewport(struct si_context *ctx, struct pipe_viewport_st
{
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
+ radeon_begin(cs);
radeon_emit(cs, fui(state->scale[0]));
radeon_emit(cs, fui(state->translate[0]));
radeon_emit(cs, fui(state->scale[1]));
radeon_emit(cs, fui(state->translate[1]));
radeon_emit(cs, fui(state->scale[2]));
radeon_emit(cs, fui(state->translate[2]));
+ radeon_end();
}
static void si_emit_viewports(struct si_context *ctx)
@@ -492,7 +505,10 @@ static void si_emit_viewports(struct si_context *ctx)
/* The simple case: Only 1 viewport is active. */
if (!ctx->vs_writes_viewport_index) {
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
+ radeon_end();
+
si_emit_one_viewport(ctx, &states[0]);
return;
}
@@ -500,7 +516,10 @@ static void si_emit_viewports(struct si_context *ctx)
/* All registers in the array need to be updated if any of them is changed.
* This is a hardware requirement.
*/
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + 0, SI_MAX_VIEWPORTS * 6);
+ radeon_end();
+
for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
si_emit_one_viewport(ctx, &states[i]);
}
@@ -528,21 +547,25 @@ static void si_emit_depth_ranges(struct si_context *ctx)
if (!ctx->vs_writes_viewport_index) {
si_viewport_zmin_zmax(&states[0], clip_halfz, window_space, &zmin, &zmax);
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
radeon_emit(cs, fui(zmin));
radeon_emit(cs, fui(zmax));
+ radeon_end();
return;
}
/* All registers in the array need to be updated if any of them is changed.
* This is a hardware requirement.
*/
+ radeon_begin(cs);
radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, SI_MAX_VIEWPORTS * 2);
for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
si_viewport_zmin_zmax(&states[i], clip_halfz, window_space, &zmin, &zmax);
radeon_emit(cs, fui(zmin));
radeon_emit(cs, fui(zmax));
}
+ radeon_end();
}
static void si_emit_viewport_states(struct si_context *ctx)
@@ -631,16 +654,20 @@ static void si_emit_window_rectangles(struct si_context *sctx)
else
rule = outside[num_rectangles - 1];
+ radeon_begin(cs);
radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE, SI_TRACKED_PA_SC_CLIPRECT_RULE,
rule);
- if (num_rectangles == 0)
+ if (num_rectangles == 0) {
+ radeon_end();
return;
+ }
radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL, num_rectangles * 2);
for (unsigned i = 0; i < num_rectangles; i++) {
radeon_emit(cs, S_028210_TL_X(rects[i].minx) | S_028210_TL_Y(rects[i].miny));
radeon_emit(cs, S_028214_BR_X(rects[i].maxx) | S_028214_BR_Y(rects[i].maxy));
}
+ radeon_end();
}
static void si_set_window_rectangles(struct pipe_context *ctx, bool include,