summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2021-01-09 21:33:35 -0500
committerMarge Bot <eric+marge@anholt.net>2021-01-20 21:53:13 +0000
commit185a2472a4c7aa24d74901af54c44bccd76fca41 (patch)
treeffbb7d59e999bffb9d841b9dbc5580b1f9f97c45
parentae5df516f1ed0ce01ffdcbacb869aa1374d6b248 (diff)
radeonsi: move variables closer to their use in most draw state functions
for lower register pressure, though I haven't measured this. si_draw_vbo will be handled in a future commit. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8600>
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.cpp74
1 files changed, 36 insertions, 38 deletions
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 181aa50ddc5..9b1ae0ea91a 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -195,10 +195,9 @@ static void si_emit_prefetch_L2(struct si_context *sctx)
* written to userdata SGPRs.
*/
static void si_emit_derived_tess_state(struct si_context *sctx,
- ubyte vertices_per_patch,
+ unsigned num_tcs_input_cp,
unsigned *num_patches)
{
- struct radeon_cmdbuf *cs = &sctx->gfx_cs;
struct si_shader *ls_current;
struct si_shader_selector *ls;
/* The TES pointer will only be used for sctx->last_tcs.
@@ -208,14 +207,6 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
bool has_primid_instancing_bug = sctx->chip_class == GFX6 && sctx->screen->info.max_se == 1;
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
- unsigned num_tcs_input_cp = vertices_per_patch;
- unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
- unsigned num_tcs_patch_outputs;
- unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
- unsigned input_patch_size, output_patch_size, output_patch0_offset;
- unsigned perpatch_output_offset, lds_per_patch, lds_size;
- unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
- unsigned offchip_layout, target_lds_size, ls_hs_config;
/* Since GFX9 has merged LS-HS in the TCS state, set LS = TCS. */
if (sctx->chip_class >= GFX9) {
@@ -245,7 +236,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
/* This calculates how shader inputs and outputs among VS, TCS, and TES
* are laid out in LDS. */
- num_tcs_inputs = util_last_bit64(ls->outputs_written);
+ unsigned num_tcs_inputs = util_last_bit64(ls->outputs_written);
+ unsigned num_tcs_output_cp, num_tcs_outputs, num_tcs_patch_outputs;
if (sctx->tcs_shader.cso) {
num_tcs_outputs = util_last_bit64(tcs->outputs_written);
@@ -258,8 +250,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
}
- input_vertex_size = ls->lshs_vertex_stride;
- output_vertex_size = num_tcs_outputs * 16;
+ unsigned input_vertex_size = ls->lshs_vertex_stride;
+ unsigned output_vertex_size = num_tcs_outputs * 16;
+ unsigned input_patch_size;
/* Allocate LDS for TCS inputs only if it's used. */
if (!ls_current->key.opt.same_patch_vertices ||
@@ -268,8 +261,9 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
else
input_patch_size = 0;
- pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
- output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+ unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
+ unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+ unsigned lds_per_patch;
/* Compute the LDS size per patch.
*
@@ -302,7 +296,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
* Use 16K so that we can fit 2 workgroups on the same CU.
*/
ASSERTED unsigned max_lds_size = 32 * 1024; /* hw limit */
- target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
+ unsigned target_lds_size = 16 * 1024; /* target at least 2 workgroups per CU, 16K each */
*num_patches = MIN2(*num_patches, target_lds_size / lds_per_patch);
*num_patches = MAX2(*num_patches, 1);
assert(*num_patches * lds_per_patch <= max_lds_size);
@@ -357,8 +351,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
sctx->last_num_patches = *num_patches;
- output_patch0_offset = input_patch_size * *num_patches;
- perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
+ unsigned output_patch0_offset = input_patch_size * *num_patches;
+ unsigned perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
/* Compute userdata SGPRs. */
assert(((input_vertex_size / 4) & ~0xff) == 0);
@@ -376,16 +370,16 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
si_resource(sctx->tess_rings_tmz) : si_resource(sctx->tess_rings))->gpu_address;
assert((ring_va & u_bit_consecutive(0, 19)) == 0);
- tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
- S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
- tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
- tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
- offchip_layout =
+ unsigned tcs_in_layout = S_VS_STATE_LS_OUT_PATCH_SIZE(input_patch_size / 4) |
+ S_VS_STATE_LS_OUT_VERTEX_SIZE(input_vertex_size / 4);
+ unsigned tcs_out_layout = (output_patch_size / 4) | (num_tcs_input_cp << 13) | ring_va;
+ unsigned tcs_out_offsets = (output_patch0_offset / 16) | ((perpatch_output_offset / 16) << 16);
+ unsigned offchip_layout =
(*num_patches - 1) | ((num_tcs_output_cp - 1) << 6) |
((pervertex_output_patch_size * *num_patches) << 11);
/* Compute the LDS size. */
- lds_size = lds_per_patch * *num_patches;
+ unsigned lds_size = lds_per_patch * *num_patches;
if (sctx->chip_class >= GFX7) {
assert(lds_size <= 65536);
@@ -404,6 +398,8 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
* been tested. */
assert(ls_current->config.lds_size == 0);
+ struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+
if (sctx->chip_class >= GFX9) {
unsigned hs_rsrc2 = ls_current->config.rsrc2;
@@ -448,8 +444,10 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
radeon_emit(cs, offchip_layout);
radeon_emit(cs, ring_va);
- ls_hs_config = S_028B58_NUM_PATCHES(*num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
- S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
+ unsigned ls_hs_config =
+ S_028B58_NUM_PATCHES(*num_patches) |
+ S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
+ S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
if (sctx->last_ls_hs_config != ls_hs_config) {
if (sctx->chip_class >= GFX7) {
@@ -970,17 +968,14 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
bool dispatch_prim_discard_cs, unsigned original_index_size)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
- bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
- uint32_t index_max_size = 0;
- uint32_t use_opaque = 0;
- uint64_t index_va = 0;
if (unlikely(sctx->thread_trace_enabled)) {
si_sqtt_write_event_marker(sctx, &sctx->gfx_cs, EventCmdDraw,
UINT_MAX, UINT_MAX, UINT_MAX);
}
+ uint32_t use_opaque = 0;
+
if (indirect && indirect->count_from_stream_output) {
struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output;
@@ -992,6 +987,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
indirect = NULL;
}
+ uint32_t index_max_size = 0;
+ uint64_t index_va = 0;
+
/* draw packet */
if (index_size) {
/* Register shadowing doesn't shadow INDEX_TYPE. */
@@ -1050,6 +1048,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
sctx->last_index_size = -1;
}
+ unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
+ bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
+
if (indirect) {
assert(num_draws == 1);
uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
@@ -1114,8 +1115,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
radeon_emit(cs, di_src_sel);
}
} else {
- int base_vertex;
-
/* Register shadowing requires that we always emit PKT3_NUM_INSTANCES. */
if (sctx->shadowed_regs ||
sctx->last_instance_count == SI_INSTANCE_COUNT_UNKNOWN ||
@@ -1126,7 +1125,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
}
/* Base vertex and start instance. */
- base_vertex = original_index_size ? info->index_bias : draws[0].start;
+ int base_vertex = original_index_size ? info->index_bias : draws[0].start;
bool set_draw_id = sctx->vs_uses_draw_id;
bool set_base_instance = sctx->vs_uses_base_instance;
@@ -1299,11 +1298,9 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
template <chip_class GFX_VERSION> ALWAYS_INLINE
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
{
- unsigned i, count = sctx->num_vertex_elements;
- uint32_t *ptr;
-
struct si_vertex_elements *velems = sctx->vertex_elements;
unsigned alloc_size = velems->vb_desc_list_alloc_size;
+ uint32_t *ptr;
if (alloc_size) {
/* Vertex buffer descriptors are the only ones which are uploaded
@@ -1330,12 +1327,13 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS;
}
+ unsigned count = sctx->num_vertex_elements;
assert(count <= SI_MAX_ATTRIBS);
unsigned first_vb_use_mask = velems->first_vb_use_mask;
unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs;
- for (i = 0; i < count; i++) {
+ for (unsigned i = 0; i < count; i++) {
struct pipe_vertex_buffer *vb;
struct si_resource *buf;
unsigned vbo_index = velems->vertex_buffer_index[i];