summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDanylo Piliaiev <dpiliaiev@igalia.com>2021-12-07 15:15:23 +0200
committerMarge Bot <emma+marge@anholt.net>2022-01-07 15:29:23 +0000
commit57c3e07f96d3a8f6483f680b67aa54755b38f00f (patch)
treebc32ccb0848b90e9419c1e6d0c766a199ca474b7
parenta40f004ecf2cc03aa839783e2dd7628638083986 (diff)
ir3: Be able to reduce register limit for RA when CS has barriers
If barriers are used, it must be possible for all waves in the workgroup to execute concurrently. Thus we may have to reduce the registers limit. Fixes a hang in "Digital Combat Simulator". Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14110>
-rw-r--r--src/freedreno/ir3/ir3_compiler.c2
-rw-r--r--src/freedreno/ir3/ir3_compiler.h3
-rw-r--r--src/freedreno/ir3/ir3_compiler_nir.c3
-rw-r--r--src/freedreno/ir3/ir3_ra.c53
-rw-r--r--src/freedreno/ir3/ir3_shader.h3
-rw-r--r--src/gallium/drivers/freedreno/freedreno_screen.c4
6 files changed, 66 insertions, 2 deletions
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 26b120a87c0..dfc6aec037f 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -95,6 +95,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->wave_granularity = 2;
compiler->max_waves = 16;
+ compiler->max_variable_workgroup_size = 1024;
+
if (compiler->gen >= 6) {
compiler->samgq_workaround = true;
/* a6xx split the pipeline state into geometry and fragment state, in
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 9b2edd3b944..68e5d944ea5 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -166,6 +166,9 @@ struct ir3_compiler {
*/
bool has_getfiberid;
+ /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
+ uint32_t max_variable_workgroup_size;
+
/* Type to use for 1b nir bools: */
type_t bool_type;
};
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index b0572e67c9b..cb11d5120cd 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1400,6 +1400,8 @@ emit_control_barrier(struct ir3_context *ctx)
barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
barrier->barrier_class = IR3_BARRIER_EVERYTHING;
array_insert(b, b->keeps, barrier);
+
+ ctx->so->has_barrier = true;
}
static void
@@ -3968,6 +3970,7 @@ emit_instructions(struct ir3_context *ctx)
barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
barrier->barrier_class = IR3_BARRIER_EVERYTHING;
array_insert(ctx->block, ctx->block->keeps, barrier);
+ ctx->so->has_barrier = true;
}
/* And emit the body: */
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 8bb91a5c938..0fd31eacae0 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -2212,6 +2212,54 @@ calc_min_limit_pressure(struct ir3_shader_variant *v,
ralloc_free(ctx);
}
+/*
+ * If barriers are used, it must be possible for all waves in the workgroup
+ * to execute concurrently. Thus we may have to reduce the registers limit.
+ */
+static void
+calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
+ struct ir3_pressure *limit_pressure)
+{
+ const struct ir3_compiler *compiler = v->shader->compiler;
+
+ unsigned threads_per_wg;
+ if (v->local_size_variable) {
+ /* We have to expect the worst case. */
+ threads_per_wg = compiler->max_variable_workgroup_size;
+ } else {
+ threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+ }
+
+ /* The register file is grouped into reg_size_vec4 number of parts.
+ * Each part has enough registers to add a single vec4 register to
+ * each thread of a single-sized wave-pair. With double threadsize
+ * each wave-pair would consume two parts of the register file to get
+ * a single vec4 for a thread. The more active wave-pairs the less
+ * parts each could get.
+ */
+
+ bool double_threadsize = ir3_should_double_threadsize(v, 0);
+ unsigned waves_per_wg = DIV_ROUND_UP(
+ threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
+ compiler->wave_granularity);
+
+ uint32_t vec4_regs_per_thread =
+ compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
+ assert(vec4_regs_per_thread > 0);
+
+ uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
+
+ if (limit_pressure->full > half_regs_per_thread) {
+ if (v->mergedregs) {
+ limit_pressure->full = half_regs_per_thread;
+ } else {
+ /* TODO: Handle !mergedregs case, probably we would have to do this
+ * after the first register pressure pass.
+ */
+ }
+ }
+}
+
int
ir3_ra(struct ir3_shader_variant *v)
{
@@ -2238,12 +2286,15 @@ ir3_ra(struct ir3_shader_variant *v)
d("\thalf: %u", max_pressure.half);
d("\tshared: %u", max_pressure.shared);
- /* TODO: calculate half/full limit correctly for CS with barrier */
struct ir3_pressure limit_pressure;
limit_pressure.full = RA_FULL_SIZE;
limit_pressure.half = RA_HALF_SIZE;
limit_pressure.shared = RA_SHARED_SIZE;
+ if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
+ calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
+ }
+
/* If requested, lower the limit so that spilling happens more often. */
if (ir3_shader_debug & IR3_DBG_SPILLALL)
calc_min_limit_pressure(v, live, &limit_pressure);
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index f4588512a15..97f2632632e 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -696,6 +696,9 @@ struct ir3_shader_variant {
uint16_t local_size[3];
bool local_size_variable;
+ /* Important for compute shader to determine max reg footprint */
+ bool has_barrier;
+
struct ir3_disasm_info disasm_info;
};
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index aae18be3d2b..969562887b9 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -724,6 +724,8 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type,
if (!has_compute(screen))
return 0;
+ struct ir3_compiler *compiler = screen->compiler;
+
#define RET(x) \
do { \
if (ret) \
@@ -780,7 +782,7 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type,
RET((uint32_t[]){32}); // TODO
case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
- RET((uint64_t[]){1024}); // TODO
+ RET((uint64_t[]){ compiler->max_variable_workgroup_size });
}
return 0;