summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEmma Anholt <emma@anholt.net>2022-01-10 14:49:09 -0800
committerMarge Bot <emma+marge@anholt.net>2022-01-16 19:11:29 +0000
commitf6ffefba3e466a71a1a3099e1385bee09920e088 (patch)
tree44b11a812b8e8ee5f007e380dcb4f0ae4f98735b
parentb024102d7c2959451bfef323432beaa4dca4dd88 (diff)
nir: Apply nir_opt_offsets to nir_intrinsic_load_uniform as well.
Doing this for ir3 required adding a struct for limits of how much base to fold in (which NTT wants as well for its case of shared vars), otherwise the later work to lower to the 1<<9 word limit would emit more instructions. The shader-db results are that sometimes the reduction in NIR instruction count results in the fewer sampler prefetches due to the shader being estimated to be shorter (dota2, nexuiz): total instructions in shared programs: 8996651 -> 8996776 (<.01%) total cat5 in shared programs: 86561 -> 86577 (0.02%) Reviewed-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14023>
-rw-r--r--src/amd/vulkan/radv_shader.c10
-rw-r--r--src/compiler/nir/nir.h13
-rw-r--r--src/compiler/nir/nir_opt_offsets.c41
-rw-r--r--src/freedreno/ir3/ir3_nir.c12
4 files changed, 53 insertions, 23 deletions
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index b43dea2cb23..a23559f789d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -211,8 +211,14 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)
NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
}
- if (opt_offsets)
- NIR_PASS_V(nir, nir_opt_offsets);
+ if (opt_offsets) {
+ static const nir_opt_offsets_options offset_options = {
+ .uniform_max = 0,
+ .buffer_max = ~0,
+ .shared_max = ~0,
+ };
+ NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
+ }
/* Do late algebraic optimization to turn add(a,
* neg(b)) back into subs, then the mandatory cleanup
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 93f1ca593e1..910a0e9c81e 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5254,7 +5254,18 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
bool nir_opt_move(nir_shader *shader, nir_move_options options);
-bool nir_opt_offsets(nir_shader *shader);
+typedef struct {
+ /** nir_load_uniform max base offset */
+ uint32_t uniform_max;
+
+ /** nir_var_mem_shared max base offset */
+ uint32_t shared_max;
+
+ /** nir_load/store_buffer_amd max base offset */
+ uint32_t buffer_max;
+} nir_opt_offsets_options;
+
+bool nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options);
bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
bool indirect_load_ok, bool expensive_alu_ok);
diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c
index 471fab1747d..58cfa98a1ac 100644
--- a/src/compiler/nir/nir_opt_offsets.c
+++ b/src/compiler/nir/nir_opt_offsets.c
@@ -31,10 +31,11 @@
typedef struct
{
struct hash_table *range_ht;
+ const nir_opt_offsets_options *options;
} opt_offsets_state;
static nir_ssa_def *
-try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const)
+try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const, uint32_t max)
{
if (instr->type != nir_instr_type_alu)
return NULL;
@@ -66,15 +67,18 @@ try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *
for (unsigned i = 0; i < 2; ++i) {
if (nir_src_is_const(alu->src[i].src)) {
- *out_const += nir_src_as_uint(alu->src[i].src);
- nir_ssa_def *replace_src =
- try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const);
- return replace_src ? replace_src : alu->src[1 - i].src.ssa;
+ uint32_t offset = nir_src_as_uint(alu->src[i].src);
+ if (offset + *out_const <= max) {
+ *out_const += offset;
+ nir_ssa_def *replace_src =
+ try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const, max);
+ return replace_src ? replace_src : alu->src[1 - i].src.ssa;
+ }
}
}
- nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const);
- nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const);
+ nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const, max);
+ nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const, max);
if (!replace_src0 && !replace_src1)
return NULL;
@@ -88,7 +92,8 @@ static bool
try_fold_load_store(nir_builder *b,
nir_intrinsic_instr *intrin,
opt_offsets_state *state,
- unsigned offset_src_idx)
+ unsigned offset_src_idx,
+ uint32_t max)
{
/* Assume that BASE is the constant offset of a load/store.
* Try to constant-fold additions to the offset source
@@ -103,7 +108,7 @@ try_fold_load_store(nir_builder *b,
return false;
if (!nir_src_is_const(*off_src)) {
- replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const);
+ replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const, max);
} else if (nir_src_as_uint(*off_src)) {
off_const += nir_src_as_uint(*off_src);
b->cursor = nir_before_instr(&intrin->instr);
@@ -128,21 +133,18 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
- /* Note that while it's tempting to include nir_intrinsic_load_uniform
- * here, freedreno doesn't want that because it can have to move the base
- * back to a register plus a small constant offset, and it's not clever
- * enough to minimize the code that that emits.
- */
+ case nir_intrinsic_load_uniform:
+ return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max);
case nir_intrinsic_load_shared:
case nir_intrinsic_load_shared_ir3:
- return try_fold_load_store(b, intrin, state, 0);
+ return try_fold_load_store(b, intrin, state, 0, state->options->shared_max);
case nir_intrinsic_store_shared:
case nir_intrinsic_store_shared_ir3:
- return try_fold_load_store(b, intrin, state, 1);
+ return try_fold_load_store(b, intrin, state, 1, state->options->shared_max);
case nir_intrinsic_load_buffer_amd:
- return try_fold_load_store(b, intrin, state, 1);
+ return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max);
case nir_intrinsic_store_buffer_amd:
- return try_fold_load_store(b, intrin, state, 2);
+ return try_fold_load_store(b, intrin, state, 2, state->options->buffer_max);
default:
return false;
}
@@ -151,10 +153,11 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
}
bool
-nir_opt_offsets(nir_shader *shader)
+nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options)
{
opt_offsets_state state;
state.range_ht = NULL;
+ state.options = options;
bool p = nir_shader_instructions_pass(shader, process_instr,
nir_metadata_block_index |
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index ccf3bdab63d..d6c30a2df87 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -117,7 +117,17 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
progress |= OPT(s, nir_lower_alu);
progress |= OPT(s, nir_lower_pack);
progress |= OPT(s, nir_opt_constant_folding);
- progress |= OPT(s, nir_opt_offsets);
+
+ static const nir_opt_offsets_options offset_options = {
+ /* How large an offset we can encode in the instr's immediate field.
+ */
+ .uniform_max = (1 << 9) - 1,
+
+ .shared_max = ~0,
+
+ .buffer_max = ~0,
+ };
+ progress |= OPT(s, nir_opt_offsets, &offset_options);
nir_load_store_vectorize_options vectorize_opts = {
.modes = nir_var_mem_ubo,