diff options
author | Emma Anholt <emma@anholt.net> | 2022-01-10 14:49:09 -0800 |
---|---|---|
committer | Marge Bot <emma+marge@anholt.net> | 2022-01-16 19:11:29 +0000 |
commit | f6ffefba3e466a71a1a3099e1385bee09920e088 (patch) | |
tree | 44b11a812b8e8ee5f007e380dcb4f0ae4f98735b | |
parent | b024102d7c2959451bfef323432beaa4dca4dd88 (diff) |
nir: Apply nir_opt_offsets to nir_intrinsic_load_uniform as well.
Doing this for ir3 required adding a struct for limits of how much base to
fold in (which NTT wants as well for its case of shared vars), otherwise
the later work to lower to the 1<<9 word limit would emit more
instructions.
The shader-db results are that sometimes the reduction in NIR instruction
count results in the fewer sampler prefetches due to the shader being
estimated to be shorter (dota2, nexuiz):
total instructions in shared programs: 8996651 -> 8996776 (<.01%)
total cat5 in shared programs: 86561 -> 86577 (0.02%)
Reviewed-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14023>
-rw-r--r-- | src/amd/vulkan/radv_shader.c | 10 | ||||
-rw-r--r-- | src/compiler/nir/nir.h | 13 | ||||
-rw-r--r-- | src/compiler/nir/nir_opt_offsets.c | 41 | ||||
-rw-r--r-- | src/freedreno/ir3/ir3_nir.c | 12 |
4 files changed, 53 insertions, 23 deletions
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index b43dea2cb23..a23559f789d 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -211,8 +211,14 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets) NIR_PASS(more_algebraic, nir, nir_opt_algebraic); } - if (opt_offsets) - NIR_PASS_V(nir, nir_opt_offsets); + if (opt_offsets) { + static const nir_opt_offsets_options offset_options = { + .uniform_max = 0, + .buffer_max = ~0, + .shared_max = ~0, + }; + NIR_PASS_V(nir, nir_opt_offsets, &offset_options); + } /* Do late algebraic optimization to turn add(a, * neg(b)) back into subs, then the mandatory cleanup diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 93f1ca593e1..910a0e9c81e 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5254,7 +5254,18 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options); bool nir_opt_move(nir_shader *shader, nir_move_options options); -bool nir_opt_offsets(nir_shader *shader); +typedef struct { + /** nir_load_uniform max base offset */ + uint32_t uniform_max; + + /** nir_var_mem_shared max base offset */ + uint32_t shared_max; + + /** nir_load/store_buffer_amd max base offset */ + uint32_t buffer_max; +} nir_opt_offsets_options; + +bool nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options); bool nir_opt_peephole_select(nir_shader *shader, unsigned limit, bool indirect_load_ok, bool expensive_alu_ok); diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c index 471fab1747d..58cfa98a1ac 100644 --- a/src/compiler/nir/nir_opt_offsets.c +++ b/src/compiler/nir/nir_opt_offsets.c @@ -31,10 +31,11 @@ typedef struct { struct hash_table *range_ht; + const nir_opt_offsets_options *options; } opt_offsets_state; static nir_ssa_def * -try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const) +try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const, uint32_t max) { if (instr->type != nir_instr_type_alu) return NULL; @@ -66,15 +67,18 @@ try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state * for (unsigned i = 0; i < 2; ++i) { if (nir_src_is_const(alu->src[i].src)) { - *out_const += nir_src_as_uint(alu->src[i].src); - nir_ssa_def *replace_src = - try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const); - return replace_src ? replace_src : alu->src[1 - i].src.ssa; + uint32_t offset = nir_src_as_uint(alu->src[i].src); + if (offset + *out_const <= max) { + *out_const += offset; + nir_ssa_def *replace_src = + try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const, max); + return replace_src ? replace_src : alu->src[1 - i].src.ssa; + } } } - nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const); - nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const); + nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const, max); + nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const, max); if (!replace_src0 && !replace_src1) return NULL; @@ -88,7 +92,8 @@ static bool try_fold_load_store(nir_builder *b, nir_intrinsic_instr *intrin, opt_offsets_state *state, - unsigned offset_src_idx) + unsigned offset_src_idx, + uint32_t max) { /* Assume that BASE is the constant offset of a load/store. * Try to constant-fold additions to the offset source @@ -103,7 +108,7 @@ try_fold_load_store(nir_builder *b, return false; if (!nir_src_is_const(*off_src)) { - replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const); + replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const, max); } else if (nir_src_as_uint(*off_src)) { off_const += nir_src_as_uint(*off_src); b->cursor = nir_before_instr(&intrin->instr); @@ -128,21 +133,18 @@ process_instr(nir_builder *b, nir_instr *instr, void *s) nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { - /* Note that while it's tempting to include nir_intrinsic_load_uniform - * here, freedreno doesn't want that because it can have to move the base - * back to a register plus a small constant offset, and it's not clever - * enough to minimize the code that that emits. - */ + case nir_intrinsic_load_uniform: + return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max); case nir_intrinsic_load_shared: case nir_intrinsic_load_shared_ir3: - return try_fold_load_store(b, intrin, state, 0); + return try_fold_load_store(b, intrin, state, 0, state->options->shared_max); case nir_intrinsic_store_shared: case nir_intrinsic_store_shared_ir3: - return try_fold_load_store(b, intrin, state, 1); + return try_fold_load_store(b, intrin, state, 1, state->options->shared_max); case nir_intrinsic_load_buffer_amd: - return try_fold_load_store(b, intrin, state, 1); + return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max); case nir_intrinsic_store_buffer_amd: - return try_fold_load_store(b, intrin, state, 2); + return try_fold_load_store(b, intrin, state, 2, state->options->buffer_max); default: return false; } @@ -151,10 +153,11 @@ process_instr(nir_builder *b, nir_instr *instr, void *s) } bool -nir_opt_offsets(nir_shader *shader) +nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options) { opt_offsets_state state; state.range_ht = NULL; + state.options = options; bool p = nir_shader_instructions_pass(shader, process_instr, nir_metadata_block_index | diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index ccf3bdab63d..d6c30a2df87 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -117,7 +117,17 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s) progress |= OPT(s, nir_lower_alu); progress |= OPT(s, nir_lower_pack); progress |= OPT(s, nir_opt_constant_folding); - progress |= OPT(s, nir_opt_offsets); + + static const nir_opt_offsets_options offset_options = { + /* How large an offset we can encode in the instr's immediate field. + */ + .uniform_max = (1 << 9) - 1, + + .shared_max = ~0, + + .buffer_max = ~0, + }; + progress |= OPT(s, nir_opt_offsets, &offset_options); nir_load_store_vectorize_options vectorize_opts = { .modes = nir_var_mem_ubo, |