nir: Apply nir_opt_offsets to nir_intrinsic_load_uniform as well.

Doing this for ir3 required adding a struct for limits of how much base to fold in (which NTT wants as well for its case of shared vars), otherwise the later work to lower to the 1<<9 word limit would emit more instructions. The shader-db results are that sometimes the reduction in NIR instruction count results in the fewer sampler prefetches due to the shader being estimated to be shorter (dota2, nexuiz): total instructions in shared programs: 8996651 -> 8996776 (<.01%) total cat5 in shared programs: 86561 -> 86577 (0.02%) Reviewed-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14023>
author: Emma Anholt <emma@anholt.net> 2022-01-10 14:49:09 -0800
committer: Marge Bot <emma+marge@anholt.net> 2022-01-16 19:11:29 +0000
commit: f6ffefba3e466a71a1a3099e1385bee09920e088 (patch)
tree: 44b11a812b8e8ee5f007e380dcb4f0ae4f98735b
parent: b024102d7c2959451bfef323432beaa4dca4dd88 (diff)
4 files changed, 53 insertions, 23 deletions
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index b43dea2cb23..a23559f789d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -211,8 +211,14 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)
       NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
    }
 
-   if (opt_offsets)
-      NIR_PASS_V(nir, nir_opt_offsets);
+   if (opt_offsets) {
+      static const nir_opt_offsets_options offset_options = {
+         .uniform_max = 0,
+         .buffer_max = ~0,
+         .shared_max = ~0,
+      };
+      NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
+   }
 
    /* Do late algebraic optimization to turn add(a,
     * neg(b)) back into subs, then the mandatory cleanup
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 93f1ca593e1..910a0e9c81e 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5254,7 +5254,18 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);
 
 bool nir_opt_move(nir_shader *shader, nir_move_options options);
 
-bool nir_opt_offsets(nir_shader *shader);
+typedef struct {
+   /** nir_load_uniform max base offset */
+   uint32_t uniform_max;
+
+   /** nir_var_mem_shared max base offset */
+   uint32_t shared_max;
+
+   /** nir_load/store_buffer_amd max base offset */
+   uint32_t buffer_max;
+} nir_opt_offsets_options;
+
+bool nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options);
 
 bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
                              bool indirect_load_ok, bool expensive_alu_ok);
diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c
index 471fab1747d..58cfa98a1ac 100644
--- a/src/compiler/nir/nir_opt_offsets.c
+++ b/src/compiler/nir/nir_opt_offsets.c
@@ -31,10 +31,11 @@
 typedef struct
 {
    struct hash_table *range_ht;
+   const nir_opt_offsets_options *options;
 } opt_offsets_state;
 
 static nir_ssa_def *
-try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const)
+try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const, uint32_t max)
 {
    if (instr->type != nir_instr_type_alu)
       return NULL;
@@ -66,15 +67,18 @@ try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *
 
    for (unsigned i = 0; i < 2; ++i) {
       if (nir_src_is_const(alu->src[i].src)) {
-         *out_const += nir_src_as_uint(alu->src[i].src);
-         nir_ssa_def *replace_src =
-            try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const);
-         return replace_src ? replace_src : alu->src[1 - i].src.ssa;
+         uint32_t offset = nir_src_as_uint(alu->src[i].src);
+         if (offset + *out_const <= max) {
+            *out_const += offset;
+            nir_ssa_def *replace_src =
+                try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const, max);
+            return replace_src ? replace_src : alu->src[1 - i].src.ssa;
+         }
       }
    }
 
-   nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const);
-   nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const);
+   nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const, max);
+   nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const, max);
    if (!replace_src0 && !replace_src1)
       return NULL;
 
@@ -88,7 +92,8 @@ static bool
 try_fold_load_store(nir_builder *b,
                     nir_intrinsic_instr *intrin,
                     opt_offsets_state *state,
-                    unsigned offset_src_idx)
+                    unsigned offset_src_idx,
+                    uint32_t max)
 {
    /* Assume that BASE is the constant offset of a load/store.
     * Try to constant-fold additions to the offset source
@@ -103,7 +108,7 @@ try_fold_load_store(nir_builder *b,
       return false;
 
    if (!nir_src_is_const(*off_src)) {
-      replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const);
+      replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const, max);
    } else if (nir_src_as_uint(*off_src)) {
       off_const += nir_src_as_uint(*off_src);
       b->cursor = nir_before_instr(&intrin->instr);
@@ -128,21 +133,18 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
    switch (intrin->intrinsic) {
-      /* Note that while it's tempting to include nir_intrinsic_load_uniform
-       * here, freedreno doesn't want that because it can have to move the base
-       * back to a register plus a small constant offset, and it's not clever
-       * enough to minimize the code that that emits.
-       */
+   case nir_intrinsic_load_uniform:
+      return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max);
    case nir_intrinsic_load_shared:
    case nir_intrinsic_load_shared_ir3:
-      return try_fold_load_store(b, intrin, state, 0);
+      return try_fold_load_store(b, intrin, state, 0, state->options->shared_max);
    case nir_intrinsic_store_shared:
    case nir_intrinsic_store_shared_ir3:
-      return try_fold_load_store(b, intrin, state, 1);
+      return try_fold_load_store(b, intrin, state, 1, state->options->shared_max);
    case nir_intrinsic_load_buffer_amd:
-      return try_fold_load_store(b, intrin, state, 1);
+      return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max);
    case nir_intrinsic_store_buffer_amd:
-      return try_fold_load_store(b, intrin, state, 2);
+      return try_fold_load_store(b, intrin, state, 2, state->options->buffer_max);
    default:
       return false;
    }
@@ -151,10 +153,11 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
 }
 
 bool
-nir_opt_offsets(nir_shader *shader)
+nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options)
 {
    opt_offsets_state state;
    state.range_ht = NULL;
+   state.options = options;
 
    bool p = nir_shader_instructions_pass(shader, process_instr,
                                          nir_metadata_block_index |
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index ccf3bdab63d..d6c30a2df87 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -117,7 +117,17 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
       progress |= OPT(s, nir_lower_alu);
       progress |= OPT(s, nir_lower_pack);
       progress |= OPT(s, nir_opt_constant_folding);
-      progress |= OPT(s, nir_opt_offsets);
+
+      static const nir_opt_offsets_options offset_options = {
+         /* How large an offset we can encode in the instr's immediate field.
+          */
+         .uniform_max = (1 << 9) - 1,
+
+         .shared_max = ~0,
+
+         .buffer_max = ~0,
+      };
+      progress |= OPT(s, nir_opt_offsets, &offset_options);
 
       nir_load_store_vectorize_options vectorize_opts = {
          .modes = nir_var_mem_ubo,
author	Emma Anholt <emma@anholt.net>	2022-01-10 14:49:09 -0800
committer	Marge Bot <emma+marge@anholt.net>	2022-01-16 19:11:29 +0000
commit	f6ffefba3e466a71a1a3099e1385bee09920e088 (patch)
tree	44b11a812b8e8ee5f007e380dcb4f0ae4f98735b
parent	b024102d7c2959451bfef323432beaa4dca4dd88 (diff)