nir: Generate load_ubo_vec4 directly for !PIPE_CAP_NATIVE_INTEGERS

The prog_to_nir->NIR-to-TGSI change ended up causing regressions on r300, and svga against r300-class hardware, because nir_lower_uniforms_to_ubo() introduced shifts that nir_lower_ubo_vec4() tried to reverse, but that NIR couldn't prove are no-ops (since shifting up and back down may drop bits), and the hardware can't do the integer ops. Instead, make it so that nir_lower_uniforms_to_ubo can generate nir_intrinsic_load_ubo_vec4 directly for !INTEGER hardware. Fixes: cf3fc79cd0ab ("st/mesa: Replace mesa_to_tgsi() with prog_to_nir() and nir_to_tgsi().") Closes: #4602 Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10194> (cherry picked from commit 5de3cbbb2e66fcdb7d6219b78b89ad932ecadbd5)
author: Eric Anholt <eric@anholt.net> 2021-04-09 16:10:30 -0700
committer: Eric Engestrom <eric@engestrom.ch> 2021-04-18 22:13:27 +0200
commit: 8636a1220c22e19978ee26b47fb20dc1346dd128 (patch)
tree: a3c88478bb278948680637b362aa35602a90c087
parent: 743b02f9394869da4f1f9ce2f43f190e3846044b (diff)
12 files changed, 69 insertions, 65 deletions
diff --git a/.pick_status.json b/.pick_status.json
index 885d41112b7..8791d9dd64b 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -175,7 +175,7 @@
         "description": "nir: Generate load_ubo_vec4 directly for !PIPE_CAP_NATIVE_INTEGERS",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": "cf3fc79cd0ab55776bbbabe76237c272ab07133e"
     },
diff --git a/docs/gallium/screen.rst b/docs/gallium/screen.rst
index cdc1467c78a..b1b0dc5559d 100644
--- a/docs/gallium/screen.rst
+++ b/docs/gallium/screen.rst
@@ -481,7 +481,8 @@ The integer capabilities:
   those bits set, pipe_context::set_constant_buffer(.., 0, ..) is ignored
   by the driver, and the driver can throw assertion failures.
 * ``PIPE_CAP_PACKED_UNIFORMS``: True if the driver supports packed uniforms
-  as opposed to padding to vec4s.
+  as opposed to padding to vec4s.  Requires ``PIPE_SHADER_CAP_INTEGERS`` if
+  ``lower_uniforms_to_ubo`` is set.
 * ``PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES``: Whether the
   ``PIPE_CONSERVATIVE_RASTER_POST_SNAP`` mode is supported for triangles.
   The post-snap mode means the conservative rasterization occurs after
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 28931886ede..a0a646610db 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -3435,8 +3435,6 @@ typedef struct nir_shader_compiler_options {
    unsigned max_unroll_iterations;
    unsigned max_unroll_iterations_aggressive;
 
-   /* For the non-zero value of the enum corresponds multiplier when
-    * calling lower_uniforms_to_ubo */
    bool lower_uniforms_to_ubo;
 
    nir_lower_int64_options lower_int64_options;
@@ -4633,7 +4631,7 @@ bool nir_vectorize_tess_levels(nir_shader *shader);
 bool nir_lower_fragcolor(nir_shader *shader);
 bool nir_lower_fragcoord_wtrans(nir_shader *shader);
 void nir_lower_viewport_transform(nir_shader *shader);
-bool nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier);
+bool nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4);
 
 typedef struct nir_lower_subgroups_options {
    uint8_t subgroup_size;
diff --git a/src/compiler/nir/nir_lower_uniforms_to_ubo.c b/src/compiler/nir/nir_lower_uniforms_to_ubo.c
index 65107c01046..0b37b705ef8 100644
--- a/src/compiler/nir/nir_lower_uniforms_to_ubo.c
+++ b/src/compiler/nir/nir_lower_uniforms_to_ubo.c
@@ -22,27 +22,24 @@
  */
 
 /*
- * Remap load_uniform intrinsics to UBO accesses of UBO binding point 0.
- * Simultaneously, remap existing UBO accesses by increasing their binding
- * point by 1.
+ * Remap load_uniform intrinsics to nir_load_ubo or nir_load_ubo_vec4 accesses
+ * of UBO binding point 0. Simultaneously, remap existing UBO accesses by
+ * increasing their binding point by 1.
  *
- * Note that nir_intrinsic_load_uniform base/ranges can be set in different
- * units, and the multiplier argument caters to supporting these different
- * units.
+ * For PIPE_CAP_PACKED_UNIFORMS, dword_packed should be set to indicate that
+ * nir_intrinsic_load_uniform is in increments of dwords instead of vec4s.
  *
- * For example:
- * - st_glsl_to_nir for PIPE_CAP_PACKED_UNIFORMS uses dwords (4 bytes) so the
- *   multiplier should be 4
- * - st_glsl_to_nir for !PIPE_CAP_PACKED_UNIFORMS uses vec4s so the
- *   multiplier should be 16
- * - tgsi_to_nir uses vec4s, so the multiplier should be 16
+ * If load_vec4 is set, then nir_intrinsic_load_ubo_vec4 will be generated
+ * instead of nir_intrinsic_load_ubo, saving addressing math for hardawre
+ * needing aligned vec4 loads in increments of vec4s (such as TGSI CONST file
+ * loads).
  */
 
 #include "nir.h"
 #include "nir_builder.h"
 
 static bool
-lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
+lower_instr(nir_intrinsic_instr *instr, nir_builder *b, bool dword_packed, bool load_vec4)
 {
    b->cursor = nir_before_instr(&instr->instr);
 
@@ -58,43 +55,51 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
 
    if (instr->intrinsic == nir_intrinsic_load_uniform) {
       nir_ssa_def *ubo_idx = nir_imm_int(b, 0);
-      nir_ssa_def *ubo_offset =
-         nir_iadd(b, nir_imm_int(b, multiplier * nir_intrinsic_base(instr)),
-                  nir_imul(b, nir_imm_int(b, multiplier),
-                           nir_ssa_for_src(b, instr->src[0], 1)));
-
-      nir_intrinsic_instr *load =
-         nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
-      load->num_components = instr->num_components;
-      load->src[0] = nir_src_for_ssa(ubo_idx);
-      load->src[1] = nir_src_for_ssa(ubo_offset);
-      assert(instr->dest.ssa.bit_size >= 8);
+      nir_ssa_def *uniform_offset = nir_ssa_for_src(b, instr->src[0], 1);
 
-      /* If it's const, set the alignment to our known constant offset.  If
-       * not, set it to a pessimistic value based on the multiplier (or the
-       * scalar size, for qword loads).
-       *
-       * We could potentially set up stricter alignments for indirects by
-       * knowing what features are enabled in the APIs (see comment in
-       * nir_lower_ubo_vec4.c)
-       */
-      if (nir_src_is_const(instr->src[0])) {
-         nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX,
-                                 (nir_src_as_uint(instr->src[0]) +
-                                  nir_intrinsic_base(instr) * multiplier) %
-                                 NIR_ALIGN_MUL_MAX);
+      assert(instr->dest.ssa.bit_size >= 8);
+      nir_ssa_def *load_result;
+      if (load_vec4) {
+         /* No asking us to generate load_vec4 when you've packed your uniforms
+          * as dwords instead of vec4s.
+          */
+         assert(!dword_packed);
+         load_result = nir_load_ubo_vec4(b, instr->num_components, instr->dest.ssa.bit_size,
+                                         ubo_idx,
+                                         nir_iadd_imm(b, uniform_offset, nir_intrinsic_base(instr)));
       } else {
-         nir_intrinsic_set_align(load, MAX2(multiplier,
-                                            instr->dest.ssa.bit_size / 8), 0);
-      }
-      nir_ssa_dest_init(&load->instr, &load->dest,
-                        load->num_components, instr->dest.ssa.bit_size,
-                        instr->dest.ssa.name);
-      nir_builder_instr_insert(b, &load->instr);
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa, &load->dest.ssa);
+         /* For PIPE_CAP_PACKED_UNIFORMS, the uniforms are packed with the
+          * base/offset in dword units instead of vec4 units.
+          */
+         int multiplier = dword_packed ? 4 : 16;
+         load_result = nir_load_ubo(b, instr->num_components, instr->dest.ssa.bit_size,
+                             ubo_idx,
+                             nir_iadd_imm(b, nir_imul_imm(b, uniform_offset, multiplier),
+                                          nir_intrinsic_base(instr) * multiplier));
+         nir_intrinsic_instr *load = nir_instr_as_intrinsic(load_result->parent_instr);
+
+         /* If it's const, set the alignment to our known constant offset.  If
+          * not, set it to a pessimistic value based on the multiplier (or the
+          * scalar size, for qword loads).
+          *
+          * We could potentially set up stricter alignments for indirects by
+          * knowing what features are enabled in the APIs (see comment in
+          * nir_lower_ubo_vec4.c)
+          */
+         if (nir_src_is_const(instr->src[0])) {
+            nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX,
+                                    (nir_src_as_uint(instr->src[0]) +
+                                    nir_intrinsic_base(instr) * multiplier) %
+                                    NIR_ALIGN_MUL_MAX);
+         } else {
+            nir_intrinsic_set_align(load, MAX2(multiplier,
+                                             instr->dest.ssa.bit_size / 8), 0);
+         }
 
-      nir_intrinsic_set_range_base(load, nir_intrinsic_base(instr) * multiplier);
-      nir_intrinsic_set_range(load, nir_intrinsic_range(instr) * multiplier);
+         nir_intrinsic_set_range_base(load, nir_intrinsic_base(instr) * multiplier);
+         nir_intrinsic_set_range(load, nir_intrinsic_range(instr) * multiplier);
+      }
+      nir_ssa_def_rewrite_uses(&instr->dest.ssa, load_result);
 
       nir_instr_remove(&instr->instr);
       return true;
@@ -104,7 +109,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
 }
 
 bool
-nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier)
+nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4)
 {
    bool progress = false;
 
@@ -117,7 +122,7 @@ nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier)
                if (instr->type == nir_instr_type_intrinsic)
                   progress |= lower_instr(nir_instr_as_intrinsic(instr),
                                           &builder,
-                                          multiplier);
+                                          dword_packed, load_vec4);
             }
          }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 67ae6a16356..00c63ed5f45 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -100,7 +100,7 @@ draw_create_vs_llvm(struct draw_context *draw,
       vs->base.state.ir.nir = state->ir.nir;
       nir_shader *nir = (nir_shader *)state->ir.nir;
       if (!nir->options->lower_uniforms_to_ubo)
-         NIR_PASS_V(state->ir.nir, nir_lower_uniforms_to_ubo, 16);
+         NIR_PASS_V(state->ir.nir, nir_lower_uniforms_to_ubo, false, false);
       nir_tgsi_scan_shader(state->ir.nir, &vs->base.info, true);
    } else {
       /* we make a private copy of the tokens */
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index d0307a690c0..1f6270233bf 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -2718,8 +2718,8 @@ nir_to_tgsi(struct nir_shader *s,
 
    if (!original_options->lower_uniforms_to_ubo) {
       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
-                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS) ?
-                 4 : 16);
+                 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
+                 !native_integers);
    }
 
    /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index a39309a6ac7..242a0e40fa6 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -2496,7 +2496,7 @@ ttn_finalize_nir(struct ttn_compile *c, struct pipe_screen *screen)
    }
 
    if (nir->options->lower_uniforms_to_ubo)
-      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
 
    if (!c->cap_samplers_as_deref)
       NIR_PASS_V(nir, nir_lower_samplers);
diff --git a/src/gallium/drivers/d3d12/d3d12_compiler.cpp b/src/gallium/drivers/d3d12/d3d12_compiler.cpp
index d127ae36df5..cbaab4f79f0 100644
--- a/src/gallium/drivers/d3d12/d3d12_compiler.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_compiler.cpp
@@ -147,7 +147,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,
 
    uint32_t num_ubos_before_lower_to_ubo = nir->info.num_ubos;
    uint32_t num_uniforms_before_lower_to_ubo = nir->num_uniforms;
-   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
    shader->has_default_ubo0 = num_uniforms_before_lower_to_ubo > 0 &&
                               nir->info.num_ubos > num_ubos_before_lower_to_ubo;
 
diff --git a/src/gallium/drivers/etnaviv/tests/lower_ubo_tests.cpp b/src/gallium/drivers/etnaviv/tests/lower_ubo_tests.cpp
index 2b9fa41062e..24342b288a2 100644
--- a/src/gallium/drivers/etnaviv/tests/lower_ubo_tests.cpp
+++ b/src/gallium/drivers/etnaviv/tests/lower_ubo_tests.cpp
@@ -123,7 +123,7 @@ TEST_F(nir_lower_ubo_test, basic)
    nir_ssa_def *offset = nir_imm_int(&b, 4);
    nir_load_uniform(&b, 1, 32, offset);
 
-   nir_lower_uniforms_to_ubo(b.shader, 16);
+   nir_lower_uniforms_to_ubo(b.shader, false, false);
    nir_opt_constant_folding(b.shader);
 
    ASSERT_TRUE(etna_nir_lower_ubo_to_uniform(b.shader));
diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index f030483817d..3cfb6b4dc32 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -914,7 +914,7 @@ zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr, bool optimize)
       tex_opts.lower_tg4_offsets = true;
       NIR_PASS_V(nir, nir_lower_tex, &tex_opts);
    }
-   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+   NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
    if (nir->info.stage == MESA_SHADER_GEOMETRY)
       NIR_PASS_V(nir, nir_lower_gs_intrinsics, nir_lower_gs_intrinsics_per_stream);
    optimize_nir(nir);
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index 92d021e52fc..0a19949ad82 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -1005,12 +1005,10 @@ st_unpacked_uniforms_type_size(const struct glsl_type *type, bool bindless)
 void
 st_nir_lower_uniforms(struct st_context *st, nir_shader *nir)
 {
-   unsigned multiplier = 16;
    if (st->ctx->Const.PackedDriverUniformStorage) {
       NIR_PASS_V(nir, nir_lower_io, nir_var_uniform,
                  st_packed_uniforms_type_size,
                  (nir_lower_io_options)0);
-      multiplier = 4;
    } else {
       NIR_PASS_V(nir, nir_lower_io, nir_var_uniform,
                  st_unpacked_uniforms_type_size,
@@ -1018,7 +1016,9 @@ st_nir_lower_uniforms(struct st_context *st, nir_shader *nir)
    }
 
    if (nir->options->lower_uniforms_to_ubo)
-      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, multiplier);
+      NIR_PASS_V(nir, nir_lower_uniforms_to_ubo,
+                 st->ctx->Const.PackedDriverUniformStorage,
+                 !st->ctx->Const.NativeIntegers);
 }
 
 /* Last third of preparing nir from glsl, which happens after shader
diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
index 638432d39cf..e00967d503e 100644
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -338,7 +338,7 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
          * to UBO ordinarily, but it isn't as aggressive as we need. */
 
         NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
-        NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
+        NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
 
         do {
                 progress = false;
author	Eric Anholt <eric@anholt.net>	2021-04-09 16:10:30 -0700
committer	Eric Engestrom <eric@engestrom.ch>	2021-04-18 22:13:27 +0200
commit	8636a1220c22e19978ee26b47fb20dc1346dd128 (patch)
tree	a3c88478bb278948680637b362aa35602a90c087
parent	743b02f9394869da4f1f9ce2f43f190e3846044b (diff)