gallivm: do per-element lod for lod bias and explicit derivs too

Except for explicit derivs with cube maps which are very bogus anyway. Just like explicit lod this is only used if no_quad_lod is set in GALLIVM_DEBUG env var. Minification is terrible on cpus which don't support true vector shifts (but should work correctly). Cannot do the min/mag filter decision (if they are different) per pixel though, only selecting different mip levels works. Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
author: Roland Scheidegger <sroland@vmware.com> 2013-08-22 19:05:00 +0200
committer: Roland Scheidegger <sroland@vmware.com> 2013-08-22 19:05:52 +0200
commit: bd0b6c518000efdba4898664835612315c2c3cd1 (patch)
tree: 409bc0514544ec6c67b96fbe06d0a3449fb238f3 /src/gallium/auxiliary/gallivm
parent: 33694a1800cdf6909f197d711e84f636602981da (diff)
2 files changed, 74 insertions, 31 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 6e5c4a1e48f..95541041e31 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -200,7 +200,7 @@ lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
  * Generate code to compute coordinate gradient (rho).
  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  *
- * The resulting rho is scalar per quad.
+ * The resulting rho has bld->levelf format (per quad or per element).
  */
 static LLVMValueRef
 lp_build_rho(struct lp_build_sample_context *bld,
@@ -230,13 +230,17 @@ lp_build_rho(struct lp_build_sample_context *bld,
    LLVMValueRef first_level, first_level_vec;
    unsigned length = coord_bld->type.length;
    unsigned num_quads = length / 4;
+   boolean rho_per_quad = levelf_bld->type.length != length;
    unsigned i;
    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    LLVMValueRef rho_xvec, rho_yvec;
 
    /* Note that all simplified calculations will only work for isotropic filtering */
 
-   assert(bld->num_lods != length);
+   /*
+    * rho calcs are always per quad except for explicit derivs (excluding
+    * the messy cube maps for now) when requested.
+    */
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, texture_unit);
@@ -247,11 +251,18 @@ lp_build_rho(struct lp_build_sample_context *bld,
    if (cube_rho) {
       LLVMValueRef cubesize;
       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+
       /*
        * Cube map code did already everything except size mul and per-quad extraction.
+       * Luckily cube maps are always quadratic!
        */
-      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      levelf_bld->type, cube_rho, 0);
+      if (rho_per_quad) {
+         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                         levelf_bld->type, cube_rho, 0);
+      }
+      else {
+         rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
+      }
       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
          rho = lp_build_sqrt(levelf_bld, rho);
       }
@@ -290,29 +301,35 @@ lp_build_rho(struct lp_build_sample_context *bld,
             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
          }
-         rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
-         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         levelf_bld->type, rho_vec, 0);
-         /*
-          * note that as long as we don't care about per-pixel lod could reduce math
-          * more (at some shuffle cost), but for now only do sqrt after packing.
-          */
+         rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+
+         if (rho_per_quad) {
+            /*
+             * note for this case without per-pixel lod could reduce math more
+             * (at some shuffle cost), but for now only do sqrt after packing,
+             * otherwise would also need different code to per-pixel lod case.
+             */
+            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                            levelf_bld->type, rho, 0);
+         }
          rho = lp_build_sqrt(levelf_bld, rho);
+
       }
       else {
-         rho_vec = ddmax[0];
+         rho = ddmax[0];
          if (dims > 1) {
-            rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
+            rho = lp_build_max(coord_bld, rho, ddmax[1]);
             if (dims > 2) {
-               rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
+               rho = lp_build_max(coord_bld, rho, ddmax[2]);
             }
          }
-         /*
-          * rho_vec now still contains per-pixel rho, convert to scalar per quad
-          * since we can't handle per-pixel rho/lod from now on (TODO).
-          */
-         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         levelf_bld->type, rho_vec, 0);
+         if (rho_per_quad) {
+            /*
+             * rho_vec contains per-pixel rho, convert to scalar per quad.
+             */
+            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                            levelf_bld->type, rho, 0);
+         }
       }
    }
    else {
@@ -379,12 +396,25 @@ lp_build_rho(struct lp_build_sample_context *bld,
             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
          }
+
          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
-         rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+         rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 
-         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                         levelf_bld->type, rho_vec, 0);
+         if (rho_per_quad) {
+            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                            levelf_bld->type, rho, 0);
+         }
+         else {
+            /*
+             * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
+             * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
+             * same is true for cpus having faster scalars than 4-wide vecs
+             * for 4-wide case (where pack/unpack would be no-ops anyway).
+             * (Same is true really for cube_rho case above.)
+             */
+            rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
+         }
          rho = lp_build_sqrt(levelf_bld, rho);
       }
       else {
@@ -464,8 +494,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
                   }
                }
             }
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            levelf_bld->type, rho, 0);
+            if (rho_per_quad) {
+               rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                               levelf_bld->type, rho, 0);
+            }
+            else {
+               rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
+            }
          }
          else {
             if (dims <= 1) {
@@ -491,6 +526,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
                   }
                }
             }
+            if (!rho_per_quad) {
+               rho = lp_build_broadcast_scalar(levelf_bld, rho);
+            }
          }
       }
    }
@@ -729,8 +767,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
          /* add shader lod bias */
          if (lod_bias) {
-            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
-                  levelf_bld->type, lod_bias, 0);
+            if (bld->num_lods != bld->coord_type.length)
+               lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                                    levelf_bld->type, lod_bias, 0);
             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
          }
       }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index deb6ef429d5..743dd0a4501 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1942,7 +1942,9 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
     * There are other situations where at least the multiple int lods could be
     * avoided like min and max lod being equal.
     */
-   if (explicit_lod && lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+   if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+       (explicit_lod || lod_bias ||
+        (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE)) &&
        ((is_fetch && target != PIPE_BUFFER) ||
         (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
       bld.num_lods = type.length;
@@ -2140,9 +2142,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
          bld4.levelf_type.length = 1;
          bld4.leveli_type = lp_int_type(bld4.levelf_type);
 
-         if (explicit_lod && lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
-             ((is_fetch && target != PIPE_BUFFER) ||
-              (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+         if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+               (explicit_lod || lod_bias ||
+                (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE)) &&
+               ((is_fetch && target != PIPE_BUFFER) ||
+                (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
             bld4.num_lods = type4.length;
          else
             bld4.num_lods = 1;
author	Roland Scheidegger <sroland@vmware.com>	2013-08-22 19:05:00 +0200
committer	Roland Scheidegger <sroland@vmware.com>	2013-08-22 19:05:52 +0200
commit	bd0b6c518000efdba4898664835612315c2c3cd1 (patch)
tree	409bc0514544ec6c67b96fbe06d0a3449fb238f3 /src/gallium/auxiliary/gallivm
parent	33694a1800cdf6909f197d711e84f636602981da (diff)