gallivm: don't calculate square root of rho if we use accurate rho method

While a sqrt here and there shouldn't hurt much (depending on the cpu) it is possible to completely omit it since rho is only used for calculating lod and there log2(x) == 0.5*log2(x^2). Depending on the exact path taken for calculating lod this means we get a simple mul instead of sqrt (in case of nearest mip filter in fact we don't need to replace the sqrt with something else at all), only in some not very useful path this doesn't work (combined brilinear calculation of int level and fractional lod, accurate rho calc but brilinear filtering seems odd). Apart from being faster as an added bonus this should increase our crappy fractional accuracy of lod, since fast_log2 is only good for ~3bits and this should increase accuracy by one bit (though not used if dimension is just one as we'd need an extra mul there as we never had the squared rho in the first place). v2: use separate ilog2_sqrt function if we have squared rho. Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
author: Roland Scheidegger <sroland@vmware.com> 2013-08-29 03:58:18 +0200
committer: Roland Scheidegger <sroland@vmware.com> 2013-08-30 02:16:45 +0200
commit: 81cfcdbd87940914fc3c59acd0e43c4f6efb3bb7 (patch)
tree: d2dfec96dd8e5cd436a09fd5751fd4af8bbe6d82 /src/gallium/auxiliary/gallivm
parent: 10e40ad11d5ed7f2d286837f92f8b63547f3db0b (diff)
1 files changed, 74 insertions, 39 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index e1cfd78e885..9b0a92c9cb9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -232,6 +232,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
    unsigned length = coord_bld->type.length;
    unsigned num_quads = length / 4;
    boolean rho_per_quad = rho_bld->type.length != length;
+   boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1);
    unsigned i;
    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
    LLVMValueRef rho_xvec, rho_yvec;
@@ -264,12 +265,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
       else {
          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
       }
-      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-         rho = lp_build_sqrt(rho_bld, rho);
-      }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                             rho_bld->type, float_size, index0);
+      if (no_rho_opt) {
+         /* skipping sqrt hence returning rho squared */
+         cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
+      }
       rho = lp_build_mul(rho_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
@@ -281,7 +283,11 @@ lp_build_rho(struct lp_build_sample_context *bld,
          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
                                                coord_bld->type, float_size, indexi);
 
-         if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+         /*
+          * note that for rho_per_quad case could reduce math (at some shuffle
+          * cost), but for now use same code to per-pixel lod case.
+          */
+         if (no_rho_opt) {
             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
@@ -295,7 +301,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
          }
       }
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
          if (dims > 2) {
@@ -303,19 +309,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
          }
          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
-
-         if (rho_per_quad) {
-            /*
-             * note for this case without per-pixel lod could reduce math more
-             * (at some shuffle cost), but for now only do sqrt after packing,
-             * otherwise would also need different code to per-pixel lod case.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
-         rho = lp_build_sqrt(rho_bld, rho);
-
-      }
+         /* skipping sqrt hence returning rho squared */
+     }
       else {
          rho = ddmax[0];
          if (dims > 1) {
@@ -324,13 +319,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
                rho = lp_build_max(coord_bld, rho, ddmax[2]);
             }
          }
-         if (rho_per_quad) {
-            /*
-             * rho_vec contains per-pixel rho, convert to scalar per quad.
-             */
-            rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                            rho_bld->type, rho, 0);
-         }
+      }
+      if (rho_per_quad) {
+         /*
+          * rho_vec contains per-pixel rho, convert to scalar per quad.
+          */
+         rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                         rho_bld->type, rho, 0);
       }
    }
    else {
@@ -362,7 +357,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
          }
       }
 
-      if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
+      if (no_rho_opt) {
          static const unsigned char swizzle01[] = { /* no-op swizzle */
             0, 1,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
@@ -407,16 +402,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
                                             rho_bld->type, rho, 0);
          }
          else {
-            /*
-             * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
-             * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
-             * same is true for cpus having faster scalars than 4-wide vecs
-             * for 4-wide case (where pack/unpack would be no-ops anyway).
-             * (Same is true really for cube_rho case above.)
-             */
             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
          }
-         rho = lp_build_sqrt(rho_bld, rho);
+         /* skipping sqrt hence returning rho squared */
       }
       else {
          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
@@ -636,7 +624,7 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
    /*
     * The pre factor will make the intersections with the exact powers of two
-    * happen precisely where we want then to be, which means that the integer
+    * happen precisely where we want them to be, which means that the integer
     * part will not need any post adjustments.
     */
    rho = lp_build_mul(bld, rho,
@@ -666,6 +654,34 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
 
 /**
+ * Fast implementation of iround(log2(sqrt(x))), based on
+ * log2(x^n) == n*log2(x).
+ *
+ * Gives accurate results all the time.
+ * (Could be trivially extended to handle other power-of-two roots.)
+ */
+static LLVMValueRef
+lp_build_ilog2_sqrt(struct lp_build_context *bld,
+                    LLVMValueRef x)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef ipart;
+   struct lp_type i_type = lp_int_type(bld->type);
+   LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
+   ipart = lp_build_extract_exponent(bld, x, 1);
+   ipart = LLVMBuildAShr(builder, ipart, one, "");
+
+   return ipart;
+}
+
+
+/**
  * Generate code to compute texture level of detail (lambda).
  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  * \param lod_bias  optional float vector with the shader lod bias
@@ -740,6 +756,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       }
       else {
          LLVMValueRef rho;
+         boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
+                               (bld->dims > 1);
 
          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
@@ -760,16 +778,28 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
                /*
-                * Don't actually need both all the time, ipart is needed
-                * for nearest mipfilter, pos_or_zero if min != mag.
+                * Don't actually need both values all the time, lod_ipart is
+                * needed for nearest mipfilter, lod_positive if min != mag.
                 */
-               *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               if (rho_squared) {
+                  *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
+               }
+               else {
+                  *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
+               }
                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
                                                 rho, lodf_bld->one);
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
-                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) &&
+                !rho_squared) {
+               /*
+                * This can't work if rho is squared. Not sure if it could be
+                * fixed while keeping it worthwile, could also do sqrt here
+                * but brilinear and no_rho_opt seems like a combination not
+                * making much sense anyway so just use ordinary path below.
+                */
                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
                                       out_lod_ipart, out_lod_fpart);
                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
@@ -784,6 +814,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          else {
             lod = lp_build_fast_log2(lodf_bld, rho);
          }
+         if (rho_squared) {
+            /* log2(x^2) == 0.5*log2(x) */
+            lod = lp_build_mul(lodf_bld, lod,
+                               lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
+         }
 
          /* add shader lod bias */
          if (lod_bias) {
author	Roland Scheidegger <sroland@vmware.com>	2013-08-29 03:58:18 +0200
committer	Roland Scheidegger <sroland@vmware.com>	2013-08-30 02:16:45 +0200
commit	81cfcdbd87940914fc3c59acd0e43c4f6efb3bb7 (patch)
tree	d2dfec96dd8e5cd436a09fd5751fd4af8bbe6d82 /src/gallium/auxiliary/gallivm
parent	10e40ad11d5ed7f2d286837f92f8b63547f3db0b (diff)