6 files changed, 332 insertions, 7 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index c989584cf41..acbcef7e2ed 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -173,6 +173,7 @@ GALLIVM_SOURCES := \
         gallivm/lp_bld_format_aos.c \
         gallivm/lp_bld_format_aos_array.c \
 	gallivm/lp_bld_format_float.c \
+        gallivm/lp_bld_format_srgb.c \
         gallivm/lp_bld_format_soa.c \
         gallivm/lp_bld_format_yuv.c \
         gallivm/lp_bld_gather.c \
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7d6fe04f50d..e7955aa6bf4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -2896,7 +2896,7 @@ lp_build_log(struct lp_build_context *bld,
  * Generate polynomial.
  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
  */
-static LLVMValueRef
+LLVMValueRef
 lp_build_polynomial(struct lp_build_context *bld,
                     LLVMValueRef x,
                     const double *coeffs,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 920e339cda5..04e180c94fd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -239,6 +239,12 @@ lp_build_fast_rsqrt(struct lp_build_context *bld,
                     LLVMValueRef a);
 
 LLVMValueRef
+lp_build_polynomial(struct lp_build_context *bld,
+                    LLVMValueRef x,
+                    const double *coeffs,
+                    unsigned num_coeffs);
+
+LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
              LLVMValueRef a);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 12a03180180..744d0028941 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -158,4 +158,15 @@ lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
                          LLVMValueRef src,
                          LLVMValueRef *dst);
 
+LLVMValueRef
+lp_build_linear_to_srgb(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src);
+
+LLVMValueRef
+lp_build_srgb_to_linear(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src);
+
+
 #endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 4c6bd81047a..114ce03bbdc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -163,11 +163,23 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
           */
 
          if (type.floating) {
-            if(format_desc->channel[chan].normalized)
-               input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
-            else
-               input = LLVMBuildSIToFP(builder, input,
-                                       lp_build_vec_type(gallivm, type), "");
+            if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+               assert(width == 8);
+               if (format_desc->swizzle[3] == chan) {
+                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+               }
+               else {
+                  struct lp_type conv_type = lp_uint_type(type);
+                  input = lp_build_srgb_to_linear(gallivm, conv_type, input);
+               }
+            }
+            else {
+               if(format_desc->channel[chan].normalized)
+                  input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+               else
+                  input = LLVMBuildSIToFP(builder, input,
+                                          lp_build_vec_type(gallivm, type), "");
+            }
          }
          else if (format_desc->channel[chan].pure_integer) {
             /* Nothing to do */
@@ -344,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
@@ -394,7 +407,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       packed = lp_build_gather(gallivm, type.length,
                                format_desc->block.bits,
                                type.width, base_ptr, offset,
-			       FALSE);
+                               FALSE);
       if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
          lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
       }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
new file mode 100644
index 00000000000..217aaa99838
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c
@@ -0,0 +1,294 @@
+/**************************************************************************
+ *
+ * Copyright 2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Format conversion code for srgb formats.
+ *
+ * Functions for converting from srgb to linear and vice versa.
+ * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
+ *
+ * srgb->linear:
+ * cl = cs / 12.92,                 cs <= 0.04045
+ * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
+ *
+ * linear->srgb:
+ * if (isnan(cl)) {
+ *    Map IEEE-754 Not-a-number to zero.
+ *    cs = 0.0;
+ * } else if (cl > 1.0) {
+ *    cs = 1.0;
+ * } else if (cl < 0.0) {
+ *    cs = 0.0;
+ * } else if (cl < 0.0031308) {
+ *    cs = 12.92 * cl;
+ * } else {
+ *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
+ * }
+ *
+ * This does not need to be accurate, however at least for d3d10
+ * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
+ * 1) For srgb->linear, it is required that the error on the srgb side is
+ *    not larger than 0.5f, which I interpret that if you map the value back
+ *    to srgb from linear using the ideal conversion, it would not be off by
+ *    more than 0.5f (that is, it would map to the same 8-bit integer value
+ *    as it was before conversion to linear).
+ * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
+ *    error is allowed.
+ * 3) Additionally, all srgb values converted to linear and back must result
+ *    in the same value as they were originally.
+ *
+ * @author Roland Scheidegger <sroland@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_format.h"
+
+
+
+/**
+ * Convert srgb int values to linear float values.
+ * Several possibilities how to do this, e.g.
+ * - table
+ * - doing the pow() with int-to-float and float-to-int tricks
+ *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
+ * - just using standard polynomial approximation
+ *   (3rd order polynomial is required for crappy but just sufficient accuracy)
+ *
+ * @param src   integer (vector) value(s) to convert
+ *              (8 bit values unpacked to 32 bit already).
+ */
+LLVMValueRef
+lp_build_srgb_to_linear(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src)
+{
+   struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
+   struct lp_build_context f32_bld;
+   LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
+   double coeffs[4] = {0.0023f,
+                       0.0030f / 255.0f,
+                       0.6935f / (255.0f * 255.0f),
+                       0.3012f / (255.0f * 255.0f * 255.0f)
+   };
+
+   assert(src_type.width == 32);
+
+   lp_build_context_init(&f32_bld, gallivm, f32_type);
+
+   /*
+    * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
+    * ( poly =  0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
+    * (found with octave polyfit and some magic as I couldn't get the error
+    * function right). Using the above mentioned error function, the values stay
+    * within +-0.35, except for the lowest values - hence tweaking linear segment
+    * to cover the first 16 instead of the first 11 values (the error stays
+    * just about acceptable there too).
+    * Hence: lin = src > 15 ? poly : src / 12.6
+    * This function really only makes sense for vectors, should use LUT otherwise.
+    * All in all (including float conversion) 11 instructions (with sse4.1),
+    * 6 constants (polynomial could be done with 1 instruction less at the cost
+    * of slightly worse dependency chain, fma should also help).
+    */
+   /* doing the 1/255 mul as part of the approximation */
+   srcf = lp_build_int_to_float(&f32_bld, src);
+   lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
+   part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
+
+   part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);
+
+   lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
+   is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
+   return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
+}
+
+
+/**
+ * Convert linear float values to srgb int values.
+ * Several possibilities how to do this, e.g.
+ * - use table (based on exponent/highest order mantissa bits) and do
+ *   linear interpolation (https://gist.github.com/rygorous/2203834)
+ * - Chebyshev polynomial
+ * - Approximation using reciprocals
+ * - using int-to-float and float-to-int tricks for pow()
+ *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
+ *
+ * @param src   float (vector) value(s) to convert.
+ */
+LLVMValueRef
+lp_build_linear_to_srgb(struct gallivm_state *gallivm,
+                        struct lp_type src_type,
+                        LLVMValueRef src)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context f32_bld;
+   LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;
+
+   lp_build_context_init(&f32_bld, gallivm, src_type);
+
+   src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
+
+   if (0) {
+      /*
+       * using int-to-float and float-to-int trick for pow().
+       * This is much more accurate than necessary thanks to the correction,
+       * but it most certainly makes no sense without rsqrt available.
+       * Bonus points if you understand how this works...
+       * All in all (including min/max clamp, conversion) 19 instructions.
+       */
+
+      float exp_f = 2.0f / 3.0f;
+      float coeff_f = 0.62996f;
+      LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
+      struct lp_type int_type = lp_int_type(src_type);
+
+      /*
+       * First calculate approx x^8/12
+       */
+      exponent = lp_build_const_vec(gallivm, src_type, exp_f);
+      coeff = lp_build_const_vec(gallivm, src_type,
+                                 exp2f(127.0f / exp_f - 127.0f) *
+                                 powf(coeff_f, 1.0f / exp_f));
+
+      /* premultiply src */
+      tmp = lp_build_mul(&f32_bld, coeff, src);
+      /* "log2" */
+      tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
+      tmp = lp_build_int_to_float(&f32_bld, tmp);
+      /* multiply for pow */
+      tmp = lp_build_mul(&f32_bld, tmp, exponent);
+      /* "exp2" */
+      pow_approx = lp_build_itrunc(&f32_bld, tmp);
+      pow_approx = LLVMBuildBitCast(builder, pow_approx,
+                                    lp_build_vec_type(gallivm, src_type), "");
+
+      /*
+       * Since that pow was inaccurate (like 3 bits, though each sqrt step would
+       * give another bit), compensate the error (which is why we chose another
+       * exponent in the first place).
+       */
+      /* x * x^(8/12) = x^(20/12) */
+      pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
+
+      /* x * x * x^(-4/12) = x^(20/12) */
+      /* Should avoid using rsqrt if it's not available, but
+       * using x * x^(4/12) * x^(4/12) instead will change error weight */
+      tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
+      x2 = lp_build_mul(&f32_bld, src, src);
+      pow_2 = lp_build_mul(&f32_bld, x2, tmp);
+
+      /* average the values so the errors cancel out, compensate bias,
+       * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
+       * for conversion to int in here */
+      tmp = lp_build_add(&f32_bld, pow_1, pow_2);
+      coeff = lp_build_const_vec(gallivm, src_type,
+                                 1.0f / (3.0f * coeff_f) * 0.999852f *
+                                 powf(1.055f * 255.0f, 4.0f));
+      pow_final = lp_build_mul(&f32_bld, tmp, coeff);
+
+      /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
+      if (lp_build_fast_rsqrt_available(src_type)) {
+         pow_final = lp_build_fast_rsqrt(&f32_bld,
+                        lp_build_fast_rsqrt(&f32_bld, pow_final));
+      }
+      else {
+         pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
+      }
+      pow_final = lp_build_add(&f32_bld, pow_final,
+                               lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
+   }
+
+   else {
+      /*
+       * using "rational polynomial" approximation here.
+       * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
+       * factoring in the 255.0 mul and the scaling mul.
+       * (a is closer to actual value so has higher weight than b.)
+       * Note: the constants are magic values. They were found empirically,
+       * possibly could be improved but good enough (be VERY careful with
+       * error metric if you'd want to tweak them, they also MUST fit with
+       * the crappy polynomial above for srgb->linear since it is required
+       * that each srgb value maps back to the same value).
+       * This function has an error of max +-0.17 (and we'd only require +-0.6),
+       * for the approximated srgb->linear values the error is naturally larger
+       * (+-0.42) but still accurate enough (required +-0.5 essentially).
+       * All in all (including min/max clamp, conversion) 15 instructions.
+       * FMA would help (minus 2 instructions).
+       */
+
+      LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;
+
+      if (lp_build_fast_rsqrt_available(src_type)) {
+         tmp = lp_build_fast_rsqrt(&f32_bld, src);
+         x05 = lp_build_mul(&f32_bld, src, tmp);
+      }
+      else {
+         /*
+          * I don't really expect this to be practical without rsqrt
+          * but there's no reason for triple punishment so at least
+          * save the otherwise resulting division and unnecessary mul...
+          */
+         x05 = lp_build_sqrt(&f32_bld, src);
+      }
+
+      tmp = lp_build_mul(&f32_bld, x05, src);
+      if (lp_build_fast_rsqrt_available(src_type)) {
+         x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
+      }
+      else {
+         x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
+      }
+
+      a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
+      b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
+      c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
+
+      tmp = lp_build_mul(&f32_bld, a_const, x0375);
+      tmp2 = lp_build_mul(&f32_bld, b_const, x05);
+      tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
+      pow_final = lp_build_add(&f32_bld, tmp, tmp2);
+   }
+
+   /* linear part is easy */
+   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
+   lin = lp_build_mul(&f32_bld, src, lin_const);
+
+   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
+   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
+   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
+
+   f32_bld.type.sign = 0;
+   return lp_build_iround(&f32_bld, tmp);
+}