glsl: dfloor_to_arith WIPfp64_floor

status: works for positive values, problems with negative ones. Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
author: Tapani Pälli <tapani.palli@intel.com> 2014-12-31 11:14:02 +0200
committer: Tapani Pälli <tapani.palli@intel.com> 2015-01-13 15:01:45 +0200
commit: 2e3909057e269b3466ceef8d2573abf82078e5c6 (patch)
tree: a7e5c88e39c86210487a5bedf197d1bce6344b1d
parent: 0602a7efefebe4da07bc3911aad3b9229b946d70 (diff)
3 files changed, 130 insertions, 1 deletions
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 180ae6f0aaf..8f0f024a5bc 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -44,6 +44,7 @@
 #define DOPS_TO_DFRAC      0x1000
 #define DFREXP_DLDEXP_TO_ARITH    0x2000
 #define DSQRT_TO_FSQRT            0x4000
+#define DFLOOR_TO_ARITH           0x8000
 
 /**
  * \see class lower_packing_builtins_visitor
diff --git a/src/glsl/lower_instructions.cpp b/src/glsl/lower_instructions.cpp
index 7868be51cf6..febc87b424c 100644
--- a/src/glsl/lower_instructions.cpp
+++ b/src/glsl/lower_instructions.cpp
@@ -45,6 +45,7 @@
  * - SAT_TO_CLAMP
  * - DOPS_TO_DFRAC
  * - DSQRT_TO_FSQRT
+ * - DFLOOR_TO_ARITH
  *
  * SUB_TO_ADD_NEG:
  * ---------------
@@ -125,6 +126,10 @@
  * --------------
  * Splits double square root into exponent division and single precision
  * square root.
+ *
+ * DFLOOR_TO_ARITH
+ * ---------------
+ * Provides floor with pure luck.
  */
 
 #include "main/core.h" /* for M_LOG2E */
@@ -170,6 +175,7 @@ private:
    void double_lrp(ir_expression *);
    void dceil_to_dfrac(ir_expression *);
    void dfloor_to_dfrac(ir_expression *);
+   void dfloor_to_arith(ir_expression *);
    void dround_even_to_dfrac(ir_expression *);
    void dtrunc_to_dfrac(ir_expression *);
    void dsign_to_csel(ir_expression *);
@@ -1095,6 +1101,120 @@ lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
    ir->operands[1] = new(ir) ir_dereference_variable(t2);
 }
 
+
+void
+lower_instructions_visitor::dfloor_to_arith(ir_expression *ir)
+{
+   ir_instruction &i = *base_ir;
+   exec_list instructions;
+   ir_factory factory;
+   factory.instructions = &instructions;
+   factory.mem_ctx = ir;
+
+   const unsigned vec_elem = ir->type->vector_elements;
+   ir_rvalue *results[4] = {NULL};
+
+   ir_constant *double_zero = new(ir) ir_constant(0.0, vec_elem);
+   ir_constant *double_one = new(ir) ir_constant(1.0, vec_elem);
+
+   for (unsigned elem = 0; elem < vec_elem; elem++) {
+
+      ir_variable *result =
+         factory.make_temp(glsl_type::double_type, "result");
+
+      ir_dereference *result_dref = new(ir) ir_dereference_variable(result);
+
+      /* if (value == 0.0) return 0.0; */
+      factory.emit(if_tree(equal(ir->operands[0]->clone(ir, NULL), double_zero),
+                           assign(result, double_zero->clone(ir, NULL))));
+
+      /* if (value < 0.0) return floor(x) = -floor(abs(x)) - 1.0; */
+      factory.emit(if_tree(less(ir->operands[0]->clone(ir, NULL),
+                                double_zero->clone(ir, NULL)),
+                           assign(result,
+                                  sub(neg(expr(ir_unop_floor,
+                                               abs(ir->operands[0]->clone(ir, NULL)))), double_one))));
+
+      /* if (value > 0.0) ... */
+      ir_variable *unpacked =
+         factory.make_temp(glsl_type::uvec2_type, "unpacked");
+
+      factory.emit(assign(unpacked,
+                          expr(ir_unop_unpack_double_2x32,
+                          swizzle(ir->operands[0]->clone(ir, NULL), elem, 1))));
+
+      ir_rvalue *hi  = swizzle_y(unpacked);
+
+      /* extract exponent and mantissa from hi */
+      ir_variable *exponent =
+         factory.make_temp(glsl_type::uint_type, "exponent");
+
+      ir_variable *iexp =
+         factory.make_temp(glsl_type::int_type, "iexp");
+
+      ir_variable *mantissa =
+         factory.make_temp(glsl_type::uint_type, "mantissa");
+
+      /* exponent = (bits >> 20) & 0x7ff */
+      factory.emit(assign(exponent,
+                          bit_and(rshift(hi, factory.constant(20u)),
+                                         factory.constant(0x7ffu))));
+
+      /* mantissa = bits & 0xfffff; (20 last bits) */
+      factory.emit(assign(mantissa,
+                          bit_and(hi->clone(ir, NULL), factory.constant(0xfffffu))));
+
+      /* remove the double bias */
+      factory.emit(assign(iexp, sub(expr(ir_unop_u2i, exponent), factory.constant(1023))));
+      factory.emit(assign(exponent, sub(exponent, factory.constant(1023u))));
+
+      /* if value > 0.0 AND if exponent < 0, floor(x) = 0 */
+      factory.emit(if_tree(less(iexp, factory.constant(0)),
+                           assign(result, double_zero->clone(ir, NULL))));
+
+      /* if value > 0.0 AND exponent == 0, floor(x) = 1 */
+      factory.emit(if_tree(equal(iexp, factory.constant(0)),
+                           assign(result, double_one->clone(ir, NULL))));
+
+      /* calculate MANTISSA_BITS - exp */
+      ir_variable *nmb =
+         factory.make_temp(glsl_type::uint_type, "nmb");
+      factory.emit(assign(nmb, sub(factory.constant(20u), exponent)));
+
+      /* some temporary helpers */
+      ir_variable *a = factory.make_temp(glsl_type::uint_type, "a");
+      ir_variable *b = factory.make_temp(glsl_type::uint_type, "b");
+
+      /* is exponent bigger than zero? */
+      ir_expression *a_e = greater(iexp, factory.constant(0));
+      /* is value bigger than zero? */
+      ir_expression *b_e = greater(ir->operands[0], double_zero->clone(ir, NULL));
+
+      /* return uint32_t mf = (1 << exp) + (m >> nmb)  ... or exp2f(exp) + (m >> nmb) */
+      factory.emit(assign(a, lshift(factory.constant(1u), exponent)));
+      factory.emit(assign(b, rshift(mantissa, nmb)));
+
+      /* if value > 0.0 AND exponent > 0.0, assign result */
+      factory.emit(if_tree(expr(ir_binop_all_equal, b_e, factory.constant(true)),
+                   if_tree(expr(ir_binop_all_equal, a_e, factory.constant(true)),
+      assign(result_dref, expr(ir_unop_i2d, (expr(ir_unop_u2i, add(a, b))))))));
+
+      results[elem] = result_dref;
+   }
+
+   i.insert_before(&instructions);
+
+   /* Put the dvec back together */
+   ir->operation = ir_quadop_vector;
+   ir->operands[0] = results[0];
+   ir->operands[1] = results[1];
+   ir->operands[2] = results[2];
+   ir->operands[3] = results[3];
+
+   this->progress = true;
+}
+
+
 void
 lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
 {
@@ -1264,6 +1384,11 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
 	 div_to_mul_rcp(ir);
       break;
 
+   case ir_unop_floor:
+      if (lowering(DFLOOR_TO_ARITH) && ir->operands[0]->type->is_double())
+         dfloor_to_arith(ir);
+      break;
+
    case ir_unop_sqrt:
       if (lowering(DSQRT_TO_FSQRT) && ir->operands[0]->type->is_double())
          dsqrt_to_fsqrt(ir);
@@ -1336,10 +1461,12 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
          dceil_to_dfrac(ir);
       break;
 
+#if 0
    case ir_unop_floor:
       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
          dfloor_to_dfrac(ir);
       break;
+#endif
 
    case ir_unop_round_even:
       if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 58d6e77c379..cce29e06132 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -146,7 +146,8 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
                          bitfield_insert |
                          LDEXP_TO_ARITH |
                          DFREXP_DLDEXP_TO_ARITH |
-                         DSQRT_TO_FSQRT);
+                         DSQRT_TO_FSQRT |
+                         DFLOOR_TO_ARITH);
 
       /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
        * if-statements need to be flattened.
author	Tapani Pälli <tapani.palli@intel.com>	2014-12-31 11:14:02 +0200
committer	Tapani Pälli <tapani.palli@intel.com>	2015-01-13 15:01:45 +0200
commit	2e3909057e269b3466ceef8d2573abf82078e5c6 (patch)
tree	a7e5c88e39c86210487a5bedf197d1bce6344b1d
parent	0602a7efefebe4da07bc3911aad3b9229b946d70 (diff)