1 files changed, 446 insertions, 99 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
+   unsigned intr_size;
    LLVMValueRef cond;
 
    assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.min.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.ps.256";
+            intr_size = 256;
+         }
+      }
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.min.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse2.min.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.pd.256";
+            intr_size = 256;
+         }
       }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pminu.b";
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmins.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
             intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
             intrinsic = "llvm.x86.sse41.pminsd";
+         }
       }
    }
 
-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }
 
    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
+   unsigned intr_size;
    LLVMValueRef cond;
 
    assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.max.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.ps.256";
+            intr_size = 256;
+         }
+      }
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.max.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse2.max.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.pd.256";
+            intr_size = 256;
+         }
       }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxu.b";
+         intr_size = 128;
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxs.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxsd";
+         }
       }
    }
 
-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }
 
    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    LLVMValueRef index, res;
-   unsigned i;
+   unsigned i, length;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef vecres, elem2;
 
    assert(lp_check_value(type, a));
 
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    assert(!bld->type.norm);
 
-   index = lp_build_const_int32(bld->gallivm, 0);
-   res = LLVMBuildExtractElement(builder, a, index, "");
+   /*
+    * for byte vectors can do much better with psadbw.
+    * Using repeated shuffle/adds here. Note with multiple vectors
+    * this can be done more efficiently as outlined in the intel
+    * optimization manual.
+    * Note: could cause data rearrangement if used with smaller element
+    * sizes.
+    */
 
-   for (i = 1; i < type.length; i++) {
-      index = lp_build_const_int32(bld->gallivm, i);
-      if (type.floating)
-         res = LLVMBuildFAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
-      else
-         res = LLVMBuildAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
+   vecres = a;
+   length = type.length / 2;
+   while (length > 1) {
+      LLVMValueRef vec1, vec2;
+      for (i = 0; i < length; i++) {
+         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+      }
+      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles1, length), "");
+      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles2, length), "");
+      if (type.floating) {
+         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+      }
+      else {
+         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+      }
+      length = length >> 1;
    }
 
+   /* always have vector of size 2 here */
+   assert(length == 1);
+
+   index = lp_build_const_int32(bld->gallivm, 0);
+   res = LLVMBuildExtractElement(builder, vecres, index, "");
+   index = lp_build_const_int32(bld->gallivm, 1);
+   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+   if (type.floating)
+      res = LLVMBuildFAdd(builder, res, elem2, "");
+    else
+      res = LLVMBuildAdd(builder, res, elem2, "");
+
    return res;
 }
 
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+                            LLVMValueRef src[4])
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[4];
+   LLVMValueRef tmp[4];
+   LLVMValueRef sumtmp[2], shuftmp[2];
+
+   /* lower half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 1);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 5);
+   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   /* upper half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 2);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 6);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 2);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 6);
+   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 1);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 5);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ret_vec;
+   LLVMValueRef tmp[4];
+   const char *intrinsic = NULL;
+
+   assert(num_vecs >= 2 && num_vecs <= 4);
+   assert(bld->type.floating);
+
+   /* only use this with at least 2 vectors, as it is sort of expensive
+    * (depending on cpu) and we always need two horizontal adds anyway,
+    * so a shuffle/add approach might be better.
+    */
+
+   tmp[0] = vectors[0];
+   tmp[1] = vectors[1];
+
+   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+       bld->type.length == 4) {
+      intrinsic = "llvm.x86.sse3.hadd.ps";
+   }
+   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+            bld->type.length == 8) {
+      intrinsic = "llvm.x86.avx.hadd.ps.256";
+   }
+   if (intrinsic) {
+      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+      if (num_vecs > 2) {
+         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                          lp_build_vec_type(gallivm, bld->type),
+                                          tmp[2], tmp[3]);
+      }
+      else {
+         tmp[1] = tmp[0];
+      }
+      return lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+   }
+
+   if (bld->type.length == 4) {
+      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+   }
+   else {
+      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+      unsigned j;
+      unsigned num_iter = bld->type.length / 4;
+      struct lp_type parttype = bld->type;
+      parttype.length = 4;
+      for (j = 0; j < num_iter; j++) {
+         LLVMValueRef partsrc[4];
+         unsigned i;
+         for (i = 0; i < 4; i++) {
+            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+         }
+         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+      }
+      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+   }
+   return ret_vec;
+}
 
 /**
  * Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
       if(bld->type.floating) {
 #if 0
          /*
-          * Power of two multiplication by directly manipulating the mantissa.
+          * Power of two multiplication by directly manipulating the exponent.
           *
           * XXX: This might not be always faster, it will introduce a small error
           * for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
          return LLVMConstUDiv(a, b);
    }
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
       type.floating)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
       }
    }
+   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+            (type.width == 8 || type.width == 16 || type.width == 32)) {
+      debug_printf("%s: inefficient code, should split vectors manually\n",
+                   __FUNCTION__);
+   }
 
    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 }
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
    else
    {
       /* signed int/norm/fixed point */
+      /* could use psign with sse3 and appropriate vectors here */
       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
    return LLVMBuildSIToFP(builder, a, vec_type, "");
 }
 
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+   if ((util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) ||
+       (util_cpu_caps.has_avx && type.width*type.length == 256))
+      return TRUE;
 
+   return FALSE;
+}
 
 enum lp_build_round_sse41_mode
 {
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
       res = LLVMBuildExtractElement(builder, res, index0, "");
    }
    else {
-      assert(type.width*type.length == 128);
-
-      switch(type.width) {
-      case 32:
-         intrinsic = "llvm.x86.sse41.round.ps";
-         break;
-      case 64:
-         intrinsic = "llvm.x86.sse41.round.pd";
-         break;
-      default:
-         assert(0);
-         return bld->undef;
+      if (type.width * type.length == 128) {
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.sse41.round.ps";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.sse41.round.pd";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
+      }
+      else {
+         assert(type.width * type.length == 256);
+         assert(util_cpu_caps.has_avx);
+
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.avx.round.ps.256";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.avx.round.pd.256";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
       }
 
       res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                                      ret_type, arg);
    }
    else {
-      assert(type.width*type.length == 128);
-
-      intrinsic = "llvm.x86.sse2.cvtps2dq";
+      if (type.width* type.length == 128) {
+         intrinsic = "llvm.x86.sse2.cvtps2dq";
+      }
+      else {
+         assert(type.width*type.length == 256);
+         assert(util_cpu_caps.has_avx);
 
+         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+      }
       res = lp_build_intrinsic_unary(builder, intrinsic,
                                      ret_type, a);
    }
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
    }
    else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+   LLVMValueRef max;
+
+   /* this is the largest number smaller than 1.0 representable as float */
+   max = lp_build_const_vec(bld->gallivm, bld->type,
+                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+   return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
  * Return the integer part of a float (vector) value (== round toward zero).
  * The returned value is an integer (vector).
  * Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse2 &&
-       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+   if ((util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
       return lp_build_iround_nearest_sse2(bld, a);
    }
-   else if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-   }
-   else {
-      res = a;
-
-      if (type.sign) {
+   res = a;
+   if (type.sign) {
+      if (sse41_rounding_available(type)) {
+         res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+      }
+      else {
          /* Take the sign bit and add it to 1 constant */
          LLVMTypeRef vec_type = bld->vec_type;
          unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
  * Combined ifloor() & fract().
  *
  * Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
  */
 void
 lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       /*
        * floor() is easier.
        */
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
 }
 
 
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart)
+{
+   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+   *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
 
    /* TODO: optimize the constant case */
-   /* TODO: optimize the constant case */
 
    assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   if (type.length == 1) {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+   }
+   else {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   }
 
    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
 }
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
     * - it doesn't even get the reciprocate of 1.0 exactly
     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
     * - for recent processors the benefit over DIVPS is marginal, a case
-    *   depedent
+    *   dependent
     *
     * We could still use it on certain processors if benchmarks show that the
     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
     * particular uses that require less workarounds.
     */
 
-   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
       const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
+      const char *intrinsic = NULL;
 
-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rcp.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rcp.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
       for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
       const unsigned num_iterations = 1;
       LLVMValueRef res;
       unsigned i;
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
 
       for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rsqrt_refine(bld, a, res);