36 files changed, 3837 insertions, 1210 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
+   unsigned intr_size;
    LLVMValueRef cond;
 
    assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.min.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.ps.256";
+            intr_size = 256;
+         }
+      }
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.min.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse2.min.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.pd.256";
+            intr_size = 256;
+         }
       }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pminu.b";
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmins.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
             intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
             intrinsic = "llvm.x86.sse41.pminsd";
+         }
       }
    }
 
-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }
 
    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
+   unsigned intr_size;
    LLVMValueRef cond;
 
    assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.max.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.ps.256";
+            intr_size = 256;
+         }
+      }
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.max.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse2.max.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.pd.256";
+            intr_size = 256;
+         }
       }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxu.b";
+         intr_size = 128;
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxs.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxsd";
+         }
       }
    }
 
-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }
 
    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    LLVMValueRef index, res;
-   unsigned i;
+   unsigned i, length;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef vecres, elem2;
 
    assert(lp_check_value(type, a));
 
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    assert(!bld->type.norm);
 
-   index = lp_build_const_int32(bld->gallivm, 0);
-   res = LLVMBuildExtractElement(builder, a, index, "");
+   /*
+    * for byte vectors can do much better with psadbw.
+    * Using repeated shuffle/adds here. Note with multiple vectors
+    * this can be done more efficiently as outlined in the intel
+    * optimization manual.
+    * Note: could cause data rearrangement if used with smaller element
+    * sizes.
+    */
 
-   for (i = 1; i < type.length; i++) {
-      index = lp_build_const_int32(bld->gallivm, i);
-      if (type.floating)
-         res = LLVMBuildFAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
-      else
-         res = LLVMBuildAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
+   vecres = a;
+   length = type.length / 2;
+   while (length > 1) {
+      LLVMValueRef vec1, vec2;
+      for (i = 0; i < length; i++) {
+         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+      }
+      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles1, length), "");
+      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles2, length), "");
+      if (type.floating) {
+         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+      }
+      else {
+         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+      }
+      length = length >> 1;
    }
 
+   /* always have vector of size 2 here */
+   assert(length == 1);
+
+   index = lp_build_const_int32(bld->gallivm, 0);
+   res = LLVMBuildExtractElement(builder, vecres, index, "");
+   index = lp_build_const_int32(bld->gallivm, 1);
+   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+   if (type.floating)
+      res = LLVMBuildFAdd(builder, res, elem2, "");
+    else
+      res = LLVMBuildAdd(builder, res, elem2, "");
+
    return res;
 }
 
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+                            LLVMValueRef src[4])
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[4];
+   LLVMValueRef tmp[4];
+   LLVMValueRef sumtmp[2], shuftmp[2];
+
+   /* lower half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 1);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 5);
+   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   /* upper half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 2);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 6);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 2);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 6);
+   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 1);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 5);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ret_vec;
+   LLVMValueRef tmp[4];
+   const char *intrinsic = NULL;
+
+   assert(num_vecs >= 2 && num_vecs <= 4);
+   assert(bld->type.floating);
+
+   /* only use this with at least 2 vectors, as it is sort of expensive
+    * (depending on cpu) and we always need two horizontal adds anyway,
+    * so a shuffle/add approach might be better.
+    */
+
+   tmp[0] = vectors[0];
+   tmp[1] = vectors[1];
+
+   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+       bld->type.length == 4) {
+      intrinsic = "llvm.x86.sse3.hadd.ps";
+   }
+   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+            bld->type.length == 8) {
+      intrinsic = "llvm.x86.avx.hadd.ps.256";
+   }
+   if (intrinsic) {
+      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+      if (num_vecs > 2) {
+         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                          lp_build_vec_type(gallivm, bld->type),
+                                          tmp[2], tmp[3]);
+      }
+      else {
+         tmp[1] = tmp[0];
+      }
+      return lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+   }
+
+   if (bld->type.length == 4) {
+      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+   }
+   else {
+      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+      unsigned j;
+      unsigned num_iter = bld->type.length / 4;
+      struct lp_type parttype = bld->type;
+      parttype.length = 4;
+      for (j = 0; j < num_iter; j++) {
+         LLVMValueRef partsrc[4];
+         unsigned i;
+         for (i = 0; i < 4; i++) {
+            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+         }
+         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+      }
+      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+   }
+   return ret_vec;
+}
 
 /**
  * Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
       if(bld->type.floating) {
 #if 0
          /*
-          * Power of two multiplication by directly manipulating the mantissa.
+          * Power of two multiplication by directly manipulating the exponent.
           *
           * XXX: This might not be always faster, it will introduce a small error
           * for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
          return LLVMConstUDiv(a, b);
    }
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
       type.floating)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
       }
    }
+   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+            (type.width == 8 || type.width == 16 || type.width == 32)) {
+      debug_printf("%s: inefficient code, should split vectors manually\n",
+                   __FUNCTION__);
+   }
 
    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 }
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
    else
    {
       /* signed int/norm/fixed point */
+      /* could use psign with sse3 and appropriate vectors here */
       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
    return LLVMBuildSIToFP(builder, a, vec_type, "");
 }
 
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+   if ((util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) ||
+       (util_cpu_caps.has_avx && type.width*type.length == 256))
+      return TRUE;
 
+   return FALSE;
+}
 
 enum lp_build_round_sse41_mode
 {
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
       res = LLVMBuildExtractElement(builder, res, index0, "");
    }
    else {
-      assert(type.width*type.length == 128);
-
-      switch(type.width) {
-      case 32:
-         intrinsic = "llvm.x86.sse41.round.ps";
-         break;
-      case 64:
-         intrinsic = "llvm.x86.sse41.round.pd";
-         break;
-      default:
-         assert(0);
-         return bld->undef;
+      if (type.width * type.length == 128) {
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.sse41.round.ps";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.sse41.round.pd";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
+      }
+      else {
+         assert(type.width * type.length == 256);
+         assert(util_cpu_caps.has_avx);
+
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.avx.round.ps.256";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.avx.round.pd.256";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
       }
 
       res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                                      ret_type, arg);
    }
    else {
-      assert(type.width*type.length == 128);
-
-      intrinsic = "llvm.x86.sse2.cvtps2dq";
+      if (type.width* type.length == 128) {
+         intrinsic = "llvm.x86.sse2.cvtps2dq";
+      }
+      else {
+         assert(type.width*type.length == 256);
+         assert(util_cpu_caps.has_avx);
 
+         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+      }
       res = lp_build_intrinsic_unary(builder, intrinsic,
                                      ret_type, a);
    }
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
    }
    else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+   LLVMValueRef max;
+
+   /* this is the largest number smaller than 1.0 representable as float */
+   max = lp_build_const_vec(bld->gallivm, bld->type,
+                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+   return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
  * Return the integer part of a float (vector) value (== round toward zero).
  * The returned value is an integer (vector).
  * Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse2 &&
-       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+   if ((util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
       return lp_build_iround_nearest_sse2(bld, a);
    }
-   else if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-   }
-   else {
-      res = a;
-
-      if (type.sign) {
+   res = a;
+   if (type.sign) {
+      if (sse41_rounding_available(type)) {
+         res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+      }
+      else {
          /* Take the sign bit and add it to 1 constant */
          LLVMTypeRef vec_type = bld->vec_type;
          unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
  * Combined ifloor() & fract().
  *
  * Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
  */
 void
 lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       /*
        * floor() is easier.
        */
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
 }
 
 
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart)
+{
+   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+   *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
 
    /* TODO: optimize the constant case */
-   /* TODO: optimize the constant case */
 
    assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   if (type.length == 1) {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+   }
+   else {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   }
 
    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
 }
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
     * - it doesn't even get the reciprocate of 1.0 exactly
     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
     * - for recent processors the benefit over DIVPS is marginal, a case
-    *   depedent
+    *   dependent
     *
     * We could still use it on certain processors if benchmarks show that the
     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
     * particular uses that require less workarounds.
     */
 
-   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
       const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
+      const char *intrinsic = NULL;
 
-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rcp.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rcp.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
       for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
       const unsigned num_iterations = 1;
       LLVMValueRef res;
       unsigned i;
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
 
       for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rsqrt_refine(bld, a, res);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index aeb987ff352..60b9907e60f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a);
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a);
+
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs);
 
 LLVMValueRef
 lp_build_sub(struct lp_build_context *bld,
@@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld,
                LLVMValueRef a);
 
 LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+LLVMValueRef
 lp_build_ifloor(struct lp_build_context *bld,
                 LLVMValueRef a);
 LLVMValueRef
@@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
                       LLVMValueRef *out_ipart,
                       LLVMValueRef *out_fpart);
 
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart);
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 59e8fb2ed6e..35799a1ef8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -37,6 +37,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_half.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type)
 
    if(type.floating) {
       switch(type.width) {
+      case 16:
+         return 10;
       case 32:
          return 23;
       case 64:
-         return 53;
+         return 52;
       default:
          assert(0);
          return 0;
@@ -136,6 +139,8 @@ lp_const_min(struct lp_type type)
 
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return -65504;
       case 32:
          return -FLT_MAX;
       case 64:
@@ -169,6 +174,8 @@ lp_const_max(struct lp_type type)
 
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return 65504;
       case 32:
          return FLT_MAX;
       case 64:
@@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return 2E-10;
       case 32:
          return FLT_EPSILON;
       case 64:
@@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
 
    elem_type = lp_build_elem_type(gallivm, type);
 
-   if(type.floating)
+   if(type.floating && type.width == 16)
+      elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0);
+   else if(type.floating)
       elems[0] = LLVMConstReal(elem_type, 1.0);
    else if(type.fixed)
       elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
@@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm,
    LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
    LLVMValueRef elem;
 
-   if(type.floating) {
+   if(type.floating && type.width == 16) {
+      elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0);
+   } else if(type.floating) {
       elem = LLVMConstReal(elem_type, val);
    }
    else {
@@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm,
    if(swizzle == NULL)
       swizzle = default_swizzle;
 
-   if(type.floating) {
-      elems[swizzle[0]] = LLVMConstReal(elem_type, r);
-      elems[swizzle[1]] = LLVMConstReal(elem_type, g);
-      elems[swizzle[2]] = LLVMConstReal(elem_type, b);
-      elems[swizzle[3]] = LLVMConstReal(elem_type, a);
-   }
-   else {
-      double dscale = lp_const_scale(type);
-
-      elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0);
-      elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0);
-      elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0);
-      elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0);
-   }
+   elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r);
+   elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g);
+   elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b);
+   elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a);
 
    for(i = 4; i < type.length; ++i)
       elems[i] = elems[i % 4];
@@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm,
 /**
  * Build a callable function pointer.
  *
- * We this casts instead of LLVMAddGlobalMapping()
+ * We use function pointer constants instead of LLVMAddGlobalMapping()
  * to work around a bug in LLVM 2.6, and for efficiency/simplicity.
  */
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 0973e1f16f3..0399709faad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -70,6 +70,66 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_conv.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Converts int16 half-float to float32
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
+ * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
+ *
+ * @param src_type      <vector> type of int16
+ * @param src           value to convert
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ */
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      LLVMValueRef src)
+{
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
+
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+   LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
+
+   /* Constants */
+   LLVMValueRef i32_13          = lp_build_const_int_vec(gallivm, i32_type, 13);
+   LLVMValueRef i32_16          = lp_build_const_int_vec(gallivm, i32_type, 16);
+   LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
+   LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
+   LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+   LLVMValueRef f32_magic       = LLVMBuildBitCast(builder,
+                                                   lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
+                                                   float_vec_type, "");
+
+   /* Convert int16 vector to int32 vector by zero ext */
+   LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");
+
+   /* Exponent / mantissa bits */
+   LLVMValueRef expmant       = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
+   LLVMValueRef shifted       = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
+
+   /* Exponent adjust */
+   LLVMValueRef scaled        = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
+
+   /* Make sure Inf/NaN survive */
+   LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
+   LLVMValueRef infnanexp     = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
+
+   /* Sign bit */
+   LLVMValueRef justsign      = LLVMBuildXor(builder, h, expmant, "");
+   LLVMValueRef sign          = LLVMBuildShl(builder, justsign, i32_16, "");
+
+   /* Combine result */
+   LLVMValueRef sign_inf      = LLVMBuildOr(builder, sign, infnanexp, "");
+   LLVMValueRef final         = LLVMBuildOr(builder, scaled, sign_inf, "");
+
+   /* Cast from int32 vector to float32 vector */
+   return LLVMBuildBitCast(builder, final, float_vec_type, "");
+}
 
 
 /**
@@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm,
        dst_type.width    == 8 &&
        dst_type.length   == 16 &&
 
+       4 * num_dsts      == num_srcs &&
+
        util_cpu_caps.has_sse2)
    {
       struct lp_build_context bld;
@@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm,
       return; 
    }
 
+   /* Special case 2x8f --> 1x16ub
+    */
+   else if (src_type.floating == 1 &&
+      src_type.fixed    == 0 &&
+      src_type.sign     == 1 &&
+      src_type.norm     == 0 &&
+      src_type.width    == 32 &&
+      src_type.length   == 8 &&
+
+      dst_type.floating == 0 &&
+      dst_type.fixed    == 0 &&
+      dst_type.sign     == 0 &&
+      dst_type.norm     == 1 &&
+      dst_type.width    == 8 &&
+      dst_type.length   == 16 &&
+
+      2 * num_dsts      == num_srcs &&
+
+      util_cpu_caps.has_avx) {
+
+      struct lp_build_context bld;
+      struct lp_type int16_type = dst_type;
+      struct lp_type int32_type = dst_type;
+      LLVMValueRef const_255f;
+      unsigned i;
+
+      lp_build_context_init(&bld, gallivm, src_type);
+
+      int16_type.width *= 2;
+      int16_type.length /= 2;
+      int16_type.sign = 1;
+
+      int32_type.width *= 4;
+      int32_type.length /= 4;
+      int32_type.sign = 1;
+
+      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+      for (i = 0; i < num_dsts; ++i, src += 2) {
+         LLVMValueRef lo, hi, a, b;
+
+         a = LLVMBuildFMul(builder, src[0], const_255f, "");
+         b = LLVMBuildFMul(builder, src[1], const_255f, "");
+
+         a = lp_build_iround(&bld, a);
+         b = lp_build_iround(&bld, b);
+
+         tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
+         tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
+         tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+         tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+
+         /* relying on clamping behavior of sse2 intrinsics here */
+         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+      }
+      return;
+   }
+
+   /* Pre convert half-floats to floats
+    */
+   else if (src_type.floating && src_type.width == 16)
+   {
+      for(i = 0; i < num_tmps; ++i)
+         tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
+
+      tmp_type.width = 32;
+   }
+
    /*
     * Clamp if necessary
     */
@@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm,
  * This will convert the integer masks that match the given types.
  *
  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
+ * Any other value will likely cause unpredictable results.
  *
  * This is basically a very trimmed down version of lp_build_conv.
  */
@@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts)
 {
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 
    /* We must not loose or gain channels. Only precision */
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
@@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
     * Truncate or expand bit width
     */
 
-   if(src_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
-   }
-   else if(src_type.width < dst_type.width) {
-      assert(num_srcs == 1);
-      lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts);
-   }
-   else {
-      assert(num_srcs == num_dsts);
-      memcpy(dst, src, num_dsts * sizeof *dst);
-   }
+   lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index cec655980fa..c830fbef5f2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,6 +42,10 @@
 
 struct lp_type;
 
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+                       struct lp_type src_type,
+                       LLVMValueRef src);
 
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 444b70a678c..93505f3da45 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -35,10 +35,8 @@
 
 #if HAVE_LLVM >= 0x0300
 #include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
 #else /* HAVE_LLVM < 0x0300 */
 #include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
 #endif /* HAVE_LLVM < 0x0300 */
 
 #if HAVE_LLVM >= 0x0209
@@ -183,7 +181,7 @@ lp_disassemble(const void* func)
    /*
     * Limit disassembly to this extent
     */
-   const uint64_t extent = 0x10000;
+   const uint64_t extent = 96 * 1024;
 
    uint64_t max_pc = 0;
 
@@ -200,24 +198,6 @@ lp_disassemble(const void* func)
    std::string Error;
    const Target *T = TargetRegistry::lookupTarget(Triple, Error);
 
-#if HAVE_LLVM >= 0x0208
-   InitializeNativeTargetAsmPrinter();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   LLVMInitializeX86AsmPrinter();
-#elif defined(PIPE_ARCH_ARM)
-   LLVMInitializeARMAsmPrinter();
-#elif defined(PIPE_ARCH_PPC)
-   LLVMInitializePowerPCAsmPrinter();
-#endif
-
-#if HAVE_LLVM >= 0x0301
-   InitializeNativeTargetDisassembler();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   LLVMInitializeX86Disassembler();
-#elif defined(PIPE_ARCH_ARM)
-   LLVMInitializeARMDisassembler();
-#endif
-
 #if HAVE_LLVM >= 0x0300
    OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
 #else
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index d2b3713ed2d..30da44e5b9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
 
    value = lp_build_mask_value(mask);
 
+   /*
+    * XXX this doesn't quite generate the most efficient code possible, if
+    * the masks are vectors which have all bits set to the same value
+    * in each element.
+    * movmskps/pmovmskb would be more efficient to get the required value
+    * into ordinary reg (certainly with 8 floats).
+    * Not sure if llvm could figure that out on its own.
+    */
+
    /* cond = (mask == 0) */
    cond = LLVMBuildICmp(builder,
                         LLVMIntEQ,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 04142d905b1..3608a68202f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         LLVMValueRef i,
                         LLVMValueRef j);
 
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset);
+
 
 /*
  * SoA
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index e4b8da6bcfd..9591bcfb2c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
       return lp_build_format_swizzle_aos(format_desc, &bld, res);
    }
 
+   /* If all channels are of same type and we are not using half-floats */
+   if (util_format_is_array(format_desc)) {
+      return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+   }
+
    /*
     * YUV / subsampled formats
     */
@@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
       return res;
    }
 
-
    /*
     * Fallback to util_format_description::fetch_rgba_float().
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
new file mode 100644
index 00000000000..b8ec379d76f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_const.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_pack.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipe/p_state.h"
+
+/**
+ * @brief lp_build_fetch_rgba_aos_array
+ *
+ * \param format_desc   describes format of the image we're fetching from
+ * \param dst_type      output type
+ * \param base_ptr      address of the pixel block (or the texel if uncompressed)
+ * \param offset        ptr offset
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+                              const struct util_format_description *format_desc,
+                              struct lp_type dst_type,
+                              LLVMValueRef base_ptr,
+                              LLVMValueRef offset)
+{
+   struct lp_build_context bld;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef src_elem_type, src_vec_type;
+   LLVMValueRef ptr, res = NULL;
+   struct lp_type src_type;
+
+   memset(&src_type, 0, sizeof src_type);
+   src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+   src_type.fixed    = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+   src_type.sign     = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   src_type.norm     = format_desc->channel[0].normalized;
+   src_type.width    = format_desc->channel[0].size;
+   src_type.length   = format_desc->nr_channels;
+
+   assert(src_type.length <= dst_type.length);
+
+   src_elem_type = lp_build_elem_type(gallivm, src_type);
+   src_vec_type  = lp_build_vec_type(gallivm,  src_type);
+
+   /* Read whole vector from memory, unaligned */
+   if (!res) {
+      ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+      ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
+      res = LLVMBuildLoad(builder, ptr, "");
+      lp_set_load_alignment(res, src_type.width / 8);
+   }
+
+   /* Truncate doubles to float */
+   if (src_type.floating && src_type.width == 64) {
+      src_type.width = 32;
+      src_vec_type  = lp_build_vec_type(gallivm,  src_type);
+
+      res = LLVMBuildFPTrunc(builder, res, src_vec_type, "");
+   }
+
+   /* Expand to correct length */
+   if (src_type.length < dst_type.length) {
+      res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length);
+      src_type.length = dst_type.length;
+   }
+
+   /* Convert to correct format */
+   lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1);
+
+   /* Swizzle it */
+   lp_build_context_init(&bld, gallivm, dst_type);
+   return lp_build_format_swizzle_aos(format_desc, &bld, res);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 0a57b3ce794..afeb34079bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
     */
 
    if (util_format_fits_8unorm(format_desc) &&
-       type.floating && type.width == 32 && type.length == 4) {
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0))) {
       struct lp_type tmp_type;
       LLVMValueRef tmp;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index ccc83207004..f77eb1212b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
     * per element. Didn't measure performance but cuts shader size
     * by quite a bit (less difference if cpu has no sse4.1 support).
     */
-   if (util_cpu_caps.has_sse2 && n == 4) {
+   if (util_cpu_caps.has_sse2 && n > 1) {
       LLVMValueRef sel, tmp, tmp2;
       struct lp_build_context bld32;
 
@@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
     * per element. Didn't measure performance but cuts shader size
     * by quite a bit (less difference if cpu has no sse4.1 support).
     */
-   if (util_cpu_caps.has_sse2 && n == 4) {
+   if (util_cpu_caps.has_sse2 && n > 1) {
       LLVMValueRef sel, tmp;
       struct lp_build_context bld32;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 768d935dae5..5bf4bcfab3b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,15 +26,44 @@
  **************************************************************************/
 
 
+#include "pipe/p_config.h"
 #include "pipe/p_compiler.h"
 #include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_simple_list.h"
+#include "lp_bld.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_misc.h"
 #include "lp_bld_init.h"
 
+#include <llvm-c/Analysis.h>
 #include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/BitWriter.h>
+
+
+/**
+ * AVX is supported in:
+ * - standard JIT from LLVM 3.2 onwards
+ * - MC-JIT from LLVM 3.1
+ *   - MC-JIT supports limited OSes (MacOSX and Linux)
+ * - standard JIT in LLVM 3.1, with backports
+ */
+#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE))
+#  define USE_MCJIT 1
+#  define HAVE_AVX 1
+#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
+#  define USE_MCJIT 0
+#  define HAVE_AVX 1
+#else
+#  define USE_MCJIT 0
+#  define HAVE_AVX 0
+#endif
+
+
+#if USE_MCJIT
+void LLVMLinkInMCJIT();
+#endif
 
 
 #ifdef DEBUG
@@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags,
 
 static boolean gallivm_initialized = FALSE;
 
+unsigned lp_native_vector_width;
+
 
 /*
  * Optimization values are:
@@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level {
 };
 
 
+#if HAVE_LLVM <= 0x0206
 /**
- * LLVM 2.6 permits only one ExecutionEngine to be created.  This is it.
- */
-static LLVMExecutionEngineRef GlobalEngine = NULL;
-
-/**
- * Same gallivm state shared by all contexts.
+ * LLVM 2.6 permits only one ExecutionEngine to be created.  So use the
+ * same gallivm state everywhere.
  */
 static struct gallivm_state *GlobalGallivm = NULL;
-
-
-
-
-extern void
-lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
-
-extern void
-lp_set_target_options(void);
-
+#endif
 
 
 /**
@@ -111,6 +130,7 @@ static boolean
 create_pass_manager(struct gallivm_state *gallivm)
 {
    assert(!gallivm->passmgr);
+   assert(gallivm->target);
 
    gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider);
    if (!gallivm->passmgr)
@@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm)
                                &mod, &error);
 #endif
 
+   if (gallivm->passmgr) {
+      LLVMDisposePassManager(gallivm->passmgr);
+   }
+
 #if 0
    /* XXX this seems to crash with all versions of LLVM */
    if (gallivm->provider)
       LLVMDisposeModuleProvider(gallivm->provider);
 #endif
 
-   if (gallivm->passmgr)
-      LLVMDisposePassManager(gallivm->passmgr);
-
-#if HAVE_LLVM >= 0x207
-   if (gallivm->module)
-      LLVMDisposeModule(gallivm->module);
-#endif
-
-#if 0
-   /* Don't free the exec engine, it's a global/singleton */
-   if (gallivm->engine)
+   if (HAVE_LLVM >= 0x207 && gallivm->engine) {
+      /* This will already destroy any associated module */
       LLVMDisposeExecutionEngine(gallivm->engine);
-#endif
+   } else {
+      LLVMDisposeModule(gallivm->module);
+   }
 
-#if 0
+#if !USE_MCJIT
    /* Don't free the TargetData, it's owned by the exec engine */
-   LLVMDisposeTargetData(gallivm->target);
+#else
+   if (gallivm->target) {
+      LLVMDisposeTargetData(gallivm->target);
+   }
 #endif
 
+   /* Never free the LLVM context.
+    */
+#if 0
    if (gallivm->context)
       LLVMContextDispose(gallivm->context);
+#endif
 
    if (gallivm->builder)
       LLVMDisposeBuilder(gallivm->builder);
@@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm)
 }
 
 
-/**
- * Allocate gallivm LLVM objects.
- * \return  TRUE for success, FALSE for failure
- */
 static boolean
-init_gallivm_state(struct gallivm_state *gallivm)
+init_gallivm_engine(struct gallivm_state *gallivm)
 {
-   assert(!gallivm->context);
-   assert(!gallivm->module);
-   assert(!gallivm->provider);
-
-   lp_build_init();
-
-   gallivm->context = LLVMContextCreate();
-   if (!gallivm->context)
-      goto fail;
-
-   gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
-                                                       gallivm->context);
-   if (!gallivm->module)
-      goto fail;
-
-   gallivm->provider =
-      LLVMCreateModuleProviderForExistingModule(gallivm->module);
-   if (!gallivm->provider)
-      goto fail;
-
-   if (!GlobalEngine) {
+   if (1) {
       /* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
       enum LLVM_CodeGenOpt_Level optlevel;
       char *error = NULL;
+      int ret;
 
       if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
          optlevel = None;
@@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm)
          optlevel = Default;
       }
 
-      if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider,
-                                (unsigned) optlevel, &error)) {
+#if USE_MCJIT
+      ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine,
+                                                      gallivm->module,
+                                                      (unsigned) optlevel,
+                                                      &error);
+#else
+      ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider,
+                                  (unsigned) optlevel, &error);
+#endif
+      if (ret) {
          _debug_printf("%s\n", error);
          LLVMDisposeMessage(error);
          goto fail;
       }
 
 #if defined(DEBUG) || defined(PROFILE)
-      lp_register_oprofile_jit_event_listener(GlobalEngine);
+      lp_register_oprofile_jit_event_listener(gallivm->engine);
 #endif
    }
 
-   gallivm->engine = GlobalEngine;
-
    LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
 
+#if !USE_MCJIT
    gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
    if (!gallivm->target)
       goto fail;
+#else
+   if (0) {
+       /*
+        * Dump the data layout strings.
+        */
 
-   if (!create_pass_manager(gallivm))
-      goto fail;
+       LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+       char *data_layout;
+       char *engine_data_layout;
 
-   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
-   if (!gallivm->builder)
-      goto fail;
+       data_layout = LLVMCopyStringRepOfTargetData(gallivm->target);
+       engine_data_layout = LLVMCopyStringRepOfTargetData(target);
+
+       if (1) {
+          debug_printf("module target data = %s\n", data_layout);
+          debug_printf("engine target data = %s\n", engine_data_layout);
+       }
+
+       free(data_layout);
+       free(engine_data_layout);
+   }
+#endif
 
    return TRUE;
 
 fail:
-   free_gallivm_state(gallivm);
    return FALSE;
 }
 
 
-struct callback
-{
-   garbage_collect_callback_func func;
-   void *cb_data;
-   struct callback *prev, *next;
-};
-
-
-/** list of all garbage collector callbacks */
-static struct callback callback_list = {NULL, NULL, NULL, NULL};
+/**
+ * Singleton
+ *
+ * We must never free LLVM contexts, because LLVM has several global caches
+ * which pointing/derived from objects owned by the context, causing false
+ * memory leaks and false cache hits when these objects are destroyed.
+ *
+ * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM
+ * context per thread, and put them in a pool when threads are destroyed.
+ */
+static LLVMContextRef gallivm_context = NULL;
 
 
 /**
- * Register a function with gallivm which will be called when we
- * do garbage collection.
+ * Allocate gallivm LLVM objects.
+ * \return  TRUE for success, FALSE for failure
  */
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
-                                            void *cb_data)
+static boolean
+init_gallivm_state(struct gallivm_state *gallivm)
 {
-   struct callback *cb;
-
-   if (!callback_list.prev) {
-      make_empty_list(&callback_list);
-   }
+   assert(!gallivm->context);
+   assert(!gallivm->module);
+   assert(!gallivm->provider);
 
-   /* see if already in list */
-   foreach(cb, &callback_list) {
-      if (cb->func == func && cb->cb_data == cb_data)
-         return;
-   }
+   lp_build_init();
 
-   /* add to list */
-   cb = CALLOC_STRUCT(callback);
-   if (cb) {
-      cb->func = func;
-      cb->cb_data = cb_data;
-      insert_at_head(&callback_list, cb);
+   if (!gallivm_context) {
+      gallivm_context = LLVMContextCreate();
    }
-}
+   gallivm->context = gallivm_context;
+   if (!gallivm->context)
+      goto fail;
 
+   gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
+                                                       gallivm->context);
+   if (!gallivm->module)
+      goto fail;
 
-/**
- * Remove a callback.
- */
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
-                                          void *cb_data)
-{
-   struct callback *cb;
-
-   /* search list */
-   foreach(cb, &callback_list) {
-      if (cb->func == func && cb->cb_data == cb_data) {
-         /* found, remove it */
-         remove_from_list(cb);
-         FREE(cb);
-         return;
-      }
-   }
-}
+   gallivm->provider =
+      LLVMCreateModuleProviderForExistingModule(gallivm->module);
+   if (!gallivm->provider)
+      goto fail;
 
+   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
+   if (!gallivm->builder)
+      goto fail;
 
-/**
- * Call the callback functions (which are typically in the
- * draw module and llvmpipe driver.
- */
-static void
-call_garbage_collector_callbacks(void)
-{
-   struct callback *cb;
-   foreach(cb, &callback_list) {
-      cb->func(cb->cb_data);
+   /* FIXME: MC-JIT only allows compiling one module at a time, and it must be
+    * complete when MC-JIT is created. So defer the MC-JIT engine creation for
+    * now.
+    */
+#if !USE_MCJIT
+   if (!init_gallivm_engine(gallivm)) {
+      goto fail;
    }
-}
+#else
+   /*
+    * MC-JIT engine compiles the module immediately on creation, so we can't
+    * obtain the target data from it.  Instead we create a target data layout
+    * from a string.
+    *
+    * The produced layout strings are not precisely the same, but should make
+    * no difference for the kind of optimization passes we run.
+    *
+    * For reference this is the layout string on x64:
+    *
+    *   e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64
+    *
+    * See also:
+    * - http://llvm.org/docs/LangRef.html#datalayout
+    */
+
+   {
+      const unsigned pointer_size = 8 * sizeof(void *);
+      char layout[512];
+      util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u",
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+                    'e', // little endian
+#else
+                    'E', // big endian
+#endif
+                    pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment
+                    pointer_size, // aggregate preferred alignment
+                    pointer_size, pointer_size); // stack objects abi alignment, preferred alignment
 
+      gallivm->target = LLVMCreateTargetData(layout);
+      if (!gallivm->target) {
+         return FALSE;
+      }
+   }
+#endif
 
+   if (!create_pass_manager(gallivm))
+      goto fail;
 
-/**
- * Other gallium components using gallivm should call this periodically
- * to let us do garbage collection (or at least try to free memory
- * accumulated by the LLVM libraries).
- */
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm)
-{
-   if (gallivm->context) {
-      if (gallivm_debug & GALLIVM_DEBUG_GC)
-         debug_printf("***** Doing LLVM garbage collection\n");
+   return TRUE;
 
-      call_garbage_collector_callbacks();
-      free_gallivm_state(gallivm);
-      init_gallivm_state(gallivm);
-   }
+fail:
+   free_gallivm_state(gallivm);
+   return FALSE;
 }
 
 
@@ -398,12 +426,27 @@ lp_build_init(void)
 
    lp_set_target_options();
 
-   LLVMInitializeNativeTarget();
-
+#if USE_MCJIT
+   LLVMLinkInMCJIT();
+#else
    LLVMLinkInJIT();
+#endif
 
    util_cpu_detect();
+
+   if (HAVE_AVX &&
+       util_cpu_caps.has_avx) {
+      lp_native_vector_width = 256;
+   } else {
+      /* Leave it at 128, even when no SIMD extensions are available.
+       * Really needs to be a multiple of 128 so can fit 4 floats.
+       */
+      lp_native_vector_width = 128;
+   }
  
+   lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH",
+                                                 lp_native_vector_width);
+
    gallivm_initialized = TRUE;
 
 #if 0
@@ -423,16 +466,27 @@ lp_build_init(void)
 struct gallivm_state *
 gallivm_create(void)
 {
-   if (!GlobalGallivm) {
-      GlobalGallivm = CALLOC_STRUCT(gallivm_state);
-      if (GlobalGallivm) {
-         if (!init_gallivm_state(GlobalGallivm)) {
-            FREE(GlobalGallivm);
-            GlobalGallivm = NULL;
-         }
+   struct gallivm_state *gallivm;
+
+#if HAVE_LLVM <= 0x206
+   if (GlobalGallivm) {
+      return GlobalGallivm;
+   }
+#endif
+
+   gallivm = CALLOC_STRUCT(gallivm_state);
+   if (gallivm) {
+      if (!init_gallivm_state(gallivm)) {
+         FREE(gallivm);
+         gallivm = NULL;
       }
    }
-   return GlobalGallivm;
+
+#if HAVE_LLVM <= 0x206
+   GlobalGallivm = gallivm;
+#endif
+
+   return gallivm;
 }
 
 
@@ -442,6 +496,132 @@ gallivm_create(void)
 void
 gallivm_destroy(struct gallivm_state *gallivm)
 {
+#if HAVE_LLVM <= 0x0206
    /* No-op: don't destroy the singleton */
    (void) gallivm;
+#else
+   free_gallivm_state(gallivm);
+   FREE(gallivm);
+#endif
+}
+
+
+/**
+ * Validate and optimze a function.
+ */
+static void
+gallivm_optimize_function(struct gallivm_state *gallivm,
+                          LLVMValueRef func)
+{
+   if (0) {
+      debug_printf("optimizing %s...\n", LLVMGetValueName(func));
+   }
+
+   assert(gallivm->passmgr);
+
+   /* Apply optimizations to LLVM IR */
+   LLVMRunFunctionPassManager(gallivm->passmgr, func);
+
+   if (0) {
+      if (gallivm_debug & GALLIVM_DEBUG_IR) {
+         /* Print the LLVM IR to stderr */
+         lp_debug_dump_value(func);
+         debug_printf("\n");
+      }
+   }
+}
+
+
+/**
+ * Validate a function.
+ */
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+                        LLVMValueRef func)
+{
+   /* Verify the LLVM IR.  If invalid, dump and abort */
+#ifdef DEBUG
+   if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
+      lp_debug_dump_value(func);
+      assert(0);
+      return;
+   }
+#endif
+
+   gallivm_optimize_function(gallivm, func);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      /* Print the LLVM IR to stderr */
+      lp_debug_dump_value(func);
+      debug_printf("\n");
+   }
+}
+
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm)
+{
+#if HAVE_LLVM > 0x206
+   assert(!gallivm->compiled);
+#endif
+
+   /* Dump byte code to a file */
+   if (0) {
+      LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
+      debug_printf("llvmpipe.bc written\n");
+      debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n");
+   }
+
+#if USE_MCJIT
+   assert(!gallivm->engine);
+   if (!init_gallivm_engine(gallivm)) {
+      assert(0);
+   }
+#endif
+   assert(gallivm->engine);
+
+   ++gallivm->compiled;
+}
+
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+                     LLVMValueRef func)
+{
+   void *code;
+   func_pointer jit_func;
+
+   assert(gallivm->compiled);
+   assert(gallivm->engine);
+
+   code = LLVMGetPointerToGlobal(gallivm->engine, func);
+   assert(code);
+   jit_func = pointer_to_func(code);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      lp_disassemble(code);
+   }
+
+   /* Free the function body to save memory */
+   lp_func_delete_body(func);
+
+   return jit_func;
+}
+
+
+/**
+ * Free the function (and its machine code).
+ */
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+                      LLVMValueRef func,
+                      const void *code)
+{
+#if !USE_MCJIT
+   if (code) {
+      LLVMFreeMachineCodeForFunction(gallivm->engine, func);
+   }
+
+   LLVMDeleteFunction(func);
+#endif
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 5fc0f996c64..7edea616c4e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "util/u_pointer.h" // for func_pointer
 #include "lp_bld.h"
 #include <llvm-c/ExecutionEngine.h>
 
@@ -44,6 +45,7 @@ struct gallivm_state
    LLVMPassManagerRef passmgr;
    LLVMContextRef context;
    LLVMBuilderRef builder;
+   unsigned compiled;
 };
 
 
@@ -51,35 +53,28 @@ void
 lp_build_init(void);
 
 
-extern void
-lp_func_delete_body(LLVMValueRef func);
-
+struct gallivm_state *
+gallivm_create(void);
 
 void
-gallivm_garbage_collect(struct gallivm_state *gallivm);
-
+gallivm_destroy(struct gallivm_state *gallivm);
 
-typedef void (*garbage_collect_callback_func)(void *cb_data);
 
 void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
-                                            void *cb_data);
+gallivm_verify_function(struct gallivm_state *gallivm,
+                        LLVMValueRef func);
 
 void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
-                                          void *cb_data);
+gallivm_compile_module(struct gallivm_state *gallivm);
 
-
-struct gallivm_state *
-gallivm_create(void);
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+                     LLVMValueRef func);
 
 void
-gallivm_destroy(struct gallivm_state *gallivm);
-
-
-extern LLVMValueRef
-lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
-                       const char *Name);
+gallivm_free_function(struct gallivm_state *gallivm,
+                      LLVMValueRef func,
+                      const void * code);
 
 void
 lp_set_load_alignment(LLVMValueRef Inst,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2323f124ae4..2bf1211bcd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -48,6 +48,8 @@
 
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_type.h"
+#include "lp_bld_pack.h"
 
 
 LLVMValueRef
@@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Call intrinsic with arguments adapted to intrinsic vector length.
+ *
+ * Split vectors which are too large for the hw, or expand them if they
+ * are too small, so a caller calling a function which might use intrinsics
+ * doesn't need to do splitting/expansion on its own.
+ * This only supports intrinsics where src and dst types match.
+ */
+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+                                    const char *name,
+                                    struct lp_type src_type,
+                                    unsigned intr_size,
+                                    LLVMValueRef a,
+                                    LLVMValueRef b)
+{
+   unsigned i;
+   struct lp_type intrin_type = src_type;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   LLVMValueRef anative, bnative;
+   unsigned intrin_length = intr_size / src_type.width;
+
+   intrin_type.length = intrin_length;
+
+   if (intrin_length > src_type.length) {
+      LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef constvec, tmp;
+
+      for (i = 0; i < src_type.length; i++) {
+         elems[i] = lp_build_const_int32(gallivm, i);
+      }
+      for (; i < intrin_length; i++) {
+         elems[i] = i32undef;
+      }
+      if (src_type.length == 1) {
+         LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
+         a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
+         b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
+      }
+      constvec = LLVMConstVector(elems, intrin_length);
+      anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
+      bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
+      tmp = lp_build_intrinsic_binary(builder, name,
+                                      lp_build_vec_type(gallivm, intrin_type),
+                                      anative, bnative);
+      if (src_type.length > 1) {
+         constvec = LLVMConstVector(elems, src_type.length);
+         return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
+      }
+      else {
+         return LLVMBuildExtractElement(builder, tmp, elems[0], "");
+      }
+   }
+   else if (intrin_length < src_type.length) {
+      unsigned num_vec = src_type.length / intrin_length;
+      LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+
+      /* don't support arbitrary size here as this is so yuck */
+      if (src_type.length % intrin_length) {
+         /* FIXME: This is something which should be supported
+          * but there doesn't seem to be any need for it currently
+          * so crash and burn.
+          */
+         debug_printf("%s: should handle arbitrary vector size\n",
+                      __FUNCTION__);
+         assert(0);
+         return NULL;
+      }
+
+      for (i = 0; i < num_vec; i++) {
+         anative = lp_build_extract_range(gallivm, a, i*intrin_length,
+                                        intrin_length);
+         bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
+                                        intrin_length);
+         tmp[i] = lp_build_intrinsic_binary(builder, name,
+                                            lp_build_vec_type(gallivm, intrin_type),
+                                            anative, bnative);
+      }
+      return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
+   }
+   else {
+      return lp_build_intrinsic_binary(builder, name,
+                                       lp_build_vec_type(gallivm, src_type),
+                                       a, b);
+   }
+}
+
+
 LLVMValueRef
 lp_build_intrinsic_map(struct gallivm_state *gallivm,
                        const char *name,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index b73dd700362..38c5c29c980 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
 
 
 LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+                                    const char *name,
+                                    struct lp_type src_type,
+                                    unsigned intr_size,
+                                    LLVMValueRef a,
+                                    LLVMValueRef b);
+
+
+LLVMValueRef
 lp_build_intrinsic_map(struct gallivm_state *gallivm,
                        const char *name,
                        LLVMTypeRef ret_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 69796149aaa..7a4a5bb11d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -52,8 +52,8 @@
  *
  *    select <4 x i1> %C, %A, %B
  *
- * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
- * supported on any backend.
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
+ * supported on some backends (x86) starting with llvm 3.1.
  *
  * Expanding the boolean vector to full SIMD register width, as in
  *
@@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld,
       }
       res = LLVMBuildSelect(builder, mask, a, b, "");
    }
-   else if (util_cpu_caps.has_sse4_1 &&
-            type.width * type.length == 128 &&
+   else if (((util_cpu_caps.has_sse4_1 &&
+              type.width * type.length == 128) ||
+             (util_cpu_caps.has_avx &&
+              type.width * type.length == 256 && type.width >= 32)) &&
             !LLVMIsConstant(a) &&
             !LLVMIsConstant(b) &&
             !LLVMIsConstant(mask)) {
@@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld,
       LLVMTypeRef arg_type;
       LLVMValueRef args[3];
 
-      if (type.floating &&
-          type.width == 64) {
+      /*
+       *  There's only float blend in AVX but can just cast i32/i64
+       *  to float.
+       */
+      if (type.width * type.length == 256) {
+         if (type.width == 64) {
+           intrinsic = "llvm.x86.avx.blendv.pd.256";
+           arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
+         }
+         else {
+            intrinsic = "llvm.x86.avx.blendv.ps.256";
+            arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+         }
+      }
+      else if (type.floating &&
+               type.width == 64) {
          intrinsic = "llvm.x86.sse41.blendvpd";
          arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
       } else if (type.floating &&
@@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld,
       return lp_build_select(bld, mask_vec, a, b);
    }
 }
+
+
+/**
+ * Return (scalar-cast)val ? true : false;
+ */
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+                        unsigned real_length,
+                        LLVMValueRef val)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMTypeRef scalar_type;
+   LLVMTypeRef true_type;
+
+   assert(real_length <= bld->type.length);
+
+   true_type = LLVMIntTypeInContext(bld->gallivm->context,
+                                    bld->type.width * real_length);
+   scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
+                                      bld->type.width * bld->type.length);
+   val = LLVMBuildBitCast(builder, val, scalar_type, "");
+   /*
+    * We're using always native types so we can use intrinsics.
+    * However, if we don't do per-element calculations, we must ensure
+    * the excess elements aren't used since they may contain garbage.
+    */
+   if (real_length < bld->type.length) {
+      val = LLVMBuildTrunc(builder, val, true_type, "");
+   }
+   return LLVMBuildICmp(builder, LLVMIntNE,
+                        val, LLVMConstNull(true_type), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index ef33a653682..64c0a1f5946 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef b);
 
 
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+                        unsigned real_length,
+                        LLVMValueRef val);
+
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6c4586c4212..dd2c6120afb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -26,6 +26,12 @@
  **************************************************************************/
 
 
+/**
+ * The purpose of this module is to expose LLVM functionality not available
+ * through the C++ bindings.
+ */
+
+
 #ifndef __STDC_LIMIT_MACROS
 #define __STDC_LIMIT_MACROS
 #endif
@@ -41,11 +47,24 @@
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
+#if HAVE_LLVM >= 0x0301
+#include <llvm/ADT/Triple.h>
+#include <llvm/ExecutionEngine/JITMemoryManager.h>
+#endif
 #include <llvm/Support/CommandLine.h>
 #include <llvm/Support/PrettyStackTrace.h>
 
+#if HAVE_LLVM >= 0x0300
+#include <llvm/Support/TargetSelect.h>
+#else /* HAVE_LLVM < 0x0300 */
+#include <llvm/Target/TargetSelect.h>
+#endif /* HAVE_LLVM < 0x0300 */
+
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_misc.h"
 
 
 /**
@@ -99,6 +118,9 @@ lp_set_target_options(void)
 
 #if defined(DEBUG) || defined(PROFILE)
    llvm::NoFramePointerElim = true;
+#if HAVE_LLVM >= 0x0208
+   llvm::NoFramePointerElimNonLeaf = true;
+#endif
 #endif
 
    llvm::NoExcessFPPrecision = false;
@@ -146,6 +168,30 @@ lp_set_target_options(void)
     * shared object where the gallium driver resides.
     */
    llvm::DisablePrettyStackTrace = true;
+
+   // If we have a native target, initialize it to ensure it is linked in and
+   // usable by the JIT.
+   llvm::InitializeNativeTarget();
+
+#if HAVE_LLVM >= 0x0208
+   llvm::InitializeNativeTargetAsmPrinter();
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   LLVMInitializeX86AsmPrinter();
+#elif defined(PIPE_ARCH_ARM)
+   LLVMInitializeARMAsmPrinter();
+#elif defined(PIPE_ARCH_PPC)
+   LLVMInitializePowerPCAsmPrinter();
+#endif
+
+#if HAVE_LLVM >= 0x0207
+#  if HAVE_LLVM >= 0x0301
+   llvm::InitializeNativeTargetDisassembler();
+#  elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   LLVMInitializeX86Disassembler();
+#  elif defined(PIPE_ARCH_ARM)
+   LLVMInitializeARMDisassembler();
+#  endif
+#endif
 }
 
 
@@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
    return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
 }
 
+
 extern "C"
 void
 lp_set_load_alignment(LLVMValueRef Inst,
@@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst,
 {
    llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
 }
+
+
+#if HAVE_LLVM >= 0x301
+
+/**
+ * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX
+ * feature where available.
+ *
+ * See also:
+ * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+ * - llvm/tools/lli/lli.cpp
+ * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results
+ */
+extern "C"
+LLVMBool
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+                                          LLVMModuleRef M,
+                                          unsigned OptLevel,
+                                          char **OutError)
+{
+   using namespace llvm;
+
+   std::string Error;
+   EngineBuilder builder(unwrap(M));
+   builder.setEngineKind(EngineKind::JIT)
+          .setErrorStr(&Error)
+          .setOptLevel((CodeGenOpt::Level)OptLevel);
+
+   builder.setUseMCJIT(true);
+
+   llvm::SmallVector<std::string, 1> MAttrs;
+   if (util_cpu_caps.has_avx) {
+      /*
+       * AVX feature is not automatically detected from CPUID by the X86 target
+       * yet, because the old (yet default) JIT engine is not capable of
+       * emitting the opcodes.  But as we're using MCJIT here, it is safe to
+       * add set this attribute.
+       */
+      MAttrs.push_back("+avx");
+      builder.setMAttrs(MAttrs);
+   }
+   builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+
+   ExecutionEngine *JIT;
+#if 0
+   JIT = builder.create();
+#else
+   /*
+    * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833
+    */
+   StringRef MArch = "";
+   StringRef MCPU = "";
+   Triple TT(unwrap(M)->getTargetTriple());
+   JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs));
+#endif
+   if (JIT) {
+      *OutJIT = wrap(JIT);
+      return 0;
+   }
+   *OutError = strdup(Error.c_str());
+   return 1;
+}
+
+#endif /* HAVE_LLVM >= 0x301 */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
new file mode 100644
index 00000000000..4f80b38280c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
+
+
+extern void
+lp_func_delete_body(LLVMValueRef func);
+
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name);
+
+extern int
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+                                          LLVMModuleRef M,
+                                          unsigned OptLevel,
+                                          char **OutError);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index fde6bb594f1..b18f7841ccb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -69,6 +69,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -76,6 +77,7 @@
 #include "lp_bld_intr.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_swizzle.h"
 
 
 /**
@@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
    return LLVMConstVector(elems, n);
 }
 
+/**
+ * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
+ * See comment above lp_build_interleave2_half for more details.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
+                                   unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
+      if (i == (n / 2))
+         j += n / 4;
+
+      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+   }
+
+   return LLVMConstVector(elems, n);
+}
 
 /**
  * Build shuffle vectors that match PACKxx instructions.
@@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
    return LLVMConstVector(elems, n);
 }
 
+/**
+ * Return a vector with elements src[start:start+size]
+ * Most useful for getting half the values out of a 256bit sized vector,
+ * otherwise may cause data rearrangement to happen.
+ */
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       unsigned start,
+                       unsigned size)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(size <= Elements(elems));
+
+   for (i = 0; i < size; ++i)
+      elems[i] = lp_build_const_int32(gallivm, i + start);
+
+   if (size == 1) {
+      return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(elems, size), "");
+   }
+}
+
+/**
+ * Concatenates several (must be a power of 2) vectors (of same type)
+ * into a larger one.
+ * Most useful for building up a 256bit sized vector out of two 128bit ones.
+ */
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+                LLVMValueRef src[],
+                struct lp_type src_type,
+                unsigned num_vectors)
+{
+   unsigned new_length, i;
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+   assert(src_type.length * num_vectors <= Elements(shuffles));
+   assert(util_is_power_of_two(num_vectors));
+
+   new_length = src_type.length;
+
+   for (i = 0; i < num_vectors; i++)
+      tmp[i] = src[i];
+
+   while (num_vectors > 1) {
+      num_vectors >>= 1;
+      new_length <<= 1;
+      for (i = 0; i < new_length; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, i);
+      }
+      for (i = 0; i < num_vectors; i++) {
+         tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
+                                         LLVMConstVector(shuffles, new_length), "");
+      }
+   }
+
+   return tmp[0];
+}
 
 /**
  * Interleave vector elements.
@@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm,
    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 }
 
+/**
+ * Interleave vector elements but with 256 bit,
+ * treats it as interleave with 2 concatenated 128 bit vectors.
+ *
+ * This differs to lp_build_interleave2 as that function would do the following (for lo):
+ * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
+ *
+ *
+ * An example interleave 8x float with 8x float on AVX 256bit unpack:
+ *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
+ *
+ * Equivalent to interleaving 2x 128 bit vectors
+ *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
+ *
+ * So interleave-lo would result in:
+ *   a0 b0 a1 b1 a4 b4 a5 b5
+ *
+ * And interleave-hi would result in:
+ *   a2 b2 a3 b3 a6 b6 a7 b7
+ */
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi)
+{
+   if (type.length * type.width == 256) {
+      LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
+      return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+   } else {
+      return lp_build_interleave2(gallivm, type, a, b, lo_hi);
+   }
+}
 
 /**
  * Double the bit width.
@@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm,
  * Non-interleaved pack.
  *
  * This will move values as
- *
- *   lo =   __ l0 __ l1 __ l2 __..  __ ln
- *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *         (LSB)                     (MSB)
+ *   lo =   l0 __ l1 __ l2 __..  __ ln __
+ *   hi =   h0 __ h1 __ h2 __..  __ hn __
  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
  *
  * This will only change the number of bits the values are represented, not the
@@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
                LLVMValueRef hi)
 {
    LLVMBuilderRef builder = gallivm->builder;
-#if HAVE_LLVM < 0x0207
-   LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type);
-#endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
    LLVMValueRef shuffle;
    LLVMValueRef res = NULL;
+   struct lp_type intr_type = dst_type;
+
+#if HAVE_LLVM < 0x0207
+   intr_type = src_type;
+#endif
 
    assert(!src_type.floating);
    assert(!dst_type.floating);
@@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm,
    assert(src_type.length * 2 == dst_type.length);
 
    /* Check for special cases first */
-   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+      const char *intrinsic = NULL;
+
       switch(src_type.width) {
       case 32:
          if(dst_type.sign) {
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-#endif
+            intrinsic = "llvm.x86.sse2.packssdw.128";
          }
          else {
             if (util_cpu_caps.has_sse4_1) {
-               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
-            }
-            else {
-               /* use generic shuffle below */
-               res = NULL;
+               intrinsic = "llvm.x86.sse41.packusdw";
+#if HAVE_LLVM < 0x0207
+               /* llvm < 2.7 has inconsistent signatures except for packusdw */
+               intr_type = dst_type;
+#endif
             }
          }
          break;
-
       case 16:
-         if(dst_type.sign)
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-#endif
-         else
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-#endif
-         break;
-
-      default:
-         assert(0);
-         return LLVMGetUndef(dst_vec_type);
+         if (dst_type.sign) {
+            intrinsic = "llvm.x86.sse2.packsswb.128";
+         }
+         else {
+            intrinsic = "llvm.x86.sse2.packuswb.128";
+         }
          break;
+      /* default uses generic shuffle below */
       }
-
-      if (res) {
-         res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+      if (intrinsic) {
+         if (src_type.width * src_type.length == 128) {
+            LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+            res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
+            if (dst_vec_type != intr_vec_type) {
+               res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+            }
+         }
+         else {
+            int num_split = src_type.width * src_type.length / 128;
+            int i;
+            int nlen = 128 / src_type.width;
+            struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
+            struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
+            LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
+            LLVMValueRef tmplo, tmphi;
+            LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
+            LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
+
+            assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
+
+            for (i = 0; i < num_split / 2; i++) {
+               tmplo = lp_build_extract_range(gallivm,
+                                              lo, i*nlen*2, nlen);
+               tmphi = lp_build_extract_range(gallivm,
+                                              lo, i*nlen*2 + nlen, nlen);
+               tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                     nintr_vec_type, tmplo, tmphi);
+               if (ndst_vec_type != nintr_vec_type) {
+                  tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
+               }
+            }
+            for (i = 0; i < num_split / 2; i++) {
+               tmplo = lp_build_extract_range(gallivm,
+                                              hi, i*nlen*2, nlen);
+               tmphi = lp_build_extract_range(gallivm,
+                                              hi, i*nlen*2 + nlen, nlen);
+               tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                                 nintr_vec_type,
+                                                                 tmplo, tmphi);
+               if (ndst_vec_type != nintr_vec_type) {
+                  tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
+                                                           ndst_vec_type, "");
+               }
+            }
+            res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
+         }
          return res;
       }
    }
@@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm,
    /* All X86 SSE non-interleaved pack instructions take signed inputs and
     * saturate them, so no need to clamp for those cases. */
    if(util_cpu_caps.has_sse2 &&
-      src_type.width * src_type.length == 128 &&
-      src_type.sign)
+      src_type.width * src_type.length >= 128 &&
+      src_type.sign &&
+      (src_type.width == 32 || src_type.width == 16))
       clamp = FALSE;
 
    if(clamp) {
@@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm,
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned i;
 
-
    /* Register width must remain constant */
    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 
@@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm,
         /*
          * Register width remains constant -- use vector packing intrinsics
          */
-
          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
       }
       else {
-         /*
-          * Do it element-wise.
-          */
-
-         assert(src_type.length == dst_type.length);
-         tmp[0] = lp_build_undef(gallivm, dst_type);
-         for (i = 0; i < dst_type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
-            val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), "");
-            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         if (src_type.width / dst_type.width > num_srcs) {
+            /*
+            * First change src vectors size (with shuffle) so they have the
+            * same size as the destination vector, then pack normally.
+            * Note: cannot use cast/extract because llvm generates atrocious code.
+            */
+            unsigned size_ratio = (src_type.width * src_type.length) /
+                                  (dst_type.length * dst_type.width);
+            unsigned new_length = src_type.length / size_ratio;
+
+            for (i = 0; i < size_ratio * num_srcs; i++) {
+               unsigned start_index = (i % size_ratio) * new_length;
+               tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
+                                               start_index, new_length);
+            }
+            num_srcs *= size_ratio;
+            src_type.length = new_length;
+            tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
+         }
+         else {
+            /*
+             * Truncate bit width but expand vector size - first pack
+             * then expand simply because this should be more AVX-friendly
+             * for the cases we probably hit.
+             */
+            unsigned size_ratio = (dst_type.width * dst_type.length) /
+                                  (src_type.length * src_type.width);
+            unsigned num_pack_srcs = num_srcs / size_ratio;
+            dst_type.length = dst_type.length / size_ratio;
+
+            for (i = 0; i < size_ratio; i++) {
+               tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
+                                      &src[i*num_pack_srcs], num_pack_srcs);
+            }
+            tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
          }
       }
    }
@@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm,
          /*
           * Do it element-wise.
           */
+         assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+         for (i = 0; i < num_dsts; i++) {
+            tmp[i] = lp_build_undef(gallivm, dst_type);
+         }
 
-         assert(src_type.length == dst_type.length);
-         tmp[0] = lp_build_undef(gallivm, dst_type);
-         for (i = 0; i < dst_type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+         for (i = 0; i < src_type.length; ++i) {
+            unsigned j = i / dst_type.length;
+            LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
+            LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
 
             if (src_type.sign && dst_type.sign) {
                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
             } else {
                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
             }
-            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+            tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
          }
       }
    }
@@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Expands src vector from src.length to dst_length
+ */
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       struct lp_type src_type,
+                       unsigned dst_length)
+{
+   LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type));
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(dst_length <= Elements(elems));
+   assert(dst_length > src_type.length);
+
+   if (src_type.length == dst_length)
+      return src;
+
+   /* If its a single scalar type, no need to reinvent the wheel */
+   if (src_type.length == 1) {
+      return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src);
+   }
+
+   /* All elements from src vector */
+   for (i = 0; i < src_type.length; ++i)
+      elems[i] = lp_build_const_int32(gallivm, i);
+
+   /* Undef fill remaining space */
+   for (i = src_type.length; i < dst_length; ++i)
+      elems[i] = lp_build_const_int32(gallivm, src_type.length);
+
+   /* Combine the two vectors */
+   return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index d58da4f01b3..73f299cca11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -44,6 +44,12 @@
 
 struct lp_type;
 
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi);
 
 LLVMValueRef
 lp_build_interleave2(struct gallivm_state *gallivm,
@@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm,
                 LLVMValueRef src,
                 LLVMValueRef *dst, unsigned num_dsts);
 
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       unsigned start,
+                       unsigned size);
+
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+                LLVMValueRef src[],
+                struct lp_type src_type,
+                unsigned num_vectors);
 
 LLVMValueRef
 lp_build_packs2(struct gallivm_state *gallivm,
@@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm,
                 LLVMValueRef *dst, unsigned num_dsts);
 
 
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+                    LLVMValueRef src,
+                    struct lp_type src_type,
+                    unsigned dst_length);
+
 #endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index b0a5bc0267f..b1ba7c72655 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 
+#include "u_cpu_detect.h"
 #include "lp_bld_type.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_const.h"
@@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld,
    return lp_build_sub(bld, a_bottom, a_top);
 }
 
-
+/*
+ * To be able to handle multiple quads at once in texture sampling and
+ * do lod calculations per quad, it is necessary to get the per-quad
+ * derivatives into the lp_build_rho function.
+ * For 8-wide vectors the packed derivative values for 3 coords would
+ * look like this, this scales to a arbitrary (multiple of 4) vector size:
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____
+ * The second vector will be unused for 1d and 2d textures.
+ */
 LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+                                 LLVMValueRef a)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef idx_left  = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
-   LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT);
-   LLVMValueRef a_left  = LLVMBuildExtractElement(builder, a, idx_left, "left");
-   LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right");
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef vec1, vec2;
+
+   /* same packing as _twocoord, but can use aos swizzle helper */
+
+   /*
+    * XXX could make swizzle1 a noop swizzle by using right top/bottom
+    * pair for ddy
+    */
+   static const unsigned char swizzle1[] = {
+      LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   static const unsigned char swizzle2[] = {
+      LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+
+   vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
+   vec2 = lp_build_swizzle_aos(bld, a, swizzle2);
+
    if (bld->type.floating)
-      return LLVMBuildFSub(builder, a_right, a_left, "ddx");
+      return LLVMBuildFSub(builder, vec2, vec1, "ddxddy");
    else
-      return LLVMBuildSub(builder, a_right, a_left, "ddx");
+      return LLVMBuildSub(builder, vec2, vec1, "ddxddy");
 }
 
 
 LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+                                 LLVMValueRef a, LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef idx_top    = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
-   LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT);
-   LLVMValueRef a_top    = LLVMBuildExtractElement(builder, a, idx_top, "top");
-   LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom");
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4];
+   LLVMValueRef vec1, vec2;
+   unsigned length, num_quads, i;
+
+   /* XXX: do hsub version */
+   length = bld->type.length;
+   num_quads = length / 4;
+   for (i = 0; i < num_quads; i++) {
+      unsigned s1 = 4 * i;
+      unsigned s2 = 4 * i + length;
+      shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+      shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+      shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+      shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+      shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1);
+      shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1);
+      shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2);
+      shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2);
+   }
+   vec1 = LLVMBuildShuffleVector(builder, a, b,
+                                 LLVMConstVector(shuffles1, length), "");
+   vec2 = LLVMBuildShuffleVector(builder, a, b,
+                                 LLVMConstVector(shuffles2, length), "");
    if (bld->type.floating)
-      return LLVMBuildFSub(builder, a_bottom, a_top, "ddy");
+      return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy");
    else
-      return LLVMBuildSub(builder, a_bottom, a_top, "ddy");
+      return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
 }
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
index b7992912927..be6a1efc396 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld,
 
 
 /*
- * Scalar derivatives.
- *
- * Same as getting the first value of above.
+ * Packed derivatives (one derivative for each direction per quad)
  */
-
 LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
-                    LLVMValueRef a);
-
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+                                 LLVMValueRef a, LLVMValueRef b);
 
 LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
-                    LLVMValueRef a);
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+                                 LLVMValueRef a);
 
 
 #endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d966788d74e..85211161f3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -44,6 +44,8 @@
 #include "lp_bld_sample.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_type.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
 
 
 /*
@@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 /**
  * Generate code to compute coordinate gradient (rho).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  *
- * XXX: The resulting rho is scalar, so we ignore all but the first element of
- * derivatives that are passed by the shader.
+ * The resulting rho is scalar per quad.
  */
 static LLVMValueRef
 lp_build_rho(struct lp_build_sample_context *bld,
              unsigned unit,
-             const LLVMValueRef ddx[4],
-             const LLVMValueRef ddy[4])
+             const struct lp_derivatives *derivs)
 {
+   struct gallivm_state *gallivm = bld->gallivm;
    struct lp_build_context *int_size_bld = &bld->int_size_bld;
    struct lp_build_context *float_size_bld = &bld->float_size_bld;
    struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
-   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-   LLVMValueRef rho_x, rho_y;
    LLVMValueRef rho_vec;
    LLVMValueRef int_size, float_size;
    LLVMValueRef rho;
    LLVMValueRef first_level, first_level_vec;
+   LLVMValueRef abs_ddx_ddy[2];
+   unsigned length = coord_bld->type.length;
+   unsigned num_quads = length / 4;
+   unsigned i;
+   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   LLVMValueRef rho_xvec, rho_yvec;
+
+   abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+   if (dims > 2) {
+      abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+   }
 
-   dsdx = ddx[0];
-   dsdy = ddy[0];
-
-   if (dims <= 1) {
-      rho_x = dsdx;
-      rho_y = dsdy;
+   if (dims == 1) {
+      static const unsigned char swizzle1[] = {
+         0, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle2[] = {
+         1, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+      rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+   }
+   else if (dims == 2) {
+      static const unsigned char swizzle1[] = {
+         0, 2,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle2[] = {
+         1, 3,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+      rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
    }
    else {
-      rho_x = float_size_bld->undef;
-      rho_y = float_size_bld->undef;
-
-      rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, "");
-      rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, "");
-
-      dtdx = ddx[1];
-      dtdy = ddy[1];
-
-      rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, "");
-      rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, "");
-
-      if (dims >= 3) {
-         drdx = ddx[2];
-         drdy = ddy[2];
-
-         rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, "");
-         rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, "");
+      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
+      assert(dims == 3);
+      for (i = 0; i < num_quads; i++) {
+         shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
+         shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
+         shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
+         shuffles1[4*i + 3] = i32undef;
+         shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
+         shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
+         shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
+         shuffles2[4*i + 3] = i32undef;
       }
+      rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+                                        LLVMConstVector(shuffles1, length), "");
+      rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+                                        LLVMConstVector(shuffles2, length), "");
    }
 
-   rho_x = lp_build_abs(float_size_bld, rho_x);
-   rho_y = lp_build_abs(float_size_bld, rho_y);
-
-   rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+   rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, unit);
@@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld,
    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
    float_size = lp_build_int_to_float(float_size_bld, int_size);
 
-   rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+   if (bld->coord_type.length > 4) {
+      /* expand size to each quad */
+      if (dims > 1) {
+         /* could use some broadcast_vector helper for this? */
+         int num_quads = bld->coord_type.length / 4;
+         LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
+         for (i = 0; i < num_quads; i++) {
+            src[i] = float_size;
+         }
+         float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
+      }
+      else {
+         float_size = lp_build_broadcast_scalar(coord_bld, float_size);
+      }
+      rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
 
-   if (dims <= 1) {
-      rho = rho_vec;
+      if (dims <= 1) {
+         rho = rho_vec;
+      }
+      else {
+         if (dims >= 2) {
+            static const unsigned char swizzle1[] = {
+               0, LP_BLD_SWIZZLE_DONTCARE,
+               LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+            };
+            static const unsigned char swizzle2[] = {
+               1, LP_BLD_SWIZZLE_DONTCARE,
+               LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+            };
+            LLVMValueRef rho_s, rho_t, rho_r;
+
+            rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+            rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
+
+            rho = lp_build_max(coord_bld, rho_s, rho_t);
+
+            if (dims >= 3) {
+               static const unsigned char swizzle3[] = {
+                  2, LP_BLD_SWIZZLE_DONTCARE,
+                  LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+               };
+               rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
+               rho = lp_build_max(coord_bld, rho, rho_r);
+            }
+         }
+      }
+      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                      perquadf_bld->type, rho);
    }
    else {
-      if (dims >= 2) {
-         LLVMValueRef rho_s, rho_t, rho_r;
+      if (dims <= 1) {
+         rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+      }
+      rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
 
-         rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
-         rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+      if (dims <= 1) {
+         rho = rho_vec;
+      }
+      else {
+         if (dims >= 2) {
+            LLVMValueRef rho_s, rho_t, rho_r;
+
+            rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+            rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
 
-         rho = lp_build_max(float_bld, rho_s, rho_t);
-         if (dims >= 3) {
-            rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
-            rho = lp_build_max(float_bld, rho, rho_r);
+            rho = lp_build_max(float_bld, rho_s, rho_t);
+
+            if (dims >= 3) {
+               rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
+               rho = lp_build_max(float_bld, rho, rho_r);
+            }
          }
       }
    }
@@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
 /**
  * Generate code to compute texture level of detail (lambda).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  * \param lod_bias  optional float vector with the shader lod bias
  * \param explicit_lod  optional float vector with the explicit lod
  * \param width  scalar int texture width
  * \param height  scalar int texture height
  * \param depth  scalar int texture depth
  *
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
+ * The resulting lod is scalar per quad, so only the first value per quad
+ * passed in from lod_bias, explicit_lod is used.
  */
 void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                       unsigned unit,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
+                      const struct lp_derivatives *derivs,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
@@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
    LLVMValueRef lod;
 
-   *out_lod_ipart = bld->int_bld.zero;
-   *out_lod_fpart = bld->float_bld.zero;
+   *out_lod_ipart = bld->perquadi_bld.zero;
+   *out_lod_fpart = perquadf_bld->zero;
 
    if (bld->static_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
@@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       LLVMValueRef min_lod =
          bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
 
-      lod = min_lod;
+      lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
    }
    else {
-      LLVMValueRef sampler_lod_bias =
-         bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
-      LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0);
-
       if (explicit_lod) {
-         lod = LLVMBuildExtractElement(builder, explicit_lod,
-                                       index0, "");
+         lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                         perquadf_bld->type, explicit_lod);
       }
       else {
          LLVMValueRef rho;
 
-         rho = lp_build_rho(bld, unit, ddx, ddy);
+         rho = lp_build_rho(bld, unit, derivs);
 
          /*
           * Compute lod = log2(rho)
@@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-               *out_lod_ipart = lp_build_ilog2(float_bld, rho);
-               *out_lod_fpart = bld->float_bld.zero;
+               *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
+               *out_lod_fpart = perquadf_bld->zero;
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-               lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+               lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
                                       out_lod_ipart, out_lod_fpart);
                return;
             }
          }
 
          if (0) {
-            lod = lp_build_log2(float_bld, rho);
+            lod = lp_build_log2(perquadf_bld, rho);
          }
          else {
-            lod = lp_build_fast_log2(float_bld, rho);
+            lod = lp_build_fast_log2(perquadf_bld, rho);
          }
 
          /* add shader lod bias */
          if (lod_bias) {
-            lod_bias = LLVMBuildExtractElement(builder, lod_bias,
-                                               index0, "");
+            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                  perquadf_bld->type, lod_bias);
             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
          }
       }
 
       /* add sampler lod bias */
-      if (bld->static_state->lod_bias_non_zero)
+      if (bld->static_state->lod_bias_non_zero) {
+         LLVMValueRef sampler_lod_bias =
+            bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
+         sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+                                                      sampler_lod_bias);
          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
+      }
 
       /* clamp lod */
       if (bld->static_state->apply_max_lod) {
          LLVMValueRef max_lod =
             bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit);
+         max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
 
-         lod = lp_build_min(float_bld, lod, max_lod);
+         lod = lp_build_min(perquadf_bld, lod, max_lod);
       }
       if (bld->static_state->apply_min_lod) {
          LLVMValueRef min_lod =
             bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
+         min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
 
-         lod = lp_build_max(float_bld, lod, min_lod);
+         lod = lp_build_max(perquadf_bld, lod, min_lod);
       }
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-         lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+         lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
                                 out_lod_ipart, out_lod_fpart);
       }
       else {
-         lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+         lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
       }
 
       lp_build_name(*out_lod_fpart, "lod_fpart");
    }
    else {
-      *out_lod_ipart = lp_build_iround(float_bld, lod);
+      *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
    }
 
    lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 /**
  * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
  * mipmap level index.
- * Note: this is all scalar code.
- * \param lod  scalar float texture level of detail
+ * Note: this is all scalar per quad code.
+ * \param lod_ipart  int texture level of detail
  * \param level_out  returns integer 
  */
 void
@@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                            LLVMValueRef lod_ipart,
                            LLVMValueRef *level_out)
 {
-   struct lp_build_context *int_bld = &bld->int_bld;
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
    LLVMValueRef first_level, last_level, level;
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, unit);
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->gallivm, unit);
+   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
 
-   /* convert float lod to integer */
-   level = lp_build_add(int_bld, lod_ipart, first_level);
+   level = lp_build_add(perquadi_bld, lod_ipart, first_level);
 
    /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_bld, level, first_level, last_level);
+   *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
 }
 
 
 /**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
+ * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
+ * Later, we'll sample from those two mipmap levels and interpolate between them.
  */
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
@@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            LLVMValueRef *level1_out)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
    LLVMValueRef first_level, last_level;
    LLVMValueRef clamp_min;
    LLVMValueRef clamp_max;
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, unit);
-
-   *level0_out = lp_build_add(int_bld, lod_ipart, first_level);
-   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->gallivm, unit);
+   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+
+   *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
+   *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
 
    /*
     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
     * ends in the process.
     */
 
+   /*
+    * This code (vector select in particular) only works with llvm 3.1
+    * (if there's more than one quad, with x86 backend). Might consider
+    * converting to our lp_bld_logic helpers.
+    */
+#if HAVE_LLVM < 0x0301
+   assert(perquadi_bld->type.length == 1);
+#endif
+
    /* *level0_out < first_level */
    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
                              *level0_out, first_level,
@@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                  first_level, *level1_out, "");
 
    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
-                                      float_bld->zero, *lod_fpart_inout, "");
+                                      perquadf_bld->zero, *lod_fpart_inout, "");
 
    /* *level0_out >= last_level */
    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                  last_level, *level1_out, "");
 
    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
-                                      float_bld->zero, *lod_fpart_inout, "");
+                                      perquadf_bld->zero, *lod_fpart_inout, "");
 
    lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
    lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
@@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 }
 
 
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                int level)
-{
-   LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level);
-   return lp_build_get_mipmap_level(bld, lvl);
-}
-
-
 /**
  * Codegen equivalent for u_minify().
  * Return max(1, base_size >> level);
@@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
  *                    bld->int_size_type or bld->float_size_type)
  * @param coord_type  type of the texture size vector (either
  *                    bld->int_coord_type or bld->coord_type)
- * @param int_size    vector with the integer texture size (width, height,
- *                    depth)
+ * @param size        vector with the texture size (width, height, depth)
  */
 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
@@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
 /**
  * Unnormalize coords.
  *
- * @param int_size  vector with the integer texture size (width, height, depth)
+ * @param flt_size  vector with the integer texture size (width, height, depth)
  */
 void
 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
@@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
 
 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = +0.5 / abs(coord); */
+   LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
+   return ima;
+}
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
 {
    /* ima = -0.5 / abs(coord); */
    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
@@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
    return ima;
 }
 
-
 /**
  * Helper used by lp_build_cube_lookup()
+ * FIXME: the sign here can also be 0.
+ * Arithmetically this could definitely make a difference. Either
+ * fix the comment or use other (simpler) sign function, not sure
+ * which one it should be.
  * \param sign  scalar +1 or -1
  * \param coord  float vector
  * \param ima  float vector
@@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
    struct lp_build_context *coord_bld = &bld->coord_bld;
    LLVMBuilderRef builder = bld->gallivm->builder;
+   struct gallivm_state *gallivm = bld->gallivm;
    LLVMValueRef rx, ry, rz;
-   LLVMValueRef arx, ary, arz;
-   LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25);
-   LLVMValueRef arx_ge_ary, arx_ge_arz;
-   LLVMValueRef ary_ge_arx, ary_ge_arz;
-   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-
-   assert(bld->coord_bld.type.length == 4);
+   LLVMValueRef tmp[4], rxyz, arxyz;
 
    /*
     * Use the average of the four pixel's texcoords to choose the face.
+    * Slight simplification just calculate the sum, skip scaling.
     */
-   rx = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, s));
-   ry = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, t));
-   rz = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, r));
+   tmp[0] = s;
+   tmp[1] = t;
+   tmp[2] = r;
+   rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+   arxyz = lp_build_abs(&bld->coord_bld, rxyz);
+
+   if (coord_bld->type.length > 4) {
+      struct lp_build_context *cint_bld = &bld->int_coord_bld;
+      struct lp_type intctype = cint_bld->type;
+      LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
+      LLVMValueRef arxs, arys, arzs;
+      LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+      LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+      LLVMValueRef ryneg, rzneg;
+      LLVMValueRef ma, ima;
+      LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+      LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+                                                     1 << (intctype.width - 1));
+      LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+                                                      intctype.width -1);
+      LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+      LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+      LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+
+      assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+      assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+      assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+
+      rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+      ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+      rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+      ryneg = LLVMBuildXor(builder, ry, signmask, "");
+      rzneg = LLVMBuildXor(builder, rz, signmask, "");
+
+      /* the sign bit comes from the averaged vector (per quad),
+       * as does the decision which face to use */
+      signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
+      signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+
+      arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
+      arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
+      arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
 
-   arx = lp_build_abs(float_bld, rx);
-   ary = lp_build_abs(float_bld, ry);
-   arz = lp_build_abs(float_bld, rz);
+      /*
+       * select x if x >= y else select y
+       * select previous result if y >= max(x,y) else select z
+       */
+      arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
+      maxarxsarys = lp_build_max(coord_bld, arxs, arys);
+      arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
 
-   /*
-    * Compare sign/magnitude of rx,ry,rz to determine face
-    */
-   arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, "");
-   arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, "");
-   ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, "");
-   ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, "");
+      /*
+       * compute all possible new s/t coords
+       * snewx = signrx * -rz;
+       * tnewx = -ry;
+       * snewy = rx;
+       * tnewy = signry * rz;
+       * snewz = signrz * rx;
+       * tnewz = -ry;
+       */
+      signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+      snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
+      tnewx = ryneg;
+
+      signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+      snewy = rx;
+      tnewy = LLVMBuildXor(builder, signrys, rz, "");
+
+      signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+      snewz = LLVMBuildXor(builder, signrzs, rx, "");
+      tnewz = ryneg;
+
+      /* XXX on x86 unclear if we should cast the values back to float
+       * or not - on some cpus (nehalem) pblendvb has twice the throughput
+       * of blendvps though on others there just might be domain
+       * transition penalties when using it (this depends on what llvm
+       * will chose for the bit ops above so there appears no "right way",
+       * but given the boatload of selects let's just use the int type).
+       *
+       * Unfortunately we also need the sign bit of the summed coords.
+       */
+      *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
+      *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
+      ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
+      *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
+      sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
+
+      *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
+      *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
+      ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
+      *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
+      sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+      *face_s = LLVMBuildBitCast(builder, *face_s,
+                               lp_build_vec_type(gallivm, coord_bld->type), "");
+      *face_t = LLVMBuildBitCast(builder, *face_t,
+                               lp_build_vec_type(gallivm, coord_bld->type), "");
+
+      /* add +1 for neg face */
+      /* XXX with AVX probably want to use another select here -
+       * as long as we ensure vblendvps gets used we can actually
+       * skip the comparison and just use sign as a "mask" directly.
+       */
+      sign = LLVMBuildLShr(builder, sign, signshift, "");
+      *face = LLVMBuildOr(builder, *face, sign, "face");
 
-   arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, "");
-   ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, "");
+      ima = lp_build_cube_imapos(coord_bld, ma);
+
+      *face_s = lp_build_mul(coord_bld, *face_s, ima);
+      *face_s = lp_build_add(coord_bld, *face_s, posHalf);
+      *face_t = lp_build_mul(coord_bld, *face_t, ima);
+      *face_t = lp_build_add(coord_bld, *face_t, posHalf);
+   }
 
-   {
+   else {
       struct lp_build_if_state if_ctx;
       LLVMValueRef face_s_var;
       LLVMValueRef face_t_var;
       LLVMValueRef face_var;
-
-      face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var");
-      face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var");
-      face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var");
-
-      lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz);
+      LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+      LLVMValueRef shuffles[4];
+      LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
+      LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+      struct lp_build_context *float_bld = &bld->float_bld;
+
+      assert(bld->coord_bld.type.length == 4);
+
+      shuffles[0] = lp_build_const_int32(gallivm, 0);
+      shuffles[1] = lp_build_const_int32(gallivm, 1);
+      shuffles[2] = lp_build_const_int32(gallivm, 0);
+      shuffles[3] = lp_build_const_int32(gallivm, 1);
+      arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+      shuffles[0] = lp_build_const_int32(gallivm, 1);
+      shuffles[1] = lp_build_const_int32(gallivm, 0);
+      shuffles[2] = lp_build_const_int32(gallivm, 2);
+      shuffles[3] = lp_build_const_int32(gallivm, 2);
+      aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+      arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
+
+      shuffles[0] = lp_build_const_int32(gallivm, 0);
+      shuffles[1] = lp_build_const_int32(gallivm, 1);
+      arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+                                            LLVMConstVector(shuffles, 2), "");
+      shuffles[0] = lp_build_const_int32(gallivm, 2);
+      shuffles[1] = lp_build_const_int32(gallivm, 3);
+      arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+                                            LLVMConstVector(shuffles, 2), "");
+      arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
+
+      arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+                                               lp_build_const_int32(gallivm, 1), "");
+      ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
+      face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
+      face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
+
+      lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
       {
          /* +/- X face */
-         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
-         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         LLVMValueRef sign, ima;
+         rx = LLVMBuildExtractElement(builder, rxyz,
+                                      lp_build_const_int32(gallivm, 0), "");
+         /* +/- X face */
+         sign = lp_build_sgn(float_bld, rx);
+         ima = lp_build_cube_imaneg(coord_bld, s);
          *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
          *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
          *face = lp_build_cube_face(bld, rx,
@@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       {
          struct lp_build_if_state if_ctx2;
 
-         lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz);
+         lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
          {
+            LLVMValueRef sign, ima;
             /* +/- Y face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            ry = LLVMBuildExtractElement(builder, rxyz,
+                                         lp_build_const_int32(gallivm, 1), "");
+            sign = lp_build_sgn(float_bld, ry);
+            ima = lp_build_cube_imaneg(coord_bld, t);
             *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
             *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
             *face = lp_build_cube_face(bld, ry,
@@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          lp_build_else(&if_ctx2);
          {
             /* +/- Z face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            LLVMValueRef sign, ima;
+            rz = LLVMBuildExtractElement(builder, rxyz,
+                                         lp_build_const_int32(gallivm, 2), "");
+            sign = lp_build_sgn(float_bld, rz);
+            ima = lp_build_cube_imaneg(coord_bld, r);
             *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
             *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
             *face = lp_build_cube_face(bld, rz,
@@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
       *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
       *face   = LLVMBuildLoad(builder, face_var, "face");
+      *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index dad138abee0..0f3d8ae6cb5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -52,6 +52,15 @@ struct lp_build_context;
 
 
 /**
+ * Helper struct holding all derivatives needed for sampling
+ */
+struct lp_derivatives
+{
+   LLVMValueRef ddx_ddy[2];
+};
+
+
+/**
  * Sampler static state.
  *
  * These are the bits of state from pipe_resource and pipe_sampler_state that
@@ -192,6 +201,9 @@ struct lp_build_sample_context
    /* See texture_dims() */
    unsigned dims;
 
+   /** SIMD vector width */
+   unsigned vector_width;
+
    /** regular scalar float type */
    struct lp_type float_type;
    struct lp_build_context float_bld;
@@ -199,7 +211,7 @@ struct lp_build_sample_context
    /** float vector type */
    struct lp_build_context float_vec_bld;
 
-   /** regular scalar float type */
+   /** regular scalar int type */
    struct lp_type int_type;
    struct lp_build_context int_bld;
 
@@ -223,10 +235,15 @@ struct lp_build_sample_context
    struct lp_type texel_type;
    struct lp_build_context texel_bld;
 
+   /** Float per-quad type */
+   struct lp_type perquadf_type;
+   struct lp_build_context perquadf_bld;
+
+   /** Int per-quad type */
+   struct lp_type perquadi_type;
+   struct lp_build_context perquadi_bld;
+
    /* Common dynamic state values */
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef depth;
    LLVMValueRef row_stride_array;
    LLVMValueRef img_stride_array;
    LLVMValueRef data_array;
@@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                       unsigned unit,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
+                      const struct lp_derivatives *derivs,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
@@ -331,10 +347,6 @@ LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
                           LLVMValueRef level);
 
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                int level);
-
 
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
@@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
+                    const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias,
                     LLVMValueRef explicit_lod,
                     LLVMValueRef texel_out[4]);
 
+
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+                                  LLVMValueRef coord_f,
+                                  LLVMValueRef length_i,
+                                  LLVMValueRef length_f,
+                                  LLVMValueRef *coord0_i,
+                                  LLVMValueRef *weight_f);
+
+
 void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                         const struct lp_sampler_static_state *static_state,
                         struct lp_sampler_dynamic_state *dynamic_state,
+                        struct lp_type int_type,
                         unsigned unit,
                         LLVMValueRef explicit_lod,
                         LLVMValueRef *sizes_out);
 
 void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm, 
+                    struct lp_type type,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 74858bc9718..ad1b29cf096 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -27,7 +27,7 @@
 
 /**
  * @file
- * Texture sampling -- SoA.
+ * Texture sampling -- AoS.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  * @author Brian Paul <brianp@vmware.com>
@@ -40,6 +40,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -75,6 +76,7 @@ static void
 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
                                  unsigned block_length,
                                  LLVMValueRef coord,
+                                 LLVMValueRef coord_f,
                                  LLVMValueRef length,
                                  LLVMValueRef stride,
                                  boolean is_pot,
@@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
       if(is_pot)
          coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         coord = LLVMBuildAdd(builder, coord, bias, "");
-         coord = LLVMBuildURem(builder, coord, length, "");
+         struct lp_build_context *coord_bld = &bld->coord_bld;
+         LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+         coord = lp_build_fract_safe(coord_bld, coord_f);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_itrunc(coord_bld, coord);
       }
       break;
 
@@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
 
 
 /**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for float texcoords.
+ * \param coord  the incoming texcoord (s,t,r or q)
+ * \param length  the texture size along one dimension
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param icoord  the texcoord after wrapping, as int
+ */
+static void
+lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
+                                   LLVMValueRef coord,
+                                   LLVMValueRef length,
+                                   boolean is_pot,
+                                   unsigned wrap_mode,
+                                   LLVMValueRef *icoord)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef length_minus_one;
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      /* take fraction, unnormalize */
+      coord = lp_build_fract_safe(coord_bld, coord);
+      coord = lp_build_mul(coord_bld, coord, length);
+      *icoord = lp_build_itrunc(coord_bld, coord);
+      break;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length);
+      }
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
+                             length_minus_one);
+      *icoord = lp_build_itrunc(coord_bld, coord);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+   }
+}
+
+
+/**
  * Build LLVM code for texture coord wrapping, for linear filtering,
  * for scaled integer texcoords.
  * \param block_length  is the length of the pixel block along the
@@ -139,6 +192,8 @@ static void
 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
                                 unsigned block_length,
                                 LLVMValueRef coord0,
+                                LLVMValueRef *weight_i,
+                                LLVMValueRef coord_f,
                                 LLVMValueRef length,
                                 LLVMValueRef stride,
                                 boolean is_pot,
@@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
    LLVMValueRef length_minus_one;
    LLVMValueRef lmask, umask, mask;
 
-   if (block_length != 1) {
-      /*
-       * If the pixel block covers more than one pixel then there is no easy
-       * way to calculate offset1 relative to offset0. Instead, compute them
-       * independently.
-       */
-
-      LLVMValueRef coord1;
-
-      lp_build_sample_wrap_nearest_int(bld,
-                                       block_length,
-                                       coord0,
-                                       length,
-                                       stride,
-                                       is_pot,
-                                       wrap_mode,
-                                       offset0, i0);
-
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+   /*
+    * If the pixel block covers more than one pixel then there is no easy
+    * way to calculate offset1 relative to offset0. Instead, compute them
+    * independently. Otherwise, try to compute offset0 and offset1 with
+    * a single stride multiplication.
+    */
 
-      lp_build_sample_wrap_nearest_int(bld,
-                                       block_length,
-                                       coord1,
-                                       length,
-                                       stride,
-                                       is_pot,
-                                       wrap_mode,
-                                       offset1, i1);
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
+   if (block_length != 1) {
+      LLVMValueRef coord1;
+      switch(wrap_mode) {
+      case PIPE_TEX_WRAP_REPEAT:
+         if (is_pot) {
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+            coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
+            coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
+         }
+         else {
+            LLVMValueRef mask;
+            LLVMValueRef weight;
+            LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+            lp_build_coord_repeat_npot_linear(bld, coord_f,
+                                              length, length_f,
+                                              &coord0, &weight);
+            mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+                                    PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+            coord1 = LLVMBuildAnd(builder,
+                                  lp_build_add(int_coord_bld, coord0,
+                                               int_coord_bld->one),
+                                  mask, "");
+            weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+            *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
+         }
+         break;
+
+      case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
+                                length_minus_one);
+         coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
+                                length_minus_one);
+         break;
+
+      case PIPE_TEX_WRAP_CLAMP:
+      case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      default:
+         assert(0);
+         coord0 = int_coord_bld->zero;
+         coord1 = int_coord_bld->zero;
+         break;
+      }
+      lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
+                                     offset0, i0);
+      lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
+                                     offset1, i1);
       return;
    }
 
-   /*
-    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
-    * multiplication.
-    */
-
    *i0 = int_coord_bld->zero;
    *i1 = int_coord_bld->zero;
 
-   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
       if (is_pot) {
          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
       }
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         coord0 = LLVMBuildAdd(builder, coord0, bias, "");
-         coord0 = LLVMBuildURem(builder, coord0, length, "");
+         LLVMValueRef weight;
+         LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+         lp_build_coord_repeat_npot_linear(bld, coord_f,
+                                           length, length_f,
+                                           &coord0, &weight);
+         weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+         *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
       }
 
       mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
@@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* XXX this might be slower than the separate path
+       * on some newer cpus. With sse41 this is 8 instructions vs. 7
+       * - at least on SNB this is almost certainly slower since
+       * min/max are cheaper than selects, and the muls aren't bad.
+       */
       lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                                PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
       umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
@@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 
 
 /**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for float texcoords.
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param coord  the incoming texcoord (s,t,r or q)
+ * \param length  the texture size along one dimension
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param coord0  the first texcoord after wrapping, as int
+ * \param coord1  the second texcoord after wrapping, as int
+ * \param weight  the filter weight as int (0-255)
+ * \param force_nearest  if this coord actually uses nearest filtering
+ */
+static void
+lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
+                                  unsigned block_length,
+                                  LLVMValueRef coord,
+                                  LLVMValueRef length,
+                                  boolean is_pot,
+                                  unsigned wrap_mode,
+                                  LLVMValueRef *coord0,
+                                  LLVMValueRef *coord1,
+                                  LLVMValueRef *weight,
+                                  unsigned force_nearest)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         /* mul by size and subtract 0.5 */
+         coord = lp_build_mul(coord_bld, coord, length);
+         if (!force_nearest)
+            coord = lp_build_sub(coord_bld, coord, half);
+         *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+         *coord1 = lp_build_ifloor(coord_bld, *coord1);
+         /* repeat wrap */
+         length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
+         *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
+         *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
+      }
+      else {
+         LLVMValueRef mask;
+         /* wrap with normalized floats is just fract */
+         coord = lp_build_fract(coord_bld, coord);
+         /* unnormalize */
+         coord = lp_build_mul(coord_bld, coord, length);
+         /*
+          * we avoided the 0.5/length division, have to fix up wrong
+          * edge cases with selects
+          */
+         *coord1 = lp_build_add(coord_bld, coord, half);
+         coord = lp_build_sub(coord_bld, coord, half);
+         *weight = lp_build_fract(coord_bld, coord);
+         mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+                                 PIPE_FUNC_LESS, coord, coord_bld->zero);
+         *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
+         *coord0 = lp_build_itrunc(coord_bld, *coord0);
+         mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+                                 PIPE_FUNC_LESS, *coord1, length);
+         *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
+         *coord1 = lp_build_itrunc(coord_bld, *coord1);
+      }
+      break;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      if (bld->static_state->normalized_coords) {
+         /* mul by tex size */
+         coord = lp_build_mul(coord_bld, coord, length);
+      }
+      /* subtract 0.5 */
+      if (!force_nearest) {
+         coord = lp_build_sub(coord_bld, coord, half);
+      }
+      /* clamp to [0, length - 1] */
+      coord = lp_build_min(coord_bld, coord, length_minus_one);
+      coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+      *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+      /* coord1 = min(coord1, length-1) */
+      *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
+      *coord1 = lp_build_itrunc(coord_bld, *coord1);
+      break;
+   default:
+      assert(0);
+      *coord0 = int_coord_bld->zero;
+      *coord1 = int_coord_bld->zero;
+      *weight = coord_bld->zero;
+      break;
+   }
+   *weight = lp_build_mul_imm(coord_bld, *weight, 256);
+   *weight = lp_build_itrunc(coord_bld, *weight);
+   return;
+}
+
+
+/**
+ * Fetch texels for image with nearest sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
+                                    LLVMValueRef data_ptr,
+                                    LLVMValueRef offset,
+                                    LLVMValueRef x_subcoord,
+                                    LLVMValueRef y_subcoord,
+                                    LLVMValueRef *colors_lo,
+                                    LLVMValueRef *colors_hi)
+{
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef rgba8;
+   struct lp_build_context h16, u8n;
+   LLVMTypeRef u8n_vec_type;
+
+   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      /*
+       * Given the format is a rgba8, just read the pixels as is,
+       * without any swizzling. Swizzling will be done later.
+       */
+      rgba8 = lp_build_gather(bld->gallivm,
+                              bld->texel_type.length,
+                              bld->format_desc->block.bits,
+                              bld->texel_type.width,
+                              data_ptr, offset);
+
+      rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+   }
+   else {
+      rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+                                      bld->format_desc,
+                                      u8n.type,
+                                      data_ptr, offset,
+                                      x_subcoord,
+                                      y_subcoord);
+   }
+
+   /* Expand one 4*rgba8 to two 2*rgba16 */
+   lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+                    rgba8,
+                    colors_lo, colors_hi);
+}
+
+
+/**
  * Sample a single texture image with nearest sampling.
  * If sampling a cube texture, r = cube face in [0,5].
  * Return filtered color as two vectors of 16-bit fixed point values.
@@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, u8n_vec_type;
+   struct lp_build_context i32;
+   LLVMTypeRef i32_vec_type;
    LLVMValueRef i32_c8;
    LLVMValueRef width_vec, height_vec, depth_vec;
    LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
+   LLVMValueRef s_float, t_float = NULL, r_float = NULL;
    LLVMValueRef x_stride;
    LLVMValueRef x_offset, offset;
    LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
 
-   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
-   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
 
    i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
-   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
    lp_build_extract_image_sizes(bld,
                                 bld->int_size_type,
@@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                 &height_vec,
                                 &depth_vec);
 
+   s_float = s; t_float = t; r_float = r;
+
    if (bld->static_state->normalized_coords) {
       LLVMValueRef scaled_size;
       LLVMValueRef flt_size;
@@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    /* Do texcoord wrapping, compute texel offset */
    lp_build_sample_wrap_nearest_int(bld,
                                     bld->format_desc->block.width,
-                                    s_ipart, width_vec, x_stride,
+                                    s_ipart, s_float,
+                                    width_vec, x_stride,
                                     bld->static_state->pot_width,
                                     bld->static_state->wrap_s,
                                     &x_offset, &x_subcoord);
@@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       LLVMValueRef y_offset;
       lp_build_sample_wrap_nearest_int(bld,
                                        bld->format_desc->block.height,
-                                       t_ipart, height_vec, row_stride_vec,
+                                       t_ipart, t_float,
+                                       height_vec, row_stride_vec,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t,
                                        &y_offset, &y_subcoord);
@@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
          LLVMValueRef z_offset;
          lp_build_sample_wrap_nearest_int(bld,
                                           1, /* block length (depth) */
-                                          r_ipart, depth_vec, img_stride_vec,
+                                          r_ipart, r_float,
+                                          depth_vec, img_stride_vec,
                                           bld->static_state->pot_depth,
                                           bld->static_state->wrap_r,
                                           &z_offset, &z_subcoord);
@@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       }
    }
 
+   lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+                                       x_subcoord, y_subcoord,
+                                       colors_lo, colors_hi);
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
+                                     LLVMValueRef int_size,
+                                     LLVMValueRef row_stride_vec,
+                                     LLVMValueRef img_stride_vec,
+                                     LLVMValueRef data_ptr,
+                                     LLVMValueRef s,
+                                     LLVMValueRef t,
+                                     LLVMValueRef r,
+                                     LLVMValueRef *colors_lo,
+                                     LLVMValueRef *colors_hi)
+   {
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec, height_vec, depth_vec;
+   LLVMValueRef offset;
+   LLVMValueRef x_subcoord, y_subcoord;
+   LLVMValueRef x_icoord, y_icoord, z_icoord;
+   LLVMValueRef flt_size;
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
+   /* Do texcoord wrapping */
+   lp_build_sample_wrap_nearest_float(bld,
+                                      s, width_vec,
+                                      bld->static_state->pot_width,
+                                      bld->static_state->wrap_s,
+                                      &x_icoord);
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_nearest_float(bld,
+                                         t, height_vec,
+                                         bld->static_state->pot_height,
+                                         bld->static_state->wrap_t,
+                                         &y_icoord);
+
+      if (dims >= 3) {
+         lp_build_sample_wrap_nearest_float(bld,
+                                            r, depth_vec,
+                                            bld->static_state->pot_depth,
+                                            bld->static_state->wrap_r,
+                                            &z_icoord);
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z_icoord = r;
+      }
+   }
+
+   /*
+    * From here on we deal with ints, and we should split up the 256bit
+    * vectors manually for better generated code.
+    */
+
+   /*
+    * compute texel offsets -
+    * cannot do offset calc with floats, difficult for block-based formats,
+    * and not enough precision anyway.
+    */
+   lp_build_sample_offset(&bld->int_coord_bld,
+                          bld->format_desc,
+                          x_icoord, y_icoord,
+                          z_icoord,
+                          row_stride_vec, img_stride_vec,
+                          &offset,
+                          &x_subcoord, &y_subcoord);
+
+   lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+                                       x_subcoord, y_subcoord,
+                                       colors_lo, colors_hi);
+}
+
+
+/**
+ * Fetch texels for image with linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
+                                   LLVMValueRef data_ptr,
+                                   LLVMValueRef offset[2][2][2],
+                                   LLVMValueRef x_subcoord[2],
+                                   LLVMValueRef y_subcoord[2],
+                                   LLVMValueRef s_fpart,
+                                   LLVMValueRef t_fpart,
+                                   LLVMValueRef r_fpart,
+                                   LLVMValueRef *colors_lo,
+                                   LLVMValueRef *colors_hi)
+{
+   const unsigned dims = bld->dims;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_build_context h16, u8n;
+   LLVMTypeRef h16_vec_type, u8n_vec_type;
+   LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
+   LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef shuffle_lo, shuffle_hi;
+   LLVMValueRef s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
+   LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
+   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+   LLVMValueRef packed_lo, packed_hi;
+   unsigned i, j, k;
+   unsigned numj, numk;
+
+   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+   h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
+   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   if (dims >= 2)
+      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+   if (dims >= 3)
+      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+   for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      unsigned subindex = 0;
+#else
+      unsigned subindex = 1;
+#endif
+      LLVMValueRef index;
+
+      index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+      for (i = 0; i < 4; ++i)
+         shuffles_lo[j + i] = index;
+
+      index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+      for (i = 0; i < 4; ++i)
+         shuffles_hi[j + i] = index;
+   }
+
+   shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+   shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+   s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                       shuffle_lo, "");
+   s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                       shuffle_hi, "");
+   if (dims >= 2) {
+      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                          shuffle_lo, "");
+      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                          shuffle_hi, "");
+   }
+   if (dims >= 3) {
+      r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                          shuffle_lo, "");
+      r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                          shuffle_hi, "");
+   }
+
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
     *
@@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
     *
     * The higher 8 bits of the resulting elements will be zero.
     */
-   {
-      LLVMValueRef rgba8;
+   numj = 1 + (dims >= 2);
+   numk = 1 + (dims >= 3);
 
-      if (util_format_is_rgba8_variant(bld->format_desc)) {
-         /*
-          * Given the format is a rgba8, just read the pixels as is,
-          * without any swizzling. Swizzling will be done later.
-          */
-         rgba8 = lp_build_gather(bld->gallivm,
-                                 bld->texel_type.length,
-                                 bld->format_desc->block.bits,
-                                 bld->texel_type.width,
-                                 data_ptr, offset);
+   for (k = 0; k < numk; k++) {
+      for (j = 0; j < numj; j++) {
+         for (i = 0; i < 2; i++) {
+            LLVMValueRef rgba8;
+
+            if (util_format_is_rgba8_variant(bld->format_desc)) {
+               /*
+                * Given the format is a rgba8, just read the pixels as is,
+                * without any swizzling. Swizzling will be done later.
+                */
+               rgba8 = lp_build_gather(bld->gallivm,
+                                       bld->texel_type.length,
+                                       bld->format_desc->block.bits,
+                                       bld->texel_type.width,
+                                       data_ptr, offset[k][j][i]);
+
+               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+            }
+            else {
+               rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+                                               bld->format_desc,
+                                               u8n.type,
+                                               data_ptr, offset[k][j][i],
+                                               x_subcoord[i],
+                                               y_subcoord[j]);
+            }
 
-         rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+            /* Expand one 4*rgba8 to two 2*rgba16 */
+            lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+                             rgba8,
+                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+         }
       }
-      else {
-         rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
-                                         bld->format_desc,
-                                         u8n.type,
-                                         data_ptr, offset,
-                                         x_subcoord,
-                                         y_subcoord);
+   }
+
+   /*
+    * Linear interpolation with 8.8 fixed point.
+    */
+   if (bld->static_state->force_nearest_s) {
+      /* special case 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+                                t_fpart_lo,
+                                neighbors_lo[0][0][0],
+                                neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+                                t_fpart_hi,
+                                neighbors_hi[0][1][0],
+                                neighbors_hi[0][1][0]);
+   }
+   else if (bld->static_state->force_nearest_t) {
+      /* special case 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+                                s_fpart_lo,
+                                neighbors_lo[0][0][0],
+                                neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+                                s_fpart_hi,
+                                neighbors_hi[0][0][0],
+                                neighbors_hi[0][0][1]);
+   }
+   else {
+      /* general 1/2/3-D lerping */
+      if (dims == 1) {
+         packed_lo = lp_build_lerp(&h16,
+                                   s_fpart_lo,
+                                   neighbors_lo[0][0][0],
+                                   neighbors_lo[0][0][1]);
+
+         packed_hi = lp_build_lerp(&h16,
+                                   s_fpart_hi,
+                                   neighbors_hi[0][0][0],
+                                   neighbors_hi[0][0][1]);
       }
+      else {
+         /* 2-D lerp */
+         packed_lo = lp_build_lerp_2d(&h16,
+                                      s_fpart_lo, t_fpart_lo,
+                                      neighbors_lo[0][0][0],
+                                      neighbors_lo[0][0][1],
+                                      neighbors_lo[0][1][0],
+                                      neighbors_lo[0][1][1]);
+
+         packed_hi = lp_build_lerp_2d(&h16,
+                                      s_fpart_hi, t_fpart_hi,
+                                      neighbors_hi[0][0][0],
+                                      neighbors_hi[0][0][1],
+                                      neighbors_hi[0][1][0],
+                                      neighbors_hi[0][1][1]);
+
+         if (dims >= 3) {
+            LLVMValueRef packed_lo2, packed_hi2;
+
+            /* lerp in the second z slice */
+            packed_lo2 = lp_build_lerp_2d(&h16,
+                                          s_fpart_lo, t_fpart_lo,
+                                          neighbors_lo[1][0][0],
+                                          neighbors_lo[1][0][1],
+                                          neighbors_lo[1][1][0],
+                                          neighbors_lo[1][1][1]);
 
-      /* Expand one 4*rgba8 to two 2*rgba16 */
-      lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
-                       rgba8,
-                       colors_lo, colors_hi);
+            packed_hi2 = lp_build_lerp_2d(&h16,
+                                          s_fpart_hi, t_fpart_hi,
+                                          neighbors_hi[1][0][0],
+                                          neighbors_hi[1][0][1],
+                                          neighbors_hi[1][1][0],
+                                          neighbors_hi[1][1][1]);
+            /* interp between two z slices */
+            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+                                      packed_lo, packed_lo2);
+            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+                                      packed_hi, packed_hi2);
+         }
+      }
    }
-}
 
+   *colors_lo = packed_lo;
+   *colors_hi = packed_hi;
+}
 
 /**
  * Sample a single texture image with (bi-)(tri-)linear sampling.
@@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   struct lp_build_context i32;
+   LLVMTypeRef i32_vec_type;
    LLVMValueRef i32_c8, i32_c128, i32_c255;
    LLVMValueRef width_vec, height_vec, depth_vec;
-   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
-   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
+   LLVMValueRef s_ipart, s_fpart, s_float;
+   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
+   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
    LLVMValueRef x_stride, y_stride, z_stride;
    LLVMValueRef x_offset0, x_offset1;
    LLVMValueRef y_offset0, y_offset1;
    LLVMValueRef z_offset0, z_offset1;
    LLVMValueRef offset[2][2][2]; /* [z][y][x] */
    LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
-   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
-   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
-   LLVMValueRef packed_lo, packed_hi;
    unsigned x, y, z;
-   unsigned i, j, k;
-   unsigned numj, numk;
 
-   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
-   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
 
    i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
-   h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
-   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
    lp_build_extract_image_sizes(bld,
                                 bld->int_size_type,
@@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                 &height_vec,
                                 &depth_vec);
 
+   s_float = s; t_float = t; r_float = r;
+
    if (bld->static_state->normalized_coords) {
       LLVMValueRef scaled_size;
       LLVMValueRef flt_size;
@@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    /* do texcoord wrapping and compute texel offsets */
    lp_build_sample_wrap_linear_int(bld,
                                    bld->format_desc->block.width,
-                                   s_ipart, width_vec, x_stride,
+                                   s_ipart, &s_fpart, s_float,
+                                   width_vec, x_stride,
                                    bld->static_state->pot_width,
                                    bld->static_state->wrap_s,
                                    &x_offset0, &x_offset1,
@@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    if (dims >= 2) {
       lp_build_sample_wrap_linear_int(bld,
                                       bld->format_desc->block.height,
-                                      t_ipart, height_vec, y_stride,
+                                      t_ipart, &t_fpart, t_float,
+                                      height_vec, y_stride,
                                       bld->static_state->pot_height,
                                       bld->static_state->wrap_t,
                                       &y_offset0, &y_offset1,
@@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    if (dims >= 3) {
       lp_build_sample_wrap_linear_int(bld,
                                       bld->format_desc->block.height,
-                                      r_ipart, depth_vec, z_stride,
+                                      r_ipart, &r_fpart, r_float,
+                                      depth_vec, z_stride,
                                       bld->static_state->pot_depth,
                                       bld->static_state->wrap_r,
                                       &z_offset0, &z_offset1,
@@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Transform 4 x i32 in
-    *
-    *   s_fpart = {s0, s1, s2, s3}
-    *
-    * into 8 x i16
-    *
-    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
-    *
-    * into two 8 x i16
-    *
-    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
-    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
-    *
-    * and likewise for t_fpart. There is no risk of loosing precision here
-    * since the fractional parts only use the lower 8bits.
-    */
-   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
-   if (dims >= 2)
-      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
-   if (dims >= 3)
-      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+   lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+                                      x_subcoord, y_subcoord,
+                                      s_fpart, t_fpart, r_fpart,
+                                      colors_lo, colors_hi);
+}
 
-   {
-      LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
-      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffle_lo;
-      LLVMValueRef shuffle_hi;
 
-      for (j = 0; j < h16.type.length; j += 4) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
-         unsigned subindex = 0;
-#else
-         unsigned subindex = 1;
-#endif
-         LLVMValueRef index;
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
+                                    LLVMValueRef int_size,
+                                    LLVMValueRef row_stride_vec,
+                                    LLVMValueRef img_stride_vec,
+                                    LLVMValueRef data_ptr,
+                                    LLVMValueRef s,
+                                    LLVMValueRef t,
+                                    LLVMValueRef r,
+                                    LLVMValueRef *colors_lo,
+                                    LLVMValueRef *colors_hi)
+{
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec, height_vec, depth_vec;
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart = NULL;
+   LLVMValueRef r_fpart = NULL;
+   LLVMValueRef x_stride, y_stride, z_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef z_offset0, z_offset1;
+   LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+   LLVMValueRef x_subcoord[2], y_subcoord[2];
+   LLVMValueRef flt_size;
+   LLVMValueRef x_icoord0, x_icoord1;
+   LLVMValueRef y_icoord0, y_icoord1;
+   LLVMValueRef z_icoord0, z_icoord1;
+   unsigned x, y, z;
 
-         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
-         for (i = 0; i < 4; ++i)
-            shuffles_lo[j + i] = index;
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
 
-         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
-         for (i = 0; i < 4; ++i)
-            shuffles_hi[j + i] = index;
-      }
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
 
-      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
-      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+   /* do texcoord wrapping and compute texel offsets */
+   lp_build_sample_wrap_linear_float(bld,
+                                     bld->format_desc->block.width,
+                                     s, width_vec,
+                                     bld->static_state->pot_width,
+                                     bld->static_state->wrap_s,
+                                     &x_icoord0, &x_icoord1,
+                                     &s_fpart,
+                                     bld->static_state->force_nearest_s);
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear_float(bld,
+                                        bld->format_desc->block.height,
+                                        t, height_vec,
+                                        bld->static_state->pot_height,
+                                        bld->static_state->wrap_t,
+                                        &y_icoord0, &y_icoord1,
+                                        &t_fpart,
+                                        bld->static_state->force_nearest_t);
 
-      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
-                                          shuffle_lo, "");
-      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
-                                          shuffle_hi, "");
-      if (dims >= 2) {
-         t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
-                                             shuffle_lo, "");
-         t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
-                                             shuffle_hi, "");
-      }
       if (dims >= 3) {
-         r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
-                                             shuffle_lo, "");
-         r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
-                                             shuffle_hi, "");
+         lp_build_sample_wrap_linear_float(bld,
+                                           bld->format_desc->block.height,
+                                           r, depth_vec,
+                                           bld->static_state->pot_depth,
+                                           bld->static_state->wrap_r,
+                                           &z_icoord0, &z_icoord1,
+                                           &r_fpart, 0);
       }
    }
 
    /*
-    * Fetch the pixels as 4 x 32bit (rgba order might differ):
-    *
-    *   rgba0 rgba1 rgba2 rgba3
-    *
-    * bit cast them into 16 x u8
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * unpack them into two 8 x i16:
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1
-    *   r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * The higher 8 bits of the resulting elements will be zero.
+    * From here on we deal with ints, and we should split up the 256bit
+    * vectors manually for better generated code.
     */
-   numj = 1 + (dims >= 2);
-   numk = 1 + (dims >= 3);
 
-   for (k = 0; k < numk; k++) {
-      for (j = 0; j < numj; j++) {
-         for (i = 0; i < 2; i++) {
-            LLVMValueRef rgba8;
-
-            if (util_format_is_rgba8_variant(bld->format_desc)) {
-               /*
-                * Given the format is a rgba8, just read the pixels as is,
-                * without any swizzling. Swizzling will be done later.
-                */
-               rgba8 = lp_build_gather(bld->gallivm,
-                                       bld->texel_type.length,
-                                       bld->format_desc->block.bits,
-                                       bld->texel_type.width,
-                                       data_ptr, offset[k][j][i]);
-
-               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
-            }
-            else {
-               rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
-                                               bld->format_desc,
-                                               u8n.type,
-                                               data_ptr, offset[k][j][i],
-                                               x_subcoord[i],
-                                               y_subcoord[j]);
-            }
-
-            /* Expand one 4*rgba8 to two 2*rgba16 */
-            lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
-                             rgba8,
-                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
-         }
-      }
-   }
+   /* get pixel, row and image strides */
+   x_stride = lp_build_const_vec(bld->gallivm,
+                                 bld->int_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+   y_stride = row_stride_vec;
+   z_stride = img_stride_vec;
 
    /*
-    * Linear interpolation with 8.8 fixed point.
+    * compute texel offset -
+    * cannot do offset calc with floats, difficult for block-based formats,
+    * and not enough precision anyway.
     */
-   if (bld->static_state->force_nearest_s) {
-      /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                t_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
-
-      packed_hi = lp_build_lerp(&h16,
-                                t_fpart_hi,
-                                neighbors_hi[0][1][0],
-                                neighbors_hi[0][1][0]);
+   lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                  bld->format_desc->block.width,
+                                  x_icoord0, x_stride,
+                                  &x_offset0, &x_subcoord[0]);
+   lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                  bld->format_desc->block.width,
+                                  x_icoord1, x_stride,
+                                  &x_offset1, &x_subcoord[1]);
+   for (z = 0; z < 2; z++) {
+      for (y = 0; y < 2; y++) {
+         offset[z][y][0] = x_offset0;
+         offset[z][y][1] = x_offset1;
+      }
    }
-   else if (bld->static_state->force_nearest_t) {
-      /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                s_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
 
-      packed_hi = lp_build_lerp(&h16,
-                                s_fpart_hi,
-                                neighbors_hi[0][0][0],
-                                neighbors_hi[0][0][1]);
+   if (dims >= 2) {
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     bld->format_desc->block.height,
+                                     y_icoord0, y_stride,
+                                     &y_offset0, &y_subcoord[0]);
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     bld->format_desc->block.height,
+                                     y_icoord1, y_stride,
+                                     &y_offset1, &y_subcoord[1]);
+      for (z = 0; z < 2; z++) {
+         for (x = 0; x < 2; x++) {
+            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[z][0][x], y_offset0);
+            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[z][1][x], y_offset1);
+         }
+      }
    }
-   else {
-      /* general 1/2/3-D lerping */
-      if (dims == 1) {
-         packed_lo = lp_build_lerp(&h16,
-                                   s_fpart_lo,
-                                   neighbors_lo[0][0][0],
-                                   neighbors_lo[0][0][1]);
 
-         packed_hi = lp_build_lerp(&h16,
-                                   s_fpart_hi,
-                                   neighbors_hi[0][0][0],
-                                   neighbors_hi[0][0][1]);
+   if (dims >= 3) {
+      LLVMValueRef z_subcoord[2];
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     1,
+                                     z_icoord0, z_stride,
+                                     &z_offset0, &z_subcoord[0]);
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     1,
+                                     z_icoord1, z_stride,
+                                     &z_offset1, &z_subcoord[1]);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[0][y][x], z_offset0);
+            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[1][y][x], z_offset1);
+         }
       }
-      else {
-         /* 2-D lerp */
-         packed_lo = lp_build_lerp_2d(&h16,
-                                      s_fpart_lo, t_fpart_lo,
-                                      neighbors_lo[0][0][0],
-                                      neighbors_lo[0][0][1],
-                                      neighbors_lo[0][1][0],
-                                      neighbors_lo[0][1][1]);
-
-         packed_hi = lp_build_lerp_2d(&h16,
-                                      s_fpart_hi, t_fpart_hi,
-                                      neighbors_hi[0][0][0],
-                                      neighbors_hi[0][0][1],
-                                      neighbors_hi[0][1][0],
-                                      neighbors_hi[0][1][1]);
-
-         if (dims >= 3) {
-            LLVMValueRef packed_lo2, packed_hi2;
-
-            /* lerp in the second z slice */
-            packed_lo2 = lp_build_lerp_2d(&h16,
-                                          s_fpart_lo, t_fpart_lo,
-                                          neighbors_lo[1][0][0],
-                                          neighbors_lo[1][0][1],
-                                          neighbors_lo[1][1][0],
-                                          neighbors_lo[1][1][1]);
-
-            packed_hi2 = lp_build_lerp_2d(&h16,
-                                          s_fpart_hi, t_fpart_hi,
-                                          neighbors_hi[1][0][0],
-                                          neighbors_hi[1][0][1],
-                                          neighbors_hi[1][1][0],
-                                          neighbors_hi[1][1][1]);
-            /* interp between two z slices */
-            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
-                                      packed_lo, packed_lo2);
-            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
-                                      packed_hi, packed_hi2);
+   }
+   else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef z_offset;
+      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            /* The r coord is the cube face in [0,5] */
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[0][y][x], z_offset);
          }
       }
    }
 
-   *colors_lo = packed_lo;
-   *colors_hi = packed_hi;
+   lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+                                      x_subcoord, y_subcoord,
+                                      s_fpart, t_fpart, r_fpart,
+                                      colors_lo, colors_hi);
 }
 
 
@@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMValueRef size0;
    LLVMValueRef size1;
-   LLVMValueRef row_stride0_vec;
-   LLVMValueRef row_stride1_vec;
-   LLVMValueRef img_stride0_vec;
-   LLVMValueRef img_stride1_vec;
+   LLVMValueRef row_stride0_vec = NULL;
+   LLVMValueRef row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL;
+   LLVMValueRef img_stride1_vec = NULL;
    LLVMValueRef data_ptr0;
    LLVMValueRef data_ptr1;
    LLVMValueRef colors0_lo, colors0_hi;
@@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                &size0,
                                &row_stride0_vec, &img_stride0_vec);
    data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
-   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      lp_build_sample_image_nearest(bld,
-                                    size0,
-                                    row_stride0_vec, img_stride0_vec,
-                                    data_ptr0, s, t, r,
-                                    &colors0_lo, &colors0_hi);
+   if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+      if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+         lp_build_sample_image_nearest_afloat(bld,
+                                              size0,
+                                              row_stride0_vec, img_stride0_vec,
+                                              data_ptr0, s, t, r,
+                                              &colors0_lo, &colors0_hi);
+      }
+      else {
+         assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+         lp_build_sample_image_linear_afloat(bld,
+                                             size0,
+                                             row_stride0_vec, img_stride0_vec,
+                                             data_ptr0, s, t, r,
+                                             &colors0_lo, &colors0_hi);
+      }
    }
    else {
-      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-      lp_build_sample_image_linear(bld,
-                                   size0,
-                                   row_stride0_vec, img_stride0_vec,
-                                   data_ptr0, s, t, r,
-                                   &colors0_lo, &colors0_hi);
+      if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+         lp_build_sample_image_nearest(bld,
+                                       size0,
+                                       row_stride0_vec, img_stride0_vec,
+                                       data_ptr0, s, t, r,
+                                       &colors0_lo, &colors0_hi);
+      }
+      else {
+         assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+         lp_build_sample_image_linear(bld,
+                                      size0,
+                                      row_stride0_vec, img_stride0_vec,
+                                      data_ptr0, s, t, r,
+                                      &colors0_lo, &colors0_hi);
+      }
    }
 
    /* Store the first level's colors in the output variables */
@@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    LLVMBuildStore(builder, colors0_hi, colors_hi_var);
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0);
-      LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32);
+      LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
+                                                     bld->perquadf_bld.type, 256.0);
+      LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
+      unsigned num_quads = bld->coord_bld.type.length / 4;
+      unsigned i;
 
-      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
-      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
+      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
 
       /* need_lerp = lod_fpart > 0 */
-      need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
-                                lod_fpart, LLVMConstNull(i32_type),
-                                "need_lerp");
+      if (num_quads == 1) {
+         need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+                                   lod_fpart, bld->perquadi_bld.zero,
+                                   "need_lerp");
+      }
+      else {
+         /*
+          * We'll do mip filtering if any of the quads need it.
+          * It might be better to split the vectors here and only fetch/filter
+          * quads which need it.
+          */
+         /*
+          * We need to clamp lod_fpart here since we can get negative
+          * values which would screw up filtering if not all
+          * lod_fpart values have same sign.
+          * We can however then skip the greater than comparison.
+          */
+         lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
+                                  bld->perquadi_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+      }
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
       {
          struct lp_build_context h16_bld;
 
-         lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
+         lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
 
          /* sample the second mipmap level */
          lp_build_mipmap_level_sizes(bld, ilevel1,
                                      &size1,
                                      &row_stride1_vec, &img_stride1_vec);
          data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
-         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-            lp_build_sample_image_nearest(bld,
-                                          size1,
-                                          row_stride1_vec, img_stride1_vec,
-                                          data_ptr1, s, t, r,
-                                          &colors1_lo, &colors1_hi);
+
+         if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+            if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+               lp_build_sample_image_nearest_afloat(bld,
+                                                    size1,
+                                                    row_stride1_vec, img_stride1_vec,
+                                                    data_ptr1, s, t, r,
+                                                    &colors1_lo, &colors1_hi);
+            }
+            else {
+               lp_build_sample_image_linear_afloat(bld,
+                                                   size1,
+                                                   row_stride1_vec, img_stride1_vec,
+                                                   data_ptr1, s, t, r,
+                                                   &colors1_lo, &colors1_hi);
+            }
          }
          else {
-            lp_build_sample_image_linear(bld,
-                                         size1,
-                                         row_stride1_vec, img_stride1_vec,
-                                         data_ptr1, s, t, r,
-                                         &colors1_lo, &colors1_hi);
+            if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+               lp_build_sample_image_nearest(bld,
+                                             size1,
+                                             row_stride1_vec, img_stride1_vec,
+                                             data_ptr1, s, t, r,
+                                             &colors1_lo, &colors1_hi);
+            }
+            else {
+               lp_build_sample_image_linear(bld,
+                                            size1,
+                                            row_stride1_vec, img_stride1_vec,
+                                            data_ptr1, s, t, r,
+                                            &colors1_lo, &colors1_hi);
+            }
          }
 
          /* interpolate samples from the two mipmap levels */
 
-         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
-         lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+         if (num_quads == 1) {
+            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+            lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
 
 #if HAVE_LLVM == 0x208
-         /* This is a work-around for a bug in LLVM 2.8.
-          * Evidently, something goes wrong in the construction of the
-          * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
-          * to force the vector to be properly constructed.
-          * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
-          */
-         {
-            LLVMValueRef shuffles[8], shuffle;
-            int i;
-            assert(h16_bld.type.length <= Elements(shuffles));
-            for (i = 0; i < h16_bld.type.length; i++)
-               shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
-            shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
-            lod_fpart = LLVMBuildShuffleVector(builder,
-                                               lod_fpart, lod_fpart,
-                                               shuffle, "");
-         }
+            /* This is a work-around for a bug in LLVM 2.8.
+             * Evidently, something goes wrong in the construction of the
+             * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
+             * to force the vector to be properly constructed.
+             * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+             */
+            {
+               LLVMValueRef shuffles[8], shuffle;
+               assert(h16_bld.type.length <= Elements(shuffles));
+               for (i = 0; i < h16_bld.type.length; i++)
+                  shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
+               shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+               lod_fpart = LLVMBuildShuffleVector(builder,
+                                                  lod_fpart, lod_fpart,
+                                                  shuffle, "");
+            }
 #endif
 
-         colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
-                                    colors0_lo, colors1_lo);
-         colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
-                                    colors0_hi, colors1_hi);
+            colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+                                       colors0_lo, colors1_lo);
+            colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+                                       colors0_hi, colors1_hi);
+         }
+         else {
+            LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
+            struct lp_type perquadi16_type = bld->perquadi_bld.type;
+            perquadi16_type.width /= 2;
+            perquadi16_type.length *= 2;
+            lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
+                                         lp_build_vec_type(bld->gallivm,
+                                                           perquadi16_type), "");
+            /* XXX this only works for exactly 2 quads. More quads need shuffle */
+            assert(num_quads == 2);
+            for (i = 0; i < num_quads; i++) {
+               LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
+               lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
+                                                         perquadi16_type,
+                                                         h16_bld.type,
+                                                         lod_fpart,
+                                                         indexi2);
+            }
+            colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
+                                       colors0_lo, colors1_lo);
+            colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
+                                       colors0_hi, colors1_hi);
+         }
 
          LLVMBuildStore(builder, colors0_lo, colors_lo_var);
          LLVMBuildStore(builder, colors0_hi, colors_hi_var);
@@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     LLVMValueRef s,
                     LLVMValueRef t,
                     LLVMValueRef r,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
-                    LLVMValueRef lod_bias, /* optional */
-                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef lod_ipart,
+                    LLVMValueRef lod_fpart,
+                    LLVMValueRef ilevel0,
+                    LLVMValueRef ilevel1,
                     LLVMValueRef texel_out[4])
 {
    struct lp_build_context *int_bld = &bld->int_bld;
@@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
    const unsigned dims = bld->dims;
-   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL;
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
-   LLVMValueRef face_ddx[4], face_ddy[4];
    struct lp_build_context h16_bld;
-   LLVMValueRef first_level;
-   LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
 
    /* we only support the common/simple wrap modes at this time */
    assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
 
 
    /* make 16-bit fixed-pt builder context */
-   lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
-
-   /* cube face selection, compute pre-face coords, etc. */
-   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
-      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-
-      /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
-      face_ddx[2] = NULL;
-      face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
-      face_ddy[2] = NULL;
-      face_ddy[3] = NULL;
-      ddx = face_ddx;
-      ddy = face_ddy;
-   }
-
-   /*
-    * Compute the level of detail (float).
-    */
-   if (min_filter != mag_filter ||
-       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
-      /* Need to compute lod either to choose mipmap levels or to
-       * distinguish between minification/magnification with one mipmap level.
-       */
-      lp_build_lod_selector(bld, unit, ddx, ddy,
-                            lod_bias, explicit_lod,
-                            mip_filter,
-                            &lod_ipart, &lod_fpart);
-   } else {
-      lod_ipart = i32t_zero;
-   }
-
-   /*
-    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
-    */
-   switch (mip_filter) {
-   default:
-      assert(0 && "bad mip_filter value in lp_build_sample_aos()");
-      /* fall-through */
-   case PIPE_TEX_MIPFILTER_NONE:
-      /* always use mip level 0 */
-      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
-          * We should be able to set ilevel0 = const(0) but that causes
-          * bad x86 code to be emitted.
-          */
-         assert(lod_ipart);
-         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
-      }
-      else {
-         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
-                                                       bld->gallivm, unit);
-         ilevel0 = first_level;
-      }
-      break;
-   case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod_ipart);
-      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
-      break;
-   case PIPE_TEX_MIPFILTER_LINEAR:
-      assert(lod_ipart);
-      assert(lod_fpart);
-      lp_build_linear_mip_levels(bld, unit,
-                                 lod_ipart, &lod_fpart,
-                                 &ilevel0, &ilevel1);
-      break;
-   }
+   lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
 
    /*
     * Get/interpolate texture colors.
@@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
    packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
 
    if (min_filter == mag_filter) {
-      /* no need to distinquish between minification and magnification */
+      /* no need to distinguish between minification and magnification */
       lp_build_sample_mipmap(bld,
                              min_filter, mip_filter,
                              s, t, r,
@@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
     * into 'packed'
     */
    packed = lp_build_pack2(bld->gallivm,
-                           h16_bld.type, lp_type_unorm(8),
+                           h16_bld.type, lp_type_unorm(8, bld->vector_width),
                            LLVMBuildLoad(builder, packed_lo, ""),
                            LLVMBuildLoad(builder, packed_hi, ""));
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index 5d9ecac4d50..55b3bc1c09a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     LLVMValueRef s,
                     LLVMValueRef t,
                     LLVMValueRef r,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
-                    LLVMValueRef lod_bias, /* optional */
-                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef lod_ipart,
+                    LLVMValueRef lod_fpart,
+                    LLVMValueRef ilevel0,
+                    LLVMValueRef ilevel1,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 73dc3e77083..aaef7970635 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -41,6 +41,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -57,6 +58,7 @@
 #include "lp_bld_sample_aos.h"
 #include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
 
 
 /**
@@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
+ * Helper to compute the first coord and the weight for
+ * linear wrap repeat npot textures
+ */
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+                                  LLVMValueRef coord_f,
+                                  LLVMValueRef length_i,
+                                  LLVMValueRef length_f,
+                                  LLVMValueRef *coord0_i,
+                                  LLVMValueRef *weight_f)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
+                                                int_coord_bld->one);
+   LLVMValueRef mask;
+   /* wrap with normalized floats is just fract */
+   coord_f = lp_build_fract(coord_bld, coord_f);
+   /* mul by size and subtract 0.5 */
+   coord_f = lp_build_mul(coord_bld, coord_f, length_f);
+   coord_f = lp_build_sub(coord_bld, coord_f, half);
+   /*
+    * we avoided the 0.5/length division before the repeat wrap,
+    * now need to fix up edge cases with selects
+    */
+   /* convert to int, compute lerp weight */
+   lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
+   mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
+                           PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
+   *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
+}
+
+
+/**
  * Build LLVM code for texture wrap mode for linear filtering.
  * \param x0_out  returns first integer texcoord
  * \param x1_out  returns second integer texcoord
@@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
-      /* mul by size and subtract 0.5 */
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_sub(coord_bld, coord, half);
-      /* convert to int, compute lerp weight */
-      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      /* repeat wrap */
       if (is_pot) {
+         /* mul by size and subtract 0.5 */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* repeat wrap */
          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
       }
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          LLVMValueRef mask;
-         coord0 = LLVMBuildAdd(builder, coord0, bias, "");
-         coord0 = LLVMBuildURem(builder, coord0, length, "");
-         mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+         lp_build_coord_repeat_npot_linear(bld, coord,
+                                           length, length_f,
+                                           &coord0, &weight);
+         mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
          coord1 = LLVMBuildAnd(builder,
-                              lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
-                              mask, "");
+                               lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+                               mask, "");
       }
       break;
 
@@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      icoord = lp_build_ifloor(coord_bld, coord);
-      if (is_pot)
+      if (is_pot) {
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         icoord = lp_build_ifloor(coord_bld, coord);
          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
+      }
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         icoord = LLVMBuildAdd(builder, icoord, bias, "");
-         icoord = LLVMBuildURem(builder, icoord, length, "");
+          /* take fraction, unnormalize */
+          coord = lp_build_fract_safe(coord_bld, coord);
+          coord = lp_build_mul(coord_bld, coord, length_f);
+          icoord = lp_build_itrunc(coord_bld, coord);
       }
       break;
 
@@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+      /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */
       {
          LLVMValueRef min, max;
 
@@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
+      unsigned num_quads = bld->coord_bld.type.length / 4;
 
       /* need_lerp = lod_fpart > 0 */
-      need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
-                                lod_fpart,
-                                bld->float_bld.zero,
-                                "need_lerp");
+      if (num_quads == 1) {
+         need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+                                   lod_fpart, bld->perquadf_bld.zero,
+                                   "need_lerp");
+      }
+      else {
+         /*
+          * We'll do mip filtering if any of the quads need it.
+          * It might be better to split the vectors here and only fetch/filter
+          * quads which need it.
+          */
+         /*
+          * We unfortunately need to clamp lod_fpart here since we can get
+          * negative values which would screw up filtering if not all
+          * lod_fpart values have same sign.
+          */
+         lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
+                                  bld->perquadf_bld.zero);
+         need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+                                      PIPE_FUNC_GREATER,
+                                      lod_fpart, bld->perquadf_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
+     }
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
       {
@@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
          /* interpolate samples from the two mipmap levels */
 
-         lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+         lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+                                                           bld->perquadf_bld.type,
+                                                           bld->texel_bld.type,
+                                                           lod_fpart);
 
          for (chan = 0; chan < 4; chan++) {
             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    }
 }
 
-
-
 /**
- * General texture sampling codegen.
- * This function handles texture sampling for all texture targets (1D,
- * 2D, 3D, cube) and all filtering modes.
+ * Calculate cube face, lod, mip levels.
  */
 static void
-lp_build_sample_general(struct lp_build_sample_context *bld,
-                        unsigned unit,
-                        LLVMValueRef s,
-                        LLVMValueRef t,
-                        LLVMValueRef r,
-                        const LLVMValueRef *ddx,
-                        const LLVMValueRef *ddy,
-                        LLVMValueRef lod_bias, /* optional */
-                        LLVMValueRef explicit_lod, /* optional */
-                        LLVMValueRef *colors_out)
+lp_build_sample_common(struct lp_build_sample_context *bld,
+                       unsigned unit,
+                       LLVMValueRef *s,
+                       LLVMValueRef *t,
+                       LLVMValueRef *r,
+                       const struct lp_derivatives *derivs,
+                       LLVMValueRef lod_bias, /* optional */
+                       LLVMValueRef explicit_lod, /* optional */
+                       LLVMValueRef *lod_ipart,
+                       LLVMValueRef *lod_fpart,
+                       LLVMValueRef *ilevel0,
+                       LLVMValueRef *ilevel1)
 {
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const unsigned mip_filter = bld->static_state->min_mip_filter;
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
-   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef face_ddx[4], face_ddy[4];
-   LLVMValueRef texels[4];
    LLVMValueRef first_level;
-   LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
-   unsigned chan;
+   struct lp_derivatives face_derivs;
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
+      lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t);
+      *s = face_s; /* vec */
+      *t = face_t; /* vec */
       /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+      *r = face; /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
-      face_ddx[2] = NULL;
-      face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
-      face_ddy[2] = NULL;
-      face_ddy[3] = NULL;
-      ddx = face_ddx;
-      ddy = face_ddy;
+      face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t);
+      face_derivs.ddx_ddy[1] = NULL;
+      derivs = &face_derivs;
    }
 
    /*
@@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lp_build_lod_selector(bld, unit, ddx, ddy,
+      lp_build_lod_selector(bld, unit, derivs,
                             lod_bias, explicit_lod,
                             mip_filter,
-                            &lod_ipart, &lod_fpart);
+                            lod_ipart, lod_fpart);
    } else {
-      lod_ipart = i32t_zero;
+      *lod_ipart = bld->perquadi_bld.zero;
    }
 
    /*
@@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
          /* XXX this is a work-around for an apparent bug in LLVM 2.7.
           * We should be able to set ilevel0 = const(0) but that causes
           * bad x86 code to be emitted.
+          * XXX should probably disable that on other llvm versions.
           */
-         assert(lod_ipart);
-         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+         assert(*lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
       }
       else {
          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                        bld->gallivm, unit);
-         ilevel0 = first_level;
+         first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+         *ilevel0 = first_level;
       }
       break;
    case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod_ipart);
-      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+      assert(*lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
       break;
    case PIPE_TEX_MIPFILTER_LINEAR:
-      assert(lod_ipart);
-      assert(lod_fpart);
+      assert(*lod_ipart);
+      assert(*lod_fpart);
       lp_build_linear_mip_levels(bld, unit,
-                                 lod_ipart, &lod_fpart,
-                                 &ilevel0, &ilevel1);
+                                 *lod_ipart, lod_fpart,
+                                 ilevel0, ilevel1);
       break;
    }
+}
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+                        unsigned unit,
+                        LLVMValueRef s,
+                        LLVMValueRef t,
+                        LLVMValueRef r,
+                        LLVMValueRef lod_ipart,
+                        LLVMValueRef lod_fpart,
+                        LLVMValueRef ilevel0,
+                        LLVMValueRef ilevel1,
+                        LLVMValueRef *colors_out)
+{
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   LLVMValueRef texels[4];
+   unsigned chan;
 
    /*
     * Get/interpolate texture colors.
@@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    }
 
    if (min_filter == mag_filter) {
-      /* no need to distinquish between minification and magnification */
+      /* no need to distinguish between minification and magnification */
       lp_build_sample_mipmap(bld, unit,
                              min_filter, mip_filter,
                              s, t, r,
@@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
  * For debugging.
  */
 void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+                    struct lp_type type,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
                     LLVMValueRef texel_out[4])
 {
    LLVMValueRef one = lp_build_one(gallivm, type);
@@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
- * \param ddx  partial derivatives of (s,t,r,q) with respect to x
- * \param ddy  partial derivatives of (s,t,r,q) with respect to y
+ * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
  */
 void
 lp_build_sample_soa(struct gallivm_state *gallivm,
@@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef ddx[4],
-                    const LLVMValueRef ddy[4],
+                    const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
                     LLVMValueRef texel_out[4])
@@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    struct lp_build_sample_context bld;
    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
    LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tex_width, tex_height, tex_depth;
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
-   struct lp_type float_vec_type;
 
    if (0) {
       enum pipe_format fmt = static_state->format;
@@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    bld.format_desc = util_format_description(static_state->format);
    bld.dims = dims;
 
+   bld.vector_width = lp_type_width(type);
+
    bld.float_type = lp_type_float(32);
    bld.int_type = lp_type_int(32);
    bld.coord_type = type;
@@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    bld.float_size_type.length = dims > 1 ? 4 : 1;
    bld.int_size_type = lp_int_type(bld.float_size_type);
    bld.texel_type = type;
-
-   float_vec_type = lp_type_float_vec(32);
+   bld.perquadf_type = type;
+   /* we want native vector size to be able to use our intrinsics */
+   bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+   bld.perquadi_type = lp_int_type(bld.perquadf_type);
 
    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
-   lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type);
+   lp_build_context_init(&bld.float_vec_bld, gallivm, type);
    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
+   lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
+   lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
 
    /* Get the dynamic state */
-   bld.width = dynamic_state->width(dynamic_state, gallivm, unit);
-   bld.height = dynamic_state->height(dynamic_state, gallivm, unit);
-   bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit);
+   tex_width = dynamic_state->width(dynamic_state, gallivm, unit);
+   tex_height = dynamic_state->height(dynamic_state, gallivm, unit);
+   tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit);
    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit);
    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit);
    bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit);
@@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
 
    /* width, height, depth as single int vector */
    if (dims <= 1) {
-      bld.int_size = bld.width;
+      bld.int_size = tex_width;
    }
    else {
       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
-                                            bld.width, LLVMConstInt(i32t, 0, 0), "");
+                                            tex_width, LLVMConstInt(i32t, 0, 0), "");
       if (dims >= 2) {
          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
-                                               bld.height, LLVMConstInt(i32t, 1, 0), "");
+                                               tex_height, LLVMConstInt(i32t, 1, 0), "");
          if (dims >= 3) {
             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
-                                                  bld.depth, LLVMConstInt(i32t, 2, 0), "");
+                                                  tex_depth, LLVMConstInt(i32t, 2, 0), "");
          }
       }
    }
 
    if (0) {
       /* For debug: no-op texture sampling */
-      lp_build_sample_nop(gallivm, bld.texel_type, texel_out);
-   }
-   else if (util_format_fits_8unorm(bld.format_desc) &&
-            lp_is_simple_wrap_mode(static_state->wrap_s) &&
-            lp_is_simple_wrap_mode(static_state->wrap_t)) {
-      /* do sampling/filtering with fixed pt arithmetic */
-      lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
-                          lod_bias, explicit_lod,
+      lp_build_sample_nop(gallivm,
+                          bld.texel_type,
+                          num_coords,
+                          coords,
                           texel_out);
    }
-
    else {
+      LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
+      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+      unsigned num_quads = type.length / 4;
+      const unsigned mip_filter = bld.static_state->min_mip_filter;
+      boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+                        lp_is_simple_wrap_mode(static_state->wrap_s) &&
+                        lp_is_simple_wrap_mode(static_state->wrap_t);
+
       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
-          util_format_fits_8unorm(bld.format_desc)) {
+          !use_aos && util_format_fits_8unorm(bld.format_desc)) {
          debug_printf("%s: using floating point linear filtering for %s\n",
                       __FUNCTION__, bld.format_desc->short_name);
          debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d\n",
@@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                       static_state->wrap_t);
       }
 
-      lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
-                              lod_bias, explicit_lod,
-                              texel_out);
+      lp_build_sample_common(&bld, unit,
+                             &s, &t, &r,
+                             derivs, lod_bias, explicit_lod,
+                             &lod_ipart, &lod_fpart,
+                             &ilevel0, &ilevel1);
+
+      /*
+       * we only try 8-wide sampling with soa as it appears to
+       * be a loss with aos with AVX.
+       */
+      if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+                             !use_aos)) {
+
+         if (num_quads > 1) {
+            LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+            /* These parameters are the same for all quads */
+            lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+            ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+         }
+         if (use_aos) {
+            /* do sampling/filtering with fixed pt arithmetic */
+            lp_build_sample_aos(&bld, unit,
+                                s, t, r,
+                                lod_ipart, lod_fpart,
+                                ilevel0, ilevel1,
+                                texel_out);
+         }
+
+         else {
+            lp_build_sample_general(&bld, unit,
+                                    s, t, r,
+                                    lod_ipart, lod_fpart,
+                                    ilevel0, ilevel1,
+                                    texel_out);
+         }
+      }
+      else {
+         struct lp_build_if_state if_ctx;
+         LLVMValueRef notsame_levels, notsame;
+         LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+         LLVMValueRef texels[4];
+         LLVMValueRef texelout[4];
+         unsigned j;
+
+         texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr");
+         texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg");
+         texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb");
+         texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa");
+
+         /* only build the if if we MAY split, otherwise always split */
+         if (!use_aos) {
+            notsame = lp_build_extract_broadcast(gallivm,
+                                                 bld.perquadi_bld.type,
+                                                 bld.perquadi_bld.type,
+                                                 ilevel0, index0);
+            notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame);
+            notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads,
+                                                     notsame);
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+               notsame = lp_build_extract_broadcast(gallivm,
+                                                    bld.perquadi_bld.type,
+                                                    bld.perquadi_bld.type,
+                                                    ilevel1, index0);
+               notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame);
+               notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame);
+               notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, "");
+            }
+            lp_build_if(&if_ctx, gallivm, notsame_levels);
+         }
+
+         {
+            struct lp_build_sample_context bld4;
+            struct lp_type type4 = type;
+            unsigned i;
+            LLVMValueRef texelout4[4];
+            LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
+
+            type4.length = 4;
+
+            /* Setup our build context */
+            memset(&bld4, 0, sizeof bld4);
+            bld4.gallivm = bld.gallivm;
+            bld4.static_state = bld.static_state;
+            bld4.dynamic_state = bld.dynamic_state;
+            bld4.format_desc = bld.format_desc;
+            bld4.dims = bld.dims;
+            bld4.row_stride_array = bld.row_stride_array;
+            bld4.img_stride_array = bld.img_stride_array;
+            bld4.data_array = bld.data_array;
+            bld4.int_size = bld.int_size;
+
+            bld4.vector_width = lp_type_width(type4);
+
+            bld4.float_type = lp_type_float(32);
+            bld4.int_type = lp_type_int(32);
+            bld4.coord_type = type4;
+            bld4.int_coord_type = lp_int_type(type4);
+            bld4.float_size_type = lp_type_float(32);
+            bld4.float_size_type.length = dims > 1 ? 4 : 1;
+            bld4.int_size_type = lp_int_type(bld4.float_size_type);
+            bld4.texel_type = type4;
+            bld4.perquadf_type = type4;
+            /* we want native vector size to be able to use our intrinsics */
+            bld4.perquadf_type.length = 1;
+            bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+
+            lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
+            lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
+            lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
+            lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
+            lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+            lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
+            lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
+            lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
+            lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
+            lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+
+            for (i = 0; i < num_quads; i++) {
+               LLVMValueRef s4, t4, r4;
+               LLVMValueRef lod_iparts, lod_fparts = NULL;
+               LLVMValueRef ilevel0s, ilevel1s = NULL;
+               LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+
+               s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
+               t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
+               r4 = lp_build_extract_range(gallivm, r, 4*i, 4);
+               lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
+               ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+               if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+                  ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
+                  lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+               }
+
+               if (use_aos) {
+                  /* do sampling/filtering with fixed pt arithmetic */
+                  lp_build_sample_aos(&bld4, unit,
+                                      s4, t4, r4,
+                                      lod_iparts, lod_fparts,
+                                      ilevel0s, ilevel1s,
+                                      texelout4);
+               }
+
+               else {
+                  lp_build_sample_general(&bld4, unit,
+                                          s4, t4, r4,
+                                          lod_iparts, lod_fparts,
+                                          ilevel0s, ilevel1s,
+                                          texelout4);
+               }
+               for (j = 0; j < 4; j++) {
+                  texelouttmp[j][i] = texelout4[j];
+               }
+            }
+            for (j = 0; j < 4; j++) {
+               texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
+               LLVMBuildStore(builder, texelout[j], texels[j]);
+            }
+         }
+         if (!use_aos) {
+            LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL;
+
+            lp_build_else(&if_ctx);
+
+            /* These parameters are the same for all quads */
+            lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+            ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+               ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, "");
+            }
+
+            if (use_aos) {
+               /* do sampling/filtering with fixed pt arithmetic */
+               lp_build_sample_aos(&bld, unit,
+                                   s, t, r,
+                                   lod_iparts, lod_fpart,
+                                   ilevel0s, ilevel1s,
+                                   texelout);
+            }
+
+            else {
+               lp_build_sample_general(&bld, unit,
+                                       s, t, r,
+                                       lod_iparts, lod_fpart,
+                                       ilevel0s, ilevel1s,
+                                       texelout);
+            }
+            for (j = 0; j < 4; j++) {
+               LLVMBuildStore(builder, texelout[j], texels[j]);
+            }
+
+            lp_build_endif(&if_ctx);
+         }
+
+         for (j = 0; j < 4; j++) {
+            texel_out[j] = LLVMBuildLoad(builder, texels[j], "");
+         }
+      }
    }
 
    lp_build_sample_compare(&bld, r, texel_out);
@@ -1283,6 +1559,7 @@ void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                         const struct lp_sampler_static_state *static_state,
                         struct lp_sampler_dynamic_state *dynamic_state,
+                        struct lp_type int_type,
                         unsigned unit,
                         LLVMValueRef explicit_lod,
                         LLVMValueRef *sizes_out)
@@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
       return;
    }
 
-   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32));
+   assert(!int_type.floating);
+
+   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));
 
    if (explicit_lod) {
       LLVMValueRef first_level;
@@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
    size = lp_build_minify(&bld_int_vec, size, lod);
 
    for (i=0; i < dims; i++) {
-      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type,
+      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type,
                                                 size,
                                                 lp_build_const_int32(gallivm, i));
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 5d4406812c7..641c960431d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -40,6 +40,7 @@
 #include "lp_bld_init.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
 
 
 LLVMValueRef
@@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
 
 
 /**
- * Combined extract and broadcast (or a mere shuffle when the two types match)
+ * Combined extract and broadcast (mere shuffle in most cases)
  */
 LLVMValueRef
 lp_build_extract_broadcast(struct gallivm_state *gallivm,
@@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
       }
    }
    else {
-      if (dst_type.length == src_type.length) {
+      if (dst_type.length > 1) {
          /*
-          * Special shuffle of the same size.
+          * shuffle - result can be of different length.
           */
 
          LLVMValueRef shuffle;
@@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
                                       LLVMVectorType(i32t, dst_type.length),
                                       index);
          res = LLVMBuildShuffleVector(gallivm->builder, vector,
-                                      LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)),
+                                      LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
                                       shuffle, "");
       }
       else {
-         LLVMValueRef scalar;
-         scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
-         if (dst_type.length == 1) {
-            /*
-             * Trivial extract scalar from vector.
-             */
-
-            res = scalar;
-         }
-         else {
-            /*
-             * General case of different sized vectors.
-             */
-
-            res = lp_build_broadcast(gallivm,
-                                     lp_build_vec_type(gallivm, dst_type),
-                                     vector);
-         }
+         /*
+          * Trivial extract scalar from vector.
+          */
+          res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
       }
    }
 
@@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
          return bld->zero;
       case PIPE_SWIZZLE_ONE:
          return bld->one;
+      case LP_BLD_SWIZZLE_DONTCARE:
+         return bld->undef;
       default:
          assert(0);
          return bld->undef;
@@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
             case PIPE_SWIZZLE_BLUE:
             case PIPE_SWIZZLE_ALPHA:
                shuffle = j + swizzles[i];
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
                break;
             case PIPE_SWIZZLE_ZERO:
                shuffle = type.length + 0;
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
                if (!aux[0]) {
                   aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
                }
                break;
             case PIPE_SWIZZLE_ONE:
                shuffle = type.length + 1;
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
                if (!aux[1]) {
                   aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
                }
                break;
+            case LP_BLD_SWIZZLE_DONTCARE:
+               shuffles[j + i] = LLVMGetUndef(i32t);
+               break;
             }
-            shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
          }
       }
 
@@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
 
    lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
 }
+
+
+/**
+ * Transpose from AOS <-> SOA
+ *
+ * @param single_type_lp   type of pixels
+ * @param src              the 4 * n pixel input
+ * @param dst              the 4 * n pixel output
+ */
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+                       struct lp_type single_type_lp,
+                       const LLVMValueRef src[4],
+                       LLVMValueRef dst[4])
+{
+   struct lp_type double_type_lp = single_type_lp;
+   LLVMTypeRef single_type;
+   LLVMTypeRef double_type;
+   LLVMValueRef t0, t1, t2, t3;
+
+   double_type_lp.length >>= 1;
+   double_type_lp.width  <<= 1;
+
+   double_type = lp_build_vec_type(gallivm, double_type_lp);
+   single_type = lp_build_vec_type(gallivm, single_type_lp);
+
+   /* Interleave x, y, z, w -> xy and zw */
+   t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0);
+   t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0);
+   t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1);
+   t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1);
+
+   /* Cast to double width type for second interleave */
+   t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
+   t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
+   t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
+   t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
+
+   /* Interleave xy, zw -> xyzw */
+   dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
+   dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
+   dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
+   dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
+
+   /* Cast back to original single width type */
+   dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
+   dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
+   dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
+   dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
+}
+
+
+/**
+ * Pack first element of aos values,
+ * pad out to destination size.
+ * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ */
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+                          struct lp_type src_type,
+                          struct lp_type dst_type,
+                          const LLVMValueRef src)
+{
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef undef = LLVMGetUndef(i32t);
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   unsigned num_src = src_type.length / 4;
+   unsigned num_dst = dst_type.length;
+   unsigned i;
+
+   assert(num_src <= num_dst);
+
+   for (i = 0; i < num_src; i++) {
+      shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+   }
+   for (i = num_src; i < num_dst; i++) {
+      shuffles[i] = undef;
+   }
+
+   if (num_dst == 1) {
+      return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(shuffles, num_dst), "");
+   }
+}
+
+
+/**
+ * Unpack and broadcast packed aos values consisting of only the
+ * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
+ */
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      struct lp_type dst_type,
+                                      const LLVMValueRef src)
+{
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   unsigned num_dst = dst_type.length;
+   unsigned num_src = dst_type.length / 4;
+   unsigned i;
+
+   assert(num_dst / 4 <= src_type.length);
+
+   for (i = 0; i < num_src; i++) {
+      shuffles[i*4] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
+   }
+
+   if (num_src == 1) {
+      return lp_build_extract_broadcast(gallivm, src_type, dst_type,
+                                        src, shuffles[0]);
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(shuffles, num_dst), "");
+   }
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index c366a65103e..0bf4ce988a2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -44,6 +44,9 @@ struct lp_type;
 struct lp_build_context;
 
 
+#define LP_BLD_SWIZZLE_DONTCARE 0xFF
+
+
 LLVMValueRef
 lp_build_broadcast(struct gallivm_state *gallivm,
                    LLVMTypeRef vec_type,
@@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
                              const unsigned char swizzles[4]);
 
 
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+                       struct lp_type type,
+                       const LLVMValueRef src[4],
+                       LLVMValueRef dst[4]);
+
+
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+                          struct lp_type src_type,
+                          struct lp_type dst_type,
+                          const LLVMValueRef src);
+
+
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      struct lp_type dst_type,
+                                      const LLVMValueRef src);
+
+
 #endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 4423bc5dedd..e292420a61a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -60,6 +60,7 @@ struct tgsi_token;
 struct tgsi_shader_info;
 struct lp_build_mask_context;
 struct gallivm_state;
+struct lp_derivatives;
 
 
 enum lp_build_tex_modifier {
@@ -174,8 +175,7 @@ struct lp_build_sampler_soa
                         unsigned unit,
                         unsigned num_coords,
                         const LLVMValueRef *coords,
-                        const LLVMValueRef *ddx,
-                        const LLVMValueRef *ddy,
+                        const struct lp_derivatives *derivs,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef *texel);
@@ -183,6 +183,7 @@ struct lp_build_sampler_soa
    void
    (*emit_size_query)( const struct lp_build_sampler_soa *sampler,
                        struct gallivm_state *gallivm,
+                       struct lp_type type,
                        unsigned unit,
                        LLVMValueRef explicit_lod, /* optional */
                        LLVMValueRef *sizes_out);
@@ -197,8 +198,7 @@ struct lp_build_sampler_aos
                         unsigned target, /* TGSI_TEXTURE_* */
                         unsigned unit,
                         LLVMValueRef coords,
-                        LLVMValueRef ddx,
-                        LLVMValueRef ddy,
+                        const struct lp_derivatives derivs,
                         enum lp_build_tex_modifier modifier);
 };
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 24bc13a9be8..0666bba7fbd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -56,6 +56,7 @@
 #include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_sample.h"
 
 
 /**
@@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
    LLVMValueRef coords;
    LLVMValueRef ddx;
    LLVMValueRef ddy;
+   struct lp_derivatives derivs;
 
    if (!bld->sampler) {
       _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
 
    coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL);
 
-   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+   if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
       ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL);
       ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL);
       unit = inst->Src[3].Register.Index;
@@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
       ddy = lp_build_ddy( &bld->bld_base.base, coords );
 #else
       /* TODO */
-      ddx = bld->bld_base.base.one;
-      ddy = bld->bld_base.base.one;
+      derivs.ddx_ddy[0] = bld->bld_base.base.one;
+      derivs.ddx_ddy[1] = bld->bld_base.base.one;
 #endif
       unit = inst->Src[1].Register.Index;
    }
@@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
    return bld->sampler->emit_fetch_texel(bld->sampler,
                                          &bld->bld_base.base,
                                          target, unit,
-                                         coords, ddx, ddy,
+                                         coords, derivs,
                                          modifier);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index d9faaf20273..85a4401b534 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -62,6 +62,7 @@
 #include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_printf.h"
+#include "lp_bld_sample.h"
 
 
 static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -763,7 +764,7 @@ emit_fetch_temporary(
    else {
       LLVMValueRef temp_ptr;
       if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
-         LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+         LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
          LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
                                                      swizzle);
          temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
@@ -1068,7 +1069,7 @@ emit_store_chan(
          switch (dtype) {
          case TGSI_TYPE_UNSIGNED:
          case TGSI_TYPE_SIGNED: {
-            LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+            LLVMTypeRef itype = bld_base->int_bld.vec_type;
             LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
             LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
                                                         chan_index);
@@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
           LLVMValueRef *texel)
 {
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    unsigned unit;
    LLVMValueRef lod_bias, explicit_lod;
    LLVMValueRef oow = NULL;
    LLVMValueRef coords[3];
-   LLVMValueRef ddx[3];
-   LLVMValueRef ddy[3];
+   struct lp_derivatives derivs;
    unsigned num_coords;
+   unsigned dims;
    unsigned i;
 
    if (!bld->sampler) {
@@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       return;
    }
 
+   derivs.ddx_ddy[0] = bld->bld_base.base.undef;
+   derivs.ddx_ddy[1] = bld->bld_base.base.undef;
+
    switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
       num_coords = 1;
+      dims = 1;
       break;
    case TGSI_TEXTURE_1D_ARRAY:
+      num_coords = 2;
+      dims = 1;
+      break;
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
       num_coords = 2;
+      dims = 2;
       break;
    case TGSI_TEXTURE_SHADOW1D:
    case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      num_coords = 3;
+      dims = 1;
+      break;
    case TGSI_TEXTURE_SHADOW2D:
    case TGSI_TEXTURE_SHADOWRECT:
    case TGSI_TEXTURE_2D_ARRAY:
-   case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
       num_coords = 3;
+      dims = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+      num_coords = 3;
+      dims = 3;
       break;
    case TGSI_TEXTURE_SHADOW2D_ARRAY:
       num_coords = 4;
+      dims = 2;
       break;
    default:
       assert(0);
@@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
-      LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
-      for (i = 0; i < num_coords; i++) {
-         LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i );
-         LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i );
-         ddx[i] = LLVMBuildExtractElement(builder, src1, index0, "");
-         ddy[i] = LLVMBuildExtractElement(builder, src2, index0, "");
+      LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef ddxdyonec[3];
+      unsigned length = bld->bld_base.base.type.length;
+      unsigned num_quads = length / 4;
+      unsigned dim;
+      unsigned quad;
+
+      for (dim = 0; dim < dims; ++dim) {
+         LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
+         LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
+         for (quad = 0; quad < num_quads; ++quad) {
+            unsigned s1 = 4*quad;
+            unsigned s2 = 4*quad + length;
+            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
+            shuffles[4*quad + 2] = i32undef;
+            shuffles[4*quad + 3] = i32undef;
+         }
+         ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
+                                               LLVMConstVector(shuffles, length), "");
+      }
+      if (dims == 1) {
+         derivs.ddx_ddy[0] = ddxdyonec[0];
+      }
+      else if (dims >= 2) {
+         for (quad = 0; quad < num_quads; ++quad) {
+            unsigned s1 = 4*quad;
+            unsigned s2 = 4*quad + length;
+            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
+            shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
+            shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
+         }
+         derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
+                                                  LLVMConstVector(shuffles, length), "");
+         if (dims == 3) {
+            derivs.ddx_ddy[1] = ddxdyonec[2];
+         }
       }
       unit = inst->Src[3].Register.Index;
    }  else {
-      for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] );
-         ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] );
+      if (dims == 1) {
+         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+      }
+      else if (dims >= 2) {
+         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
+                                                            coords[0], coords[1]);
+         if (dims == 3) {
+            derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+         }
       }
       unit = inst->Src[1].Register.Index;
    }
-   for (i = num_coords; i < 3; i++) {
-      ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
-      ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
-   }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.base.type,
                                   unit, num_coords, coords,
-                                  ddx, ddy,
+                                  &derivs,
                                   lod_bias, explicit_lod,
                                   texel);
 }
@@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld,
 
    bld->sampler->emit_size_query(bld->sampler,
                                  bld->bld_base.base.gallivm,
+                                 bld->bld_base.int_bld.type,
                                  inst->Src[1].Register.Index,
                                  explicit_lod,
                                  sizes_out);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index 413e69bedac..6c3aa38bfb1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return LLVMIntTypeInContext(gallivm->context, 16);
+         break;
       case 32:
          return LLVMFloatTypeInContext(gallivm->context);
          break;
@@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
 
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         if(elem_kind != LLVMIntegerTypeKind)
+            return FALSE;
+         break;
       case 32:
          if(elem_kind != LLVMFloatTypeKind)
             return FALSE;
@@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type)
 
 
 /**
- * Build int32[4] vector type
- */
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm)
-{
-   struct lp_type t;
-   LLVMTypeRef type;
-
-   memset(&t, 0, sizeof(t));
-   t.floating = FALSE; /* floating point values */
-   t.sign = TRUE;      /* values are signed */
-   t.norm = FALSE;     /* values are not limited to [0,1] or [-1,1] */
-   t.width = 32;       /* 32-bit int */
-   t.length = 4;       /* 4 elements per vector */
-
-   type = lp_build_int_elem_type(gallivm, t);
-   return LLVMVectorType(type, t.length);
-}
-
-
-/**
  * Create element of vector type
  */
 struct lp_type
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index f11a190e7cc..75310e05f3e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -40,21 +40,35 @@
 #include "pipe/p_compiler.h"
 #include "gallivm/lp_bld.h"
 
+/**
+ * Native SIMD architecture width available at runtime.
+ *
+ * Using this width should give the best performance,
+ * and it determines the necessary alignment of vector variables.
+ */
+extern unsigned lp_native_vector_width;
 
+/**
+ * Maximum supported vector width (not necessarily supported at run-time).
+ *
+ * Should only be used when lp_native_vector_width isn't available,
+ * i.e. sizing/alignment of non-malloced variables.
+ */
+#define LP_MAX_VECTOR_WIDTH 256
 
 /**
- * Native SIMD register width.
+ * Minimum vector alignment for static variable alignment
  *
- * 128 for all architectures we care about.
+ * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8.  An
+ * expression is non-portable.
  */
-#define LP_NATIVE_VECTOR_WIDTH 128
+#define LP_MIN_VECTOR_ALIGN 32
 
 /**
  * Several functions can only cope with vectors of length up to this value.
  * You may need to increase that value if you want to represent bigger vectors.
  */
-#define LP_MAX_VECTOR_LENGTH 16
-
+#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8)
 
 /**
  * The LLVM type system can't conveniently express all the things we care about
@@ -151,6 +165,13 @@ struct lp_build_context
 };
 
 
+static INLINE unsigned
+lp_type_width(struct lp_type type)
+{
+   return type.width * type.length;
+}
+
+
 /** Create scalar float type */
 static INLINE struct lp_type
 lp_type_float(unsigned width)
@@ -169,7 +190,7 @@ lp_type_float(unsigned width)
 
 /** Create vector of float type */
 static INLINE struct lp_type
-lp_type_float_vec(unsigned width)
+lp_type_float_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
@@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width)
    res_type.floating = TRUE;
    res_type.sign = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
@@ -200,14 +221,14 @@ lp_type_int(unsigned width)
 
 /** Create vector int type */
 static INLINE struct lp_type
-lp_type_int_vec(unsigned width)
+lp_type_int_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.sign = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
@@ -229,34 +250,34 @@ lp_type_uint(unsigned width)
 
 /** Create vector uint type */
 static INLINE struct lp_type
-lp_type_uint_vec(unsigned width)
+lp_type_uint_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
 
 
 static INLINE struct lp_type
-lp_type_unorm(unsigned width)
+lp_type_unorm(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.norm = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
 
 
 static INLINE struct lp_type
-lp_type_fixed(unsigned width)
+lp_type_fixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
@@ -264,21 +285,21 @@ lp_type_fixed(unsigned width)
    res_type.sign = TRUE;
    res_type.fixed = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
 
 
 static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
+lp_type_ufixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.fixed = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
@@ -312,10 +333,6 @@ LLVMTypeRef
 lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
 
 
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm);
-
-
 static INLINE struct lp_type
 lp_float32_vec4_type(void)
 {