27 files changed, 1058 insertions, 1019 deletions
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore
index f6973b54d2c..21cd3cf2ed2 100644
--- a/src/gallium/drivers/llvmpipe/.gitignore
+++ b/src/gallium/drivers/llvmpipe/.gitignore
@@ -4,4 +4,3 @@ lp_test_blend
 lp_test_conv
 lp_test_format
 lp_test_printf
-lp_test_round
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 26fbde9a169..ef16fc7d882 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -55,8 +55,7 @@ PROGS := lp_test_format	\
 	 lp_test_arit	\
 	 lp_test_blend	\
 	 lp_test_conv	\
-	 lp_test_printf \
-	 lp_test_round
+	 lp_test_printf
 
 # Need this for the lp_test_*.o files
 CLEAN_EXTRA = *.o
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 85560a1c716..cea44a78679 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -94,7 +94,6 @@ if not env['embedded']:
 
     if not env['msvc']:
         tests.append('arit')
-        tests.append('round')
 
     for test in tests:
         testname = 'lp_test_' + test
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 87a6a2751d4..8efa75c01d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -59,6 +59,7 @@
 
 #include "pipe/p_state.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_arit.h"
@@ -102,7 +103,16 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
    struct lp_type type = bld->type;
    LLVMValueRef res;
 
-   assert(type.sign);
+   /*
+    * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
+    * are between 0..255 so ensure we generate the fastest comparisons for
+    * wider elements.
+    */
+   if (type.width <= 8) {
+      assert(!type.sign);
+   } else {
+      assert(type.sign);
+   }
 
    assert(stencil->enabled);
 
@@ -424,29 +434,86 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
    LLVMBuilderRef builder = gallivm->builder;
    LLVMContextRef context = gallivm->context;
    LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
-   LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
-   LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8TypeInContext(context), 16);
-   LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti");
-   LLVMValueRef maskarray[4] = {
-      lp_build_const_int32(gallivm, 0),
-      lp_build_const_int32(gallivm, 4),
-      lp_build_const_int32(gallivm, 8),
-      lp_build_const_int32(gallivm, 12)
-   };
-   LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4);
-   LLVMValueRef shufflev =  LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev");
-   LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32TypeInContext(context), "shuffle");
-   LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32TypeInContext(context), shuffle);
-   LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig");
-   LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr");
-   LLVMBuildStore(builder, incr, counter);
+   LLVMValueRef count, newcount;
+
+   assert(type.length <= 16);
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse && type.length == 4) {
+      const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+      const char *popcntintr = "llvm.ctpop.i32";
+      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+                                           lp_build_vec_type(gallivm, type), "");
+      bits = lp_build_intrinsic_unary(builder, movmskintr,
+                                      LLVMInt32TypeInContext(context), bits);
+      count = lp_build_intrinsic_unary(builder, popcntintr,
+                                       LLVMInt32TypeInContext(context), bits);
+   }
+   else if(util_cpu_caps.has_avx && type.length == 8) {
+      const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+      const char *popcntintr = "llvm.ctpop.i32";
+      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+                                           lp_build_vec_type(gallivm, type), "");
+      bits = lp_build_intrinsic_unary(builder, movmskintr,
+                                      LLVMInt32TypeInContext(context), bits);
+      count = lp_build_intrinsic_unary(builder, popcntintr,
+                                       LLVMInt32TypeInContext(context), bits);
+   }
+   else {
+      unsigned i;
+      LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+      LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+      LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+      LLVMValueRef shufflev, countd;
+      LLVMValueRef shuffles[16];
+      const char *popcntintr = NULL;
+
+      countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+       for (i = 0; i < type.length; i++) {
+          shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+       }
+
+       shufflev = LLVMConstVector(shuffles, type.length);
+       countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+       countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+       /*
+        * XXX FIXME
+        * this is bad on cpus without popcount (on x86 supported by intel
+        * nehalem, amd barcelona, and up - not tied to sse42).
+        * Would be much faster to just sum the 4 elements of the vector with
+        * some horizontal add (shuffle/add/shuffle/add after the initial and).
+        */
+       switch (type.length) {
+       case 4:
+          popcntintr = "llvm.ctpop.i32";
+          break;
+       case 8:
+          popcntintr = "llvm.ctpop.i64";
+          break;
+       case 16:
+          popcntintr = "llvm.ctpop.i128";
+          break;
+       default:
+          assert(0);
+       }
+       count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+       if (type.length > 4) {
+          count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 32), "");
+       }
+   }
+   newcount = LLVMBuildLoad(builder, counter, "origcount");
+   newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+   LLVMBuildStore(builder, newcount, counter);
 }
 
 
 
 /**
  * Generate code for performing depth and/or stencil tests.
- * We operate on a vector of values (typically a 2x2 quad).
+ * We operate on a vector of values (typically n 2x2 quads).
  *
  * \param depth  the depth test state
  * \param stencil  the front/back stencil state
@@ -454,9 +521,9 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
  * \param format_desc  description of the depth/stencil surface
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param stencil_refs  the front/back stencil ref values (scalar)
- * \param z_src  the incoming depth/stencil values (a 2x2 quad, float32)
+ * \param z_src  the incoming depth/stencil values (n 2x2 quad values, float32)
  * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
- * \param facing  contains boolean value indicating front/back facing polygon
+ * \param face  contains boolean value indicating front/back facing polygon
  */
 void
 lp_build_depth_stencil_test(struct gallivm_state *gallivm,
@@ -507,6 +574,12 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
    assert(z_type.width == z_src_type.width);
    assert(z_type.length == z_src_type.length);
 
+   /* FIXME: for non-float depth/stencil might generate better code
+    * if we'd always split it up to use 128bit operations.
+    * For stencil we'd almost certainly want to pack to 8xi16 values,
+    * for z just run twice.
+    */
+
    /* Sanity checking */
    {
       const unsigned z_swizzle = format_desc->swizzle[0];
@@ -548,7 +621,7 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
    lp_build_context_init(&z_bld, gallivm, z_type);
 
    /* Setup build context for stencil vals */
-   s_type = lp_type_int_vec(z_type.width);
+   s_type = lp_int_type(z_type);
    lp_build_context_init(&s_bld, gallivm, s_type);
 
    /* Load current z/stencil value from z/stencil buffer */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 0d51ccb0349..d108f35f719 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -61,6 +61,9 @@
  * #   |   #   |   #
  * #################
  *
+ * If we iterate over multiple quads at once, quads 01 and 23 are processed
+ * together.
+ *
  * Within each quad, we have four pixels which are represented in SOA
  * order:
  *
@@ -72,6 +75,10 @@
  *
  * So the green channel (for example) of the four pixels is stored in
  * a single vector register: {g0, g1, g2, g3}.
+ * The order stays the same even with multiple quads:
+ * 0 1 4 5
+ * 2 3 6 7
+ * is stored as g0..g7
  */
 
 
@@ -102,8 +109,8 @@
 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
 
 
-static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
-static const unsigned char quad_offset_y[4] = {0, 0, 1, 1};
+static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
+static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
 
 
 static void
@@ -115,132 +122,353 @@ attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix
       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
 }
 
-
-/**
- * Initialize the bld->a0, dadx, dady fields.  This involves fetching
- * those values from the arrays which are passed into the JIT function.
+/* Much easier, and significantly less instructions in the per-stamp
+ * part (less than half) but overall more instructions so a loss if
+ * most quads are active. Might be a win though with larger vectors.
+ * No ability to do per-quad divide (doable but not implemented)
+ * Could be made to work with passed in pixel offsets (i.e. active quad merging).
  */
 static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
-            LLVMValueRef a0_ptr,
-            LLVMValueRef dadx_ptr,
-            LLVMValueRef dady_ptr)
+coeffs_init_simple(struct lp_build_interp_soa_context *bld,
+                   LLVMValueRef a0_ptr,
+                   LLVMValueRef dadx_ptr,
+                   LLVMValueRef dady_ptr)
 {
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
    struct gallivm_state *gallivm = coeff_bld->gallivm;
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef zero = LLVMConstNull(coeff_bld->elem_type);
-   LLVMValueRef one = LLVMConstReal(coeff_bld->elem_type, 1.0);
-   LLVMValueRef i0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef i1 = lp_build_const_int32(gallivm, 1);
-   LLVMValueRef i2 = lp_build_const_int32(gallivm, 2);
-   LLVMValueRef i3 = lp_build_const_int32(gallivm, 3);
    unsigned attrib;
-   unsigned chan;
-
-   /* TODO: Use more vector operations */
 
    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      /*
+       * always fetch all 4 values for performance/simplicity
+       * Note: we do that here because it seems to generate better
+       * code. It generates a lot of moves initially but less
+       * moves later. As far as I can tell this looks like a
+       * llvm issue, instead of simply reloading the values from
+       * the passed in pointers it if it runs out of registers
+       * it spills/reloads them. Maybe some optimization passes
+       * would help.
+       * Might want to investigate this again later.
+       */
+      const unsigned interp = bld->interp[attrib];
+      LLVMValueRef index = lp_build_const_int32(gallivm,
+                                attrib * TGSI_NUM_CHANNELS);
+      LLVMValueRef ptr;
+      LLVMValueRef dadxaos = setup_bld->zero;
+      LLVMValueRef dadyaos = setup_bld->zero;
+      LLVMValueRef a0aos = setup_bld->zero;
+
+      switch (interp) {
+      case LP_INTERP_PERSPECTIVE:
+         /* fall-through */
+
+      case LP_INTERP_LINEAR:
+         ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+         ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+         attrib_name(dadxaos, attrib, 0, ".dadxaos");
+         attrib_name(dadyaos, attrib, 0, ".dadyaos");
+         /* fall-through */
+
+      case LP_INTERP_CONSTANT:
+      case LP_INTERP_FACING:
+         ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         a0aos = LLVMBuildLoad(builder, ptr, "");
+         attrib_name(a0aos, attrib, 0, ".a0aos");
+         break;
+
+      case LP_INTERP_POSITION:
+         /* Nothing to do as the position coeffs are already setup in slot 0 */
+         continue;
+
+      default:
+         assert(0);
+         break;
+      }
+      bld->a0aos[attrib] = a0aos;
+      bld->dadxaos[attrib] = dadxaos;
+      bld->dadyaos[attrib] = dadyaos;
+   }
+}
+
+/**
+ * Interpolate the shader input attribute values.
+ * This is called for each (group of) quad(s).
+ */
+static void
+attribs_update_simple(struct lp_build_interp_soa_context *bld,
+                      struct gallivm_state *gallivm,
+                      int quad_start_index,
+                      int start,
+                      int end)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   LLVMValueRef oow = NULL;
+   unsigned attrib, i;
+   LLVMValueRef pixoffx;
+   LLVMValueRef pixoffy;
+   unsigned num_pix = coeff_bld->type.length;
+
+   /* could do this with code-generated passed in pixel offsets */
+   pixoffx = coeff_bld->undef;
+   pixoffy = coeff_bld->undef;
+   for (i = 0; i < coeff_bld->type.length; i++) {
+      LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+      LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
+                                                (quad_start_index & 1) * 2);
+      LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
+                                                (quad_start_index & 2));
+      pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+      pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
+   }
+
+   pixoffx = LLVMBuildFAdd(builder, pixoffx,
+                           lp_build_broadcast_scalar(coeff_bld, bld->x), "");
+   pixoffy = LLVMBuildFAdd(builder, pixoffy,
+                           lp_build_broadcast_scalar(coeff_bld, bld->y), "");
+
+   for (attrib = start; attrib < end; attrib++) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
-      for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+      unsigned chan;
+
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
          if (mask & (1 << chan)) {
-            LLVMValueRef index = lp_build_const_int32(gallivm,
-                                      attrib * TGSI_NUM_CHANNELS + chan);
-            LLVMValueRef a0 = zero;
-            LLVMValueRef dadx = zero;
-            LLVMValueRef dady = zero;
-            LLVMValueRef dadxy = zero;
-            LLVMValueRef dadq;
-            LLVMValueRef dadq2;
-            LLVMValueRef a;
+            LLVMValueRef index;
+            LLVMValueRef dadx = coeff_bld->zero;
+            LLVMValueRef dady = coeff_bld->zero;
+            LLVMValueRef a = coeff_bld->zero;
 
+            index = lp_build_const_int32(gallivm, chan);
             switch (interp) {
             case LP_INTERP_PERSPECTIVE:
                /* fall-through */
 
             case LP_INTERP_LINEAR:
                if (attrib == 0 && chan == 0) {
-                  dadxy = dadx = one;
+                  dadx = coeff_bld->one;
                }
                else if (attrib == 0 && chan == 1) {
-                  dadxy = dady = one;
+                  dady = coeff_bld->one;
                }
                else {
-                  dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
-                  dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
-                  dadxy = LLVMBuildFAdd(builder, dadx, dady, "");
-                  attrib_name(dadx, attrib, chan, ".dadx");
-                  attrib_name(dady, attrib, chan, ".dady");
-                  attrib_name(dadxy, attrib, chan, ".dadxy");
+                  dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                    coeff_bld->type, bld->dadxaos[attrib],
+                                                    index);
+                  dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                    coeff_bld->type, bld->dadyaos[attrib],
+                                                    index);
+                  a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                 coeff_bld->type, bld->a0aos[attrib],
+                                                 index);
                }
-               /* fall-through */
+               /*
+                * a = a0 + (x * dadx + y * dady)
+                */
+               dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+               dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+               a = LLVMBuildFAdd(builder, a, dadx, "");
+               a = LLVMBuildFAdd(builder, a, dady, "");
+
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  if (oow == NULL) {
+                     LLVMValueRef w = bld->attribs[0][3];
+                     assert(attrib != 0);
+                     assert(bld->mask[0] & TGSI_WRITEMASK_W);
+                     oow = lp_build_rcp(coeff_bld, w);
+                  }
+                  a = lp_build_mul(coeff_bld, a, oow);
+               }
+               break;
 
             case LP_INTERP_CONSTANT:
             case LP_INTERP_FACING:
-               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
-               attrib_name(a0, attrib, chan, ".a0");
+               a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, bld->a0aos[attrib],
+                                              index);
                break;
 
             case LP_INTERP_POSITION:
-               /* Nothing to do as the position coeffs are already setup in slot 0 */
-               continue;
+               assert(attrib > 0);
+               a = bld->attribs[0][chan];
+               break;
 
             default:
                assert(0);
                break;
             }
 
-            /*
-             * dadq = {0, dadx, dady, dadx + dady}
-             */
+            if ((attrib == 0) && (chan == 2)){
+               /* FIXME: Depth values can exceed 1.0, due to the fact that
+                * setup interpolation coefficients refer to (0,0) which causes
+                * precision loss. So we must clamp to 1.0 here to avoid artifacts
+                */
+               a = lp_build_min(coeff_bld, a, coeff_bld->one);
+            }
+            bld->attribs[attrib][chan] = a;
+         }
+      }
+   }
+}
 
-            dadq = coeff_bld->undef;
-            dadq = LLVMBuildInsertElement(builder, dadq, zero,  i0, "");
-            dadq = LLVMBuildInsertElement(builder, dadq, dadx,  i1, "");
-            dadq = LLVMBuildInsertElement(builder, dadq, dady,  i2, "");
-            dadq = LLVMBuildInsertElement(builder, dadq, dadxy, i3, "");
+/**
+ * Initialize the bld->a, dadq fields.  This involves fetching
+ * those values from the arrays which are passed into the JIT function.
+ */
+static void
+coeffs_init(struct lp_build_interp_soa_context *bld,
+            LLVMValueRef a0_ptr,
+            LLVMValueRef dadx_ptr,
+            LLVMValueRef dady_ptr)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef pixoffx, pixoffy;
+   unsigned attrib;
+   unsigned chan;
+   unsigned i;
+
+   pixoffx = coeff_bld->undef;
+   pixoffy = coeff_bld->undef;
+   for (i = 0; i < coeff_bld->type.length; i++) {
+      LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+      LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
+      LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
+      pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+      pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
+   }
 
-            /*
-             * dadq2 = 2 * dq
-             */
 
-            dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      const unsigned mask = bld->mask[attrib];
+      const unsigned interp = bld->interp[attrib];
+      LLVMValueRef index = lp_build_const_int32(gallivm,
+                                attrib * TGSI_NUM_CHANNELS);
+      LLVMValueRef ptr;
+      LLVMValueRef dadxaos = setup_bld->zero;
+      LLVMValueRef dadyaos = setup_bld->zero;
+      LLVMValueRef a0aos = setup_bld->zero;
+
+      /* always fetch all 4 values for performance/simplicity */
+      switch (interp) {
+      case LP_INTERP_PERSPECTIVE:
+         /* fall-through */
+
+      case LP_INTERP_LINEAR:
+         ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+         ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+         attrib_name(dadxaos, attrib, 0, ".dadxaos");
+         attrib_name(dadyaos, attrib, 0, ".dadyaos");
+         /* fall-through */
+
+      case LP_INTERP_CONSTANT:
+      case LP_INTERP_FACING:
+         ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         a0aos = LLVMBuildLoad(builder, ptr, "");
+         attrib_name(a0aos, attrib, 0, ".a0aos");
+         break;
+
+      case LP_INTERP_POSITION:
+         /* Nothing to do as the position coeffs are already setup in slot 0 */
+         continue;
+
+      default:
+         assert(0);
+         break;
+      }
 
-            /*
-             * a = a0 + (x * dadx + y * dady)
-             */
+      /*
+       * a = a0 + (x * dadx + y * dady)
+       * a0aos is the attrib value at top left corner of stamp
+       */
+      if (interp != LP_INTERP_CONSTANT &&
+          interp != LP_INTERP_FACING) {
+         LLVMValueRef axaos, ayaos;
+         axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
+                               dadxaos, "");
+         ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
+                               dadyaos, "");
+         a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
+         a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
+      }
+
+      /*
+       * dadq = {0, dadx, dady, dadx + dady}
+       * for two quads (side by side) this is:
+       * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
+       */
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         /* this generates a CRAPLOAD of shuffles... */
+         if (mask & (1 << chan)) {
+            LLVMValueRef dadx, dady;
+            LLVMValueRef dadq, dadq2;
+            LLVMValueRef a;
+            LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
 
             if (attrib == 0 && chan == 0) {
-               a = bld->x;
+               a = lp_build_broadcast_scalar(coeff_bld, bld->x);
+               dadx = coeff_bld->one;
+               dady = coeff_bld->zero;
             }
             else if (attrib == 0 && chan == 1) {
-               a = bld->y;
+               a = lp_build_broadcast_scalar(coeff_bld, bld->y);
+               dady = coeff_bld->one;
+               dadx = coeff_bld->zero;
             }
             else {
-               a = a0;
-               if (interp != LP_INTERP_CONSTANT &&
-                   interp != LP_INTERP_FACING) {
-                  LLVMValueRef ax, ay, axy;
-                  ax = LLVMBuildFMul(builder, bld->x, dadx, "");
-                  ay = LLVMBuildFMul(builder, bld->y, dady, "");
-                  axy = LLVMBuildFAdd(builder, ax, ay, "");
-                  a = LLVMBuildFAdd(builder, a, axy, "");
-               }
-            }
+               dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, dadxaos, chan_index);
+               dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, dadyaos, chan_index);
 
-            /*
-             * a = {a, a, a, a}
-             */
+               /*
+                * a = {a, a, a, a}
+                */
+               a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, a0aos, chan_index);
+            }
 
-            a = lp_build_broadcast(gallivm, coeff_bld->vec_type, a);
+            dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+            dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+            dadq = LLVMBuildFAdd(builder, dadx, dady, "");
 
             /*
-             * Compute the attrib values on the upper-left corner of each quad.
+             * Compute the attrib values on the upper-left corner of each
+             * group of quads.
+             * Note that if we process 2 quads at once this doesn't
+             * really exactly to what we want.
+             * We need to access elem 0 and 2 respectively later if we process
+             * 2 quads at once.
              */
 
             if (interp != LP_INTERP_CONSTANT &&
                 interp != LP_INTERP_FACING) {
+               dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
                a = LLVMBuildFAdd(builder, a, dadq2, "");
 	    }
 
@@ -249,6 +477,12 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
              * a *= 1 / w
              */
 
+            /*
+             * XXX since we're only going to access elements 0,2 out of 8
+             * if we have 8-wide vectors we should do the division only 4-wide.
+             * a is really a 2-elements in a 4-wide vector disguised as 8-wide
+             * in this case.
+             */
             if (interp == LP_INTERP_PERSPECTIVE) {
                LLVMValueRef w = bld->a[0][3];
                assert(attrib != 0);
@@ -279,18 +513,18 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
 static void
 attribs_update(struct lp_build_interp_soa_context *bld,
                struct gallivm_state *gallivm,
-               int quad_index,
+               int quad_start_index,
                int start,
                int end)
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
-   LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_index);
+   LLVMValueRef shuffle = lp_build_const_int_vec(gallivm, coeff_bld->type, quad_start_index);
    LLVMValueRef oow = NULL;
    unsigned attrib;
    unsigned chan;
 
-   assert(quad_index < 4);
+   assert(quad_start_index < 4);
 
    for(attrib = start; attrib < end; ++attrib) {
       const unsigned mask = bld->mask[attrib];
@@ -412,6 +646,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          LLVMValueRef y0)
 {
    struct lp_type coeff_type;
+   struct lp_type setup_type;
    unsigned attrib;
    unsigned chan;
 
@@ -421,19 +656,26 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
    coeff_type.floating = TRUE;
    coeff_type.sign = TRUE;
    coeff_type.width = 32;
-   coeff_type.length = TGSI_QUAD_SIZE;
+   coeff_type.length = type.length;
+
+   memset(&setup_type, 0, sizeof setup_type);
+   setup_type.floating = TRUE;
+   setup_type.sign = TRUE;
+   setup_type.width = 32;
+   setup_type.length = TGSI_NUM_CHANNELS;
+
 
    /* XXX: we don't support interpolating into any other types */
    assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
 
    lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
+   lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
 
    /* For convenience */
    bld->pos = bld->attribs[0];
    bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
 
    /* Position */
-   bld->num_attribs = 1;
    bld->mask[0] = TGSI_WRITEMASK_XYZW;
    bld->interp[0] = LP_INTERP_LINEAR;
 
@@ -453,7 +695,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 
    pos_init(bld, x0, y0);
 
-   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+   if (coeff_type.length > 4) {
+      coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
+   }
+   else {
+      coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+   }
 }
 
 
@@ -463,20 +710,30 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 void
 lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
                                   struct gallivm_state *gallivm,
-                                  int quad_index)
+                                  int quad_start_index)
 {
-   assert(quad_index < 4);
+   assert(quad_start_index < 4);
 
-   attribs_update(bld, gallivm, quad_index, 1, bld->num_attribs);
+   if (bld->coeff_bld.type.length > 4) {
+      attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+   }
+   else {
+      attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+   }
 }
 
 void
 lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
                                   struct gallivm_state *gallivm,
-                                  int quad_index)
+                                  int quad_start_index)
 {
-   assert(quad_index < 4);
+   assert(quad_start_index < 4);
 
-   attribs_update(bld, gallivm, quad_index, 0, 1);
+   if (bld->coeff_bld.type.length > 4) {
+      attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
+   }
+   else {
+      attribs_update(bld, gallivm, quad_start_index, 0, 1);
+   }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 6970a9b8c2c..f293b582318 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -79,6 +79,7 @@ struct lp_build_interp_soa_context
 {
    /* TGSI_QUAD_SIZE x float */
    struct lp_build_context coeff_bld;
+   struct lp_build_context setup_bld;
 
    unsigned num_attribs;
    unsigned mask[1 + PIPE_MAX_SHADER_INPUTS]; /**< TGSI_WRITE_MASK_x */
@@ -87,8 +88,11 @@ struct lp_build_interp_soa_context
    LLVMValueRef x;
    LLVMValueRef y;
 
-   LLVMValueRef a   [1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+   LLVMValueRef a[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
    LLVMValueRef dadq[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+   LLVMValueRef a0aos[1 + PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef dadxaos[1 + PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef dadyaos[1 + PIPE_MAX_SHADER_INPUTS];
 
    LLVMValueRef oow;
 
@@ -118,12 +122,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 void
 lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
                                   struct gallivm_state *gallivm,
-                                  int quad_index);
+                                  int quad_start_index);
 
 void
 lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
                                struct gallivm_state *gallivm,
-                               int quad_index);
+                               int quad__start_index);
 
 
 #endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 9e4c7d6734e..07cea9158c3 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -51,42 +51,6 @@
 unsigned llvmpipe_variant_count;
 
 
-/**
- * This function is called by the gallivm "garbage collector" when
- * the LLVM global data structures are freed.  We must free all LLVM-related
- * data.  Specifically, all JIT'd shader variants.
- */
-static void
-garbage_collect_callback(void *cb_data)
-{
-   struct llvmpipe_context *lp = (struct llvmpipe_context *) cb_data;
-   struct lp_fs_variant_list_item *li;
-
-   /* Free all the context's shader variants */
-   li = first_elem(&lp->fs_variants_list);
-   while (!at_end(&lp->fs_variants_list, li)) {
-      struct lp_fs_variant_list_item *next = next_elem(li);
-      llvmpipe_remove_shader_variant(lp, li->base);
-      li = next;
-   }
-
-   /* Free all the context's primitive setup variants */
-   lp_delete_setup_variants(lp);
-
-   /* release references to setup variants, shaders */
-   lp_setup_set_setup_variant(lp->setup, NULL);
-   lp_setup_set_fs_variant(lp->setup, NULL);
-   lp_setup_reset(lp->setup);
-
-   /* This type will be recreated upon demand */
-   lp->jit_context_ptr_type = NULL;
-
-   /* mark all state as dirty to ensure new shaders are jit'd, etc. */
-   lp->dirty = ~0;
-}
-
-
-
 static void llvmpipe_destroy( struct pipe_context *pipe )
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
@@ -94,9 +58,6 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
 
    lp_print_counters();
 
-   gallivm_remove_garbage_collector_callback(garbage_collect_callback,
-                                             llvmpipe);
-
    /* This will also destroy llvmpipe->setup:
     */
    if (llvmpipe->draw)
@@ -128,8 +89,6 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
 
    lp_delete_setup_variants(llvmpipe);
 
-   gallivm_destroy(llvmpipe->gallivm);
-
    align_free( llvmpipe );
 }
 
@@ -195,12 +154,10 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    llvmpipe_init_context_resource_funcs( &llvmpipe->pipe );
    llvmpipe_init_surface_functions(llvmpipe);
 
-   llvmpipe->gallivm = gallivm_create();
-
    /*
     * Create drawing context and plug our rendering stage into it.
     */
-   llvmpipe->draw = draw_create_gallivm(&llvmpipe->pipe, llvmpipe->gallivm);
+   llvmpipe->draw = draw_create(&llvmpipe->pipe);
    if (!llvmpipe->draw)
       goto fail;
 
@@ -226,9 +183,6 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
 
    lp_reset_counters();
 
-   gallivm_register_garbage_collector_callback(garbage_collect_callback,
-                                               llvmpipe);
-
    return &llvmpipe->pipe;
 
  fail:
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index d4750705b43..d0220e188cf 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -131,10 +131,6 @@ struct llvmpipe_context {
    unsigned nr_fs_variants;
    unsigned nr_fs_instrs;
 
-   /** JIT code generation */
-   struct gallivm_state *gallivm;
-   LLVMTypeRef jit_context_ptr_type;
-
    struct lp_setup_variant_list_item setup_variants_list;
    unsigned nr_setup_variants;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index 42430550ea6..964b792b739 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -54,13 +54,6 @@ llvmpipe_flush( struct pipe_context *pipe,
    /* ask the setup module to flush */
    lp_setup_flush(llvmpipe->setup, fence, reason);
 
-
-   if (llvmpipe_variant_count > 1000) {
-      /* time to do a garbage collection */
-      gallivm_garbage_collect(llvmpipe->gallivm);
-      llvmpipe_variant_count = 0;
-   }
-
    /* Enable to dump BMPs of the color/depth buffers each frame */
    if (0) {
       static unsigned frame_no = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index eb1db84e4b8..7a85eab41a0 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -41,7 +41,7 @@
 
 
 static void
-lp_jit_create_types(struct llvmpipe_context *lp)
+lp_jit_create_types(struct lp_fragment_shader_variant *lp)
 {
    struct gallivm_state *gallivm = lp->gallivm;
    LLVMContextRef lc = gallivm->context;
@@ -183,11 +183,9 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 }
 
 
-LLVMTypeRef
-lp_jit_get_context_type(struct llvmpipe_context *lp)
+void
+lp_jit_init_types(struct lp_fragment_shader_variant *lp)
 {
    if (!lp->jit_context_ptr_type)
       lp_jit_create_types(lp);
-
-   return lp->jit_context_ptr_type;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 04e8dd5267b..584d2c8fd81 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -42,6 +42,7 @@
 #include "lp_texture.h"
 
 
+struct lp_fragment_shader_variant;
 struct llvmpipe_screen;
 
 
@@ -164,8 +165,8 @@ void
 lp_jit_screen_init(struct llvmpipe_screen *screen);
 
 
-LLVMTypeRef
-lp_jit_get_context_type(struct llvmpipe_context *lp);
+void
+lp_jit_init_types(struct lp_fragment_shader_variant *lp);
 
 
 #endif /* LP_JIT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_memory.c b/src/gallium/drivers/llvmpipe/lp_memory.c
index 0f55d4a80ae..85f73e54ac4 100644
--- a/src/gallium/drivers/llvmpipe/lp_memory.c
+++ b/src/gallium/drivers/llvmpipe/lp_memory.c
@@ -36,10 +36,12 @@
  * number of threads or using a smaller tilesize when multiple
  * colorbuffers are bound.
  */
-PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
+PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
 
 
 /* A single dummy tile used in a couple of out-of-memory situations. 
  */
-PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
+PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
 
diff --git a/src/gallium/drivers/llvmpipe/lp_memory.h b/src/gallium/drivers/llvmpipe/lp_memory.h
index f7418f5e087..5552c2908e1 100644
--- a/src/gallium/drivers/llvmpipe/lp_memory.h
+++ b/src/gallium/drivers/llvmpipe/lp_memory.h
@@ -32,9 +32,12 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "lp_limits.h"
+#include "gallivm/lp_bld_type.h"
 
-extern PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
+extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4];
 
-extern PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
+extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
 
 #endif /* LP_MEMORY_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 09af0274d7a..d743d7689ae 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -42,6 +42,7 @@
 #include "lp_tile_soa.h"
 #include "gallivm/lp_bld_debug.h"
 #include "lp_scene.h"
+#include "lp_tex_sample.h"
 
 
 #ifdef DEBUG
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 03d15f6e2b0..54f45357fdc 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -97,56 +97,56 @@
 #include "lp_state_fs.h"
 
 
-#include <llvm-c/Analysis.h>
-#include <llvm-c/BitWriter.h>
-
-
 /** Fragment shader number (for debugging) */
 static unsigned fs_no = 0;
 
 
 /**
- * Expand the relevent bits of mask_input to a 4-dword mask for the 
- * four pixels in a 2x2 quad.  This will set the four elements of the
+ * Expand the relevant bits of mask_input to a n*4-dword mask for the
+ * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
  * quad mask vector to 0 or ~0.
+ * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
+ * quad arguments with fs length 8.
  *
- * \param quad  which quad of the quad group to test, in [0,3]
+ * \param first_quad  which quad(s) of the quad group to test, in [0,3]
  * \param mask_input  bitwise mask for the whole 4x4 stamp
  */
 static LLVMValueRef
 generate_quad_mask(struct gallivm_state *gallivm,
                    struct lp_type fs_type,
-                   unsigned quad,
+                   unsigned first_quad,
                    LLVMValueRef mask_input) /* int32 */
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type mask_type;
    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
-   LLVMValueRef bits[4];
+   LLVMValueRef bits[16];
    LLVMValueRef mask;
-   int shift;
+   int shift, i;
 
    /*
     * XXX: We'll need a different path for 16 x u8
     */
    assert(fs_type.width == 32);
-   assert(fs_type.length == 4);
+   assert(fs_type.length <= Elements(bits));
    mask_type = lp_int_type(fs_type);
 
    /*
     * mask_input >>= (quad * 4)
     */
-   switch (quad) {
+   switch (first_quad) {
    case 0:
       shift = 0;
       break;
    case 1:
+      assert(fs_type.length == 4);
       shift = 2;
       break;
    case 2:
       shift = 8;
       break;
    case 3:
+      assert(fs_type.length == 4);
       shift = 10;
       break;
    default:
@@ -166,12 +166,14 @@ generate_quad_mask(struct gallivm_state *gallivm,
                              lp_build_vec_type(gallivm, mask_type),
                              mask_input);
 
-   bits[0] = LLVMConstInt(i32t, 1 << 0, 0);
-   bits[1] = LLVMConstInt(i32t, 1 << 1, 0);
-   bits[2] = LLVMConstInt(i32t, 1 << 4, 0);
-   bits[3] = LLVMConstInt(i32t, 1 << 5, 0);
-   
-   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), "");
+   for (i = 0; i < fs_type.length / 4; i++) {
+      unsigned j = 2 * (i % 2) + (i / 2) * 8;
+      bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0);
+      bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0);
+      bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0);
+      bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0);
+   }
+   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
 
    /*
     * mask = mask != 0 ? ~0 : 0
@@ -300,7 +302,7 @@ generate_fs(struct gallivm_state *gallivm,
    /* do triangle edge testing */
    if (partial_mask) {
       *pmask = generate_quad_mask(gallivm, type,
-                                  i, mask_input);
+                                  i*type.length/4, mask_input);
    }
    else {
       *pmask = lp_build_const_int_vec(gallivm, type, ~0);
@@ -312,7 +314,7 @@ generate_fs(struct gallivm_state *gallivm,
    if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
       lp_build_mask_check(&mask);
 
-   lp_build_interp_soa_update_pos(interp, gallivm, i);
+   lp_build_interp_soa_update_pos(interp, gallivm, i*type.length/4);
    z = interp->pos[2];
 
    if (depth_mode & EARLY_DEPTH_TEST) {
@@ -333,7 +335,7 @@ generate_fs(struct gallivm_state *gallivm,
       }
    }
 
-   lp_build_interp_soa_update_inputs(interp, gallivm, i);
+   lp_build_interp_soa_update_inputs(interp, gallivm, i*type.length/4);
    
    /* Build the actual shader */
    lp_build_tgsi_soa(gallivm, tokens, type, &mask,
@@ -515,7 +517,7 @@ generate_fragment(struct llvmpipe_context *lp,
                   struct lp_fragment_shader_variant *variant,
                   unsigned partial_mask)
 {
-   struct gallivm_state *gallivm = lp->gallivm;
+   struct gallivm_state *gallivm = variant->gallivm;
    const struct lp_fragment_shader_variant_key *key = &variant->key;
    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
    char func_name[256];
@@ -541,8 +543,8 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMBuilderRef builder;
    struct lp_build_sampler_soa *sampler;
    struct lp_build_interp_soa_context interp;
-   LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
-   LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef fs_mask[16 / 4];
+   LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
    LLVMValueRef blend_mask;
    LLVMValueRef function;
    LLVMValueRef facing;
@@ -553,6 +555,8 @@ generate_fragment(struct llvmpipe_context *lp,
    unsigned cbuf;
    boolean cbuf0_write_all;
 
+   assert(lp_native_vector_width / 32 >= 4);
+
    /* Adjust color input interpolation according to flatshade state:
     */
    memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
@@ -579,12 +583,12 @@ generate_fragment(struct llvmpipe_context *lp,
     * characteristics. */
 
    memset(&fs_type, 0, sizeof fs_type);
-   fs_type.floating = TRUE; /* floating point values */
-   fs_type.sign = TRUE;     /* values are signed */
-   fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
-   fs_type.width = 32;      /* 32-bit float */
-   fs_type.length = 4;      /* 4 elements per vector */
-   num_fs = 4;              /* number of quads per block */
+   fs_type.floating = TRUE;      /* floating point values */
+   fs_type.sign = TRUE;          /* values are signed */
+   fs_type.norm = FALSE;         /* values are not limited to [0,1] or [-1,1] */
+   fs_type.width = 32;           /* 32-bit float */
+   fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
+   num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
 
    memset(&blend_type, 0, sizeof blend_type);
    blend_type.floating = FALSE; /* values are integers */
@@ -605,7 +609,7 @@ generate_fragment(struct llvmpipe_context *lp,
    util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", 
 		 shader->no, variant->no, partial_mask ? "partial" : "whole");
 
-   arg_types[0] = lp_jit_get_context_type(lp);         /* context */
+   arg_types[0] = variant->jit_context_ptr_type;       /* context */
    arg_types[1] = int32_type;                          /* x */
    arg_types[2] = int32_type;                          /* y */
    arg_types[3] = int32_type;                          /* facing */
@@ -738,20 +742,20 @@ generate_fragment(struct llvmpipe_context *lp,
                LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
          }
 
-	 lp_build_conv(gallivm, fs_type, blend_type,
+         lp_build_conv(gallivm, fs_type, blend_type,
                        fs_color_vals,
                        num_fs,
-		       &blend_in_color[chan], 1);
+                       &blend_in_color[chan], 1);
 
-	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+         lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
       }
 
       if (partial_mask || !variant->opaque) {
-         lp_build_conv_mask(lp->gallivm, fs_type, blend_type,
+         lp_build_conv_mask(variant->gallivm, fs_type, blend_type,
                             fs_mask, num_fs,
                             &blend_mask, 1);
       } else {
-         blend_mask = lp_build_const_int_vec(lp->gallivm, blend_type, ~0);
+         blend_mask = lp_build_const_int_vec(variant->gallivm, blend_type, ~0);
       }
 
       color_ptr = LLVMBuildLoad(builder, 
@@ -772,7 +776,7 @@ generate_fragment(struct llvmpipe_context *lp,
                               !key->alpha.enabled &&
                               !shader->info.base.uses_kill);
 
-         generate_blend(lp->gallivm,
+         generate_blend(variant->gallivm,
                         &key->blend,
                         rt,
                         builder,
@@ -787,43 +791,9 @@ generate_fragment(struct llvmpipe_context *lp,
 
    LLVMBuildRetVoid(builder);
 
-   /* Verify the LLVM IR.  If invalid, dump and abort */
-#ifdef DEBUG
-   if(LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
-      if (1)
-         lp_debug_dump_value(function);
-      abort();
-   }
-#endif
-
-   /* Apply optimizations to LLVM IR */
-   LLVMRunFunctionPassManager(gallivm->passmgr, function);
-
-   if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
-      /* Print the LLVM IR to stderr */
-      lp_debug_dump_value(function);
-      debug_printf("\n");
-   }
-
-   /* Dump byte code to a file */
-   if (0) {
-      LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
-   }
+   gallivm_verify_function(gallivm, function);
 
    variant->nr_instrs += lp_build_count_instructions(function);
-   /*
-    * Translate the LLVM IR into machine code.
-    */
-   {
-      void *f = LLVMGetPointerToGlobal(gallivm->engine, function);
-
-      variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
-
-      if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
-         lp_disassemble(f);
-      }
-      lp_func_delete_body(function);
-   }
 }
 
 
@@ -937,6 +907,12 @@ generate_variant(struct llvmpipe_context *lp,
    if(!variant)
       return NULL;
 
+   variant->gallivm = gallivm_create();
+   if (!variant->gallivm) {
+      FREE(variant);
+      return NULL;
+   }
+
    variant->shader = shader;
    variant->list_item_global.base = variant;
    variant->list_item_local.base = variant;
@@ -968,12 +944,35 @@ generate_variant(struct llvmpipe_context *lp,
       lp_debug_fs_variant(variant);
    }
 
-   generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+   lp_jit_init_types(variant);
+   
+   if (variant->jit_function[RAST_EDGE_TEST] == NULL)
+      generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+
+   if (variant->jit_function[RAST_WHOLE] == NULL) {
+      if (variant->opaque) {
+         /* Specialized shader, which doesn't need to read the color buffer. */
+         generate_fragment(lp, shader, variant, RAST_WHOLE);
+      }
+   }
+
+   /*
+    * Compile everything
+    */
+
+   gallivm_compile_module(variant->gallivm);
+
+   if (variant->function[RAST_EDGE_TEST]) {
+      variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
+            gallivm_jit_function(variant->gallivm,
+                                 variant->function[RAST_EDGE_TEST]);
+   }
 
-   if (variant->opaque) {
-      /* Specialized shader, which doesn't need to read the color buffer. */
-      generate_fragment(lp, shader, variant, RAST_WHOLE);
-   } else {
+   if (variant->function[RAST_WHOLE]) {
+         variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
+               gallivm_jit_function(variant->gallivm,
+                                    variant->function[RAST_WHOLE]);
+   } else if (!variant->jit_function[RAST_WHOLE]) {
       variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
    }
 
@@ -1116,13 +1115,14 @@ llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
    /* free all the variant's JIT'd functions */
    for (i = 0; i < Elements(variant->function); i++) {
       if (variant->function[i]) {
-         if (variant->jit_function[i])
-            LLVMFreeMachineCodeForFunction(lp->gallivm->engine,
-                                           variant->function[i]);
-         LLVMDeleteFunction(variant->function[i]);
+         gallivm_free_function(variant->gallivm,
+                               variant->function[i],
+                               variant->jit_function[i]);
       }
    }
 
+   gallivm_destroy(variant->gallivm);
+
    /* remove from shader's list */
    remove_from_list(&variant->list_item_local);
    variant->shader->variants_cached--;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 273d241d8fc..306f5f9669a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -84,6 +84,12 @@ struct lp_fragment_shader_variant
 
    boolean opaque;
 
+   struct gallivm_state *gallivm;
+
+   LLVMTypeRef jit_context_ptr_type;
+   LLVMTypeRef jit_thread_data_ptr_type;
+   LLVMTypeRef jit_linear_context_ptr_type;
+
    LLVMValueRef function[2];
 
    lp_jit_frag_func jit_function[2];
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index 299c1ef85dc..1d5e50be9b7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -38,7 +38,6 @@
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_type.h"
-#include <llvm-c/Analysis.h>	/* for LLVMVerifyFunction */
 
 #include "lp_perf.h"
 #include "lp_debug.h"
@@ -77,12 +76,6 @@ struct lp_setup_args
    LLVMValueRef dy01_ooa;
    LLVMValueRef dx20_ooa;
    LLVMValueRef dx01_ooa;
-
-   /* Temporary, per-attribute:
-    */
-   LLVMValueRef v0a;
-   LLVMValueRef v1a;
-   LLVMValueRef v2a;
 };
 
 
@@ -146,7 +139,7 @@ store_coef(struct gallivm_state *gallivm,
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef idx = lp_build_const_int32(gallivm, slot);
-   
+
    LLVMBuildStore(builder,
 		  a0, 
 		  LLVMBuildGEP(builder, args->a0, &idx, 1, ""));
@@ -210,27 +203,13 @@ vert_attrib(struct gallivm_state *gallivm,
    return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name);
 }
 
-static LLVMValueRef
-vert_clamp(LLVMBuilderRef b,
-           LLVMValueRef x,
-           LLVMValueRef min,
-           LLVMValueRef max)
-{
-   LLVMValueRef min_result = LLVMBuildFCmp(b, LLVMRealUGT, min, x, "");
-   LLVMValueRef max_result = LLVMBuildFCmp(b, LLVMRealUGT, x, max, "");
-   LLVMValueRef clamp_value;
-
-   clamp_value = LLVMBuildSelect(b, min_result, min, x, "");
-   clamp_value = LLVMBuildSelect(b, max_result, max, x, "");
-
-   return clamp_value;
-}
 
 static void
 lp_twoside(struct gallivm_state *gallivm,
            struct lp_setup_args *args,
            const struct lp_setup_variant_key *key,
-           int bcolor_slot)
+           int bcolor_slot,
+           LLVMValueRef attribv[3])
 {
    LLVMBuilderRef b = gallivm->builder;
    LLVMValueRef a0_back, a1_back, a2_back;
@@ -248,67 +227,66 @@ lp_twoside(struct gallivm_state *gallivm,
     * Prefer select to if so we don't have to worry about phis or
     * allocas.
     */
-   args->v0a = LLVMBuildSelect(b, front_facing, a0_back, args->v0a, "");
-   args->v1a = LLVMBuildSelect(b, front_facing, a1_back, args->v1a, "");
-   args->v2a = LLVMBuildSelect(b, front_facing, a2_back, args->v2a, "");
+   attribv[0] = LLVMBuildSelect(b, front_facing, a0_back, attribv[0], "");
+   attribv[1] = LLVMBuildSelect(b, front_facing, a1_back, attribv[1], "");
+   attribv[2] = LLVMBuildSelect(b, front_facing, a2_back, attribv[2], "");
 
 }
 
 static void
 lp_do_offset_tri(struct gallivm_state *gallivm,
                  struct lp_setup_args *args,
-                 const struct lp_setup_variant_key *key)
+                 const struct lp_setup_variant_key *key,
+                 LLVMValueRef inv_det,
+                 LLVMValueRef dxyz01,
+                 LLVMValueRef dxyz20,
+                 LLVMValueRef attribv[3])
 {
    LLVMBuilderRef b = gallivm->builder;
    struct lp_build_context bld;
    LLVMValueRef zoffset, mult;
    LLVMValueRef z0_new, z1_new, z2_new;
-   LLVMValueRef dzdx0, dzdx, dzdy0, dzdy;
-   LLVMValueRef max, max_value;
-   
-   LLVMValueRef one  = lp_build_const_float(gallivm, 1.0);
-   LLVMValueRef zero = lp_build_const_float(gallivm, 0.0);
-   LLVMValueRef two  = lp_build_const_int32(gallivm, 2);
-
-   /* edge vectors: e = v0 - v2, f = v1 - v2 */
-   LLVMValueRef v0_x = vert_attrib(gallivm, args->v0, 0, 0, "v0_x");
-   LLVMValueRef v1_x = vert_attrib(gallivm, args->v1, 0, 0, "v1_x");
-   LLVMValueRef v2_x = vert_attrib(gallivm, args->v2, 0, 0, "v2_x");
-   LLVMValueRef v0_y = vert_attrib(gallivm, args->v0, 0, 1, "v0_y");
-   LLVMValueRef v1_y = vert_attrib(gallivm, args->v1, 0, 1, "v1_y");
-   LLVMValueRef v2_y = vert_attrib(gallivm, args->v2, 0, 1, "v2_y");
-   LLVMValueRef v0_z = vert_attrib(gallivm, args->v0, 0, 2, "v0_z");
-   LLVMValueRef v1_z = vert_attrib(gallivm, args->v1, 0, 2, "v1_z");
-   LLVMValueRef v2_z = vert_attrib(gallivm, args->v2, 0, 2, "v2_z");
- 
-   /* edge vectors: e = v0 - v2, f = v1 - v2 */
-   LLVMValueRef dx02 = LLVMBuildFSub(b, v0_x, v2_x, "dx02");
-   LLVMValueRef dy02 = LLVMBuildFSub(b, v0_y, v2_y, "dy02");
-   LLVMValueRef dz02 = LLVMBuildFSub(b, v0_z, v2_z, "dz02");
-   LLVMValueRef dx12 = LLVMBuildFSub(b, v1_x, v2_x, "dx12"); 
-   LLVMValueRef dy12 = LLVMBuildFSub(b, v1_y, v2_y, "dy12");
-   LLVMValueRef dz12 = LLVMBuildFSub(b, v1_z, v2_z, "dz12");
- 
-   /* det = cross(e,f).z */
-   LLVMValueRef dx02_dy12  = LLVMBuildFMul(b, dx02, dy12, "dx02_dy12");
-   LLVMValueRef dy02_dx12  = LLVMBuildFMul(b, dy02, dx12, "dy02_dx12");
-   LLVMValueRef det  = LLVMBuildFSub(b, dx02_dy12, dy02_dx12, "det");
-   LLVMValueRef inv_det = LLVMBuildFDiv(b, one, det, "inv_det"); 
-   
-   /* (res1,res2) = cross(e,f).xy */
-   LLVMValueRef dy02_dz12    = LLVMBuildFMul(b, dy02, dz12, "dy02_dz12");
-   LLVMValueRef dz02_dy12    = LLVMBuildFMul(b, dz02, dy12, "dz02_dy12");
-   LLVMValueRef dz02_dx12    = LLVMBuildFMul(b, dz02, dx12, "dz02_dx12");
-   LLVMValueRef dx02_dz12    = LLVMBuildFMul(b, dx02, dz12, "dx02_dz12");
-   LLVMValueRef res1  = LLVMBuildFSub(b, dy02_dz12, dz02_dy12, "res1");
-   LLVMValueRef res2  = LLVMBuildFSub(b, dz02_dx12, dx02_dz12, "res2");
+   LLVMValueRef dzdxdzdy, dzdx, dzdy, dzxyz20, dyzzx01, dyzzx01_dzxyz20, dzx01_dyz20;
+   LLVMValueRef z0z1, z0z1z2;
+   LLVMValueRef max, max_value, res12;
+   LLVMValueRef shuffles[4];
+   LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef onei = lp_build_const_int32(gallivm, 1);
+   LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
+   LLVMValueRef twoi = lp_build_const_int32(gallivm, 2);
+   LLVMValueRef threei  = lp_build_const_int32(gallivm, 3);
+
+   /* (res12) = cross(e,f).xy */
+   shuffles[0] = twoi;
+   shuffles[1] = zeroi;
+   shuffles[2] = onei;
+   shuffles[3] = twoi;
+   dzxyz20 = LLVMBuildShuffleVector(b, dxyz20, dxyz20, LLVMConstVector(shuffles, 4), "");
+
+   shuffles[0] = onei;
+   shuffles[1] = twoi;
+   shuffles[2] = twoi;
+   shuffles[3] = zeroi;
+   dyzzx01 = LLVMBuildShuffleVector(b, dxyz01, dxyz01, LLVMConstVector(shuffles, 4), "");
+
+   dyzzx01_dzxyz20 = LLVMBuildFMul(b, dzxyz20, dyzzx01, "dyzzx01_dzxyz20");
+
+   shuffles[0] = twoi;
+   shuffles[1] = threei;
+   shuffles[2] = LLVMGetUndef(shuf_type);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   dzx01_dyz20 = LLVMBuildShuffleVector(b, dyzzx01_dzxyz20, dyzzx01_dzxyz20,
+                                        LLVMConstVector(shuffles, 4), "");
+
+   res12 = LLVMBuildFSub(b, dyzzx01_dzxyz20, dzx01_dyz20, "res12");
 
    /* dzdx = fabsf(res1 * inv_det), dydx = fabsf(res2 * inv_det)*/
-   lp_build_context_init(&bld, gallivm, lp_type_float(32));
-   dzdx0 = LLVMBuildFMul(b, res1, inv_det, "dzdx");
-   dzdx  = lp_build_abs(&bld, dzdx0);
-   dzdy0 = LLVMBuildFMul(b, res2, inv_det, "dzdy");
-   dzdy  = lp_build_abs(&bld, dzdy0);
+   lp_build_context_init(&bld, gallivm, lp_type_float_vec(32, 128));
+   dzdxdzdy = LLVMBuildFMul(b, res12, inv_det, "dzdxdzdy");
+   dzdxdzdy = lp_build_abs(&bld, dzdxdzdy);
+
+   dzdx = LLVMBuildExtractElement(b, dzdxdzdy, zeroi, "");
+   dzdy = LLVMBuildExtractElement(b, dzdxdzdy, onei, "");
 
    /* zoffset = offset->units + MAX2(dzdx, dzdy) * offset->scale */
    max = LLVMBuildFCmp(b, LLVMRealUGT, dzdx, dzdy, "");
@@ -317,45 +295,56 @@ lp_do_offset_tri(struct gallivm_state *gallivm,
    mult = LLVMBuildFMul(b, max_value, lp_build_const_float(gallivm, key->scale), "");
    zoffset = LLVMBuildFAdd(b, lp_build_const_float(gallivm, key->units), mult, "zoffset");
 
+   /* yuck */
+   shuffles[0] = twoi;
+   shuffles[1] = lp_build_const_int32(gallivm, 6);
+   shuffles[2] = LLVMGetUndef(shuf_type);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   z0z1 = LLVMBuildShuffleVector(b, attribv[0], attribv[1], LLVMConstVector(shuffles, 4), "");
+   shuffles[0] = zeroi;
+   shuffles[1] = onei;
+   shuffles[2] = lp_build_const_int32(gallivm, 6);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   z0z1z2 = LLVMBuildShuffleVector(b, z0z1, attribv[2], LLVMConstVector(shuffles, 4), "");
+   zoffset = vec4f_from_scalar(gallivm, zoffset, "");
+
    /* clamp and do offset */
-   z0_new = vert_clamp(b, LLVMBuildFAdd(b, v0_z, zoffset, ""), zero, one);
-   z1_new = vert_clamp(b, LLVMBuildFAdd(b, v1_z, zoffset, ""), zero, one);
-   z2_new = vert_clamp(b, LLVMBuildFAdd(b, v2_z, zoffset, ""), zero, one);
+   z0z1z2 = lp_build_clamp(&bld, LLVMBuildFAdd(b, z0z1z2, zoffset, ""), bld.zero, bld.one);
 
    /* insert into args->a0.z, a1.z, a2.z:
-    */   
-   args->v0a = LLVMBuildInsertElement(b, args->v0a, z0_new, two, "");
-   args->v1a = LLVMBuildInsertElement(b, args->v1a, z1_new, two, "");
-   args->v2a = LLVMBuildInsertElement(b, args->v2a, z2_new, two, "");
+    */
+   z0_new = LLVMBuildExtractElement(b, z0z1z2, zeroi, "");
+   z1_new = LLVMBuildExtractElement(b, z0z1z2, onei, "");
+   z2_new = LLVMBuildExtractElement(b, z0z1z2, twoi, "");
+   attribv[0] = LLVMBuildInsertElement(b, attribv[0], z0_new, twoi, "");
+   attribv[1] = LLVMBuildInsertElement(b, attribv[1], z1_new, twoi, "");
+   attribv[2] = LLVMBuildInsertElement(b, attribv[2], z2_new, twoi, "");
 }
 
 static void
 load_attribute(struct gallivm_state *gallivm,
                struct lp_setup_args *args,
                const struct lp_setup_variant_key *key,
-               unsigned vert_attr)
+               unsigned vert_attr,
+               LLVMValueRef attribv[3])
 {
    LLVMBuilderRef b = gallivm->builder;
    LLVMValueRef idx = lp_build_const_int32(gallivm, vert_attr);
 
    /* Load the vertex data
     */
-   args->v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
-   args->v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
-   args->v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+   attribv[0] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+   attribv[1] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+   attribv[2] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
 
 
-   /* Potentially modify it according to twoside, offset, etc:
+   /* Potentially modify it according to twoside, etc:
     */
-   if (vert_attr == 0 && (key->scale != 0.0f || key->units != 0.0f)) {
-      lp_do_offset_tri(gallivm, args, key);
-   }
-
    if (key->twoside) {
       if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
-         lp_twoside(gallivm, args, key, key->bcolor_slot);
+         lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
       else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
-         lp_twoside(gallivm, args, key, key->bspec_slot);
+         lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
    }
 }
 
@@ -375,8 +364,6 @@ emit_coef4( struct gallivm_state *gallivm,
    LLVMValueRef x0_center = args->x0_center;
    LLVMValueRef y0_center = args->y0_center;
 
-   /* XXX: using fsub, fmul on vector types -- does this work??
-    */
    LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01");
    LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20");
 
@@ -406,14 +393,15 @@ emit_coef4( struct gallivm_state *gallivm,
 static void 
 emit_linear_coef( struct gallivm_state *gallivm,
 		  struct lp_setup_args *args,
-		  unsigned slot)
+		  unsigned slot,
+		  LLVMValueRef attribv[3])
 {
    /* nothing to do anymore */
    emit_coef4(gallivm,
               args, slot, 
-              args->v0a,
-              args->v1a,
-              args->v2a);
+              attribv[0],
+              attribv[1],
+              attribv[2]);
 }
 
 
@@ -426,9 +414,10 @@ emit_linear_coef( struct gallivm_state *gallivm,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void 
-emit_perspective_coef( struct gallivm_state *gallivm,
-		       struct lp_setup_args *args,
-		       unsigned slot)
+apply_perspective_corr( struct gallivm_state *gallivm,
+                        struct lp_setup_args *args,
+                        unsigned slot,
+                        LLVMValueRef attribv[3])
 {
    LLVMBuilderRef b = gallivm->builder;
 
@@ -438,20 +427,19 @@ emit_perspective_coef( struct gallivm_state *gallivm,
    LLVMValueRef v1_oow = vec4f_from_scalar(gallivm, vert_attrib(gallivm, args->v1, 0, 3, ""), "v1_oow");
    LLVMValueRef v2_oow = vec4f_from_scalar(gallivm, vert_attrib(gallivm, args->v2, 0, 3, ""), "v2_oow");
 
-   LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, args->v0a, v0_oow, "v0_oow_v0a");
-   LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, args->v1a, v1_oow, "v1_oow_v1a");
-   LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, args->v2a, v2_oow, "v2_oow_v2a");
-
-   emit_coef4(gallivm, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a);
+   attribv[0] = LLVMBuildFMul(b, attribv[0], v0_oow, "v0_oow_v0a");
+   attribv[1] = LLVMBuildFMul(b, attribv[1], v1_oow, "v1_oow_v1a");
+   attribv[2] = LLVMBuildFMul(b, attribv[2], v2_oow, "v2_oow_v2a");
 }
 
 
 static void
 emit_position_coef( struct gallivm_state *gallivm,
 		    struct lp_setup_args *args,
-		    int slot )
+		    int slot,
+		    LLVMValueRef attribv[3])
 {
-   emit_linear_coef(gallivm, args, slot);
+   emit_linear_coef(gallivm, args, slot, attribv);
 }
 
 
@@ -464,7 +452,9 @@ emit_position_coef( struct gallivm_state *gallivm,
 static void
 emit_apply_cyl_wrap(struct gallivm_state *gallivm,
                     struct lp_setup_args *args,
-                    uint cyl_wrap)
+                    uint cyl_wrap,
+		    LLVMValueRef attribv[3])
+
 {
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_type type = lp_float32_vec4_type();
@@ -489,43 +479,43 @@ emit_apply_cyl_wrap(struct gallivm_state *gallivm,
    one = LLVMBuildAnd(builder, one, cyl_mask, "");
 
    /* Edge v0 -> v1 */
-   delta = LLVMBuildFSub(builder, args->v1a, args->v0a, "");
+   delta = LLVMBuildFSub(builder, attribv[1], attribv[0], "");
 
-   offset    = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
-   offset    = LLVMBuildAnd(builder, offset, one, "");
-   offset    = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   args->v0a = LLVMBuildFAdd(builder, args->v0a, offset, "");
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
 
-   offset    = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
-   offset    = LLVMBuildAnd(builder, offset, one, "");
-   offset    = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   args->v1a = LLVMBuildFAdd(builder, args->v1a, offset, "");
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
 
    /* Edge v1 -> v2 */
-   delta = LLVMBuildFSub(builder, args->v2a, args->v1a, "");
+   delta = LLVMBuildFSub(builder, attribv[2], attribv[1], "");
 
-   offset    = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
-   offset    = LLVMBuildAnd(builder, offset, one, "");
-   offset    = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   args->v1a = LLVMBuildFAdd(builder, args->v1a, offset, "");
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
 
-   offset    = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
-   offset    = LLVMBuildAnd(builder, offset, one, "");
-   offset    = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   args->v2a = LLVMBuildFAdd(builder, args->v2a, offset, "");
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
 
    /* Edge v2 -> v0 */
-   delta = LLVMBuildFSub(builder, args->v0a, args->v2a, "");
+   delta = LLVMBuildFSub(builder, attribv[0], attribv[2], "");
 
-   offset    = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
-   offset    = LLVMBuildAnd(builder, offset, one, "");
-   offset    = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   args->v2a = LLVMBuildFAdd(builder, args->v2a, offset, "");
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
 
-   offset    = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
-   offset    = LLVMBuildAnd(builder, offset, one, "");
-   offset    = LLVMBuildBitCast(builder, offset, float_vec_type, "");
-   args->v0a = LLVMBuildFAdd(builder, args->v0a, offset, "");
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
 }
 
 
@@ -534,43 +524,38 @@ emit_apply_cyl_wrap(struct gallivm_state *gallivm,
  */
 static void 
 emit_tri_coef( struct gallivm_state *gallivm,
-	       const struct lp_setup_variant_key *key,
-	       struct lp_setup_args *args )
+               const struct lp_setup_variant_key *key,
+               struct lp_setup_args *args)
 {
    unsigned slot;
 
-   /* The internal position input is in slot zero:
-    */
-   load_attribute(gallivm, args, key, 0);
-   emit_position_coef(gallivm, args, 0);
+   LLVMValueRef attribs[3];
 
-   /* setup interpolation for all the remaining attributes:
+  /* setup interpolation for all the remaining attributes:
     */
    for (slot = 0; slot < key->num_inputs; slot++) {
-
-      if (key->inputs[slot].interp == LP_INTERP_CONSTANT ||
-          key->inputs[slot].interp == LP_INTERP_LINEAR ||
-          key->inputs[slot].interp == LP_INTERP_PERSPECTIVE)
-         load_attribute(gallivm, args, key, key->inputs[slot].src_index);
-
       switch (key->inputs[slot].interp) {
       case LP_INTERP_CONSTANT:
-	 if (key->flatshade_first) {
-	    emit_constant_coef4(gallivm, args, slot+1, args->v0a);
-	 }
-	 else {
-	    emit_constant_coef4(gallivm, args, slot+1, args->v2a);
-	 }
-	 break;
+         load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+         if (key->flatshade_first) {
+            emit_constant_coef4(gallivm, args, slot+1, attribs[0]);
+         }
+         else {
+            emit_constant_coef4(gallivm, args, slot+1, attribs[2]);
+         }
+         break;
 
       case LP_INTERP_LINEAR:
-	 emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap);
-	 emit_linear_coef(gallivm, args, slot+1);
+         load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+	 emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
+         emit_linear_coef(gallivm, args, slot+1, attribs);
          break;
 
       case LP_INTERP_PERSPECTIVE:
-	 emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap);
-	 emit_perspective_coef(gallivm, args, slot+1);
+         load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+	 emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
+         apply_perspective_corr(gallivm, args, slot+1, attribs);
+         emit_linear_coef(gallivm, args, slot+1, attribs);
          break;
 
       case LP_INTERP_POSITION:
@@ -591,62 +576,6 @@ emit_tri_coef( struct gallivm_state *gallivm,
 }
 
 
-/* XXX: This is generic code, share with fs/vs codegen:
- */
-static lp_jit_setup_triangle
-finalize_function(struct gallivm_state *gallivm,
-		  LLVMBuilderRef builder,
-		  LLVMValueRef function)
-{
-   void *f;
-
-   /* Verify the LLVM IR.  If invalid, dump and abort */
-#ifdef DEBUG
-   if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
-      if (1)
-         lp_debug_dump_value(function);
-      abort();
-   }
-#endif
-
-   /* Apply optimizations to LLVM IR */
-   LLVMRunFunctionPassManager(gallivm->passmgr, function);
-
-   if (gallivm_debug & GALLIVM_DEBUG_IR)
-   {
-      /* Print the LLVM IR to stderr */
-      lp_debug_dump_value(function);
-      debug_printf("\n");
-   }
-
-   /*
-    * Translate the LLVM IR into machine code.
-    */
-   f = LLVMGetPointerToGlobal(gallivm->engine, function);
-
-   if (gallivm_debug & GALLIVM_DEBUG_ASM)
-   {
-      lp_disassemble(f);
-   }
-
-   lp_func_delete_body(function);
-
-   return (lp_jit_setup_triangle) pointer_to_func(f);
-}
-
-/* XXX: Generic code:
- */
-static void
-lp_emit_emms(struct gallivm_state *gallivm)
-{
-#ifdef PIPE_ARCH_X86
-   /* Avoid corrupting the FPU stack on 32bit OSes. */
-   lp_build_intrinsic(gallivm->builder, "llvm.x86.mmx.emms",
-         LLVMVoidTypeInContext(gallivm->context), NULL, 0);
-#endif
-}
-
-
 /* XXX: generic code:
  */
 static void
@@ -664,49 +593,70 @@ set_noalias(LLVMBuilderRef builder,
 
 static void
 init_args(struct gallivm_state *gallivm,
-	  struct lp_setup_args *args,
-	  const struct lp_setup_variant *variant)
+          const struct lp_setup_variant_key *key,
+	  struct lp_setup_args *args)
 {
    LLVMBuilderRef b = gallivm->builder;
+   LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef onef = lp_build_const_float(gallivm, 1.0);
+   LLVMValueRef onei = lp_build_const_int32(gallivm, 1);
+   LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
+   LLVMValueRef pixel_center, xy0_center, dxy01, dxy20, dyx20;
+   LLVMValueRef e, f, ef, ooa;
+   LLVMValueRef shuffles[4];
+   LLVMValueRef attr_pos[3];
+   struct lp_type typef4 = lp_type_float_vec(32, 128);
 
-   LLVMValueRef v0_x = vert_attrib(gallivm, args->v0, 0, 0, "v0_x");
-   LLVMValueRef v0_y = vert_attrib(gallivm, args->v0, 0, 1, "v0_y");
+   /* The internal position input is in slot zero:
+    */
+   load_attribute(gallivm, args, key, 0, attr_pos);
 
-   LLVMValueRef v1_x = vert_attrib(gallivm, args->v1, 0, 0, "v1_x");
-   LLVMValueRef v1_y = vert_attrib(gallivm, args->v1, 0, 1, "v1_y");
+   pixel_center = lp_build_const_vec(gallivm, typef4,
+                                  key->pixel_center_half ? 0.5 : 0.0);
 
-   LLVMValueRef v2_x = vert_attrib(gallivm, args->v2, 0, 0, "v2_x");
-   LLVMValueRef v2_y = vert_attrib(gallivm, args->v2, 0, 1, "v2_y");
+   /*
+    * xy are first two elems in v0a/v1a/v2a but just use vec4 arit
+    * also offset_tri uses actually xyz in them
+    */
+   xy0_center = LLVMBuildFSub(b, attr_pos[0], pixel_center, "xy0_center" );
 
-   LLVMValueRef pixel_center = lp_build_const_float(gallivm,
-                                   variant->key.pixel_center_half ? 0.5 : 0);
+   dxy01 = LLVMBuildFSub(b, attr_pos[0], attr_pos[1], "dxy01");
+   dxy20 = LLVMBuildFSub(b, attr_pos[2], attr_pos[0], "dxy20");
 
-   LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" );
-   LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" );
-   
-   LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01");
-   LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01");
-   LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20");
-   LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20");
+   shuffles[0] = onei;
+   shuffles[1] = zeroi;
+   shuffles[2] = LLVMGetUndef(shuf_type);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+
+   dyx20 = LLVMBuildShuffleVector(b, dxy20, dxy20, LLVMConstVector(shuffles, 4), "");
+
+   ef = LLVMBuildFMul(b, dxy01, dyx20, "ef");
+   e = LLVMBuildExtractElement(b, ef, zeroi, "");
+   f = LLVMBuildExtractElement(b, ef, onei, "");
 
-   LLVMValueRef one  = lp_build_const_float(gallivm, 1.0);
-   LLVMValueRef e    = LLVMBuildFMul(b, dx01, dy20, "e");
-   LLVMValueRef f    = LLVMBuildFMul(b, dx20, dy01, "f");
-   LLVMValueRef ooa  = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa");
+   ooa  = LLVMBuildFDiv(b, onef, LLVMBuildFSub(b, e, f, ""), "ooa");
 
-   LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa");
-   LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa");
-   LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa");
-   LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa");
+   ooa = vec4f_from_scalar(gallivm, ooa, "");
+
+   /* tri offset calc shares a lot of arithmetic, do it here */
+   if (key->scale != 0.0f || key->units != 0.0f) {
+      lp_do_offset_tri(gallivm, args, key, ooa, dxy01, dxy20, attr_pos);
+   }
 
-   args->dy20_ooa  = vec4f_from_scalar(gallivm, dy20_ooa, "dy20_ooa_4f");
-   args->dy01_ooa  = vec4f_from_scalar(gallivm, dy01_ooa, "dy01_ooa_4f");
+   dxy20 = LLVMBuildFMul(b, dxy20, ooa, "");
+   dxy01 = LLVMBuildFMul(b, dxy01, ooa, "");
 
-   args->dx20_ooa  = vec4f_from_scalar(gallivm, dx20_ooa, "dx20_ooa_4f");
-   args->dx01_ooa  = vec4f_from_scalar(gallivm, dx01_ooa, "dx01_ooa_4f");
+   args->dy20_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, onei);
+   args->dy01_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, onei);
 
-   args->x0_center = vec4f_from_scalar(gallivm, x0_center, "x0_center_4f");
-   args->y0_center = vec4f_from_scalar(gallivm, y0_center, "y0_center_4f");
+   args->dx20_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, zeroi);
+   args->dx01_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, zeroi);
+
+   args->x0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, zeroi);
+   args->y0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, onei);
+
+   /* might want to merge that with other coef emit in the future */
+   emit_position_coef(gallivm, args, 0, attr_pos);
 }
 
 /**
@@ -714,18 +664,18 @@ init_args(struct gallivm_state *gallivm,
  *
  */
 static struct lp_setup_variant *
-generate_setup_variant(struct gallivm_state *gallivm,
-		       struct lp_setup_variant_key *key,
+generate_setup_variant(struct lp_setup_variant_key *key,
                        struct llvmpipe_context *lp)
 {
    struct lp_setup_variant *variant = NULL;
+   struct gallivm_state *gallivm;
    struct lp_setup_args args;
    char func_name[256];
    LLVMTypeRef vec4f_type;
    LLVMTypeRef func_type;
    LLVMTypeRef arg_types[7];
    LLVMBasicBlockRef block;
-   LLVMBuilderRef builder = gallivm->builder;
+   LLVMBuilderRef builder;
    int64_t t0 = 0, t1;
 
    if (0)
@@ -735,6 +685,13 @@ generate_setup_variant(struct gallivm_state *gallivm,
    if (variant == NULL)
       goto fail;
 
+   variant->gallivm = gallivm = gallivm_create();
+   if (!variant->gallivm) {
+      goto fail;
+   }
+
+   builder = gallivm->builder;
+
    if (LP_DEBUG & DEBUG_COUNTERS) {
       t0 = os_time_get();
    }
@@ -793,14 +750,17 @@ generate_setup_variant(struct gallivm_state *gallivm,
    LLVMPositionBuilderAtEnd(builder, block);
 
    set_noalias(builder, variant->function, arg_types, Elements(arg_types));
-   init_args(gallivm, &args, variant);
+   init_args(gallivm, &variant->key, &args);
    emit_tri_coef(gallivm, &variant->key, &args);
 
-   lp_emit_emms(gallivm);
    LLVMBuildRetVoid(builder);
 
-   variant->jit_function = finalize_function(gallivm, builder,
-					     variant->function);
+   gallivm_verify_function(gallivm, variant->function);
+
+   gallivm_compile_module(gallivm);
+
+   variant->jit_function = (lp_jit_setup_triangle)
+      gallivm_jit_function(gallivm, variant->function);
    if (!variant->jit_function)
       goto fail;
 
@@ -818,10 +778,12 @@ generate_setup_variant(struct gallivm_state *gallivm,
 fail:
    if (variant) {
       if (variant->function) {
-	 if (variant->jit_function)
-	    LLVMFreeMachineCodeForFunction(gallivm->engine,
-					   variant->function);
-	 LLVMDeleteFunction(variant->function);
+         gallivm_free_function(gallivm,
+                               variant->function,
+                               variant->jit_function);
+      }
+      if (variant->gallivm) {
+         gallivm_destroy(variant->gallivm);
       }
       FREE(variant);
    }
@@ -882,10 +844,13 @@ remove_setup_variant(struct llvmpipe_context *lp,
    }
 
    if (variant->function) {
-      if (variant->jit_function)
-	 LLVMFreeMachineCodeForFunction(lp->gallivm->engine,
-					variant->function);
-      LLVMDeleteFunction(variant->function);
+      gallivm_free_function(variant->gallivm,
+                            variant->function,
+                            variant->jit_function);
+   }
+
+   if (variant->gallivm) {
+      gallivm_destroy(variant->gallivm);
    }
 
    remove_from_list(&variant->list_item_global);
@@ -954,7 +919,7 @@ llvmpipe_update_setup(struct llvmpipe_context *lp)
 	 cull_setup_variants(lp);
       }
 
-      variant = generate_setup_variant(lp->gallivm, key, lp);
+      variant = generate_setup_variant(key, lp);
       if (variant) {
          insert_at_head(&lp->setup_variants_list, &variant->list_item_global);
          lp->nr_setup_variants++;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 609c4f62511..e0abe467a6d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -55,6 +55,8 @@ struct lp_setup_variant {
    
    struct lp_setup_variant_list_item list_item_global;
 
+   struct gallivm_state *gallivm;
+
    /* XXX: this is a pointer to the LLVM IR.  Once jit_function is
     * generated, we never need to use the IR again - need to find a
     * way to release this data without destroying the generated
@@ -69,15 +71,6 @@ struct lp_setup_variant {
    unsigned no;
 };
 
-void lp_setup_tri_fallback( const float (*v0)[4],
-			    const float (*v1)[4],
-			    const float (*v2)[4],
-			    boolean front_facing,
-			    float (*a0)[4],
-			    float (*dadx)[4],
-			    float (*dady)[4],
-			    const struct lp_setup_variant_key *key );
-
 void lp_delete_setup_variants(struct llvmpipe_context *lp);
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index c64f3e149fd..4b6c8a7a6a5 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -42,11 +42,6 @@
 #include <float.h>
 
 #include "gallivm/lp_bld.h"
-#include <llvm-c/Analysis.h>
-#include <llvm-c/ExecutionEngine.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/BitWriter.h>
-#include <llvm-c/Transforms/Scalar.h>
 
 #include "pipe/p_state.h"
 #include "util/u_format.h"
@@ -64,14 +59,14 @@ write_tsv_header(FILE *fp);
 
 
 boolean
-test_some(struct gallivm_state *gallivm,unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
           unsigned long n);
 
 boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp);
+test_single(unsigned verbose, FILE *fp);
 
 boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp);
+test_all(unsigned verbose, FILE *fp);
 
 
 #if defined(PIPE_CC_MSVC)
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
index 45ca32f5866..6e09f7e67b0 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -53,7 +53,7 @@ write_tsv_header(FILE *fp)
 }
 
 
-typedef float (*unary_func_t)(float);
+typedef void (*unary_func_t)(float *out, const float *in);
 
 
 /**
@@ -180,6 +180,45 @@ const float sincos_values[] = {
     5*M_PI/4,
 };
 
+const float round_values[] = {
+      -10.0, -1, 0.0, 12.0,
+      -1.49, -0.25, 1.25, 2.51,
+      -0.99, -0.01, 0.01, 0.99,
+};
+
+static float fractf(float x)
+{
+   x -= floorf(x);
+   if (x >= 1.0f) {
+      // clamp to the largest number smaller than one
+      x = 1.0f - 0.5f*FLT_EPSILON;
+   }
+   return x;
+}
+
+
+const float fract_values[] = {
+   // http://en.wikipedia.org/wiki/IEEE_754-1985#Examples
+   0.0f,
+   -0.0f,
+   1.0f,
+   -1.0f,
+   0.5f,
+   -0.5f,
+   1.401298464324817e-45f, // smallest denormal
+   -1.401298464324817e-45f,
+   5.88e-39f, // middle denormal
+   1.18e-38f, // largest denormal
+   -1.18e-38f,
+   -1.62981451e-08f,
+   FLT_EPSILON,
+   -FLT_EPSILON,
+   1.0f - 0.5f*FLT_EPSILON,
+   -1.0f + FLT_EPSILON,
+   FLT_MAX,
+   -FLT_MAX
+};
+
 
 /*
  * Unary test cases.
@@ -196,6 +235,11 @@ unary_tests[] = {
    {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
    {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
    {"sgn", &lp_build_sgn, &sgnf, exp2_values, Elements(exp2_values), 20.0 },
+   {"round", &lp_build_round, &roundf, round_values, Elements(round_values), 24.0 },
+   {"trunc", &lp_build_trunc, &truncf, round_values, Elements(round_values), 24.0 },
+   {"floor", &lp_build_floor, &floorf, round_values, Elements(round_values), 24.0 },
+   {"ceil", &lp_build_ceil, &ceilf, round_values, Elements(round_values), 24.0 },
+   {"fract", &lp_build_fract_safe, &fractf, fract_values, Elements(fract_values), 24.0 },
 };
 
 
@@ -204,39 +248,40 @@ unary_tests[] = {
  */
 static LLVMValueRef
 build_unary_test_func(struct gallivm_state *gallivm,
-                      LLVMModuleRef module,
-                      LLVMContextRef context,
                       const struct unary_test_t *test)
 {
-   struct lp_type type = lp_type_float_vec(32);
-   LLVMTypeRef i32t = LLVMInt32TypeInContext(context);
-   LLVMTypeRef f32t = LLVMFloatTypeInContext(context);
+   struct lp_type type = lp_type_float_vec(32, lp_native_vector_width);
+   LLVMContextRef context = gallivm->context;
+   LLVMModuleRef module = gallivm->module;
    LLVMTypeRef vf32t = lp_build_vec_type(gallivm, type);
-   LLVMTypeRef args[1] = { f32t };
-   LLVMValueRef func = LLVMAddFunction(module, test->name, LLVMFunctionType(f32t, args, Elements(args), 0));
-   LLVMValueRef arg1 = LLVMGetParam(func, 0);
+   LLVMTypeRef args[2] = { LLVMPointerType(vf32t, 0), LLVMPointerType(vf32t, 0) };
+   LLVMValueRef func = LLVMAddFunction(module, test->name,
+                                       LLVMFunctionType(LLVMVoidTypeInContext(context),
+                                                        args, Elements(args), 0));
+   LLVMValueRef arg0 = LLVMGetParam(func, 0);
+   LLVMValueRef arg1 = LLVMGetParam(func, 1);
    LLVMBuilderRef builder = gallivm->builder;
    LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry");
-   LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
    LLVMValueRef ret;
 
    struct lp_build_context bld;
 
-   lp_build_context_init(&bld, gallivm, lp_type_float_vec(32));
+   lp_build_context_init(&bld, gallivm, type);
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
    LLVMPositionBuilderAtEnd(builder, block);
    
-   /* scalar to vector */
-   arg1 = LLVMBuildInsertElement(builder, LLVMGetUndef(vf32t), arg1, index0, "");
+   arg1 = LLVMBuildLoad(builder, arg1, "");
 
    ret = test->builder(&bld, arg1);
    
-   /* vector to scalar */
-   ret = LLVMBuildExtractElement(builder, ret, index0, "");
+   LLVMBuildStore(builder, ret, arg0);
+
+   LLVMBuildRetVoid(builder);
+
+   gallivm_verify_function(gallivm, func);
 
-   LLVMBuildRet(builder, ret);
    return func;
 }
 
@@ -245,67 +290,86 @@ build_unary_test_func(struct gallivm_state *gallivm,
  * Test one LLVM unary arithmetic builder function.
  */
 static boolean
-test_unary(struct gallivm_state *gallivm, unsigned verbose, FILE *fp, const struct unary_test_t *test)
+test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test)
 {
-   LLVMModuleRef module = gallivm->module;
+   struct gallivm_state *gallivm;
    LLVMValueRef test_func;
-   LLVMExecutionEngineRef engine = gallivm->engine;
-   LLVMContextRef context = gallivm->context;
-   char *error = NULL;
    unary_func_t test_func_jit;
    boolean success = TRUE;
-   int i;
+   int i, j;
+   int length = lp_native_vector_width / 32;
+   float *in, *out;
 
-   test_func = build_unary_test_func(gallivm, module, context, test);
+   in = align_malloc(length * 4, length * 4);
+   out = align_malloc(length * 4, length * 4);
 
-   if (LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
-      printf("LLVMVerifyModule: %s\n", error);
-      LLVMDumpModule(module);
-      abort();
+   /* random NaNs or 0s could wreak havoc */
+   for (i = 0; i < length; i++) {
+      in[i] = 1.0;
    }
-   LLVMDisposeMessage(error);
 
-   test_func_jit = (unary_func_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_func));
+   gallivm = gallivm_create();
 
-   for (i = 0; i < test->num_values; ++i) {
-      float value = test->values[i];
-      float ref = test->ref(value);
-      float src = test_func_jit(value);
+   test_func = build_unary_test_func(gallivm, test);
 
-      double error = fabs(src - ref);
-      double precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
+   gallivm_compile_module(gallivm);
 
-      bool pass = precision >= test->precision;
+   test_func_jit = (unary_func_t) gallivm_jit_function(gallivm, test_func);
 
-      if (isnan(ref)) {
-         continue;
-      }
+   for (j = 0; j < (test->num_values + length - 1) / length; j++) {
+      int num_vals = ((j + 1) * length <= test->num_values) ? length :
+                                                              test->num_values % length;
 
-      if (!pass || verbose) {
-         printf("%s(%.9g): ref = %.9g, src = %.9g, precision = %f bits, %s\n",
-               test->name, value, ref, src, precision,
-               pass ? "PASS" : "FAIL");
+      for (i = 0; i < num_vals; ++i) {
+         in[i] = test->values[i+j*length];
       }
 
-      if (!pass) {
-         success = FALSE;
+      test_func_jit(out, in);
+      for (i = 0; i < num_vals; ++i) {
+         float ref = test->ref(in[i]);
+         double error, precision;
+         bool pass;
+
+         error = fabs(out[i] - ref);
+         precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
+
+         pass = precision >= test->precision;
+
+         if (isnan(ref)) {
+            continue;
+         }
+
+         if (!pass || verbose) {
+            printf("%s(%.9g): ref = %.9g, out = %.9g, precision = %f bits, %s\n",
+                  test->name, in[i], ref, out[i], precision,
+                  pass ? "PASS" : "FAIL");
+         }
+
+         if (!pass) {
+            success = FALSE;
+         }
       }
    }
 
-   LLVMFreeMachineCodeForFunction(engine, test_func);
+   gallivm_free_function(gallivm, test_func, test_func_jit);
+
+   gallivm_destroy(gallivm);
+
+   align_free(in);
+   align_free(out);
 
    return success;
 }
 
 
 boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
 {
    boolean success = TRUE;
    int i;
 
    for (i = 0; i < Elements(unary_tests); ++i) {
-      if (!test_unary(gallivm, verbose, fp, &unary_tests[i])) {
+      if (!test_unary(verbose, fp, &unary_tests[i])) {
          success = FALSE;
       }
    }
@@ -315,19 +379,19 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
 
 
 boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
           unsigned long n)
 {
    /*
     * Not randomly generated test cases, so test all.
     */
 
-   return test_all(gallivm, verbose, fp);
+   return test_all(verbose, fp);
 }
 
 
 boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
 {
    return TRUE;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 51324cbb6a3..37b37fda40e 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -36,6 +36,7 @@
  * @author Brian Paul <brian@vmware.com>
  */
 
+#include "util/u_memory.h"
 
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_type.h"
@@ -53,19 +54,6 @@ enum vector_mode
 
 typedef void (*blend_test_ptr_t)(const void *src, const void *dst, const void *con, void *res);
 
-/** cast wrapper */
-static blend_test_ptr_t
-voidptr_to_blend_test_ptr_t(void *p)
-{
-   union {
-      void *v;
-      blend_test_ptr_t f;
-   } u;
-   u.v = p;
-   return u.f;
-}
-
-
 
 void
 write_tsv_header(FILE *fp)
@@ -468,50 +456,43 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 
 PIPE_ALIGN_STACK
 static boolean
-test_one(struct gallivm_state *gallivm,
-         unsigned verbose,
+test_one(unsigned verbose,
          FILE *fp,
          const struct pipe_blend_state *blend,
          enum vector_mode mode,
          struct lp_type type)
 {
-   LLVMModuleRef module = gallivm->module;
+   struct gallivm_state *gallivm;
    LLVMValueRef func = NULL;
-   LLVMExecutionEngineRef engine = gallivm->engine;
-   char *error = NULL;
    blend_test_ptr_t blend_test_ptr;
    boolean success;
    const unsigned n = LP_TEST_NUM_SAMPLES;
    int64_t cycles[LP_TEST_NUM_SAMPLES];
    double cycles_avg = 0.0;
    unsigned i, j;
-   void *code;
+   const unsigned stride = lp_type_width(type)/8;
 
    if(verbose >= 1)
       dump_blend_type(stdout, blend, mode, type);
 
-   func = add_blend_test(gallivm, blend, mode, type);
+   gallivm = gallivm_create();
 
-   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
-      LLVMDumpModule(module);
-      abort();
-   }
-   LLVMDisposeMessage(error);
+   func = add_blend_test(gallivm, blend, mode, type);
 
-   code = LLVMGetPointerToGlobal(engine, func);
-   blend_test_ptr = voidptr_to_blend_test_ptr_t(code);
+   gallivm_compile_module(gallivm);
 
-   if(verbose >= 2)
-      lp_disassemble(code);
+   blend_test_ptr = (blend_test_ptr_t)gallivm_jit_function(gallivm, func);
 
    success = TRUE;
-   for(i = 0; i < n && success; ++i) {
-      if(mode == AoS) {
-         PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+   if(mode == AoS) {
+      uint8_t *src, *dst, *con, *res, *ref;
+      src = align_malloc(stride, stride);
+      dst = align_malloc(stride, stride);
+      con = align_malloc(stride, stride);
+      res = align_malloc(stride, stride);
+      ref = align_malloc(stride, stride);
+
+      for(i = 0; i < n && success; ++i) {
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -569,14 +550,21 @@ test_one(struct gallivm_state *gallivm,
             fprintf(stderr, "\n");
          }
       }
-
-      if(mode == SoA) {
-         const unsigned stride = type.length*type.width/8;
-         PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
-         PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+      align_free(src);
+      align_free(dst);
+      align_free(con);
+      align_free(res);
+      align_free(ref);
+   }
+   else if(mode == SoA) {
+      uint8_t *src, *dst, *con, *res, *ref;
+      src = align_malloc(4*stride, stride);
+      dst = align_malloc(4*stride, stride);
+      con = align_malloc(4*stride, stride);
+      res = align_malloc(4*stride, stride);
+      ref = align_malloc(4*stride, stride);
+
+      for(i = 0; i < n && success; ++i) {
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
@@ -651,6 +639,11 @@ test_one(struct gallivm_state *gallivm,
             }
          }
       }
+      align_free(src);
+      align_free(dst);
+      align_free(con);
+      align_free(res);
+      align_free(ref);
    }
 
    /*
@@ -687,16 +680,9 @@ test_one(struct gallivm_state *gallivm,
    if(fp)
       write_tsv_row(fp, blend, mode, type, cycles_avg, success);
 
-   if (!success) {
-      if(verbose < 2)
-         LLVMDumpModule(module);
-      LLVMWriteBitcodeToFile(module, "blend.bc");
-      fprintf(stderr, "blend.bc written\n");
-      fprintf(stderr, "Invoke as \"llc -o - blend.bc\"\n");
-      abort();
-   }
+   gallivm_free_function(gallivm, func, blend_test_ptr);
 
-   LLVMFreeMachineCodeForFunction(engine, func);
+   gallivm_destroy(gallivm);
 
    return success;
 }
@@ -753,7 +739,7 @@ const unsigned num_types = sizeof(blend_types)/sizeof(blend_types[0]);
 
 
 boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
 {
    const unsigned *rgb_func;
    const unsigned *rgb_src_factor;
@@ -789,7 +775,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
                            blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
                            blend.rt[0].colormask         = PIPE_MASK_RGBA;
 
-                           if(!test_one(gallivm, verbose, fp, &blend, mode, *type))
+                           if(!test_one(verbose, fp, &blend, mode, *type))
                              success = FALSE;
 
                         }
@@ -806,7 +792,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
 
 
 boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
           unsigned long n)
 {
    const unsigned *rgb_func;
@@ -849,7 +835,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
       blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
       blend.rt[0].colormask         = PIPE_MASK_RGBA;
 
-      if(!test_one(gallivm, verbose, fp, &blend, mode, *type))
+      if(!test_one(verbose, fp, &blend, mode, *type))
         success = FALSE;
    }
 
@@ -858,7 +844,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
 
 
 boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
 {
    printf("no test_single()");
    return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 0dcb5422887..71d45bd5ce7 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -142,21 +142,21 @@ add_conv_test(struct gallivm_state *gallivm,
 
    LLVMBuildRetVoid(builder);;
 
+   gallivm_verify_function(gallivm, func);
+
    return func;
 }
 
 
 PIPE_ALIGN_STACK
 static boolean
-test_one(struct gallivm_state *gallivm, unsigned verbose,
+test_one(unsigned verbose,
          FILE *fp,
          struct lp_type src_type,
          struct lp_type dst_type)
 {
-   LLVMModuleRef module = gallivm->module;
-   LLVMExecutionEngineRef engine = gallivm->engine;
+   struct gallivm_state *gallivm;
    LLVMValueRef func = NULL;
-   char *error = NULL;
    conv_test_ptr_t conv_test_ptr;
    boolean success;
    const unsigned n = LP_TEST_NUM_SAMPLES;
@@ -166,10 +166,18 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
    unsigned num_dsts;
    double eps;
    unsigned i, j;
-   void *code;
 
-   if (src_type.width * src_type.length != dst_type.width * dst_type.length &&
-       src_type.length != dst_type.length) {
+   if ((src_type.width >= dst_type.width && src_type.length > dst_type.length) ||
+       (src_type.width <= dst_type.width && src_type.length < dst_type.length)) {
+      return TRUE;
+   }
+
+   /* Known failures
+    * - fixed point 32 -> float 32
+    * - float 32 -> signed normalised integer 32
+    */
+   if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) ||
+       (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) {
       return TRUE;
    }
 
@@ -183,7 +191,7 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
    }
 
    if(verbose >= 1)
-      dump_conv_types(stdout, src_type, dst_type);
+      dump_conv_types(stderr, src_type, dst_type);
 
    if (src_type.length > dst_type.length) {
       num_srcs = 1;
@@ -203,29 +211,20 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
 
    eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
 
-   func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts);
+   gallivm = gallivm_create();
 
-   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
-      LLVMDumpModule(module);
-      abort();
-   }
-   LLVMDisposeMessage(error);
-
-   if(verbose >= 2)
-      LLVMDumpModule(module);
+   func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts);
 
-   code = LLVMGetPointerToGlobal(engine, func);
-   conv_test_ptr = (conv_test_ptr_t)pointer_to_func(code);
+   gallivm_compile_module(gallivm);
 
-   if(verbose >= 2)
-      lp_disassemble(code);
+   conv_test_ptr = (conv_test_ptr_t)gallivm_jit_function(gallivm, func);
 
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
@@ -320,20 +319,9 @@ test_one(struct gallivm_state *gallivm, unsigned verbose,
    if(fp)
       write_tsv_row(fp, src_type, dst_type, cycles_avg, success);
 
-   if (!success) {
-      static boolean firsttime = TRUE;
-      if(firsttime) {
-         if(verbose < 2)
-            LLVMDumpModule(module);
-         LLVMWriteBitcodeToFile(module, "conv.bc");
-         fprintf(stderr, "conv.bc written\n");
-         fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
-         firsttime = FALSE;
-         /* abort(); */
-      }
-   }
+   gallivm_free_function(gallivm, func, conv_test_ptr);
 
-   LLVMFreeMachineCodeForFunction(engine, func);
+   gallivm_destroy(gallivm);
 
    return success;
 }
@@ -348,18 +336,33 @@ const struct lp_type conv_types[] = {
    {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
    {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
 
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   8 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   8 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   8 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   8 },
+
    /* Fixed */
    {  FALSE,  TRUE,  TRUE,  TRUE,    32,   4 },
    {  FALSE,  TRUE,  TRUE, FALSE,    32,   4 },
    {  FALSE,  TRUE, FALSE,  TRUE,    32,   4 },
    {  FALSE,  TRUE, FALSE, FALSE,    32,   4 },
 
+   {  FALSE,  TRUE,  TRUE,  TRUE,    32,   8 },
+   {  FALSE,  TRUE,  TRUE, FALSE,    32,   8 },
+   {  FALSE,  TRUE, FALSE,  TRUE,    32,   8 },
+   {  FALSE,  TRUE, FALSE, FALSE,    32,   8 },
+
    /* Integer */
    {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
    {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
    {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
    {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
 
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   8 },
+
    {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
    {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
    {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
@@ -381,7 +384,7 @@ const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
 
 
 boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
 {
    const struct lp_type *src_type;
    const struct lp_type *dst_type;
@@ -394,7 +397,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
          if(src_type == dst_type)
             continue;
 
-         if(!test_one(gallivm, verbose, fp, *src_type, *dst_type)){
+         if(!test_one(verbose, fp, *src_type, *dst_type)){
             success = FALSE;
             ++error_count;
          }
@@ -408,7 +411,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
 
 
 boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
           unsigned long n)
 {
    const struct lp_type *src_type;
@@ -423,7 +426,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
          dst_type = &conv_types[rand() % num_types];
       } while (src_type == dst_type || src_type->norm != dst_type->norm);
 
-      if(!test_one(gallivm, verbose, fp, *src_type, *dst_type))
+      if(!test_one(verbose, fp, *src_type, *dst_type))
         success = FALSE;
    }
 
@@ -432,7 +435,7 @@ test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
 
 
 boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
 {
    /*    float, fixed,  sign,  norm, width, len */
    struct lp_type f32x4_type =
@@ -442,7 +445,7 @@ test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
 
    boolean success;
 
-   success = test_one(gallivm, verbose, fp, f32x4_type, ub8x4_type);
+   success = test_one(verbose, fp, f32x4_type, ub8x4_type);
 
    return success;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index daf6ded29c7..34cbdbdd630 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -83,7 +83,6 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    LLVMContextRef context = gallivm->context;
    LLVMModuleRef module = gallivm->module;
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMPassManagerRef passmgr = gallivm->passmgr;
    LLVMTypeRef args[4];
    LLVMValueRef func;
    LLVMValueRef packed_ptr;
@@ -120,16 +119,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
 
    LLVMBuildRetVoid(builder);
 
-   if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
-      LLVMDumpValue(func);
-      abort();
-   }
-
-   LLVMRunFunctionPassManager(passmgr, func);
-
-   if (verbose >= 1) {
-      LLVMDumpValue(func);
-   }
+   gallivm_verify_function(gallivm, func);
 
    return func;
 }
@@ -137,26 +127,24 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
 
 PIPE_ALIGN_STACK
 static boolean
-test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_format_float(unsigned verbose, FILE *fp,
                   const struct util_format_description *desc)
 {
+   struct gallivm_state *gallivm;
    LLVMValueRef fetch = NULL;
-   LLVMExecutionEngineRef engine = gallivm->engine;
    fetch_ptr_t fetch_ptr;
    PIPE_ALIGN_VAR(16) float unpacked[4];
    boolean first = TRUE;
    boolean success = TRUE;
    unsigned i, j, k, l;
-   void *f;
+
+   gallivm = gallivm_create();
 
    fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_float32_vec4_type());
 
-   f = LLVMGetPointerToGlobal(engine, fetch);
-   fetch_ptr = (fetch_ptr_t) pointer_to_func(f);
+   gallivm_compile_module(gallivm);
 
-   if (verbose >= 2) {
-      lp_disassemble(f);
-   }
+   fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch);
 
    for (l = 0; l < util_format_nr_test_cases; ++l) {
       const struct util_format_test_case *test = &util_format_test_cases[l];
@@ -171,25 +159,35 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
 
          for (i = 0; i < desc->block.height; ++i) {
             for (j = 0; j < desc->block.width; ++j) {
-               boolean match;
+               boolean match = TRUE;
 
                memset(unpacked, 0, sizeof unpacked);
 
                fetch_ptr(unpacked, test->packed, j, i);
 
-               match = TRUE;
-               for(k = 0; k < 4; ++k)
-                  if (fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON)
+               for(k = 0; k < 4; ++k) {
+                  if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
                      match = FALSE;
+                  }
+
+                  if (util_is_double_nan(test->unpacked[i][j][k]) != util_is_nan(unpacked[k])) {
+                     match = FALSE;
+                  }
+
+                  if (!util_is_double_inf_or_nan(test->unpacked[i][j][k]) &&
+                      fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) {
+                     match = FALSE;
+                  }
+               }
 
                if (!match) {
                   printf("FAILED\n");
                   printf("  Packed: %02x %02x %02x %02x\n",
                          test->packed[0], test->packed[1], test->packed[2], test->packed[3]);
-                  printf("  Unpacked (%u,%u): %f %f %f %f obtained\n",
+                  printf("  Unpacked (%u,%u): %.9g %.9g %.9g %.9g obtained\n",
                          j, i,
                          unpacked[0], unpacked[1], unpacked[2], unpacked[3]);
-                  printf("                  %f %f %f %f expected\n",
+                  printf("                  %.9g %.9g %.9g %.9g expected\n",
                          test->unpacked[i][j][0],
                          test->unpacked[i][j][1],
                          test->unpacked[i][j][2],
@@ -201,14 +199,9 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
       }
    }
 
-   if (!success) {
-      if (verbose < 1) {
-         LLVMDumpValue(fetch);
-      }
-   }
+   gallivm_free_function(gallivm, fetch, fetch_ptr);
 
-   LLVMFreeMachineCodeForFunction(engine, fetch);
-   LLVMDeleteFunction(fetch);
+   gallivm_destroy(gallivm);
 
    if(fp)
       write_tsv_row(fp, desc, success);
@@ -219,26 +212,24 @@ test_format_float(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
 
 PIPE_ALIGN_STACK
 static boolean
-test_format_unorm8(struct gallivm_state *gallivm,
-                   unsigned verbose, FILE *fp,
+test_format_unorm8(unsigned verbose, FILE *fp,
                    const struct util_format_description *desc)
 {
+   struct gallivm_state *gallivm;
    LLVMValueRef fetch = NULL;
    fetch_ptr_t fetch_ptr;
    uint8_t unpacked[4];
    boolean first = TRUE;
    boolean success = TRUE;
    unsigned i, j, k, l;
-   void *f;
+
+   gallivm = gallivm_create();
 
    fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_unorm8_vec4_type());
 
-   f = LLVMGetPointerToGlobal(gallivm->engine, fetch);
-   fetch_ptr = (fetch_ptr_t) pointer_to_func(f);
+   gallivm_compile_module(gallivm);
 
-   if (verbose >= 2) {
-      lp_disassemble(f);
-   }
+   fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch);
 
    for (l = 0; l < util_format_nr_test_cases; ++l) {
       const struct util_format_test_case *test = &util_format_test_cases[l];
@@ -285,6 +276,7 @@ test_format_unorm8(struct gallivm_state *gallivm,
                          float_to_ubyte(test->unpacked[i][j][1]),
                          float_to_ubyte(test->unpacked[i][j][2]),
                          float_to_ubyte(test->unpacked[i][j][3]));
+
                   success = FALSE;
                }
             }
@@ -292,11 +284,9 @@ test_format_unorm8(struct gallivm_state *gallivm,
       }
    }
 
-   if (!success)
-      LLVMDumpValue(fetch);
+   gallivm_free_function(gallivm, fetch, fetch_ptr);
 
-   LLVMFreeMachineCodeForFunction(gallivm->engine, fetch);
-   LLVMDeleteFunction(fetch);
+   gallivm_destroy(gallivm);
 
    if(fp)
       write_tsv_row(fp, desc, success);
@@ -308,17 +298,16 @@ test_format_unorm8(struct gallivm_state *gallivm,
 
 
 static boolean
-test_one(struct gallivm_state *gallivm,
-         unsigned verbose, FILE *fp,
+test_one(unsigned verbose, FILE *fp,
          const struct util_format_description *format_desc)
 {
    boolean success = TRUE;
 
-   if (!test_format_float(gallivm, verbose, fp, format_desc)) {
+   if (!test_format_float(verbose, fp, format_desc)) {
      success = FALSE;
    }
 
-   if (!test_format_unorm8(gallivm, verbose, fp, format_desc)) {
+   if (!test_format_unorm8(verbose, fp, format_desc)) {
      success = FALSE;
    }
 
@@ -327,7 +316,7 @@ test_one(struct gallivm_state *gallivm,
 
 
 boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
 {
    enum pipe_format format;
    boolean success = TRUE;
@@ -359,7 +348,7 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
          continue;
       }
 
-      if (!test_one(gallivm, verbose, fp, format_desc)) {
+      if (!test_one(verbose, fp, format_desc)) {
            success = FALSE;
       }
    }
@@ -369,15 +358,15 @@ test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
 
 
 boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
           unsigned long n)
 {
-   return test_all(gallivm, verbose, fp);
+   return test_all(verbose, fp);
 }
 
 
 boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
 {
    printf("no test_single()");
    return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index d229c620310..4c610923146 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -39,6 +39,7 @@
 
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_debug.h"
 #include "lp_test.h"
 
 
@@ -369,7 +370,6 @@ int main(int argc, char **argv)
    unsigned i;
    boolean success;
    boolean single = FALSE;
-   struct gallivm_state *gallivm;
 
    for(i = 1; i < argc; ++i) {
       if(strcmp(argv[i], "-v") == 0)
@@ -384,23 +384,28 @@ int main(int argc, char **argv)
 
    lp_build_init();
 
-   gallivm = gallivm_create();
+#ifdef DEBUG
+   if (verbose >= 2) {
+      gallivm_debug |= GALLIVM_DEBUG_IR;
+      gallivm_debug |= GALLIVM_DEBUG_ASM;
+   }
+#endif
 
    util_cpu_detect();
 
    if(fp) {
       /* Warm up the caches */
-      test_some(gallivm, 0, NULL, 100);
+      test_some(0, NULL, 100);
 
       write_tsv_header(fp);
    }
       
    if (single)
-      success = test_single(gallivm, verbose, fp);
+      success = test_single(verbose, fp);
    else if (n)
-      success = test_some(gallivm, verbose, fp, n);
+      success = test_some(verbose, fp, n);
    else
-      success = test_all(gallivm, verbose, fp);
+      success = test_all(verbose, fp);
 
    if(fp)
       fclose(fp);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_printf.c b/src/gallium/drivers/llvmpipe/lp_test_printf.c
index 620cdb57c13..c483de94d40 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_printf.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_printf.c
@@ -78,66 +78,61 @@ add_printf_test(struct gallivm_state *gallivm)
 
    LLVMBuildRetVoid(builder);
 
+   gallivm_verify_function(gallivm, func);
+
    return func;
 }
 
 
 PIPE_ALIGN_STACK
 static boolean
-test_printf(struct gallivm_state *gallivm,
-            unsigned verbose, FILE *fp,
+test_printf(unsigned verbose, FILE *fp,
             const struct printf_test_case *testcase)
 {
-   LLVMExecutionEngineRef engine = gallivm->engine;
-   LLVMModuleRef module = gallivm->module;
+   struct gallivm_state *gallivm;
    LLVMValueRef test;
-   char *error = NULL;
    test_printf_t test_printf_func;
    boolean success = TRUE;
-   void *code;
 
-   test = add_printf_test(gallivm);
+   gallivm = gallivm_create();
 
-   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
-      LLVMDumpModule(module);
-      abort();
-   }
-   LLVMDisposeMessage(error);
+   test = add_printf_test(gallivm);
 
-   code = LLVMGetPointerToGlobal(engine, test);
-   test_printf_func = (test_printf_t) pointer_to_func(code);
+   gallivm_compile_module(gallivm);
 
-   // LLVMDumpModule(module);
+   test_printf_func = (test_printf_t) gallivm_jit_function(gallivm, test);
 
    test_printf_func(0);
 
-   LLVMFreeMachineCodeForFunction(engine, test);
+   gallivm_free_function(gallivm, test, test_printf_func);
+
+   gallivm_destroy(gallivm);
 
    return success;
 }
 
 
 boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_all(unsigned verbose, FILE *fp)
 {
    boolean success = TRUE;
 
-   test_printf(gallivm, verbose, fp, NULL);
+   test_printf(verbose, fp, NULL);
 
    return success;
 }
 
 
 boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
+test_some(unsigned verbose, FILE *fp,
           unsigned long n)
 {
-   return test_all(gallivm, verbose, fp);
+   return test_all(verbose, fp);
 }
 
 
 boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
+test_single(unsigned verbose, FILE *fp)
 {
    printf("no test_single()");
    return TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
deleted file mode 100644
index fc3edf372d5..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "util/u_pointer.h"
-#include "gallivm/lp_bld.h"
-#include "gallivm/lp_bld_init.h"
-#include "gallivm/lp_bld_arit.h"
-
-#include "lp_test.h"
-
-
-void
-write_tsv_header(FILE *fp)
-{
-   fprintf(fp,
-           "result\t"
-           "format\n");
-
-   fflush(fp);
-}
-
-
-#ifdef PIPE_ARCH_SSE
-
-# include <emmintrin.h>
-
-typedef __m128 (*test_round_t)(__m128);
-
-typedef LLVMValueRef (*lp_func_t)(struct lp_build_context *, LLVMValueRef);
-
-
-static LLVMValueRef
-add_test(struct gallivm_state *gallivm, const char *name, lp_func_t lp_func)
-{
-   LLVMModuleRef module = gallivm->module;
-   LLVMContextRef context = gallivm->context;
-   LLVMBuilderRef builder = gallivm->builder;
-
-   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatTypeInContext(context), 4);
-   LLVMTypeRef args[1] = { v4sf };
-   LLVMValueRef func = LLVMAddFunction(module, name, LLVMFunctionType(v4sf, args, 1, 0));
-   LLVMValueRef arg1 = LLVMGetParam(func, 0);
-   LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry");
-   LLVMValueRef ret;
-   struct lp_build_context bld;
-
-   lp_build_context_init(&bld, gallivm, lp_float32_vec4_type());
-
-   LLVMSetFunctionCallConv(func, LLVMCCallConv);
-
-   LLVMPositionBuilderAtEnd(builder, block);
-
-   ret = lp_func(&bld, arg1);
-
-   LLVMBuildRet(builder, ret);
-
-   return func;
-}
-
-static void
-printv(char* string, __m128 value)
-{
-   __m128 v = value;
-   float *f = (float *)&v;
-   printf("%s: %10f %10f %10f %10f\n", string,
-           f[0], f[1], f[2], f[3]);
-}
-
-static boolean
-compare(__m128 x, __m128 y)
-{
-   boolean success = TRUE;
-   float *xp = (float *) &x;
-   float *yp = (float *) &y;
-   if (xp[0] != yp[0] ||
-       xp[1] != yp[1] ||
-       xp[2] != yp[2] ||
-       xp[3] != yp[3]) {
-      printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n");
-      success = FALSE;
-   }
-   return success;
-}
-
-
-
-PIPE_ALIGN_STACK
-static boolean
-test_round(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
-   LLVMModuleRef module = gallivm->module;
-   LLVMValueRef test_round = NULL, test_trunc, test_floor, test_ceil;
-   LLVMExecutionEngineRef engine = gallivm->engine;
-   char *error = NULL;
-   test_round_t round_func, trunc_func, floor_func, ceil_func;
-   float unpacked[4];
-   boolean success = TRUE;
-   int i;
-
-   test_round = add_test(gallivm, "round", lp_build_round);
-   test_trunc = add_test(gallivm, "trunc", lp_build_trunc);
-   test_floor = add_test(gallivm, "floor", lp_build_floor);
-   test_ceil = add_test(gallivm, "ceil", lp_build_ceil);
-
-   if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
-      printf("LLVMVerifyModule: %s\n", error);
-      LLVMDumpModule(module);
-      abort();
-   }
-   LLVMDisposeMessage(error);
-
-   round_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_round));
-   trunc_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_trunc));
-   floor_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_floor));
-   ceil_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_ceil));
-
-   memset(unpacked, 0, sizeof unpacked);
-
-   if (0)
-      LLVMDumpModule(module);
-
-   for (i = 0; i < 3; i++) {
-      /* NOTE: There are several acceptable rules for x.5 rounding: ceiling,
-       * nearest even, etc. So we avoid testing such corner cases here.
-       */
-      __m128 xvals[3] = {
-         {-10.0, -1, 0, 12.0},
-         {-1.49, -0.25, 1.25, 2.51},
-         {-0.99, -0.01, 0.01, 0.99}
-      };
-      __m128 x = xvals[i];
-      __m128 y, ref;
-      float *xp = (float *) &x;
-      float *refp = (float *) &ref;
-
-      printf("\n");
-      printv("x            ", x);
-
-      refp[0] = round(xp[0]);
-      refp[1] = round(xp[1]);
-      refp[2] = round(xp[2]);
-      refp[3] = round(xp[3]);
-      y = round_func(x);
-      printv("C round(x)   ", ref);
-      printv("LLVM round(x)", y);
-      success = success && compare(ref, y);
-
-      refp[0] = trunc(xp[0]);
-      refp[1] = trunc(xp[1]);
-      refp[2] = trunc(xp[2]);
-      refp[3] = trunc(xp[3]);
-      y = trunc_func(x);
-      printv("C trunc(x)   ", ref);
-      printv("LLVM trunc(x)", y);
-      success = success && compare(ref, y);
-
-      refp[0] = floor(xp[0]);
-      refp[1] = floor(xp[1]);
-      refp[2] = floor(xp[2]);
-      refp[3] = floor(xp[3]);
-      y = floor_func(x);
-      printv("C floor(x)   ", ref);
-      printv("LLVM floor(x)", y);
-      success = success && compare(ref, y);
-
-      refp[0] = ceil(xp[0]);
-      refp[1] = ceil(xp[1]);
-      refp[2] = ceil(xp[2]);
-      refp[3] = ceil(xp[3]);
-      y = ceil_func(x);
-      printv("C ceil(x)    ", ref);
-      printv("LLVM ceil(x) ", y);
-      success = success && compare(ref, y);
-   }
-
-   LLVMFreeMachineCodeForFunction(engine, test_round);
-   LLVMFreeMachineCodeForFunction(engine, test_trunc);
-   LLVMFreeMachineCodeForFunction(engine, test_floor);
-   LLVMFreeMachineCodeForFunction(engine, test_ceil);
-
-   return success;
-}
-
-#else /* !PIPE_ARCH_SSE */
-
-static boolean
-test_round(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
-   return TRUE;
-}
-
-#endif /* !PIPE_ARCH_SSE */
-
-
-boolean
-test_all(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
-   return test_round(gallivm, verbose, fp);
-}
-
-
-boolean
-test_some(struct gallivm_state *gallivm, unsigned verbose, FILE *fp,
-          unsigned long n)
-{
-   return test_all(gallivm, verbose, fp);
-}
-
-boolean
-test_single(struct gallivm_state *gallivm, unsigned verbose, FILE *fp)
-{
-   printf("no test_single()");
-   return TRUE;
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index daa96f20c7e..9151e427ba7 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -178,8 +178,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                      unsigned unit,
                                      unsigned num_coords,
                                      const LLVMValueRef *coords,
-                                     const LLVMValueRef *ddx,
-                                     const LLVMValueRef *ddy,
+                                     const struct lp_derivatives *derivs,
                                      LLVMValueRef lod_bias, /* optional */
                                      LLVMValueRef explicit_lod, /* optional */
                                      LLVMValueRef *texel)
@@ -189,7 +188,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
    assert(unit < PIPE_MAX_SAMPLERS);
    
    if (LP_PERF & PERF_NO_TEX) {
-      lp_build_sample_nop(gallivm, type, texel);
+      lp_build_sample_nop(gallivm, type, num_coords, coords, texel);
       return;
    }
 
@@ -199,7 +198,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                        type,
                        unit,
                        num_coords, coords,
-                       ddx, ddy,
+                       derivs,
                        lod_bias, explicit_lod,
                        texel);
 }
@@ -210,6 +209,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
 static void
 lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
                                     struct gallivm_state *gallivm,
+                                    struct lp_type type,
                                     unsigned unit,
                                     LLVMValueRef explicit_lod, /* optional */
                                     LLVMValueRef *sizes_out)
@@ -221,6 +221,7 @@ lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
    lp_build_size_query_soa(gallivm,
 			   &sampler->dynamic_state.static_state[unit],
 			   &sampler->dynamic_state.base,
+                           type,
 			   unit,
 			   explicit_lod,
 			   sizes_out);