46 files changed, 4156 insertions, 2247 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 28a176d68fa..2807c780d2d 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -165,6 +165,7 @@ GALLIVM_SOURCES := \
         gallivm/lp_bld_conv.c \
         gallivm/lp_bld_flow.c \
         gallivm/lp_bld_format_aos.c \
+        gallivm/lp_bld_format_aos_array.c \
         gallivm/lp_bld_format_soa.c \
         gallivm/lp_bld_format_yuv.c \
         gallivm/lp_bld_gather.c \
@@ -187,7 +188,6 @@ GALLIVM_SOURCES := \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
         draw/draw_llvm_sample.c \
-        draw/draw_llvm_translate.c \
         draw/draw_vs_llvm.c \
         draw/draw_pt_fetch_shade_pipeline_llvm.c
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 20260c1abbf..be30b7db245 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -70,8 +70,7 @@ draw_get_option_use_llvm(void)
  * Create new draw module context with gallivm state for LLVM JIT.
  */
 static struct draw_context *
-draw_create_context(struct pipe_context *pipe, boolean try_llvm,
-                    struct gallivm_state *gallivm)
+draw_create_context(struct pipe_context *pipe, boolean try_llvm)
 {
    struct draw_context *draw = CALLOC_STRUCT( draw_context );
    if (draw == NULL)
@@ -79,16 +78,7 @@ draw_create_context(struct pipe_context *pipe, boolean try_llvm,
 
 #if HAVE_LLVM
    if (try_llvm && draw_get_option_use_llvm()) {
-      if (!gallivm) {
-         gallivm = gallivm_create();
-         draw->own_gallivm = gallivm;
-      }
-
-      if (!gallivm)
-         goto err_destroy;
-
-      draw->llvm = draw_llvm_create(draw, gallivm);
-
+      draw->llvm = draw_llvm_create(draw);
       if (!draw->llvm)
          goto err_destroy;
    }
@@ -114,7 +104,7 @@ err_out:
 struct draw_context *
 draw_create(struct pipe_context *pipe)
 {
-   return draw_create_context(pipe, TRUE, NULL);
+   return draw_create_context(pipe, TRUE);
 }
 
 
@@ -124,17 +114,7 @@ draw_create(struct pipe_context *pipe)
 struct draw_context *
 draw_create_no_llvm(struct pipe_context *pipe)
 {
-   return draw_create_context(pipe, FALSE, NULL);
-}
-
-
-/**
- * Create new draw module context with gallivm state for LLVM JIT.
- */
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm)
-{
-   return draw_create_context(pipe, TRUE, gallivm);
+   return draw_create_context(pipe, FALSE);
 }
 
 
@@ -213,9 +193,6 @@ void draw_destroy( struct draw_context *draw )
 #ifdef HAVE_LLVM
    if (draw->llvm)
       draw_llvm_destroy( draw->llvm );
-
-   if (draw->own_gallivm)
-      gallivm_destroy(draw->own_gallivm);
 #endif
 
    FREE( draw );
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 852cbc3da13..cc95600c530 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,7 +48,6 @@ struct draw_vertex_shader;
 struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
-struct gallivm_state;
 
 /*
  * structure to contain driver internal information 
@@ -67,9 +66,6 @@ struct draw_context *draw_create( struct pipe_context *pipe );
 
 struct draw_context *draw_create_no_llvm(struct pipe_context *pipe);
 
-struct draw_context *
-draw_create_gallivm(struct pipe_context *pipe, struct gallivm_state *gallivm);
-
 void draw_destroy( struct draw_context *draw );
 
 void draw_flush(struct draw_context *draw);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index e08221eb392..8d9b5309aff 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -43,6 +43,8 @@
 #include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_pack.h"
+#include "gallivm/lp_bld_format.h"
 
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_dump.h"
@@ -56,40 +58,6 @@
 #define DEBUG_STORE 0
 
 
-/**
- * This function is called by the gallivm "garbage collector" when
- * the LLVM global data structures are freed.  We must free all LLVM-related
- * data.  Specifically, all JIT'd shader variants.
- */
-static void
-draw_llvm_garbage_collect_callback(void *cb_data)
-{
-   struct draw_llvm *llvm = (struct draw_llvm *) cb_data;
-   struct draw_context *draw = llvm->draw;
-   struct draw_llvm_variant_list_item *li;
-
-   /* Ensure prepare will be run and shaders recompiled */
-   assert(!draw->suspend_flushing);
-   draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
-
-   /* free all shader variants */
-   li = first_elem(&llvm->vs_variants_list);
-   while (!at_end(&llvm->vs_variants_list, li)) {
-      struct draw_llvm_variant_list_item *next = next_elem(li);
-      draw_llvm_destroy_variant(li->base);
-      li = next;
-   }
-
-   /* Null-out these pointers so they get remade next time they're needed.
-    * See the accessor functions below.
-    */
-   llvm->context_ptr_type = NULL;
-   llvm->buffer_ptr_type = NULL;
-   llvm->vb_ptr_type = NULL;
-   llvm->vertex_header_ptr_type = NULL;
-}
-
-
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var,
                    boolean elts);
@@ -316,56 +284,56 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
  * Create LLVM types for various structures.
  */
 static void
-create_jit_types(struct draw_llvm *llvm)
+create_jit_types(struct draw_llvm_variant *variant)
 {
-   struct gallivm_state *gallivm = llvm->gallivm;
+   struct gallivm_state *gallivm = variant->gallivm;
    LLVMTypeRef texture_type, context_type, buffer_type, vb_type;
 
    texture_type = create_jit_texture_type(gallivm, "texture");
 
    context_type = create_jit_context_type(gallivm, texture_type, "draw_jit_context");
-   llvm->context_ptr_type = LLVMPointerType(context_type, 0);
+   variant->context_ptr_type = LLVMPointerType(context_type, 0);
 
    buffer_type = LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 8), 0);
-   llvm->buffer_ptr_type = LLVMPointerType(buffer_type, 0);
+   variant->buffer_ptr_type = LLVMPointerType(buffer_type, 0);
 
    vb_type = create_jit_vertex_buffer_type(gallivm, "pipe_vertex_buffer");
-   llvm->vb_ptr_type = LLVMPointerType(vb_type, 0);
+   variant->vb_ptr_type = LLVMPointerType(vb_type, 0);
 }
 
 
 static LLVMTypeRef
-get_context_ptr_type(struct draw_llvm *llvm)
+get_context_ptr_type(struct draw_llvm_variant *variant)
 {
-   if (!llvm->context_ptr_type)
-      create_jit_types(llvm);
-   return llvm->context_ptr_type;
+   if (!variant->context_ptr_type)
+      create_jit_types(variant);
+   return variant->context_ptr_type;
 }
 
 
 static LLVMTypeRef
-get_buffer_ptr_type(struct draw_llvm *llvm)
+get_buffer_ptr_type(struct draw_llvm_variant *variant)
 {
-   if (!llvm->buffer_ptr_type)
-      create_jit_types(llvm);
-   return llvm->buffer_ptr_type;
+   if (!variant->buffer_ptr_type)
+      create_jit_types(variant);
+   return variant->buffer_ptr_type;
 }
 
 
 static LLVMTypeRef
-get_vb_ptr_type(struct draw_llvm *llvm)
+get_vb_ptr_type(struct draw_llvm_variant *variant)
 {
-   if (!llvm->vb_ptr_type)
-      create_jit_types(llvm);
-   return llvm->vb_ptr_type;
+   if (!variant->vb_ptr_type)
+      create_jit_types(variant);
+   return variant->vb_ptr_type;
 }
 
 static LLVMTypeRef
-get_vertex_header_ptr_type(struct draw_llvm *llvm)
+get_vertex_header_ptr_type(struct draw_llvm_variant *variant)
 {
-   if (!llvm->vertex_header_ptr_type)
-      create_jit_types(llvm);
-   return llvm->vertex_header_ptr_type;
+   if (!variant->vertex_header_ptr_type)
+      create_jit_types(variant);
+   return variant->vertex_header_ptr_type;
 }
 
 
@@ -373,7 +341,7 @@ get_vertex_header_ptr_type(struct draw_llvm *llvm)
  * Create per-context LLVM info.
  */
 struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
+draw_llvm_create(struct draw_context *draw)
 {
    struct draw_llvm *llvm;
 
@@ -384,18 +352,10 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
    lp_build_init();
 
    llvm->draw = draw;
-   llvm->gallivm = gallivm;
-
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
-      LLVMDumpModule(llvm->gallivm->module);
-   }
 
    llvm->nr_variants = 0;
    make_empty_list(&llvm->vs_variants_list);
 
-   gallivm_register_garbage_collector_callback(
-                              draw_llvm_garbage_collect_callback, llvm);
-
    return llvm;
 }
 
@@ -406,9 +366,6 @@ draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm)
 void
 draw_llvm_destroy(struct draw_llvm *llvm)
 {
-   gallivm_remove_garbage_collector_callback(
-                              draw_llvm_garbage_collect_callback, llvm);
-
    /* XXX free other draw_llvm data? */
    FREE(llvm);
 }
@@ -435,15 +392,27 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
 
    variant->llvm = llvm;
 
+   variant->gallivm = gallivm_create();
+
+   create_jit_types(variant);
+
    memcpy(&variant->key, key, shader->variant_key_size);
 
-   vertex_header = create_jit_vertex_header(llvm->gallivm, num_inputs);
+   vertex_header = create_jit_vertex_header(variant->gallivm, num_inputs);
 
-   llvm->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
+   variant->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
 
    draw_llvm_generate(llvm, variant, FALSE);  /* linear */
    draw_llvm_generate(llvm, variant, TRUE);   /* elts */
 
+   gallivm_compile_module(variant->gallivm);
+
+   variant->jit_func = (draw_jit_vert_func)
+         gallivm_jit_function(variant->gallivm, variant->function);
+
+   variant->jit_func_elts = (draw_jit_vert_func_elts)
+         gallivm_jit_function(variant->gallivm, variant->function_elts);
+
    variant->shader = shader;
    variant->list_item_global.base = variant;
    variant->list_item_local.base = variant;
@@ -455,8 +424,9 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
 
 
 static void
-generate_vs(struct draw_llvm *llvm,
+generate_vs(struct draw_llvm_variant *variant,
             LLVMBuilderRef builder,
+            struct lp_type vs_type,
             LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
             const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
             const struct lp_bld_tgsi_system_values *system_values,
@@ -464,21 +434,11 @@ generate_vs(struct draw_llvm *llvm,
             struct lp_build_sampler_soa *draw_sampler,
             boolean clamp_vertex_color)
 {
+   struct draw_llvm *llvm = variant->llvm;
    const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens;
-   struct lp_type vs_type;
-   LLVMValueRef consts_ptr = draw_jit_context_vs_constants(llvm->gallivm, context_ptr);
+   LLVMValueRef consts_ptr = draw_jit_context_vs_constants(variant->gallivm, context_ptr);
    struct lp_build_sampler_soa *sampler = 0;
 
-   memset(&vs_type, 0, sizeof vs_type);
-   vs_type.floating = TRUE; /* floating point values */
-   vs_type.sign = TRUE;     /* values are signed */
-   vs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
-   vs_type.width = 32;      /* 32-bit float */
-   vs_type.length = 4;      /* 4 elements per vector */
-#if 0
-   num_vs = 4;              /* number of vertices per block */
-#endif
-
    if (gallivm_debug & GALLIVM_DEBUG_IR) {
       tgsi_dump(tokens, 0);
    }
@@ -486,7 +446,7 @@ generate_vs(struct draw_llvm *llvm,
    if (llvm->draw->num_sampler_views && llvm->draw->num_samplers)
       sampler = draw_sampler;
 
-   lp_build_tgsi_soa(llvm->gallivm,
+   lp_build_tgsi_soa(variant->gallivm,
                      tokens,
                      vs_type,
                      NULL /*struct lp_build_mask_context *mask*/,
@@ -503,7 +463,7 @@ generate_vs(struct draw_llvm *llvm,
       unsigned chan, attrib;
       struct lp_build_context bld;
       struct tgsi_shader_info* info = &llvm->draw->vs.vertex_shader->info;
-      lp_build_context_init(&bld, llvm->gallivm, vs_type);
+      lp_build_context_init(&bld, variant->gallivm, vs_type);
 
       for (attrib = 0; attrib < info->num_outputs; ++attrib) {
          for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
@@ -531,25 +491,6 @@ generate_vs(struct draw_llvm *llvm,
 }
 
 
-#if DEBUG_STORE
-static void print_vectorf(LLVMBuilderRef builder,
-                         LLVMValueRef vec)
-{
-   LLVMValueRef val[4];
-   val[0] = LLVMBuildExtractElement(builder, vec,
-                                    lp_build_const_int32(gallivm, 0), "");
-   val[1] = LLVMBuildExtractElement(builder, vec,
-                                    lp_build_const_int32(gallivm, 1), "");
-   val[2] = LLVMBuildExtractElement(builder, vec,
-                                    lp_build_const_int32(gallivm, 2), "");
-   val[3] = LLVMBuildExtractElement(builder, vec,
-                                    lp_build_const_int32(gallivm, 3), "");
-   lp_build_printf(builder, "vector = [%f, %f, %f, %f]\n",
-                   val[0], val[1], val[2], val[3]);
-}
-#endif
-
-
 static void
 generate_fetch(struct gallivm_state *gallivm,
                LLVMValueRef vbuffers_ptr,
@@ -559,6 +500,8 @@ generate_fetch(struct gallivm_state *gallivm,
                LLVMValueRef index,
                LLVMValueRef instance_id)
 {
+   const struct util_format_description *format_desc = util_format_description(velem->src_format);
+   LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef indices =
       LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
@@ -587,118 +530,47 @@ generate_fetch(struct gallivm_state *gallivm,
                          lp_build_const_int32(gallivm, velem->src_offset),
                          "");
 
-   /*lp_build_printf(builder, "vbuf index = %d, stride is %d\n", indices, stride);*/
+/*   lp_build_printf(gallivm, "vbuf index = %d, stride is %d\n", indices, stride);*/
    vbuffer_ptr = LLVMBuildGEP(builder, vbuffer_ptr, &stride, 1, "");
 
-   *res = draw_llvm_translate_from(gallivm, vbuffer_ptr, velem->src_format);
-}
-
-
-static LLVMValueRef
-aos_to_soa(struct gallivm_state *gallivm,
-           LLVMValueRef val0,
-           LLVMValueRef val1,
-           LLVMValueRef val2,
-           LLVMValueRef val3,
-           LLVMValueRef channel)
-{
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef ex, res;
-
-   ex = LLVMBuildExtractElement(builder, val0,
-                                channel, "");
-   res = LLVMBuildInsertElement(builder,
-                                LLVMConstNull(LLVMTypeOf(val0)),
-                                ex,
-                                lp_build_const_int32(gallivm, 0),
-                                "");
-
-   ex = LLVMBuildExtractElement(builder, val1,
-                                channel, "");
-   res = LLVMBuildInsertElement(builder,
-                                res, ex,
-                                lp_build_const_int32(gallivm, 1),
-                                "");
-
-   ex = LLVMBuildExtractElement(builder, val2,
-                                channel, "");
-   res = LLVMBuildInsertElement(builder,
-                                res, ex,
-                                lp_build_const_int32(gallivm, 2),
-                                "");
-
-   ex = LLVMBuildExtractElement(builder, val3,
-                                channel, "");
-   res = LLVMBuildInsertElement(builder,
-                                res, ex,
-                                lp_build_const_int32(gallivm, 3),
-                                "");
-
-   return res;
+   *res = lp_build_fetch_rgba_aos(gallivm,
+                                  format_desc,
+                                  lp_float32_vec4_type(),
+                                  vbuffer_ptr,
+                                  zero, zero, zero);
 }
 
-
 static void
-soa_to_aos(struct gallivm_state *gallivm,
-           LLVMValueRef soa[TGSI_NUM_CHANNELS],
-           LLVMValueRef aos[TGSI_NUM_CHANNELS])
+convert_to_soa(struct gallivm_state *gallivm,
+               LLVMValueRef (*src_aos)[LP_MAX_VECTOR_WIDTH / 32],
+               LLVMValueRef (*dst_soa)[TGSI_NUM_CHANNELS],
+               unsigned num_attribs, const struct lp_type soa_type)
 {
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef comp;
-   int i = 0;
+   unsigned i, j, k;
+   struct lp_type aos_channel_type = soa_type;
 
    debug_assert(TGSI_NUM_CHANNELS == 4);
+   debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
 
-   aos[0] = LLVMConstNull(LLVMTypeOf(soa[0]));
-   aos[1] = aos[2] = aos[3] = aos[0];
-
-   for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
-      LLVMValueRef channel = lp_build_const_int32(gallivm, i);
-
-      comp = LLVMBuildExtractElement(builder, soa[i],
-                                     lp_build_const_int32(gallivm, 0), "");
-      aos[0] = LLVMBuildInsertElement(builder, aos[0], comp, channel, "");
-
-      comp = LLVMBuildExtractElement(builder, soa[i],
-                                     lp_build_const_int32(gallivm, 1), "");
-      aos[1] = LLVMBuildInsertElement(builder, aos[1], comp, channel, "");
+   aos_channel_type.length >>= 1;
 
-      comp = LLVMBuildExtractElement(builder, soa[i],
-                                     lp_build_const_int32(gallivm, 2), "");
-      aos[2] = LLVMBuildInsertElement(builder, aos[2], comp, channel, "");
-
-      comp = LLVMBuildExtractElement(builder, soa[i],
-                                     lp_build_const_int32(gallivm, 3), "");
-      aos[3] = LLVMBuildInsertElement(builder, aos[3], comp, channel, "");
+   for (i = 0; i < num_attribs; ++i) {
+      LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+      unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
 
-   }
-}
+      for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+         LLVMValueRef channel[LP_MAX_VECTOR_LENGTH];
 
+         assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
 
-static void
-convert_to_soa(struct gallivm_state *gallivm,
-               LLVMValueRef (*aos)[TGSI_NUM_CHANNELS],
-               LLVMValueRef (*soa)[TGSI_NUM_CHANNELS],
-               int num_attribs)
-{
-   int i;
+         for (k = 0; k < pixels_per_channel; ++k) {
+            channel[k] = src_aos[i][j + TGSI_NUM_CHANNELS * k];
+         }
 
-   debug_assert(TGSI_NUM_CHANNELS == 4);
+         aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+      }
 
-   for (i = 0; i < num_attribs; ++i) {
-      LLVMValueRef val0 = aos[i][0];
-      LLVMValueRef val1 = aos[i][1];
-      LLVMValueRef val2 = aos[i][2];
-      LLVMValueRef val3 = aos[i][3];
-
-      soa[i][0] = aos_to_soa(gallivm, val0, val1, val2, val3,
-                             lp_build_const_int32(gallivm, 0));
-      soa[i][1] = aos_to_soa(gallivm, val0, val1, val2, val3,
-                             lp_build_const_int32(gallivm, 1));
-      soa[i][2] = aos_to_soa(gallivm, val0, val1, val2, val3,
-                             lp_build_const_int32(gallivm, 2));
-      soa[i][3] = aos_to_soa(gallivm, val0, val1, val2, val3,
-                             lp_build_const_int32(gallivm, 3));
+      lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa[i]);
    }
 }
 
@@ -707,89 +579,34 @@ static void
 store_aos(struct gallivm_state *gallivm,
           LLVMValueRef io_ptr,
           LLVMValueRef index,
-          LLVMValueRef value,
-          LLVMValueRef clipmask, boolean have_clipdist)
+          LLVMValueRef value)
 {
+   LLVMTypeRef data_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, lp_float32_vec4_type()), 0);
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptr);
    LLVMValueRef data_ptr = draw_jit_header_data(gallivm, io_ptr);
    LLVMValueRef indices[3];
-   LLVMValueRef val;
-   int vertex_id_pad_edgeflag;
 
    indices[0] = lp_build_const_int32(gallivm, 0);
    indices[1] = index;
    indices[2] = lp_build_const_int32(gallivm, 0);
 
-   /* If this assertion fails, it means we need to update the bit twidding
-    * code here.  See struct vertex_header in draw_private.h.
-    */
-   assert(DRAW_TOTAL_CLIP_PLANES==14);
-   /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
-   vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
-   if (have_clipdist)
-      vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
-   val = lp_build_const_int32(gallivm, vertex_id_pad_edgeflag);
-   /* OR with the clipmask */
-   val = LLVMBuildOr(builder, val, clipmask, "");               
-
-   /* store vertex header */
-   LLVMBuildStore(builder, val, id_ptr);
-
-
 #if DEBUG_STORE
-   lp_build_printf(builder, "    ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
-#endif
-#if 0
-   /*lp_build_printf(builder, " ---- %p storing at %d (%p)  ", io_ptr, index, data_ptr);
-     print_vectorf(builder, value);*/
-   data_ptr = LLVMBuildBitCast(builder, data_ptr,
-                               LLVMPointerType(LLVMArrayType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0), 0),
-                               "datavec");
-   data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 2, "");
-
-   LLVMBuildStore(builder, value, data_ptr);
-#else
-   {
-      LLVMValueRef x, y, z, w;
-      LLVMValueRef idx0, idx1, idx2, idx3;
-      LLVMValueRef gep0, gep1, gep2, gep3;
-      data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
-
-      idx0 = lp_build_const_int32(gallivm, 0);
-      idx1 = lp_build_const_int32(gallivm, 1);
-      idx2 = lp_build_const_int32(gallivm, 2);
-      idx3 = lp_build_const_int32(gallivm, 3);
-
-      x = LLVMBuildExtractElement(builder, value,
-                                  idx0, "");
-      y = LLVMBuildExtractElement(builder, value,
-                                  idx1, "");
-      z = LLVMBuildExtractElement(builder, value,
-                                  idx2, "");
-      w = LLVMBuildExtractElement(builder, value,
-                                  idx3, "");
-
-      gep0 = LLVMBuildGEP(builder, data_ptr, &idx0, 1, "");
-      gep1 = LLVMBuildGEP(builder, data_ptr, &idx1, 1, "");
-      gep2 = LLVMBuildGEP(builder, data_ptr, &idx2, 1, "");
-      gep3 = LLVMBuildGEP(builder, data_ptr, &idx3, 1, "");
-
-      /*lp_build_printf(builder, "##### x = %f (%p), y = %f (%p), z = %f (%p), w = %f (%p)\n",
-        x, gep0, y, gep1, z, gep2, w, gep3);*/
-      LLVMBuildStore(builder, x, gep0);
-      LLVMBuildStore(builder, y, gep1);
-      LLVMBuildStore(builder, z, gep2);
-      LLVMBuildStore(builder, w, gep3);
-   }
+   lp_build_printf(gallivm, "    ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
 #endif
+
+   data_ptr = LLVMBuildGEP(builder, data_ptr, indices, 3, "");
+   data_ptr = LLVMBuildPointerCast(builder, data_ptr, data_ptr_type, "");
+
+   /* Unaligned store due to the vertex header */
+   lp_set_store_alignment(LLVMBuildStore(builder, value, data_ptr), sizeof(float));
 }
 
 
 static void
 store_aos_array(struct gallivm_state *gallivm,
+                struct lp_type soa_type,
                 LLVMValueRef io_ptr,
-                LLVMValueRef aos[TGSI_NUM_CHANNELS],
+                LLVMValueRef* aos,
                 int attrib,
                 int num_outputs,
                 LLVMValueRef clipmask,
@@ -797,42 +614,49 @@ store_aos_array(struct gallivm_state *gallivm,
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib);
-   LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1);
-   LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2);
-   LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3);
-   LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
-   LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3;
+   LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32];
+   LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+   int vector_length = soa_type.length;
+   int i;
    
    debug_assert(TGSI_NUM_CHANNELS == 4);
 
-   io0_ptr = LLVMBuildGEP(builder, io_ptr,
-                          &ind0, 1, "");
-   io1_ptr = LLVMBuildGEP(builder, io_ptr,
-                          &ind1, 1, "");
-   io2_ptr = LLVMBuildGEP(builder, io_ptr,
-                          &ind2, 1, "");
-   io3_ptr = LLVMBuildGEP(builder, io_ptr,
-                          &ind3, 1, "");
-
-   clipmask0 = LLVMBuildExtractElement(builder, clipmask,
-                                       ind0, "");
-   clipmask1 = LLVMBuildExtractElement(builder, clipmask,
-                                       ind1, "");
-   clipmask2 = LLVMBuildExtractElement(builder, clipmask,
-                                       ind2, "");
-   clipmask3 = LLVMBuildExtractElement(builder, clipmask,
-                                       ind3, "");
+   for (i = 0; i < vector_length; i++) {
+      inds[i] = lp_build_const_int32(gallivm, i);
+      io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, "");
+   }
 
+   if (attrib == 0) {
+      /* store vertex header for each of the n vertices */
+      LLVMValueRef val, cliptmp;
+      int vertex_id_pad_edgeflag;
+
+      /* If this assertion fails, it means we need to update the bit twidding
+       * code here.  See struct vertex_header in draw_private.h.
+       */
+      assert(DRAW_TOTAL_CLIP_PLANES==14);
+      /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
+      vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
+      if (have_clipdist)
+         vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
+      val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag);
+      /* OR with the clipmask */
+      cliptmp = LLVMBuildOr(builder, val, clipmask, "");
+      for (i = 0; i < vector_length; i++) {
+         LLVMValueRef id_ptr = draw_jit_header_id(gallivm, io_ptrs[i]);
+         val = LLVMBuildExtractElement(builder, cliptmp, inds[i], "");
+         LLVMBuildStore(builder, val, id_ptr);
 #if DEBUG_STORE
-   lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n",
-                   io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3);
+         lp_build_printf(gallivm, "io = %p, index %d\n, clipmask = %x\n",
+                         io_ptrs[i], inds[i], val);
 #endif
-   /* store for each of the 4 vertices */
-   store_aos(gallivm, io0_ptr, attr_index, aos[0], clipmask0, have_clipdist);
-   store_aos(gallivm, io1_ptr, attr_index, aos[1], clipmask1, have_clipdist);
-   store_aos(gallivm, io2_ptr, attr_index, aos[2], clipmask2, have_clipdist);
-   store_aos(gallivm, io3_ptr, attr_index, aos[3], clipmask3, have_clipdist);
+      }
+   }
+
+   /* store for each of the n vertices */
+   for (i = 0; i < vector_length; i++) {
+      store_aos(gallivm, io_ptrs[i], attr_index, aos[i]);
+   }
 }
 
 
@@ -842,33 +666,53 @@ convert_to_aos(struct gallivm_state *gallivm,
                LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                LLVMValueRef clipmask,
                int num_outputs,
-               int max_vertices, boolean have_clipdist)
+               struct lp_type soa_type,
+               boolean have_clipdist)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   unsigned chan, attrib;
+   unsigned chan, attrib, i;
 
 #if DEBUG_STORE
-   lp_build_printf(builder, "   # storing begin\n");
+   lp_build_printf(gallivm, "   # storing begin\n");
 #endif
    for (attrib = 0; attrib < num_outputs; ++attrib) {
-      LLVMValueRef soa[4];
-      LLVMValueRef aos[4];
+      LLVMValueRef soa[TGSI_NUM_CHANNELS];
+      LLVMValueRef aos[LP_MAX_VECTOR_WIDTH / 32];
       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
          if (outputs[attrib][chan]) {
             LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
             lp_build_name(out, "output%u.%c", attrib, "xyzw"[chan]);
-            /*lp_build_printf(builder, "output %d : %d ",
-                            LLVMConstInt(LLVMInt32Type(), attrib, 0),
-                            LLVMConstInt(LLVMInt32Type(), chan, 0));
-              print_vectorf(builder, out);*/
+#if DEBUG_STORE
+            lp_build_printf(gallivm, "output %d : %d ",
+                            LLVMConstInt(LLVMInt32TypeInContext(gallivm->context),
+                                         attrib, 0),
+                            LLVMConstInt(LLVMInt32TypeInContext(gallivm->context),
+                                         chan, 0));
+            lp_build_print_value(gallivm, "val = ", out);
+#endif
             soa[chan] = out;
          }
          else {
             soa[chan] = 0;
          }
       }
-      soa_to_aos(gallivm, soa, aos);
+
+
+      if (soa_type.length == TGSI_NUM_CHANNELS) {
+         lp_build_transpose_aos(gallivm, soa_type, soa, aos);
+      } else {
+         lp_build_transpose_aos(gallivm, soa_type, soa, soa);
+
+         for (i = 0; i < soa_type.length; ++i) {
+            aos[i] = lp_build_extract_range(gallivm,
+                                            soa[i % TGSI_NUM_CHANNELS],
+                                            (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS,
+                                            TGSI_NUM_CHANNELS);
+         }
+      }
+
       store_aos_array(gallivm,
+                      soa_type,
                       io,
                       aos,
                       attrib,
@@ -876,104 +720,71 @@ convert_to_aos(struct gallivm_state *gallivm,
                       clipmask, have_clipdist);
    }
 #if DEBUG_STORE
-   lp_build_printf(builder, "   # storing end\n");
+   lp_build_printf(gallivm, "   # storing end\n");
 #endif
 }
 
 
 /**
  * Stores original vertex positions in clip coordinates
- * There is probably a more efficient way to do this, 4 floats at once
- * rather than extracting each element one by one.
- * idx is the output to store things too, if pre_clip_pos is set
- * we store the pos to the idx, if not we store the clipvertex to it.
  */
 static void
 store_clip(struct gallivm_state *gallivm,
+           const struct lp_type vs_type,
            LLVMValueRef io_ptr,           
            LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
            boolean pre_clip_pos, int idx)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef out[4];
+   LLVMValueRef soa[4];
+   LLVMValueRef aos[LP_MAX_VECTOR_LENGTH];
    LLVMValueRef indices[2]; 
-   LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
-   LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3;
-   LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr;    
-   LLVMValueRef out0elem, out1elem, out2elem, out3elem;
-   int i;
+   LLVMValueRef io_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+   LLVMValueRef inds[LP_MAX_VECTOR_WIDTH / 32];
+   LLVMValueRef clip_ptrs[LP_MAX_VECTOR_WIDTH / 32];
+   int i, j;
 
-   LLVMValueRef ind0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef ind1 = lp_build_const_int32(gallivm, 1);
-   LLVMValueRef ind2 = lp_build_const_int32(gallivm, 2);
-   LLVMValueRef ind3 = lp_build_const_int32(gallivm, 3);
-   
    indices[0] =
    indices[1] = lp_build_const_int32(gallivm, 0);
    
-   out[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 x2 x3*/
-   out[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 y2 y3*/
-   out[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 z2 z3*/
-   out[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 w2 w3*/
+   for (i = 0; i < vs_type.length; i++) {
+      inds[i] = lp_build_const_int32(gallivm, i);
+      io_ptrs[i] = LLVMBuildGEP(builder, io_ptr, &inds[i], 1, "");
+   }
 
-   io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, "");
-   io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, "");
-   io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, "");
-   io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, "");
+   soa[0] = LLVMBuildLoad(builder, outputs[idx][0], ""); /*x0 x1 .. xn*/
+   soa[1] = LLVMBuildLoad(builder, outputs[idx][1], ""); /*y0 y1 .. yn*/
+   soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/
+   soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/
 
    if (!pre_clip_pos) {
-      clip_ptr0 = draw_jit_header_clip(gallivm, io0_ptr);
-      clip_ptr1 = draw_jit_header_clip(gallivm, io1_ptr);
-      clip_ptr2 = draw_jit_header_clip(gallivm, io2_ptr);
-      clip_ptr3 = draw_jit_header_clip(gallivm, io3_ptr);
+      for (i = 0; i < vs_type.length; i++) {
+         clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]);
+      }
    } else {
-      clip_ptr0 = draw_jit_header_pre_clip_pos(gallivm, io0_ptr);
-      clip_ptr1 = draw_jit_header_pre_clip_pos(gallivm, io1_ptr);
-      clip_ptr2 = draw_jit_header_pre_clip_pos(gallivm, io2_ptr);
-      clip_ptr3 = draw_jit_header_pre_clip_pos(gallivm, io3_ptr);
+      for (i = 0; i < vs_type.length; i++) {
+         clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]);
+      }
    }
 
-   for (i = 0; i<4; i++) {
-      clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, indices, 2, ""); /* x0 */
-      clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, indices, 2, ""); /* x1 */
-      clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, indices, 2, ""); /* x2 */
-      clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, indices, 2, ""); /* x3 */
-
-      out0elem = LLVMBuildExtractElement(builder, out[i], ind0, ""); /* x0 */
-      out1elem = LLVMBuildExtractElement(builder, out[i], ind1, ""); /* x1 */
-      out2elem = LLVMBuildExtractElement(builder, out[i], ind2, ""); /* x2 */
-      out3elem = LLVMBuildExtractElement(builder, out[i], ind3, ""); /* x3 */
-  
-      LLVMBuildStore(builder, out0elem, clip0_ptr);
-      LLVMBuildStore(builder, out1elem, clip1_ptr);
-      LLVMBuildStore(builder, out2elem, clip2_ptr);
-      LLVMBuildStore(builder, out3elem, clip3_ptr);
-
-      indices[1]= LLVMBuildAdd(builder, indices[1], ind1, "");
+   lp_build_transpose_aos(gallivm, vs_type, soa, soa);
+   for (i = 0; i < vs_type.length; ++i) {
+      aos[i] = lp_build_extract_range(gallivm,
+                                      soa[i % TGSI_NUM_CHANNELS],
+                                      (i / TGSI_NUM_CHANNELS) * TGSI_NUM_CHANNELS,
+                                      TGSI_NUM_CHANNELS);
    }
 
-}
-
+   for (j = 0; j < vs_type.length; j++) {
+      LLVMTypeRef  clip_ptr_type = LLVMPointerType(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), 0);
+      LLVMValueRef clip_ptr;
 
-/**
- * Equivalent of _mm_set1_ps(a)
- */
-static LLVMValueRef
-vec4f_from_scalar(struct gallivm_state *gallivm,
-                  LLVMValueRef a,
-                  const char *name)
-{
-   LLVMTypeRef float_type = LLVMFloatTypeInContext(gallivm->context);
-   LLVMValueRef res = LLVMGetUndef(LLVMVectorType(float_type, 4));
-   int i;
+      clip_ptr = LLVMBuildGEP(builder, clip_ptrs[j], indices, 2, "clipo");
+      clip_ptr = LLVMBuildPointerCast(builder, clip_ptr, clip_ptr_type, "");
 
-   for (i = 0; i < 4; ++i) {
-      LLVMValueRef index = lp_build_const_int32(gallivm, i);
-      res = LLVMBuildInsertElement(gallivm->builder, res, a,
-                                   index, i == 3 ? name : "");
+      /* Unaligned store */
+      lp_set_store_alignment(LLVMBuildStore(builder, aos[j], clip_ptr), sizeof(float));
    }
-
-   return res;
 }
 
 
@@ -981,15 +792,17 @@ vec4f_from_scalar(struct gallivm_state *gallivm,
  * Transforms the outputs for viewport mapping
  */
 static void
-generate_viewport(struct draw_llvm *llvm,
+generate_viewport(struct draw_llvm_variant *variant,
                   LLVMBuilderRef builder,
+                  struct lp_type vs_type,
                   LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                   LLVMValueRef context_ptr)
 {
    int i;
-   struct gallivm_state *gallivm = llvm->gallivm;
-   struct lp_type f32_type = lp_type_float_vec(32);
-   LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/   
+   struct gallivm_state *gallivm = variant->gallivm;
+   struct lp_type f32_type = vs_type;
+   LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type);
+   LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 .. wn*/
    LLVMValueRef const1 = lp_build_const_vec(gallivm, f32_type, 1.0);       /*1.0 1.0 1.0 1.0*/ 
    LLVMValueRef vp_ptr = draw_jit_context_viewport(gallivm, context_ptr);
 
@@ -999,7 +812,7 @@ generate_viewport(struct draw_llvm *llvm,
   
    /* Viewport Mapping */
    for (i=0; i<3; i++) {
-      LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/
+      LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 .. xn*/
       LLVMValueRef scale;
       LLVMValueRef trans;
       LLVMValueRef scale_i;
@@ -1012,8 +825,10 @@ generate_viewport(struct draw_llvm *llvm,
       index = lp_build_const_int32(gallivm, i+4);
       trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
 
-      scale = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, scale_i, ""), "scale");
-      trans = vec4f_from_scalar(gallivm, LLVMBuildLoad(builder, trans_i, ""), "trans");
+      scale = lp_build_broadcast(gallivm, vs_type_llvm,
+                                 LLVMBuildLoad(builder, scale_i, "scale"));
+      trans = lp_build_broadcast(gallivm, vs_type_llvm,
+                                 LLVMBuildLoad(builder, trans_i, "trans"));
 
       /* divide by w */
       out = LLVMBuildFMul(builder, out, out3, "");
@@ -1030,10 +845,12 @@ generate_viewport(struct draw_llvm *llvm,
 
 
 /**
- * Returns clipmask as 4xi32 bitmask for the 4 vertices
+ * Returns clipmask as nxi32 bitmask for the n vertices
  */
 static LLVMValueRef 
 generate_clipmask(struct draw_llvm *llvm,
+                  struct gallivm_state *gallivm,
+                  struct lp_type vs_type,
                   LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                   boolean clip_xy,
                   boolean clip_z,
@@ -1043,15 +860,15 @@ generate_clipmask(struct draw_llvm *llvm,
                   LLVMValueRef context_ptr,
                   boolean *have_clipdist)
 {
-   struct gallivm_state *gallivm = llvm->gallivm;
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef mask; /* stores the <4xi32> clipmasks */     
+   LLVMValueRef mask; /* stores the <nxi32> clipmasks */
    LLVMValueRef test, temp; 
    LLVMValueRef zero, shift;
    LLVMValueRef pos_x, pos_y, pos_z, pos_w;
    LLVMValueRef cv_x, cv_y, cv_z, cv_w;
    LLVMValueRef plane1, planes, plane_ptr, sum;
-   struct lp_type f32_type = lp_type_float_vec(32); 
+   struct lp_type f32_type = vs_type;
+   struct lp_type i32_type = lp_int_type(vs_type);
    const unsigned pos = draw_current_shader_position_output(llvm->draw);
    const unsigned cv = draw_current_shader_clipvertex_output(llvm->draw);
    int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance;
@@ -1064,25 +881,25 @@ generate_clipmask(struct draw_llvm *llvm,
    if (cd[0] != pos || cd[1] != pos)
       have_cd = true;
 
-   mask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
-   temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
-   zero = lp_build_const_vec(gallivm, f32_type, 0);                    /* 0.0f 0.0f 0.0f 0.0f */
-   shift = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1);    /* 1 1 1 1 */
+   mask = lp_build_const_int_vec(gallivm, i32_type, 0);
+   temp = lp_build_const_int_vec(gallivm, i32_type, 0);
+   zero = lp_build_const_vec(gallivm, f32_type, 0);         /* 0.0f 0.0f 0.0f 0.0f */
+   shift = lp_build_const_int_vec(gallivm, i32_type, 1);    /* 1 1 1 1 */
 
    /*
     * load clipvertex and position from correct locations.
     * if they are the same just load them once.
     */
-   pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 x2 x3*/
-   pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 y2 y3*/
-   pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 z2 z3*/
-   pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 w2 w3*/
+   pos_x = LLVMBuildLoad(builder, outputs[pos][0], ""); /*x0 x1 .. xn */
+   pos_y = LLVMBuildLoad(builder, outputs[pos][1], ""); /*y0 y1 .. yn */
+   pos_z = LLVMBuildLoad(builder, outputs[pos][2], ""); /*z0 z1 .. zn */
+   pos_w = LLVMBuildLoad(builder, outputs[pos][3], ""); /*w0 w1 .. wn */
 
    if (clip_user && cv != pos) {
-      cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 x2 x3*/
-      cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 y2 y3*/
-      cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 z2 z3*/
-      cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 w2 w3*/
+      cv_x = LLVMBuildLoad(builder, outputs[cv][0], ""); /*x0 x1 .. xn */
+      cv_y = LLVMBuildLoad(builder, outputs[cv][1], ""); /*y0 y1 .. yn */
+      cv_z = LLVMBuildLoad(builder, outputs[cv][2], ""); /*z0 z1 .. zn */
+      cv_w = LLVMBuildLoad(builder, outputs[cv][3], ""); /*w0 w1 .. wn */
    } else {
       cv_x = pos_x;
       cv_y = pos_y;
@@ -1120,7 +937,7 @@ generate_clipmask(struct draw_llvm *llvm,
    }
 
    if (clip_z) {
-      temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 16);
+      temp = lp_build_const_int_vec(gallivm, i32_type, 16);
       if (clip_halfz) {
          /* plane 5 */
          test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
@@ -1163,42 +980,43 @@ generate_clipmask(struct draw_llvm *llvm,
                clipdist = LLVMBuildLoad(builder, outputs[cd[1]][i-4], "");
             }
             test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, clipdist);
-            temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx);
+            temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx);
             test = LLVMBuildAnd(builder, test, temp, "");
             mask = LLVMBuildOr(builder, mask, test, "");
          } else {
+            LLVMTypeRef vs_type_llvm = lp_build_vec_type(gallivm, vs_type);
             indices[0] = lp_build_const_int32(gallivm, 0);
             indices[1] = lp_build_const_int32(gallivm, plane_idx);
 
             indices[2] = lp_build_const_int32(gallivm, 0);
             plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
             plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x");
-            planes = vec4f_from_scalar(gallivm, plane1, "plane4_x");
+            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
             sum = LLVMBuildFMul(builder, planes, cv_x, "");
 
             indices[2] = lp_build_const_int32(gallivm, 1);
             plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
             plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y");
-            planes = vec4f_from_scalar(gallivm, plane1, "plane4_y");
+            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
             test = LLVMBuildFMul(builder, planes, cv_y, "");
             sum = LLVMBuildFAdd(builder, sum, test, "");
 
             indices[2] = lp_build_const_int32(gallivm, 2);
             plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
             plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z");
-            planes = vec4f_from_scalar(gallivm, plane1, "plane4_z");
+            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
             test = LLVMBuildFMul(builder, planes, cv_z, "");
             sum = LLVMBuildFAdd(builder, sum, test, "");
 
             indices[2] = lp_build_const_int32(gallivm, 3);
             plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
             plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w");
-            planes = vec4f_from_scalar(gallivm, plane1, "plane4_w");
+            planes = lp_build_broadcast(gallivm, vs_type_llvm, plane1);
             test = LLVMBuildFMul(builder, planes, cv_w, "");
             sum = LLVMBuildFAdd(builder, sum, test, "");
 
             test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, sum);
-            temp = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 1 << plane_idx);
+            temp = lp_build_const_int_vec(gallivm, i32_type, 1 << plane_idx);
             test = LLVMBuildAnd(builder, test, temp, "");
             mask = LLVMBuildOr(builder, mask, test, "");
          }
@@ -1212,23 +1030,28 @@ generate_clipmask(struct draw_llvm *llvm,
  * Returns boolean if any clipping has occurred
  * Used zero/non-zero i32 value to represent boolean 
  */
-static void
-clipmask_bool(struct gallivm_state *gallivm,
-              LLVMValueRef clipmask,
-              LLVMValueRef ret_ptr)
+static LLVMValueRef
+clipmask_booli32(struct gallivm_state *gallivm,
+                 const struct lp_type vs_type,
+                 LLVMValueRef clipmask_bool_ptr)
 {
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, "");   
+   LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef clipmask_bool = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
+   LLVMValueRef ret = LLVMConstNull(int32_type);
    LLVMValueRef temp;
    int i;
 
-   for (i=0; i<4; i++) {   
-      temp = LLVMBuildExtractElement(builder, clipmask,
+   /*
+    * Can do this with log2(vector length) pack instructions and one extract
+    * (as we don't actually need a or) with sse2 which would be way better.
+    */
+   for (i=0; i < vs_type.length; i++) {
+      temp = LLVMBuildExtractElement(builder, clipmask_bool,
                                      lp_build_const_int32(gallivm, i) , "");
       ret = LLVMBuildOr(builder, ret, temp, "");
    }
-   
-   LLVMBuildStore(builder, ret, ret_ptr);
+   return ret;
 }
 
 
@@ -1236,7 +1059,7 @@ static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
                    boolean elts)
 {
-   struct gallivm_state *gallivm = llvm->gallivm;
+   struct gallivm_state *gallivm = variant->gallivm;
    LLVMContextRef context = gallivm->context;
    LLVMTypeRef int32_type = LLVMInt32TypeInContext(context);
    LLVMTypeRef arg_types[8];
@@ -1244,6 +1067,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
    LLVMValueRef context_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
+   struct lp_type vs_type;
    LLVMValueRef end, start;
    LLVMValueRef count, fetch_elts, fetch_count;
    LLVMValueRef stride, step, io_itr;
@@ -1255,12 +1079,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
    unsigned i, j;
    struct lp_build_context bld;
    struct lp_build_loop_state lp_loop;
-   const int max_vertices = 4;
+   const int vector_length = lp_native_vector_width / 32;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
    LLVMValueRef fetch_max;
-   void *code;
    struct lp_build_sampler_soa *sampler = 0;
-   LLVMValueRef ret, ret_ptr;
+   LLVMValueRef ret, clipmask_bool_ptr;
    const boolean bypass_viewport = variant->key.bypass_viewport;
    const boolean enable_cliptest = variant->key.clip_xy || 
                                    variant->key.clip_z  ||
@@ -1273,16 +1096,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
 
    memset(&system_values, 0, sizeof(system_values));
 
-   arg_types[0] = get_context_ptr_type(llvm);       /* context */
-   arg_types[1] = get_vertex_header_ptr_type(llvm); /* vertex_header */
-   arg_types[2] = get_buffer_ptr_type(llvm);        /* vbuffers */
+   arg_types[0] = get_context_ptr_type(variant);       /* context */
+   arg_types[1] = get_vertex_header_ptr_type(variant); /* vertex_header */
+   arg_types[2] = get_buffer_ptr_type(variant);        /* vbuffers */
    if (elts)
       arg_types[3] = LLVMPointerType(int32_type, 0);/* fetch_elts * */
    else
       arg_types[3] = int32_type;                    /* start */
    arg_types[4] = int32_type;                       /* fetch_count / count */
    arg_types[5] = int32_type;                       /* stride */
-   arg_types[6] = get_vb_ptr_type(llvm);            /* pipe_vertex_buffer's */
+   arg_types[6] = get_vb_ptr_type(variant);            /* pipe_vertex_buffer's */
    arg_types[7] = int32_type;                       /* instance_id */
 
    func_type = LLVMFunctionType(int32_type, arg_types, Elements(arg_types), 0);
@@ -1341,9 +1164,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
 
    lp_build_context_init(&bld, gallivm, lp_type_int(32));
 
-   /* function will return non-zero i32 value if any clipped vertices */
-   ret_ptr = lp_build_alloca(gallivm, int32_type, "");
-   LLVMBuildStore(builder, zero, ret_ptr);
+   memset(&vs_type, 0, sizeof vs_type);
+   vs_type.floating = TRUE; /* floating point values */
+   vs_type.sign = TRUE;     /* values are signed */
+   vs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
+   vs_type.width = 32;      /* 32-bit float */
+   vs_type.length = vector_length;
+
+   /* hold temporary "bool" clipmask */
+   clipmask_bool_ptr = lp_build_alloca(gallivm, lp_build_int_vec_type(gallivm, vs_type), "");
+   LLVMBuildStore(builder, lp_build_zero(gallivm, lp_int_type(vs_type)), clipmask_bool_ptr);
 
    /* code generated texture sampling */
    sampler = draw_llvm_sampler_soa_create(
@@ -1358,14 +1188,14 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
       end = lp_build_add(&bld, start, count);
    }
 
-   step = lp_build_const_int32(gallivm, max_vertices);
+   step = lp_build_const_int32(gallivm, vector_length);
 
    fetch_max = LLVMBuildSub(builder, end, one, "fetch_max");
 
    lp_build_loop_begin(&lp_loop, gallivm, start);
    {
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
-      LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS] = { { 0 } };
+      LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][LP_MAX_VECTOR_WIDTH / 32] = { { 0 } };
       LLVMValueRef io;
       LLVMValueRef clipmask;   /* holds the clipmask value */
       const LLVMValueRef (*ptr_aos)[TGSI_NUM_CHANNELS];
@@ -1377,11 +1207,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
 
       io = LLVMBuildGEP(builder, io_ptr, &io_itr, 1, "");
 #if DEBUG_STORE
-      lp_build_printf(builder, " --- io %d = %p, loop counter %d\n",
+      lp_build_printf(gallivm, " --- io %d = %p, loop counter %d\n",
                       io_itr, io, lp_loop.counter);
 #endif
-      system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32));
-      for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+      system_values.vertex_id = lp_build_zero(gallivm, lp_type_uint_vec(32, 32*vector_length));
+      for (i = 0; i < vector_length; ++i) {
          LLVMValueRef true_index =
             LLVMBuildAdd(builder,
                          lp_loop.counter,
@@ -1413,11 +1243,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
          }
       }
       convert_to_soa(gallivm, aos_attribs, inputs,
-                     draw->pt.nr_vertex_elements);
+                     draw->pt.nr_vertex_elements, vs_type);
 
       ptr_aos = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) inputs;
-      generate_vs(llvm,
+      generate_vs(variant,
                   builder,
+                  vs_type,
                   outputs,
                   ptr_aos,
                   &system_values,
@@ -1426,29 +1257,34 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
                   variant->key.clamp_vertex_color);
 
       /* store original positions in clip before further manipulation */
-      store_clip(gallivm, io, outputs, 0, cv);
-      store_clip(gallivm, io, outputs, 1, pos);
+      store_clip(gallivm, vs_type, io, outputs, 0, cv);
+      store_clip(gallivm, vs_type, io, outputs, 1, pos);
 
       /* do cliptest */
       if (enable_cliptest) {
+         LLVMValueRef temp = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
          /* allocate clipmask, assign it integer type */
-         clipmask = generate_clipmask(llvm, outputs,
+         clipmask = generate_clipmask(llvm,
+                                      gallivm,
+                                      vs_type,
+                                      outputs,
                                       variant->key.clip_xy,
                                       variant->key.clip_z, 
                                       variant->key.clip_user,
                                       variant->key.clip_halfz,
                                       variant->key.ucp_enable,
                                       context_ptr, &have_clipdist);
-         /* return clipping boolean value for function */
-         clipmask_bool(gallivm, clipmask, ret_ptr);
+         temp = LLVMBuildOr(builder, clipmask, temp, "");
+         /* store temporary clipping boolean value */
+         LLVMBuildStore(builder, temp, clipmask_bool_ptr);
       }
       else {
-         clipmask = lp_build_const_int_vec(gallivm, lp_type_int_vec(32), 0);
+         clipmask = lp_build_const_int_vec(gallivm, lp_int_type(vs_type), 0);
       }
       
       /* do viewport mapping */
       if (!bypass_viewport) {
-         generate_viewport(llvm, builder, outputs, context_ptr);
+         generate_viewport(variant, builder, vs_type, outputs, context_ptr);
       }
 
       /* store clipmask in vertex header, 
@@ -1456,43 +1292,20 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
        * and transformed positions in data 
        */   
       convert_to_aos(gallivm, io, outputs, clipmask,
-                     vs_info->num_outputs, max_vertices, have_clipdist);
+                     vs_info->num_outputs, vs_type,
+                     have_clipdist);
    }
 
    lp_build_loop_end_cond(&lp_loop, end, step, LLVMIntUGE);
 
    sampler->destroy(sampler);
 
-   ret = LLVMBuildLoad(builder, ret_ptr, "");
-   LLVMBuildRet(builder, ret);
-
-   /*
-    * Translate the LLVM IR into machine code.
-    */
-#ifdef DEBUG
-   if (LLVMVerifyFunction(variant_func, LLVMPrintMessageAction)) {
-      lp_debug_dump_value(variant_func);
-      assert(0);
-   }
-#endif
-
-   LLVMRunFunctionPassManager(gallivm->passmgr, variant_func);
+   /* return clipping boolean value for function */
+   ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr);
 
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
-      lp_debug_dump_value(variant_func);
-      debug_printf("\n");
-   }
-
-   code = LLVMGetPointerToGlobal(gallivm->engine, variant_func);
-   if (elts)
-      variant->jit_func_elts = (draw_jit_vert_func_elts) pointer_to_func(code);
-   else
-      variant->jit_func = (draw_jit_vert_func) pointer_to_func(code);
+   LLVMBuildRet(builder, ret);
 
-   if (gallivm_debug & GALLIVM_DEBUG_ASM) {
-      lp_disassemble(code);
-   }
-   lp_func_delete_body(variant_func);
+   gallivm_verify_function(gallivm, variant_func);
 }
 
 
@@ -1600,17 +1413,17 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant)
    struct draw_llvm *llvm = variant->llvm;
 
    if (variant->function_elts) {
-      LLVMFreeMachineCodeForFunction(llvm->gallivm->engine,
-                                     variant->function_elts);
-      LLVMDeleteFunction(variant->function_elts);
+      gallivm_free_function(variant->gallivm,
+                            variant->function_elts, variant->jit_func_elts);
    }
 
    if (variant->function) {
-      LLVMFreeMachineCodeForFunction(llvm->gallivm->engine,
-                                     variant->function);
-      LLVMDeleteFunction(variant->function);
+      gallivm_free_function(variant->gallivm,
+                            variant->function, variant->jit_func);
    }
 
+   gallivm_destroy(variant->gallivm);
+
    remove_from_list(&variant->list_item_local);
    variant->shader->variants_cached--;
    remove_from_list(&variant->list_item_global);
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 31fc2db05bd..39d83cfe99f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -36,11 +36,6 @@
 #include "pipe/p_context.h"
 #include "util/u_simple_list.h"
 
-#include <llvm-c/Core.h>
-#include <llvm-c/Analysis.h>
-#include <llvm-c/Target.h>
-#include <llvm-c/ExecutionEngine.h>
-
 
 struct draw_llvm;
 struct llvm_vertex_shader;
@@ -220,6 +215,14 @@ struct draw_llvm_variant_list_item
 
 struct draw_llvm_variant
 {
+   struct gallivm_state *gallivm;
+
+   /* LLVM JIT builder types */
+   LLVMTypeRef context_ptr_type;
+   LLVMTypeRef buffer_ptr_type;
+   LLVMTypeRef vb_ptr_type;
+   LLVMTypeRef vertex_header_ptr_type;
+
    LLVMValueRef function;
    LLVMValueRef function_elts;
    draw_jit_vert_func jit_func;
@@ -249,16 +252,8 @@ struct draw_llvm {
 
    struct draw_jit_context jit_context;
 
-   struct gallivm_state *gallivm;
-
    struct draw_llvm_variant_list_item vs_variants_list;
    int nr_variants;
-
-   /* LLVM JIT builder types */
-   LLVMTypeRef context_ptr_type;
-   LLVMTypeRef buffer_ptr_type;
-   LLVMTypeRef vb_ptr_type;
-   LLVMTypeRef vertex_header_ptr_type;
 };
 
 
@@ -270,7 +265,7 @@ llvm_vertex_shader(struct draw_vertex_shader *vs)
 
 
 struct draw_llvm *
-draw_llvm_create(struct draw_context *draw, struct gallivm_state *gallivm);
+draw_llvm_create(struct draw_context *draw);
 
 void
 draw_llvm_destroy(struct draw_llvm *llvm);
@@ -286,11 +281,6 @@ draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
 struct draw_llvm_variant_key *
 draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store);
 
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
-                         LLVMValueRef vbuffer,
-                         enum pipe_format from_format);
-
 struct lp_build_sampler_soa *
 draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
                              LLVMValueRef context_ptr);
diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
index 0a8b3bc535f..1dbe5f5bd19 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -173,8 +173,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                        unsigned unit,
                                        unsigned num_coords,
                                        const LLVMValueRef *coords,
-                                       const LLVMValueRef *ddx,
-                                       const LLVMValueRef *ddy,
+                                       const struct lp_derivatives *derivs,
                                        LLVMValueRef lod_bias, /* optional */
                                        LLVMValueRef explicit_lod, /* optional */
                                        LLVMValueRef *texel)
@@ -189,7 +188,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                        type,
                        unit,
                        num_coords, coords,
-                       ddx, ddy,
+                       derivs,
                        lod_bias, explicit_lod,
                        texel);
 }
@@ -201,6 +200,7 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
 static void
 draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
                                       struct gallivm_state *gallivm,
+                                      struct lp_type type,
                                       unsigned unit,
                                       LLVMValueRef explicit_lod, /* optional */
                                       LLVMValueRef *sizes_out)
@@ -212,6 +212,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
    lp_build_size_query_soa(gallivm,
                            &sampler->dynamic_state.static_state[unit],
                            &sampler->dynamic_state.base,
+			   type,
                            unit,
                            explicit_lod,
                            sizes_out);
diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c
deleted file mode 100644
index 77d0af74733..00000000000
--- a/src/gallium/auxiliary/draw/draw_llvm_translate.c
+++ /dev/null
@@ -1,506 +0,0 @@
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "draw_llvm.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_struct.h"
-#include "gallivm/lp_bld_format.h"
-#include "gallivm/lp_bld_debug.h"
-#include "gallivm/lp_bld_type.h"
-
-#include "util/u_memory.h"
-#include "util/u_format.h"
-#include "pipe/p_state.h"
-
-
-#define DRAW_DBG 0
-
-static  LLVMValueRef
-from_64_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMDoubleTypeInContext(gallivm->context), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildFPTrunc(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-from_32_float(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0) , "");
-   return LLVMBuildLoad(gallivm->builder, bc, "");
-}
-
-static INLINE LLVMValueRef
-from_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-static INLINE LLVMValueRef
-from_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   return LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-}
-
-
-static INLINE LLVMValueRef
-from_8_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-from_16_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-from_32_unorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildUIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-from_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-from_16_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 16), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-from_32_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-from_32_fixed(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef bc = LLVMBuildBitCast(gallivm->builder, val,
-                                      LLVMPointerType(LLVMIntTypeInContext(gallivm->context, 32), 0) , "");
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, bc, "");
-   LLVMValueRef uscaled = LLVMBuildSIToFP(gallivm->builder, l, LLVMFloatTypeInContext(gallivm->context), "");
-
-   return LLVMBuildFDiv(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65536.0), "");
-}
-
-static LLVMValueRef
-to_64_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPExt(gallivm->builder, l, LLVMDoubleTypeInContext(gallivm->context), "");
-}
-
-static LLVMValueRef
-to_32_float(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   return LLVMBuildLoad(gallivm->builder, fp, "");
-}
-
-static INLINE LLVMValueRef
-to_8_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_uscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToUI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 8), "");
-}
-
-static INLINE LLVMValueRef
-to_16_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 16), "");
-}
-
-static INLINE LLVMValueRef
-to_32_sscaled(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   return LLVMBuildFPToSI(gallivm->builder, l, LLVMIntTypeInContext(gallivm->context, 32), "");
-}
-
-static INLINE LLVMValueRef
-to_8_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 8), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 255.), "");
-}
-
-static INLINE LLVMValueRef
-to_16_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65535.), "");
-}
-
-static INLINE LLVMValueRef
-to_32_unorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToUI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 4294967295.), "");
-}
-
-static INLINE LLVMValueRef
-to_8_snorm(struct gallivm_state *gallivm, LLVMValueRef val)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, val, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 8), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 127.0), "");
-}
-
-static INLINE LLVMValueRef
-to_16_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 16), "");
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 32767.0f), "");
-}
-
-static INLINE LLVMValueRef
-to_32_snorm(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 2147483647.0), "");
-}
-
-static INLINE LLVMValueRef
-to_32_fixed(struct gallivm_state *gallivm, LLVMValueRef fp)
-{
-   LLVMValueRef l = LLVMBuildLoad(gallivm->builder, fp, "");
-   LLVMValueRef uscaled = LLVMBuildFPToSI(gallivm->builder, l,
-                                          LLVMIntTypeInContext(gallivm->context, 32), "");
-
-   return LLVMBuildFMul(gallivm->builder, uscaled,
-                        lp_build_const_float(gallivm, 65536.0), "");
-}
-
-typedef LLVMValueRef (*from_func)(struct gallivm_state *, LLVMValueRef);
-typedef  LLVMValueRef (*to_func)(struct gallivm_state *, LLVMValueRef);
-
-/* so that underneath can avoid function calls which are prohibited
- * for static initialization we need this conversion */
-enum ll_type {
-   LL_Double,
-   LL_Float,
-   LL_Int32,
-   LL_Int16,
-   LL_Int8
-};
-
-static INLINE LLVMTypeRef
-ll_type_to_llvm(struct gallivm_state *gallivm, enum ll_type type)
-{
-   switch (type) {
-   case LL_Double:
-      return LLVMDoubleTypeInContext(gallivm->context);
-   case LL_Float:
-      return LLVMFloatTypeInContext(gallivm->context);
-   case LL_Int32:
-      return LLVMInt32TypeInContext(gallivm->context);
-   case LL_Int16:
-      return LLVMIntTypeInContext(gallivm->context, 16);
-   case LL_Int8:
-      return LLVMIntTypeInContext(gallivm->context, 8);
-   }
-   return LLVMIntTypeInContext(gallivm->context, 8);
-}
-
-static INLINE int
-ll_type_size(enum ll_type type)
-{
-   switch (type) {
-   case LL_Double:
-      return 8;
-   case LL_Float:
-      return 4;
-   case LL_Int32:
-      return 4;
-   case LL_Int16:
-      return 2;
-   case LL_Int8:
-      return 1;
-   }
-   return 1;
-}
-
-struct draw_llvm_translate {
-   int format;
-   from_func from;
-   to_func to;
-   enum ll_type type;
-   int num_components;
-} translates[] =
-{
-   {PIPE_FORMAT_R64_FLOAT,          from_64_float, to_64_float, LL_Double, 1},
-   {PIPE_FORMAT_R64G64_FLOAT,       from_64_float, to_64_float, LL_Double, 2},
-   {PIPE_FORMAT_R64G64B64_FLOAT,    from_64_float, to_64_float, LL_Double, 3},
-   {PIPE_FORMAT_R64G64B64A64_FLOAT, from_64_float, to_64_float, LL_Double, 4},
-   {PIPE_FORMAT_R32_FLOAT,          from_32_float, to_32_float, LL_Float, 1},
-   {PIPE_FORMAT_R32G32_FLOAT,       from_32_float, to_32_float, LL_Float, 2},
-   {PIPE_FORMAT_R32G32B32_FLOAT,    from_32_float, to_32_float, LL_Float, 3},
-   {PIPE_FORMAT_R32G32B32A32_FLOAT, from_32_float, to_32_float, LL_Float, 4},
-
-   {PIPE_FORMAT_R32_UNORM,          from_32_unorm, to_32_unorm, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_UNORM,       from_32_unorm, to_32_unorm, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_UNORM,    from_32_unorm, to_32_unorm, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_UNORM, from_32_unorm, to_32_unorm, LL_Int32, 4},
-
-   {PIPE_FORMAT_R32_USCALED,          from_32_uscaled, to_32_uscaled, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_USCALED,       from_32_uscaled, to_32_uscaled, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_USCALED,    from_32_uscaled, to_32_uscaled, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_USCALED, from_32_uscaled, to_32_uscaled, LL_Int32, 4},
-
-   {PIPE_FORMAT_R32_SNORM,          from_32_snorm, to_32_snorm, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_SNORM,       from_32_snorm, to_32_snorm, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_SNORM,    from_32_snorm, to_32_snorm, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_SNORM, from_32_snorm, to_32_snorm, LL_Int32, 4},
-
-   {PIPE_FORMAT_R32_SSCALED,          from_32_sscaled, to_32_sscaled, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_SSCALED,       from_32_sscaled, to_32_sscaled, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_SSCALED,    from_32_sscaled, to_32_sscaled, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_SSCALED, from_32_sscaled, to_32_sscaled, LL_Int32, 4},
-
-   {PIPE_FORMAT_R16_UNORM,          from_16_unorm, to_16_unorm, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_UNORM,       from_16_unorm, to_16_unorm, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_UNORM,    from_16_unorm, to_16_unorm, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_UNORM, from_16_unorm, to_16_unorm, LL_Int16, 4},
-
-   {PIPE_FORMAT_R16_USCALED,          from_16_uscaled, to_16_uscaled, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_USCALED,       from_16_uscaled, to_16_uscaled, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_USCALED,    from_16_uscaled, to_16_uscaled, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_USCALED, from_16_uscaled, to_16_uscaled, LL_Int16, 4},
-
-   {PIPE_FORMAT_R16_SNORM,          from_16_snorm, to_16_snorm, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_SNORM,       from_16_snorm, to_16_snorm, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_SNORM,    from_16_snorm, to_16_snorm, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_SNORM, from_16_snorm, to_16_snorm, LL_Int16, 4},
-
-   {PIPE_FORMAT_R16_SSCALED,          from_16_sscaled, to_16_sscaled, LL_Int16, 1},
-   {PIPE_FORMAT_R16G16_SSCALED,       from_16_sscaled, to_16_sscaled, LL_Int16, 2},
-   {PIPE_FORMAT_R16G16B16_SSCALED,    from_16_sscaled, to_16_sscaled, LL_Int16, 3},
-   {PIPE_FORMAT_R16G16B16A16_SSCALED, from_16_sscaled, to_16_sscaled, LL_Int16, 4},
-
-   {PIPE_FORMAT_R8_UNORM,       from_8_unorm, to_8_unorm, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_UNORM,     from_8_unorm, to_8_unorm, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_UNORM,   from_8_unorm, to_8_unorm, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_UNORM, from_8_unorm, to_8_unorm, LL_Int8, 4},
-
-   {PIPE_FORMAT_R8_USCALED,       from_8_uscaled, to_8_uscaled, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_USCALED,     from_8_uscaled, to_8_uscaled, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_USCALED,   from_8_uscaled, to_8_uscaled, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_USCALED, from_8_uscaled, to_8_uscaled, LL_Int8, 4},
-
-   {PIPE_FORMAT_R8_SNORM,       from_8_snorm, to_8_snorm, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_SNORM,     from_8_snorm, to_8_snorm, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_SNORM,   from_8_snorm, to_8_snorm, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_SNORM, from_8_snorm, to_8_snorm, LL_Int8, 4},
-
-   {PIPE_FORMAT_R8_SSCALED,       from_8_sscaled, to_8_sscaled, LL_Int8, 1},
-   {PIPE_FORMAT_R8G8_SSCALED,     from_8_sscaled, to_8_sscaled, LL_Int8, 2},
-   {PIPE_FORMAT_R8G8B8_SSCALED,   from_8_sscaled, to_8_sscaled, LL_Int8, 3},
-   {PIPE_FORMAT_R8G8B8A8_SSCALED, from_8_sscaled, to_8_sscaled, LL_Int8, 4},
-
-   {PIPE_FORMAT_R32_FIXED,          from_32_fixed, to_32_fixed, LL_Int32, 1},
-   {PIPE_FORMAT_R32G32_FIXED,       from_32_fixed, to_32_fixed, LL_Int32, 2},
-   {PIPE_FORMAT_R32G32B32_FIXED,    from_32_fixed, to_32_fixed, LL_Int32, 3},
-   {PIPE_FORMAT_R32G32B32A32_FIXED, from_32_fixed, to_32_fixed, LL_Int32, 4},
-};
-
-
-static LLVMValueRef
-fetch(struct gallivm_state *gallivm,
-      LLVMValueRef ptr, int val_size, int nr_components,
-      from_func func)
-{
-   int i;
-   int offset = 0;
-   LLVMValueRef res =
-      LLVMConstNull(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4));
-   LLVMValueRef defaults[4];
-
-   defaults[0] =
-   defaults[1] =
-   defaults[2] = lp_build_const_float(gallivm, 0.0);
-   defaults[3] = lp_build_const_float(gallivm, 1.0);
-
-   for (i = 0; i < nr_components; ++i) {
-      LLVMValueRef src_index = lp_build_const_int32(gallivm, offset);
-      LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
-      LLVMValueRef src_tmp;
-      LLVMValueRef component;
-
-      src_tmp = LLVMBuildGEP(gallivm->builder, ptr, &src_index, 1, "src_tmp");
-
-      /* convert src_tmp to float */
-      component = func(gallivm, src_tmp);
-
-      /* vec.comp = component */
-      res = LLVMBuildInsertElement(gallivm->builder,
-                                   res,
-                                   component,
-                                   dst_index, "");
-      offset += val_size;
-   }
-   for (; i < 4; ++i) {
-      LLVMValueRef dst_index = lp_build_const_int32(gallivm, i);
-      res = LLVMBuildInsertElement(gallivm->builder,
-                                   res,
-                                   defaults[i],
-                                   dst_index, "");
-   }
-   return res;
-}
-
-
-LLVMValueRef
-draw_llvm_translate_from(struct gallivm_state *gallivm,
-                         LLVMValueRef vbuffer,
-                         enum pipe_format from_format)
-{
-   const struct util_format_description *format_desc;
-   LLVMValueRef zero;
-   int i;
-   struct lp_type type = lp_float32_vec4_type();
-
-   /*
-    * The above can only cope with straight arrays: no bitfields,
-    * swizzles, or half floats.
-    */
-
-   for (i = 0; i < Elements(translates); ++i) {
-      if (translates[i].format == from_format) {
-         /*LLVMTypeRef type = ll_type_to_llvm(translates[i].type);*/
-         return fetch(gallivm,
-                      vbuffer,
-                      ll_type_size(translates[i].type),
-                      translates[i].num_components,
-                      translates[i].from);
-      }
-   }
-
-
-   /*
-    * This doesn't handle anything bigger than 32bits, or half floats
-    * yet.
-    *
-    * TODO: unify all this code into lp_build_fetch_rgba_aos().
-    */
-
-   format_desc = util_format_description(from_format);
-   zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
-   return lp_build_fetch_rgba_aos(gallivm, format_desc, type, vbuffer, zero, zero, zero);
-}
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index d85deeea7f5..9cede2108db 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -47,8 +47,8 @@
 #include "tgsi/tgsi_scan.h"
 
 #ifdef HAVE_LLVM
-#include <llvm-c/ExecutionEngine.h>
 struct draw_llvm;
+struct gallivm_state;
 #endif
 
 
@@ -301,7 +301,6 @@ struct draw_context
 
 #ifdef HAVE_LLVM
    struct draw_llvm *llvm;
-   struct gallivm_state *own_gallivm;
 #endif
 
    struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 1e17f808408..04b286f0f5b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -230,7 +230,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
    llvm_vert_info.stride = fpme->vertex_size;
    llvm_vert_info.verts =
       (struct vertex_header *)MALLOC(fpme->vertex_size *
-                                     align(fetch_info->count,  4));
+                                     align(fetch_info->count,  lp_native_vector_width / 32));
    if (!llvm_vert_info.verts) {
       assert(0);
       return;
@@ -423,7 +423,7 @@ draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw)
 {
    struct llvm_middle_end *fpme = 0;
 
-   if (!draw->llvm || !draw->llvm->gallivm->engine)
+   if (!draw->llvm)
       return NULL;
 
    fpme = CALLOC_STRUCT( llvm_middle_end );
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 9fc57629822..d226dab5b81 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -75,9 +75,9 @@ lp_build_min_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
+   unsigned intr_size;
    LLVMValueRef cond;
 
    assert(lp_check_value(type, a));
@@ -85,31 +85,71 @@ lp_build_min_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.min.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.ps.256";
+            intr_size = 256;
+         }
+      }
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.min.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse2.min.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.min.pd.256";
+            intr_size = 256;
+         }
       }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pminu.b";
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmins.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
             intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
             intrinsic = "llvm.x86.sse41.pminsd";
+         }
       }
    }
 
-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }
 
    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
    return lp_build_select(bld, cond, a, b);
@@ -125,9 +165,9 @@ lp_build_max_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
+   unsigned intr_size;
    LLVMValueRef cond;
 
    assert(lp_check_value(type, a));
@@ -135,31 +175,72 @@ lp_build_max_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-   if(type.width * type.length == 128) {
-      if(type.floating) {
-         if(type.width == 32 && util_cpu_caps.has_sse)
+   if (type.floating && util_cpu_caps.has_sse) {
+      if (type.width == 32) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse.max.ss";
+            intr_size = 128;
+         }
+         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.ps.256";
+            intr_size = 256;
+         }
+      }
+      if (type.width == 64 && util_cpu_caps.has_sse2) {
+         if (type.length == 1) {
+            intrinsic = "llvm.x86.sse2.max.sd";
+            intr_size = 128;
+         }
+         else if (type.length == 2 || !util_cpu_caps.has_avx) {
             intrinsic = "llvm.x86.sse2.max.pd";
+            intr_size = 128;
+         }
+         else {
+            intrinsic = "llvm.x86.avx.max.pd.256";
+            intr_size = 256;
+         }
       }
-      else {
-         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+   }
+   else if (util_cpu_caps.has_sse2 && type.length >= 2) {
+      intr_size = 128;
+      if ((type.width == 8 || type.width == 16) &&
+          (type.width * type.length <= 64) &&
+          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
+                      __FUNCTION__);
+         }
+      if (type.width == 8 && !type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxu.b";
+         intr_size = 128;
+      }
+      else if (type.width == 16 && type.sign) {
+         intrinsic = "llvm.x86.sse2.pmaxs.w";
+      }
+      if (util_cpu_caps.has_sse4_1) {
+         if (type.width == 8 && type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 16 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
-            intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+         }
+         if (type.width == 32 && !type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+        }
+         if (type.width == 32 && type.sign) {
             intrinsic = "llvm.x86.sse41.pmaxsd";
+         }
       }
    }
 
-   if(intrinsic)
-      return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
+   if(intrinsic) {
+      return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
+                                                 type,
+                                                 intr_size, a, b);
+   }
 
    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
    return lp_build_select(bld, cond, a, b);
@@ -265,15 +346,20 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
-/** Return the scalar sum of the elements of a */
+/** Return the scalar sum of the elements of a.
+ * Should avoid this operation whenever possible.
+ */
 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
    const struct lp_type type = bld->type;
    LLVMValueRef index, res;
-   unsigned i;
+   unsigned i, length;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
+   LLVMValueRef vecres, elem2;
 
    assert(lp_check_value(type, a));
 
@@ -283,26 +369,191 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    assert(!bld->type.norm);
 
-   index = lp_build_const_int32(bld->gallivm, 0);
-   res = LLVMBuildExtractElement(builder, a, index, "");
+   /*
+    * for byte vectors can do much better with psadbw.
+    * Using repeated shuffle/adds here. Note with multiple vectors
+    * this can be done more efficiently as outlined in the intel
+    * optimization manual.
+    * Note: could cause data rearrangement if used with smaller element
+    * sizes.
+    */
 
-   for (i = 1; i < type.length; i++) {
-      index = lp_build_const_int32(bld->gallivm, i);
-      if (type.floating)
-         res = LLVMBuildFAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
-      else
-         res = LLVMBuildAdd(builder, res,
-                            LLVMBuildExtractElement(builder,
-                                                    a, index, ""),
-                            "");
+   vecres = a;
+   length = type.length / 2;
+   while (length > 1) {
+      LLVMValueRef vec1, vec2;
+      for (i = 0; i < length; i++) {
+         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
+         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
+      }
+      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles1, length), "");
+      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
+                                    LLVMConstVector(shuffles2, length), "");
+      if (type.floating) {
+         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
+      }
+      else {
+         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
+      }
+      length = length >> 1;
    }
 
+   /* always have vector of size 2 here */
+   assert(length == 1);
+
+   index = lp_build_const_int32(bld->gallivm, 0);
+   res = LLVMBuildExtractElement(builder, vecres, index, "");
+   index = lp_build_const_int32(bld->gallivm, 1);
+   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
+
+   if (type.floating)
+      res = LLVMBuildFAdd(builder, res, elem2, "");
+    else
+      res = LLVMBuildAdd(builder, res, elem2, "");
+
    return res;
 }
 
+/**
+ * Return the horizontal sums of 4 float vectors as a float4 vector.
+ * This uses the technique as outlined in Intel Optimization Manual.
+ */
+static LLVMValueRef
+lp_build_horizontal_add4x4f(struct lp_build_context *bld,
+                            LLVMValueRef src[4])
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[4];
+   LLVMValueRef tmp[4];
+   LLVMValueRef sumtmp[2], shuftmp[2];
+
+   /* lower half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 1);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 5);
+   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   /* upper half of regs */
+   shuffles[0] = lp_build_const_int32(gallivm, 2);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 6);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
+                                   LLVMConstVector(shuffles, 4), "");
+   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
+                                   LLVMConstVector(shuffles, 4), "");
+
+   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
+   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 0);
+   shuffles[1] = lp_build_const_int32(gallivm, 2);
+   shuffles[2] = lp_build_const_int32(gallivm, 4);
+   shuffles[3] = lp_build_const_int32(gallivm, 6);
+   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   shuffles[0] = lp_build_const_int32(gallivm, 1);
+   shuffles[1] = lp_build_const_int32(gallivm, 3);
+   shuffles[2] = lp_build_const_int32(gallivm, 5);
+   shuffles[3] = lp_build_const_int32(gallivm, 7);
+   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
+                                       LLVMConstVector(shuffles, 4), "");
+
+   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
+}
+
+
+/*
+ * partially horizontally add 2-4 float vectors with length nx4,
+ * i.e. only four adjacent values in each vector will be added,
+ * assuming values are really grouped in 4 which also determines
+ * output order.
+ *
+ * Return a vector of the same length as the initial vectors,
+ * with the excess elements (if any) being undefined.
+ * The element order is independent of number of input vectors.
+ * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
+ * the output order thus will be
+ * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
+ */
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ret_vec;
+   LLVMValueRef tmp[4];
+   const char *intrinsic = NULL;
+
+   assert(num_vecs >= 2 && num_vecs <= 4);
+   assert(bld->type.floating);
+
+   /* only use this with at least 2 vectors, as it is sort of expensive
+    * (depending on cpu) and we always need two horizontal adds anyway,
+    * so a shuffle/add approach might be better.
+    */
+
+   tmp[0] = vectors[0];
+   tmp[1] = vectors[1];
+
+   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
+   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
+
+   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
+       bld->type.length == 4) {
+      intrinsic = "llvm.x86.sse3.hadd.ps";
+   }
+   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
+            bld->type.length == 8) {
+      intrinsic = "llvm.x86.avx.hadd.ps.256";
+   }
+   if (intrinsic) {
+      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+      if (num_vecs > 2) {
+         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
+                                          lp_build_vec_type(gallivm, bld->type),
+                                          tmp[2], tmp[3]);
+      }
+      else {
+         tmp[1] = tmp[0];
+      }
+      return lp_build_intrinsic_binary(builder, intrinsic,
+                                       lp_build_vec_type(gallivm, bld->type),
+                                       tmp[0], tmp[1]);
+   }
+
+   if (bld->type.length == 4) {
+      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
+   }
+   else {
+      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
+      unsigned j;
+      unsigned num_iter = bld->type.length / 4;
+      struct lp_type parttype = bld->type;
+      parttype.length = 4;
+      for (j = 0; j < num_iter; j++) {
+         LLVMValueRef partsrc[4];
+         unsigned i;
+         for (i = 0; i < 4; i++) {
+            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
+         }
+         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
+      }
+      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
+   }
+   return ret_vec;
+}
 
 /**
  * Generate a - b
@@ -553,7 +804,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
       if(bld->type.floating) {
 #if 0
          /*
-          * Power of two multiplication by directly manipulating the mantissa.
+          * Power of two multiplication by directly manipulating the exponent.
           *
           * XXX: This might not be always faster, it will introduce a small error
           * for multiplication by zero, and it will produce wrong results
@@ -612,7 +863,8 @@ lp_build_div(struct lp_build_context *bld,
          return LLVMConstUDiv(a, b);
    }
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
+   if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
       type.floating)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 
@@ -871,6 +1123,12 @@ lp_build_abs(struct lp_build_context *bld,
          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
       }
    }
+   else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
+            (gallivm_debug & GALLIVM_DEBUG_PERF) &&
+            (type.width == 8 || type.width == 16 || type.width == 32)) {
+      debug_printf("%s: inefficient code, should split vectors manually\n",
+                   __FUNCTION__);
+   }
 
    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
 }
@@ -934,6 +1192,7 @@ lp_build_sgn(struct lp_build_context *bld,
    else
    {
       /* signed int/norm/fixed point */
+      /* could use psign with sse3 and appropriate vectors here */
       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
       res = lp_build_select(bld, cond, bld->one, minus_one);
@@ -1000,7 +1259,16 @@ lp_build_int_to_float(struct lp_build_context *bld,
    return LLVMBuildSIToFP(builder, a, vec_type, "");
 }
 
+static boolean
+sse41_rounding_available(const struct lp_type type)
+{
+   if ((util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) ||
+       (util_cpu_caps.has_avx && type.width*type.length == 256))
+      return TRUE;
 
+   return FALSE;
+}
 
 enum lp_build_round_sse41_mode
 {
@@ -1065,18 +1333,34 @@ lp_build_round_sse41(struct lp_build_context *bld,
       res = LLVMBuildExtractElement(builder, res, index0, "");
    }
    else {
-      assert(type.width*type.length == 128);
-
-      switch(type.width) {
-      case 32:
-         intrinsic = "llvm.x86.sse41.round.ps";
-         break;
-      case 64:
-         intrinsic = "llvm.x86.sse41.round.pd";
-         break;
-      default:
-         assert(0);
-         return bld->undef;
+      if (type.width * type.length == 128) {
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.sse41.round.ps";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.sse41.round.pd";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
+      }
+      else {
+         assert(type.width * type.length == 256);
+         assert(util_cpu_caps.has_avx);
+
+         switch(type.width) {
+         case 32:
+            intrinsic = "llvm.x86.avx.round.ps.256";
+            break;
+         case 64:
+            intrinsic = "llvm.x86.avx.round.pd.256";
+            break;
+         default:
+            assert(0);
+            return bld->undef;
+         }
       }
 
       res = lp_build_intrinsic_binary(builder, intrinsic,
@@ -1125,10 +1409,15 @@ lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                                      ret_type, arg);
    }
    else {
-      assert(type.width*type.length == 128);
-
-      intrinsic = "llvm.x86.sse2.cvtps2dq";
+      if (type.width* type.length == 128) {
+         intrinsic = "llvm.x86.sse2.cvtps2dq";
+      }
+      else {
+         assert(type.width*type.length == 256);
+         assert(util_cpu_caps.has_avx);
 
+         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
+      }
       res = lp_build_intrinsic_unary(builder, intrinsic,
                                      ret_type, a);
    }
@@ -1152,8 +1441,7 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
    }
    else {
@@ -1183,8 +1471,7 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1212,8 +1499,7 @@ lp_build_floor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
@@ -1241,8 +1527,7 @@ lp_build_ceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
@@ -1269,6 +1554,34 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
+ * Prevent returning a fractional part of 1.0 for very small negative values of
+ * 'a' by clamping against 0.99999(9).
+ */
+static inline LLVMValueRef
+clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
+{
+   LLVMValueRef max;
+
+   /* this is the largest number smaller than 1.0 representable as float */
+   max = lp_build_const_vec(bld->gallivm, bld->type,
+                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
+   return lp_build_min(bld, fract, max);
+}
+
+
+/**
+ * Same as lp_build_fract, but guarantees that the result is always smaller
+ * than one.
+ */
+LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   return clamp_fract(bld, lp_build_fract(bld, a));
+}
+
+
+/**
  * Return the integer part of a float (vector) value (== round toward zero).
  * The returned value is an integer (vector).
  * Ex: itrunc(-1.5) = -1
@@ -1307,12 +1620,12 @@ lp_build_iround(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse2 &&
-       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+   if ((util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
+       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
       return lp_build_iround_nearest_sse2(bld, a);
    }
-   else if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1362,14 +1675,12 @@ lp_build_ifloor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
-      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-   }
-   else {
-      res = a;
-
-      if (type.sign) {
+   res = a;
+   if (type.sign) {
+      if (sse41_rounding_available(type)) {
+         res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+      }
+      else {
          /* Take the sign bit and add it to 1 constant */
          LLVMTypeRef vec_type = bld->vec_type;
          unsigned mantissa = lp_mantissa(type);
@@ -1423,8 +1734,7 @@ lp_build_iceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
@@ -1470,7 +1780,7 @@ lp_build_iceil(struct lp_build_context *bld,
  * Combined ifloor() & fract().
  *
  * Preferred to calling the functions separately, as it will ensure that the
- * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ * strategy (floor() vs ifloor()) that results in less redundant work is used.
  */
 void
 lp_build_ifloor_fract(struct lp_build_context *bld,
@@ -1485,8 +1795,7 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
-       (type.length == 1 || type.width*type.length == 128)) {
+   if (sse41_rounding_available(type)) {
       /*
        * floor() is easier.
        */
@@ -1507,6 +1816,21 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
 }
 
 
+/**
+ * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
+ * always smaller than one.
+ */
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart)
+{
+   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
+   *out_fpart = clamp_fract(bld, *out_fpart);
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -1519,10 +1843,14 @@ lp_build_sqrt(struct lp_build_context *bld,
    assert(lp_check_value(type, a));
 
    /* TODO: optimize the constant case */
-   /* TODO: optimize the constant case */
 
    assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   if (type.length == 1) {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
+   }
+   else {
+      util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+   }
 
    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
 }
@@ -1586,19 +1914,28 @@ lp_build_rcp(struct lp_build_context *bld,
     * - it doesn't even get the reciprocate of 1.0 exactly
     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
     * - for recent processors the benefit over DIVPS is marginal, a case
-    *   depedent
+    *   dependent
     *
     * We could still use it on certain processors if benchmarks show that the
     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
     * particular uses that require less workarounds.
     */
 
-   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
       const unsigned num_iterations = 0;
       LLVMValueRef res;
       unsigned i;
+      const char *intrinsic = NULL;
 
-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rcp.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rcp.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
       for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rcp_refine(bld, a, res);
@@ -1653,12 +1990,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
+        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
       const unsigned num_iterations = 1;
       LLVMValueRef res;
       unsigned i;
+      const char *intrinsic = NULL;
+
+      if (type.length == 4) {
+         intrinsic = "llvm.x86.sse.rsqrt.ps";
+      }
+      else {
+         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
+      }
+
+      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
 
-      res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
 
       for (i = 0; i < num_iterations; ++i) {
          res = lp_build_rsqrt_refine(bld, a, res);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index aeb987ff352..60b9907e60f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -57,8 +57,13 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
-lp_build_sum_vector(struct lp_build_context *bld,
-                    LLVMValueRef a);
+lp_build_horizontal_add(struct lp_build_context *bld,
+                        LLVMValueRef a);
+
+LLVMValueRef
+lp_build_hadd_partial4(struct lp_build_context *bld,
+                       LLVMValueRef vectors[],
+                       unsigned num_vecs);
 
 LLVMValueRef
 lp_build_sub(struct lp_build_context *bld,
@@ -157,6 +162,10 @@ lp_build_fract(struct lp_build_context *bld,
                LLVMValueRef a);
 
 LLVMValueRef
+lp_build_fract_safe(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+LLVMValueRef
 lp_build_ifloor(struct lp_build_context *bld,
                 LLVMValueRef a);
 LLVMValueRef
@@ -177,6 +186,12 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
                       LLVMValueRef *out_ipart,
                       LLVMValueRef *out_fpart);
 
+void
+lp_build_ifloor_fract_safe(struct lp_build_context *bld,
+                           LLVMValueRef a,
+                           LLVMValueRef *out_ipart,
+                           LLVMValueRef *out_fpart);
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 59e8fb2ed6e..35799a1ef8e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -37,6 +37,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_half.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -50,10 +51,12 @@ lp_mantissa(struct lp_type type)
 
    if(type.floating) {
       switch(type.width) {
+      case 16:
+         return 10;
       case 32:
          return 23;
       case 64:
-         return 53;
+         return 52;
       default:
          assert(0);
          return 0;
@@ -136,6 +139,8 @@ lp_const_min(struct lp_type type)
 
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return -65504;
       case 32:
          return -FLT_MAX;
       case 64:
@@ -169,6 +174,8 @@ lp_const_max(struct lp_type type)
 
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return 65504;
       case 32:
          return FLT_MAX;
       case 64:
@@ -196,6 +203,8 @@ lp_const_eps(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return 2E-10;
       case 32:
          return FLT_EPSILON;
       case 64:
@@ -247,7 +256,9 @@ lp_build_one(struct gallivm_state *gallivm, struct lp_type type)
 
    elem_type = lp_build_elem_type(gallivm, type);
 
-   if(type.floating)
+   if(type.floating && type.width == 16)
+      elems[0] = LLVMConstInt(elem_type, util_float_to_half(1.0f), 0);
+   else if(type.floating)
       elems[0] = LLVMConstReal(elem_type, 1.0);
    else if(type.fixed)
       elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
@@ -292,7 +303,9 @@ lp_build_const_elem(struct gallivm_state *gallivm,
    LLVMTypeRef elem_type = lp_build_elem_type(gallivm, type);
    LLVMValueRef elem;
 
-   if(type.floating) {
+   if(type.floating && type.width == 16) {
+      elem = LLVMConstInt(elem_type, util_float_to_half((float)val), 0);
+   } else if(type.floating) {
       elem = LLVMConstReal(elem_type, val);
    }
    else {
@@ -364,20 +377,10 @@ lp_build_const_aos(struct gallivm_state *gallivm,
    if(swizzle == NULL)
       swizzle = default_swizzle;
 
-   if(type.floating) {
-      elems[swizzle[0]] = LLVMConstReal(elem_type, r);
-      elems[swizzle[1]] = LLVMConstReal(elem_type, g);
-      elems[swizzle[2]] = LLVMConstReal(elem_type, b);
-      elems[swizzle[3]] = LLVMConstReal(elem_type, a);
-   }
-   else {
-      double dscale = lp_const_scale(type);
-
-      elems[swizzle[0]] = LLVMConstInt(elem_type, round(r*dscale), 0);
-      elems[swizzle[1]] = LLVMConstInt(elem_type, round(g*dscale), 0);
-      elems[swizzle[2]] = LLVMConstInt(elem_type, round(b*dscale), 0);
-      elems[swizzle[3]] = LLVMConstInt(elem_type, round(a*dscale), 0);
-   }
+   elems[swizzle[0]] = lp_build_const_elem(gallivm, type, r);
+   elems[swizzle[1]] = lp_build_const_elem(gallivm, type, g);
+   elems[swizzle[2]] = lp_build_const_elem(gallivm, type, b);
+   elems[swizzle[3]] = lp_build_const_elem(gallivm, type, a);
 
    for(i = 4; i < type.length; ++i)
       elems[i] = elems[i % 4];
@@ -452,7 +455,7 @@ lp_build_const_string(struct gallivm_state *gallivm,
 /**
  * Build a callable function pointer.
  *
- * We this casts instead of LLVMAddGlobalMapping()
+ * We use function pointer constants instead of LLVMAddGlobalMapping()
  * to work around a bug in LLVM 2.6, and for efficiency/simplicity.
  */
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 0973e1f16f3..0399709faad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -70,6 +70,66 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_conv.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Converts int16 half-float to float32
+ * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
+ * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
+ *
+ * @param src_type      <vector> type of int16
+ * @param src           value to convert
+ *
+ * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
+ */
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      LLVMValueRef src)
+{
+   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
+   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);
+
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
+   LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
+
+   /* Constants */
+   LLVMValueRef i32_13          = lp_build_const_int_vec(gallivm, i32_type, 13);
+   LLVMValueRef i32_16          = lp_build_const_int_vec(gallivm, i32_type, 16);
+   LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
+   LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
+   LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
+   LLVMValueRef f32_magic       = LLVMBuildBitCast(builder,
+                                                   lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
+                                                   float_vec_type, "");
+
+   /* Convert int16 vector to int32 vector by zero ext */
+   LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");
+
+   /* Exponent / mantissa bits */
+   LLVMValueRef expmant       = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
+   LLVMValueRef shifted       = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
+
+   /* Exponent adjust */
+   LLVMValueRef scaled        = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
+
+   /* Make sure Inf/NaN survive */
+   LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
+   LLVMValueRef infnanexp     = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
+
+   /* Sign bit */
+   LLVMValueRef justsign      = LLVMBuildXor(builder, h, expmant, "");
+   LLVMValueRef sign          = LLVMBuildShl(builder, justsign, i32_16, "");
+
+   /* Combine result */
+   LLVMValueRef sign_inf      = LLVMBuildOr(builder, sign, infnanexp, "");
+   LLVMValueRef final         = LLVMBuildOr(builder, scaled, sign_inf, "");
+
+   /* Cast from int32 vector to float32 vector */
+   return LLVMBuildBitCast(builder, final, float_vec_type, "");
+}
 
 
 /**
@@ -334,6 +394,8 @@ lp_build_conv(struct gallivm_state *gallivm,
        dst_type.width    == 8 &&
        dst_type.length   == 16 &&
 
+       4 * num_dsts      == num_srcs &&
+
        util_cpu_caps.has_sse2)
    {
       struct lp_build_context bld;
@@ -371,6 +433,76 @@ lp_build_conv(struct gallivm_state *gallivm,
       return; 
    }
 
+   /* Special case 2x8f --> 1x16ub
+    */
+   else if (src_type.floating == 1 &&
+      src_type.fixed    == 0 &&
+      src_type.sign     == 1 &&
+      src_type.norm     == 0 &&
+      src_type.width    == 32 &&
+      src_type.length   == 8 &&
+
+      dst_type.floating == 0 &&
+      dst_type.fixed    == 0 &&
+      dst_type.sign     == 0 &&
+      dst_type.norm     == 1 &&
+      dst_type.width    == 8 &&
+      dst_type.length   == 16 &&
+
+      2 * num_dsts      == num_srcs &&
+
+      util_cpu_caps.has_avx) {
+
+      struct lp_build_context bld;
+      struct lp_type int16_type = dst_type;
+      struct lp_type int32_type = dst_type;
+      LLVMValueRef const_255f;
+      unsigned i;
+
+      lp_build_context_init(&bld, gallivm, src_type);
+
+      int16_type.width *= 2;
+      int16_type.length /= 2;
+      int16_type.sign = 1;
+
+      int32_type.width *= 4;
+      int32_type.length /= 4;
+      int32_type.sign = 1;
+
+      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
+
+      for (i = 0; i < num_dsts; ++i, src += 2) {
+         LLVMValueRef lo, hi, a, b;
+
+         a = LLVMBuildFMul(builder, src[0], const_255f, "");
+         b = LLVMBuildFMul(builder, src[1], const_255f, "");
+
+         a = lp_build_iround(&bld, a);
+         b = lp_build_iround(&bld, b);
+
+         tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
+         tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
+         tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+         tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+
+         /* relying on clamping behavior of sse2 intrinsics here */
+         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
+         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+      }
+      return;
+   }
+
+   /* Pre convert half-floats to floats
+    */
+   else if (src_type.floating && src_type.width == 16)
+   {
+      for(i = 0; i < num_tmps; ++i)
+         tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]);
+
+      tmp_type.width = 32;
+   }
+
    /*
     * Clamp if necessary
     */
@@ -580,7 +712,7 @@ lp_build_conv(struct gallivm_state *gallivm,
  * This will convert the integer masks that match the given types.
  *
  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
- * Any other value will likely cause in unpredictable results.
+ * Any other value will likely cause unpredictable results.
  *
  * This is basically a very trimmed down version of lp_build_conv.
  */
@@ -591,8 +723,6 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts)
 {
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 
    /* We must not loose or gain channels. Only precision */
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
@@ -617,16 +747,5 @@ lp_build_conv_mask(struct gallivm_state *gallivm,
     * Truncate or expand bit width
     */
 
-   if(src_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      dst[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
-   }
-   else if(src_type.width < dst_type.width) {
-      assert(num_srcs == 1);
-      lp_build_unpack(gallivm, src_type, dst_type, src[0], dst, num_dsts);
-   }
-   else {
-      assert(num_srcs == num_dsts);
-      memcpy(dst, src, num_dsts * sizeof *dst);
-   }
+   lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
index cec655980fa..c830fbef5f2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -42,6 +42,10 @@
 
 struct lp_type;
 
+LLVMValueRef
+lp_build_half_to_float(struct gallivm_state *gallivm,
+                       struct lp_type src_type,
+                       LLVMValueRef src);
 
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index 444b70a678c..93505f3da45 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -35,10 +35,8 @@
 
 #if HAVE_LLVM >= 0x0300
 #include <llvm/Support/TargetRegistry.h>
-#include <llvm/Support/TargetSelect.h>
 #else /* HAVE_LLVM < 0x0300 */
 #include <llvm/Target/TargetRegistry.h>
-#include <llvm/Target/TargetSelect.h>
 #endif /* HAVE_LLVM < 0x0300 */
 
 #if HAVE_LLVM >= 0x0209
@@ -183,7 +181,7 @@ lp_disassemble(const void* func)
    /*
     * Limit disassembly to this extent
     */
-   const uint64_t extent = 0x10000;
+   const uint64_t extent = 96 * 1024;
 
    uint64_t max_pc = 0;
 
@@ -200,24 +198,6 @@ lp_disassemble(const void* func)
    std::string Error;
    const Target *T = TargetRegistry::lookupTarget(Triple, Error);
 
-#if HAVE_LLVM >= 0x0208
-   InitializeNativeTargetAsmPrinter();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   LLVMInitializeX86AsmPrinter();
-#elif defined(PIPE_ARCH_ARM)
-   LLVMInitializeARMAsmPrinter();
-#elif defined(PIPE_ARCH_PPC)
-   LLVMInitializePowerPCAsmPrinter();
-#endif
-
-#if HAVE_LLVM >= 0x0301
-   InitializeNativeTargetDisassembler();
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   LLVMInitializeX86Disassembler();
-#elif defined(PIPE_ARCH_ARM)
-   LLVMInitializeARMDisassembler();
-#endif
-
 #if HAVE_LLVM >= 0x0300
    OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
 #else
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index d2b3713ed2d..30da44e5b9c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -131,6 +131,15 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
 
    value = lp_build_mask_value(mask);
 
+   /*
+    * XXX this doesn't quite generate the most efficient code possible, if
+    * the masks are vectors which have all bits set to the same value
+    * in each element.
+    * movmskps/pmovmskb would be more efficient to get the required value
+    * into ordinary reg (certainly with 8 floats).
+    * Not sure if llvm could figure that out on its own.
+    */
+
    /* cond = (mask == 0) */
    cond = LLVMBuildICmp(builder,
                         LLVMIntEQ,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 04142d905b1..3608a68202f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -67,6 +67,13 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         LLVMValueRef i,
                         LLVMValueRef j);
 
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+                        const struct util_format_description *format_desc,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset);
+
 
 /*
  * SoA
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index e4b8da6bcfd..9591bcfb2c7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -470,6 +470,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
       return lp_build_format_swizzle_aos(format_desc, &bld, res);
    }
 
+   /* If all channels are of same type and we are not using half-floats */
+   if (util_format_is_array(format_desc)) {
+      return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+   }
+
    /*
     * YUV / subsampled formats
     */
@@ -601,7 +606,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
       return res;
    }
 
-
    /*
     * Fallback to util_format_description::fetch_rgba_float().
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
new file mode 100644
index 00000000000..b8ec379d76f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_const.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_pack.h"
+
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipe/p_state.h"
+
+/**
+ * @brief lp_build_fetch_rgba_aos_array
+ *
+ * \param format_desc   describes format of the image we're fetching from
+ * \param dst_type      output type
+ * \param base_ptr      address of the pixel block (or the texel if uncompressed)
+ * \param offset        ptr offset
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
+                              const struct util_format_description *format_desc,
+                              struct lp_type dst_type,
+                              LLVMValueRef base_ptr,
+                              LLVMValueRef offset)
+{
+   struct lp_build_context bld;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef src_elem_type, src_vec_type;
+   LLVMValueRef ptr, res = NULL;
+   struct lp_type src_type;
+
+   memset(&src_type, 0, sizeof src_type);
+   src_type.floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+   src_type.fixed    = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+   src_type.sign     = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   src_type.norm     = format_desc->channel[0].normalized;
+   src_type.width    = format_desc->channel[0].size;
+   src_type.length   = format_desc->nr_channels;
+
+   assert(src_type.length <= dst_type.length);
+
+   src_elem_type = lp_build_elem_type(gallivm, src_type);
+   src_vec_type  = lp_build_vec_type(gallivm,  src_type);
+
+   /* Read whole vector from memory, unaligned */
+   if (!res) {
+      ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+      ptr = LLVMBuildPointerCast(builder, ptr, LLVMPointerType(src_vec_type, 0), "");
+      res = LLVMBuildLoad(builder, ptr, "");
+      lp_set_load_alignment(res, src_type.width / 8);
+   }
+
+   /* Truncate doubles to float */
+   if (src_type.floating && src_type.width == 64) {
+      src_type.width = 32;
+      src_vec_type  = lp_build_vec_type(gallivm,  src_type);
+
+      res = LLVMBuildFPTrunc(builder, res, src_vec_type, "");
+   }
+
+   /* Expand to correct length */
+   if (src_type.length < dst_type.length) {
+      res = lp_build_pad_vector(gallivm, res, src_type, dst_type.length);
+      src_type.length = dst_type.length;
+   }
+
+   /* Convert to correct format */
+   lp_build_conv(gallivm, src_type, dst_type, &res, 1, &res, 1);
+
+   /* Swizzle it */
+   lp_build_context_init(&bld, gallivm, dst_type);
+   return lp_build_format_swizzle_aos(format_desc, &bld, res);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 0a57b3ce794..afeb34079bf 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -359,7 +359,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
     */
 
    if (util_format_fits_8unorm(format_desc) &&
-       type.floating && type.width == 32 && type.length == 4) {
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0))) {
       struct lp_type tmp_type;
       LLVMValueRef tmp;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
index ccc83207004..f77eb1212b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -84,7 +84,7 @@ uyvy_to_yuv_soa(struct gallivm_state *gallivm,
     * per element. Didn't measure performance but cuts shader size
     * by quite a bit (less difference if cpu has no sse4.1 support).
     */
-   if (util_cpu_caps.has_sse2 && n == 4) {
+   if (util_cpu_caps.has_sse2 && n > 1) {
       LLVMValueRef sel, tmp, tmp2;
       struct lp_build_context bld32;
 
@@ -152,7 +152,7 @@ yuyv_to_yuv_soa(struct gallivm_state *gallivm,
     * per element. Didn't measure performance but cuts shader size
     * by quite a bit (less difference if cpu has no sse4.1 support).
     */
-   if (util_cpu_caps.has_sse2 && n == 4) {
+   if (util_cpu_caps.has_sse2 && n > 1) {
       LLVMValueRef sel, tmp;
       struct lp_build_context bld32;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 768d935dae5..5bf4bcfab3b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -26,15 +26,44 @@
  **************************************************************************/
 
 
+#include "pipe/p_config.h"
 #include "pipe/p_compiler.h"
 #include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_simple_list.h"
+#include "lp_bld.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_misc.h"
 #include "lp_bld_init.h"
 
+#include <llvm-c/Analysis.h>
 #include <llvm-c/Transforms/Scalar.h>
+#include <llvm-c/BitWriter.h>
+
+
+/**
+ * AVX is supported in:
+ * - standard JIT from LLVM 3.2 onwards
+ * - MC-JIT from LLVM 3.1
+ *   - MC-JIT supports limited OSes (MacOSX and Linux)
+ * - standard JIT in LLVM 3.1, with backports
+ */
+#if HAVE_LLVM >= 0x0301 && (defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE))
+#  define USE_MCJIT 1
+#  define HAVE_AVX 1
+#elif HAVE_LLVM >= 0x0302 || (HAVE_LLVM == 0x0301 && defined(HAVE_JIT_AVX_SUPPORT))
+#  define USE_MCJIT 0
+#  define HAVE_AVX 1
+#else
+#  define USE_MCJIT 0
+#  define HAVE_AVX 0
+#endif
+
+
+#if USE_MCJIT
+void LLVMLinkInMCJIT();
+#endif
 
 
 #ifdef DEBUG
@@ -57,6 +86,8 @@ DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags,
 
 static boolean gallivm_initialized = FALSE;
 
+unsigned lp_native_vector_width;
+
 
 /*
  * Optimization values are:
@@ -81,25 +112,13 @@ enum LLVM_CodeGenOpt_Level {
 };
 
 
+#if HAVE_LLVM <= 0x0206
 /**
- * LLVM 2.6 permits only one ExecutionEngine to be created.  This is it.
- */
-static LLVMExecutionEngineRef GlobalEngine = NULL;
-
-/**
- * Same gallivm state shared by all contexts.
+ * LLVM 2.6 permits only one ExecutionEngine to be created.  So use the
+ * same gallivm state everywhere.
  */
 static struct gallivm_state *GlobalGallivm = NULL;
-
-
-
-
-extern void
-lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
-
-extern void
-lp_set_target_options(void);
-
+#endif
 
 
 /**
@@ -111,6 +130,7 @@ static boolean
 create_pass_manager(struct gallivm_state *gallivm)
 {
    assert(!gallivm->passmgr);
+   assert(gallivm->target);
 
    gallivm->passmgr = LLVMCreateFunctionPassManager(gallivm->provider);
    if (!gallivm->passmgr)
@@ -174,33 +194,37 @@ free_gallivm_state(struct gallivm_state *gallivm)
                                &mod, &error);
 #endif
 
+   if (gallivm->passmgr) {
+      LLVMDisposePassManager(gallivm->passmgr);
+   }
+
 #if 0
    /* XXX this seems to crash with all versions of LLVM */
    if (gallivm->provider)
       LLVMDisposeModuleProvider(gallivm->provider);
 #endif
 
-   if (gallivm->passmgr)
-      LLVMDisposePassManager(gallivm->passmgr);
-
-#if HAVE_LLVM >= 0x207
-   if (gallivm->module)
-      LLVMDisposeModule(gallivm->module);
-#endif
-
-#if 0
-   /* Don't free the exec engine, it's a global/singleton */
-   if (gallivm->engine)
+   if (HAVE_LLVM >= 0x207 && gallivm->engine) {
+      /* This will already destroy any associated module */
       LLVMDisposeExecutionEngine(gallivm->engine);
-#endif
+   } else {
+      LLVMDisposeModule(gallivm->module);
+   }
 
-#if 0
+#if !USE_MCJIT
    /* Don't free the TargetData, it's owned by the exec engine */
-   LLVMDisposeTargetData(gallivm->target);
+#else
+   if (gallivm->target) {
+      LLVMDisposeTargetData(gallivm->target);
+   }
 #endif
 
+   /* Never free the LLVM context.
+    */
+#if 0
    if (gallivm->context)
       LLVMContextDispose(gallivm->context);
+#endif
 
    if (gallivm->builder)
       LLVMDisposeBuilder(gallivm->builder);
@@ -215,37 +239,14 @@ free_gallivm_state(struct gallivm_state *gallivm)
 }
 
 
-/**
- * Allocate gallivm LLVM objects.
- * \return  TRUE for success, FALSE for failure
- */
 static boolean
-init_gallivm_state(struct gallivm_state *gallivm)
+init_gallivm_engine(struct gallivm_state *gallivm)
 {
-   assert(!gallivm->context);
-   assert(!gallivm->module);
-   assert(!gallivm->provider);
-
-   lp_build_init();
-
-   gallivm->context = LLVMContextCreate();
-   if (!gallivm->context)
-      goto fail;
-
-   gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
-                                                       gallivm->context);
-   if (!gallivm->module)
-      goto fail;
-
-   gallivm->provider =
-      LLVMCreateModuleProviderForExistingModule(gallivm->module);
-   if (!gallivm->provider)
-      goto fail;
-
-   if (!GlobalEngine) {
+   if (1) {
       /* We can only create one LLVMExecutionEngine (w/ LLVM 2.6 anyway) */
       enum LLVM_CodeGenOpt_Level optlevel;
       char *error = NULL;
+      int ret;
 
       if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
          optlevel = None;
@@ -254,135 +255,162 @@ init_gallivm_state(struct gallivm_state *gallivm)
          optlevel = Default;
       }
 
-      if (LLVMCreateJITCompiler(&GlobalEngine, gallivm->provider,
-                                (unsigned) optlevel, &error)) {
+#if USE_MCJIT
+      ret = lp_build_create_mcjit_compiler_for_module(&gallivm->engine,
+                                                      gallivm->module,
+                                                      (unsigned) optlevel,
+                                                      &error);
+#else
+      ret = LLVMCreateJITCompiler(&gallivm->engine, gallivm->provider,
+                                  (unsigned) optlevel, &error);
+#endif
+      if (ret) {
          _debug_printf("%s\n", error);
          LLVMDisposeMessage(error);
          goto fail;
       }
 
 #if defined(DEBUG) || defined(PROFILE)
-      lp_register_oprofile_jit_event_listener(GlobalEngine);
+      lp_register_oprofile_jit_event_listener(gallivm->engine);
 #endif
    }
 
-   gallivm->engine = GlobalEngine;
-
    LLVMAddModuleProvider(gallivm->engine, gallivm->provider);//new
 
+#if !USE_MCJIT
    gallivm->target = LLVMGetExecutionEngineTargetData(gallivm->engine);
    if (!gallivm->target)
       goto fail;
+#else
+   if (0) {
+       /*
+        * Dump the data layout strings.
+        */
 
-   if (!create_pass_manager(gallivm))
-      goto fail;
+       LLVMTargetDataRef target = LLVMGetExecutionEngineTargetData(gallivm->engine);
+       char *data_layout;
+       char *engine_data_layout;
 
-   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
-   if (!gallivm->builder)
-      goto fail;
+       data_layout = LLVMCopyStringRepOfTargetData(gallivm->target);
+       engine_data_layout = LLVMCopyStringRepOfTargetData(target);
+
+       if (1) {
+          debug_printf("module target data = %s\n", data_layout);
+          debug_printf("engine target data = %s\n", engine_data_layout);
+       }
+
+       free(data_layout);
+       free(engine_data_layout);
+   }
+#endif
 
    return TRUE;
 
 fail:
-   free_gallivm_state(gallivm);
    return FALSE;
 }
 
 
-struct callback
-{
-   garbage_collect_callback_func func;
-   void *cb_data;
-   struct callback *prev, *next;
-};
-
-
-/** list of all garbage collector callbacks */
-static struct callback callback_list = {NULL, NULL, NULL, NULL};
+/**
+ * Singleton
+ *
+ * We must never free LLVM contexts, because LLVM has several global caches
+ * which pointing/derived from objects owned by the context, causing false
+ * memory leaks and false cache hits when these objects are destroyed.
+ *
+ * TODO: For thread safety on multi-threaded OpenGL we should use one LLVM
+ * context per thread, and put them in a pool when threads are destroyed.
+ */
+static LLVMContextRef gallivm_context = NULL;
 
 
 /**
- * Register a function with gallivm which will be called when we
- * do garbage collection.
+ * Allocate gallivm LLVM objects.
+ * \return  TRUE for success, FALSE for failure
  */
-void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
-                                            void *cb_data)
+static boolean
+init_gallivm_state(struct gallivm_state *gallivm)
 {
-   struct callback *cb;
-
-   if (!callback_list.prev) {
-      make_empty_list(&callback_list);
-   }
+   assert(!gallivm->context);
+   assert(!gallivm->module);
+   assert(!gallivm->provider);
 
-   /* see if already in list */
-   foreach(cb, &callback_list) {
-      if (cb->func == func && cb->cb_data == cb_data)
-         return;
-   }
+   lp_build_init();
 
-   /* add to list */
-   cb = CALLOC_STRUCT(callback);
-   if (cb) {
-      cb->func = func;
-      cb->cb_data = cb_data;
-      insert_at_head(&callback_list, cb);
+   if (!gallivm_context) {
+      gallivm_context = LLVMContextCreate();
    }
-}
+   gallivm->context = gallivm_context;
+   if (!gallivm->context)
+      goto fail;
 
+   gallivm->module = LLVMModuleCreateWithNameInContext("gallivm",
+                                                       gallivm->context);
+   if (!gallivm->module)
+      goto fail;
 
-/**
- * Remove a callback.
- */
-void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
-                                          void *cb_data)
-{
-   struct callback *cb;
-
-   /* search list */
-   foreach(cb, &callback_list) {
-      if (cb->func == func && cb->cb_data == cb_data) {
-         /* found, remove it */
-         remove_from_list(cb);
-         FREE(cb);
-         return;
-      }
-   }
-}
+   gallivm->provider =
+      LLVMCreateModuleProviderForExistingModule(gallivm->module);
+   if (!gallivm->provider)
+      goto fail;
 
+   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
+   if (!gallivm->builder)
+      goto fail;
 
-/**
- * Call the callback functions (which are typically in the
- * draw module and llvmpipe driver.
- */
-static void
-call_garbage_collector_callbacks(void)
-{
-   struct callback *cb;
-   foreach(cb, &callback_list) {
-      cb->func(cb->cb_data);
+   /* FIXME: MC-JIT only allows compiling one module at a time, and it must be
+    * complete when MC-JIT is created. So defer the MC-JIT engine creation for
+    * now.
+    */
+#if !USE_MCJIT
+   if (!init_gallivm_engine(gallivm)) {
+      goto fail;
    }
-}
+#else
+   /*
+    * MC-JIT engine compiles the module immediately on creation, so we can't
+    * obtain the target data from it.  Instead we create a target data layout
+    * from a string.
+    *
+    * The produced layout strings are not precisely the same, but should make
+    * no difference for the kind of optimization passes we run.
+    *
+    * For reference this is the layout string on x64:
+    *
+    *   e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64
+    *
+    * See also:
+    * - http://llvm.org/docs/LangRef.html#datalayout
+    */
+
+   {
+      const unsigned pointer_size = 8 * sizeof(void *);
+      char layout[512];
+      util_snprintf(layout, sizeof layout, "%c-p:%u:%u:%u-i64:64:64-a0:0:%u-s0:%u:%u",
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+                    'e', // little endian
+#else
+                    'E', // big endian
+#endif
+                    pointer_size, pointer_size, pointer_size, // pointer size, abi alignment, preferred alignment
+                    pointer_size, // aggregate preferred alignment
+                    pointer_size, pointer_size); // stack objects abi alignment, preferred alignment
 
+      gallivm->target = LLVMCreateTargetData(layout);
+      if (!gallivm->target) {
+         return FALSE;
+      }
+   }
+#endif
 
+   if (!create_pass_manager(gallivm))
+      goto fail;
 
-/**
- * Other gallium components using gallivm should call this periodically
- * to let us do garbage collection (or at least try to free memory
- * accumulated by the LLVM libraries).
- */
-void
-gallivm_garbage_collect(struct gallivm_state *gallivm)
-{
-   if (gallivm->context) {
-      if (gallivm_debug & GALLIVM_DEBUG_GC)
-         debug_printf("***** Doing LLVM garbage collection\n");
+   return TRUE;
 
-      call_garbage_collector_callbacks();
-      free_gallivm_state(gallivm);
-      init_gallivm_state(gallivm);
-   }
+fail:
+   free_gallivm_state(gallivm);
+   return FALSE;
 }
 
 
@@ -398,12 +426,27 @@ lp_build_init(void)
 
    lp_set_target_options();
 
-   LLVMInitializeNativeTarget();
-
+#if USE_MCJIT
+   LLVMLinkInMCJIT();
+#else
    LLVMLinkInJIT();
+#endif
 
    util_cpu_detect();
+
+   if (HAVE_AVX &&
+       util_cpu_caps.has_avx) {
+      lp_native_vector_width = 256;
+   } else {
+      /* Leave it at 128, even when no SIMD extensions are available.
+       * Really needs to be a multiple of 128 so can fit 4 floats.
+       */
+      lp_native_vector_width = 128;
+   }
  
+   lp_native_vector_width = debug_get_num_option("LP_NATIVE_VECTOR_WIDTH",
+                                                 lp_native_vector_width);
+
    gallivm_initialized = TRUE;
 
 #if 0
@@ -423,16 +466,27 @@ lp_build_init(void)
 struct gallivm_state *
 gallivm_create(void)
 {
-   if (!GlobalGallivm) {
-      GlobalGallivm = CALLOC_STRUCT(gallivm_state);
-      if (GlobalGallivm) {
-         if (!init_gallivm_state(GlobalGallivm)) {
-            FREE(GlobalGallivm);
-            GlobalGallivm = NULL;
-         }
+   struct gallivm_state *gallivm;
+
+#if HAVE_LLVM <= 0x206
+   if (GlobalGallivm) {
+      return GlobalGallivm;
+   }
+#endif
+
+   gallivm = CALLOC_STRUCT(gallivm_state);
+   if (gallivm) {
+      if (!init_gallivm_state(gallivm)) {
+         FREE(gallivm);
+         gallivm = NULL;
       }
    }
-   return GlobalGallivm;
+
+#if HAVE_LLVM <= 0x206
+   GlobalGallivm = gallivm;
+#endif
+
+   return gallivm;
 }
 
 
@@ -442,6 +496,132 @@ gallivm_create(void)
 void
 gallivm_destroy(struct gallivm_state *gallivm)
 {
+#if HAVE_LLVM <= 0x0206
    /* No-op: don't destroy the singleton */
    (void) gallivm;
+#else
+   free_gallivm_state(gallivm);
+   FREE(gallivm);
+#endif
+}
+
+
+/**
+ * Validate and optimze a function.
+ */
+static void
+gallivm_optimize_function(struct gallivm_state *gallivm,
+                          LLVMValueRef func)
+{
+   if (0) {
+      debug_printf("optimizing %s...\n", LLVMGetValueName(func));
+   }
+
+   assert(gallivm->passmgr);
+
+   /* Apply optimizations to LLVM IR */
+   LLVMRunFunctionPassManager(gallivm->passmgr, func);
+
+   if (0) {
+      if (gallivm_debug & GALLIVM_DEBUG_IR) {
+         /* Print the LLVM IR to stderr */
+         lp_debug_dump_value(func);
+         debug_printf("\n");
+      }
+   }
+}
+
+
+/**
+ * Validate a function.
+ */
+void
+gallivm_verify_function(struct gallivm_state *gallivm,
+                        LLVMValueRef func)
+{
+   /* Verify the LLVM IR.  If invalid, dump and abort */
+#ifdef DEBUG
+   if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) {
+      lp_debug_dump_value(func);
+      assert(0);
+      return;
+   }
+#endif
+
+   gallivm_optimize_function(gallivm, func);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      /* Print the LLVM IR to stderr */
+      lp_debug_dump_value(func);
+      debug_printf("\n");
+   }
+}
+
+
+void
+gallivm_compile_module(struct gallivm_state *gallivm)
+{
+#if HAVE_LLVM > 0x206
+   assert(!gallivm->compiled);
+#endif
+
+   /* Dump byte code to a file */
+   if (0) {
+      LLVMWriteBitcodeToFile(gallivm->module, "llvmpipe.bc");
+      debug_printf("llvmpipe.bc written\n");
+      debug_printf("Invoke as \"llc -o - llvmpipe.bc\"\n");
+   }
+
+#if USE_MCJIT
+   assert(!gallivm->engine);
+   if (!init_gallivm_engine(gallivm)) {
+      assert(0);
+   }
+#endif
+   assert(gallivm->engine);
+
+   ++gallivm->compiled;
+}
+
+
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+                     LLVMValueRef func)
+{
+   void *code;
+   func_pointer jit_func;
+
+   assert(gallivm->compiled);
+   assert(gallivm->engine);
+
+   code = LLVMGetPointerToGlobal(gallivm->engine, func);
+   assert(code);
+   jit_func = pointer_to_func(code);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      lp_disassemble(code);
+   }
+
+   /* Free the function body to save memory */
+   lp_func_delete_body(func);
+
+   return jit_func;
+}
+
+
+/**
+ * Free the function (and its machine code).
+ */
+void
+gallivm_free_function(struct gallivm_state *gallivm,
+                      LLVMValueRef func,
+                      const void *code)
+{
+#if !USE_MCJIT
+   if (code) {
+      LLVMFreeMachineCodeForFunction(gallivm->engine, func);
+   }
+
+   LLVMDeleteFunction(func);
+#endif
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 5fc0f996c64..7edea616c4e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "util/u_pointer.h" // for func_pointer
 #include "lp_bld.h"
 #include <llvm-c/ExecutionEngine.h>
 
@@ -44,6 +45,7 @@ struct gallivm_state
    LLVMPassManagerRef passmgr;
    LLVMContextRef context;
    LLVMBuilderRef builder;
+   unsigned compiled;
 };
 
 
@@ -51,35 +53,28 @@ void
 lp_build_init(void);
 
 
-extern void
-lp_func_delete_body(LLVMValueRef func);
-
+struct gallivm_state *
+gallivm_create(void);
 
 void
-gallivm_garbage_collect(struct gallivm_state *gallivm);
-
+gallivm_destroy(struct gallivm_state *gallivm);
 
-typedef void (*garbage_collect_callback_func)(void *cb_data);
 
 void
-gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
-                                            void *cb_data);
+gallivm_verify_function(struct gallivm_state *gallivm,
+                        LLVMValueRef func);
 
 void
-gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
-                                          void *cb_data);
+gallivm_compile_module(struct gallivm_state *gallivm);
 
-
-struct gallivm_state *
-gallivm_create(void);
+func_pointer
+gallivm_jit_function(struct gallivm_state *gallivm,
+                     LLVMValueRef func);
 
 void
-gallivm_destroy(struct gallivm_state *gallivm);
-
-
-extern LLVMValueRef
-lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
-                       const char *Name);
+gallivm_free_function(struct gallivm_state *gallivm,
+                      LLVMValueRef func,
+                      const void * code);
 
 void
 lp_set_load_alignment(LLVMValueRef Inst,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
index 2323f124ae4..2bf1211bcd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -48,6 +48,8 @@
 
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_type.h"
+#include "lp_bld_pack.h"
 
 
 LLVMValueRef
@@ -129,6 +131,95 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Call intrinsic with arguments adapted to intrinsic vector length.
+ *
+ * Split vectors which are too large for the hw, or expand them if they
+ * are too small, so a caller calling a function which might use intrinsics
+ * doesn't need to do splitting/expansion on its own.
+ * This only supports intrinsics where src and dst types match.
+ */
+LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+                                    const char *name,
+                                    struct lp_type src_type,
+                                    unsigned intr_size,
+                                    LLVMValueRef a,
+                                    LLVMValueRef b)
+{
+   unsigned i;
+   struct lp_type intrin_type = src_type;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   LLVMValueRef anative, bnative;
+   unsigned intrin_length = intr_size / src_type.width;
+
+   intrin_type.length = intrin_length;
+
+   if (intrin_length > src_type.length) {
+      LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef constvec, tmp;
+
+      for (i = 0; i < src_type.length; i++) {
+         elems[i] = lp_build_const_int32(gallivm, i);
+      }
+      for (; i < intrin_length; i++) {
+         elems[i] = i32undef;
+      }
+      if (src_type.length == 1) {
+         LLVMTypeRef elem_type = lp_build_elem_type(gallivm, intrin_type);
+         a = LLVMBuildBitCast(builder, a, LLVMVectorType(elem_type, 1), "");
+         b = LLVMBuildBitCast(builder, b, LLVMVectorType(elem_type, 1), "");
+      }
+      constvec = LLVMConstVector(elems, intrin_length);
+      anative = LLVMBuildShuffleVector(builder, a, a, constvec, "");
+      bnative = LLVMBuildShuffleVector(builder, b, b, constvec, "");
+      tmp = lp_build_intrinsic_binary(builder, name,
+                                      lp_build_vec_type(gallivm, intrin_type),
+                                      anative, bnative);
+      if (src_type.length > 1) {
+         constvec = LLVMConstVector(elems, src_type.length);
+         return LLVMBuildShuffleVector(builder, tmp, tmp, constvec, "");
+      }
+      else {
+         return LLVMBuildExtractElement(builder, tmp, elems[0], "");
+      }
+   }
+   else if (intrin_length < src_type.length) {
+      unsigned num_vec = src_type.length / intrin_length;
+      LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+
+      /* don't support arbitrary size here as this is so yuck */
+      if (src_type.length % intrin_length) {
+         /* FIXME: This is something which should be supported
+          * but there doesn't seem to be any need for it currently
+          * so crash and burn.
+          */
+         debug_printf("%s: should handle arbitrary vector size\n",
+                      __FUNCTION__);
+         assert(0);
+         return NULL;
+      }
+
+      for (i = 0; i < num_vec; i++) {
+         anative = lp_build_extract_range(gallivm, a, i*intrin_length,
+                                        intrin_length);
+         bnative = lp_build_extract_range(gallivm, b, i*intrin_length,
+                                        intrin_length);
+         tmp[i] = lp_build_intrinsic_binary(builder, name,
+                                            lp_build_vec_type(gallivm, intrin_type),
+                                            anative, bnative);
+      }
+      return lp_build_concat(gallivm, tmp, intrin_type, num_vec);
+   }
+   else {
+      return lp_build_intrinsic_binary(builder, name,
+                                       lp_build_vec_type(gallivm, src_type),
+                                       a, b);
+   }
+}
+
+
 LLVMValueRef
 lp_build_intrinsic_map(struct gallivm_state *gallivm,
                        const char *name,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
index b73dd700362..38c5c29c980 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_intr.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -78,6 +78,15 @@ lp_build_intrinsic_binary(LLVMBuilderRef builder,
 
 
 LLVMValueRef
+lp_build_intrinsic_binary_anylength(struct gallivm_state *gallivm,
+                                    const char *name,
+                                    struct lp_type src_type,
+                                    unsigned intr_size,
+                                    LLVMValueRef a,
+                                    LLVMValueRef b);
+
+
+LLVMValueRef
 lp_build_intrinsic_map(struct gallivm_state *gallivm,
                        const char *name,
                        LLVMTypeRef ret_type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 69796149aaa..7a4a5bb11d3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -52,8 +52,8 @@
  *
  *    select <4 x i1> %C, %A, %B
  *
- * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is not
- * supported on any backend.
+ * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
+ * supported on some backends (x86) starting with llvm 3.1.
  *
  * Expanding the boolean vector to full SIMD register width, as in
  *
@@ -485,8 +485,10 @@ lp_build_select(struct lp_build_context *bld,
       }
       res = LLVMBuildSelect(builder, mask, a, b, "");
    }
-   else if (util_cpu_caps.has_sse4_1 &&
-            type.width * type.length == 128 &&
+   else if (((util_cpu_caps.has_sse4_1 &&
+              type.width * type.length == 128) ||
+             (util_cpu_caps.has_avx &&
+              type.width * type.length == 256 && type.width >= 32)) &&
             !LLVMIsConstant(a) &&
             !LLVMIsConstant(b) &&
             !LLVMIsConstant(mask)) {
@@ -494,8 +496,22 @@ lp_build_select(struct lp_build_context *bld,
       LLVMTypeRef arg_type;
       LLVMValueRef args[3];
 
-      if (type.floating &&
-          type.width == 64) {
+      /*
+       *  There's only float blend in AVX but can just cast i32/i64
+       *  to float.
+       */
+      if (type.width * type.length == 256) {
+         if (type.width == 64) {
+           intrinsic = "llvm.x86.avx.blendv.pd.256";
+           arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
+         }
+         else {
+            intrinsic = "llvm.x86.avx.blendv.ps.256";
+            arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
+         }
+      }
+      else if (type.floating &&
+               type.width == 64) {
          intrinsic = "llvm.x86.sse41.blendvpd";
          arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
       } else if (type.floating &&
@@ -591,3 +607,35 @@ lp_build_select_aos(struct lp_build_context *bld,
       return lp_build_select(bld, mask_vec, a, b);
    }
 }
+
+
+/**
+ * Return (scalar-cast)val ? true : false;
+ */
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+                        unsigned real_length,
+                        LLVMValueRef val)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMTypeRef scalar_type;
+   LLVMTypeRef true_type;
+
+   assert(real_length <= bld->type.length);
+
+   true_type = LLVMIntTypeInContext(bld->gallivm->context,
+                                    bld->type.width * real_length);
+   scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
+                                      bld->type.width * bld->type.length);
+   val = LLVMBuildBitCast(builder, val, scalar_type, "");
+   /*
+    * We're using always native types so we can use intrinsics.
+    * However, if we don't do per-element calculations, we must ensure
+    * the excess elements aren't used since they may contain garbage.
+    */
+   if (real_length < bld->type.length) {
+      val = LLVMBuildTrunc(builder, val, true_type, "");
+   }
+   return LLVMBuildICmp(builder, LLVMIntNE,
+                        val, LLVMConstNull(true_type), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index ef33a653682..64c0a1f5946 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -82,4 +82,9 @@ lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef b);
 
 
+LLVMValueRef
+lp_build_any_true_range(struct lp_build_context *bld,
+                        unsigned real_length,
+                        LLVMValueRef val);
+
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 6c4586c4212..dd2c6120afb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -26,6 +26,12 @@
  **************************************************************************/
 
 
+/**
+ * The purpose of this module is to expose LLVM functionality not available
+ * through the C++ bindings.
+ */
+
+
 #ifndef __STDC_LIMIT_MACROS
 #define __STDC_LIMIT_MACROS
 #endif
@@ -41,11 +47,24 @@
 #include <llvm/Target/TargetOptions.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 #include <llvm/ExecutionEngine/JITEventListener.h>
+#if HAVE_LLVM >= 0x0301
+#include <llvm/ADT/Triple.h>
+#include <llvm/ExecutionEngine/JITMemoryManager.h>
+#endif
 #include <llvm/Support/CommandLine.h>
 #include <llvm/Support/PrettyStackTrace.h>
 
+#if HAVE_LLVM >= 0x0300
+#include <llvm/Support/TargetSelect.h>
+#else /* HAVE_LLVM < 0x0300 */
+#include <llvm/Target/TargetSelect.h>
+#endif /* HAVE_LLVM < 0x0300 */
+
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_misc.h"
 
 
 /**
@@ -99,6 +118,9 @@ lp_set_target_options(void)
 
 #if defined(DEBUG) || defined(PROFILE)
    llvm::NoFramePointerElim = true;
+#if HAVE_LLVM >= 0x0208
+   llvm::NoFramePointerElimNonLeaf = true;
+#endif
 #endif
 
    llvm::NoExcessFPPrecision = false;
@@ -146,6 +168,30 @@ lp_set_target_options(void)
     * shared object where the gallium driver resides.
     */
    llvm::DisablePrettyStackTrace = true;
+
+   // If we have a native target, initialize it to ensure it is linked in and
+   // usable by the JIT.
+   llvm::InitializeNativeTarget();
+
+#if HAVE_LLVM >= 0x0208
+   llvm::InitializeNativeTargetAsmPrinter();
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   LLVMInitializeX86AsmPrinter();
+#elif defined(PIPE_ARCH_ARM)
+   LLVMInitializeARMAsmPrinter();
+#elif defined(PIPE_ARCH_PPC)
+   LLVMInitializePowerPCAsmPrinter();
+#endif
+
+#if HAVE_LLVM >= 0x0207
+#  if HAVE_LLVM >= 0x0301
+   llvm::InitializeNativeTargetDisassembler();
+#  elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   LLVMInitializeX86Disassembler();
+#  elif defined(PIPE_ARCH_ARM)
+   LLVMInitializeARMDisassembler();
+#  endif
+#endif
 }
 
 
@@ -165,6 +211,7 @@ lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
    return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
 }
 
+
 extern "C"
 void
 lp_set_load_alignment(LLVMValueRef Inst,
@@ -180,3 +227,67 @@ lp_set_store_alignment(LLVMValueRef Inst,
 {
    llvm::unwrap<llvm::StoreInst>(Inst)->setAlignment(Align);
 }
+
+
+#if HAVE_LLVM >= 0x301
+
+/**
+ * Same as LLVMCreateJITCompilerForModule, but using MCJIT and enabling AVX
+ * feature where available.
+ *
+ * See also:
+ * - llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+ * - llvm/tools/lli/lli.cpp
+ * - http://markmail.org/message/ttkuhvgj4cxxy2on#query:+page:1+mid:aju2dggerju3ivd3+state:results
+ */
+extern "C"
+LLVMBool
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+                                          LLVMModuleRef M,
+                                          unsigned OptLevel,
+                                          char **OutError)
+{
+   using namespace llvm;
+
+   std::string Error;
+   EngineBuilder builder(unwrap(M));
+   builder.setEngineKind(EngineKind::JIT)
+          .setErrorStr(&Error)
+          .setOptLevel((CodeGenOpt::Level)OptLevel);
+
+   builder.setUseMCJIT(true);
+
+   llvm::SmallVector<std::string, 1> MAttrs;
+   if (util_cpu_caps.has_avx) {
+      /*
+       * AVX feature is not automatically detected from CPUID by the X86 target
+       * yet, because the old (yet default) JIT engine is not capable of
+       * emitting the opcodes.  But as we're using MCJIT here, it is safe to
+       * add set this attribute.
+       */
+      MAttrs.push_back("+avx");
+      builder.setMAttrs(MAttrs);
+   }
+   builder.setJITMemoryManager(JITMemoryManager::CreateDefaultMemManager());
+
+   ExecutionEngine *JIT;
+#if 0
+   JIT = builder.create();
+#else
+   /*
+    * Workaround http://llvm.org/bugs/show_bug.cgi?id=12833
+    */
+   StringRef MArch = "";
+   StringRef MCPU = "";
+   Triple TT(unwrap(M)->getTargetTriple());
+   JIT = builder.create(builder.selectTarget(TT, MArch, MCPU, MAttrs));
+#endif
+   if (JIT) {
+      *OutJIT = wrap(JIT);
+      return 0;
+   }
+   *OutError = strdup(Error.c_str());
+   return 1;
+}
+
+#endif /* HAVE_LLVM >= 0x301 */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.h b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
new file mode 100644
index 00000000000..4f80b38280c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "lp_bld.h"
+#include <llvm-c/ExecutionEngine.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
+
+
+extern void
+lp_func_delete_body(LLVMValueRef func);
+
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name);
+
+extern int
+lp_build_create_mcjit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
+                                          LLVMModuleRef M,
+                                          unsigned OptLevel,
+                                          char **OutError);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index fde6bb594f1..b18f7841ccb 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -69,6 +69,7 @@
 #include "util/u_debug.h"
 #include "util/u_math.h"
 #include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -76,6 +77,7 @@
 #include "lp_bld_intr.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_swizzle.h"
 
 
 /**
@@ -101,6 +103,30 @@ lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
    return LLVMConstVector(elems, n);
 }
 
+/**
+ * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
+ * See comment above lp_build_interleave2_half for more details.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
+                                   unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
+      if (i == (n / 2))
+         j += n / 4;
+
+      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
+      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
+   }
+
+   return LLVMConstVector(elems, n);
+}
 
 /**
  * Build shuffle vectors that match PACKxx instructions.
@@ -119,6 +145,71 @@ lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
    return LLVMConstVector(elems, n);
 }
 
+/**
+ * Return a vector with elements src[start:start+size]
+ * Most useful for getting half the values out of a 256bit sized vector,
+ * otherwise may cause data rearrangement to happen.
+ */
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       unsigned start,
+                       unsigned size)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(size <= Elements(elems));
+
+   for (i = 0; i < size; ++i)
+      elems[i] = lp_build_const_int32(gallivm, i + start);
+
+   if (size == 1) {
+      return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(elems, size), "");
+   }
+}
+
+/**
+ * Concatenates several (must be a power of 2) vectors (of same type)
+ * into a larger one.
+ * Most useful for building up a 256bit sized vector out of two 128bit ones.
+ */
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+                LLVMValueRef src[],
+                struct lp_type src_type,
+                unsigned num_vectors)
+{
+   unsigned new_length, i;
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+   assert(src_type.length * num_vectors <= Elements(shuffles));
+   assert(util_is_power_of_two(num_vectors));
+
+   new_length = src_type.length;
+
+   for (i = 0; i < num_vectors; i++)
+      tmp[i] = src[i];
+
+   while (num_vectors > 1) {
+      num_vectors >>= 1;
+      new_length <<= 1;
+      for (i = 0; i < new_length; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, i);
+      }
+      for (i = 0; i < num_vectors; i++) {
+         tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
+                                         LLVMConstVector(shuffles, new_length), "");
+      }
+   }
+
+   return tmp[0];
+}
 
 /**
  * Interleave vector elements.
@@ -139,6 +230,40 @@ lp_build_interleave2(struct gallivm_state *gallivm,
    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
 }
 
+/**
+ * Interleave vector elements but with 256 bit,
+ * treats it as interleave with 2 concatenated 128 bit vectors.
+ *
+ * This differs to lp_build_interleave2 as that function would do the following (for lo):
+ * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
+ *
+ *
+ * An example interleave 8x float with 8x float on AVX 256bit unpack:
+ *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
+ *
+ * Equivalent to interleaving 2x 128 bit vectors
+ *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
+ *
+ * So interleave-lo would result in:
+ *   a0 b0 a1 b1 a4 b4 a5 b5
+ *
+ * And interleave-hi would result in:
+ *   a2 b2 a3 b3 a6 b6 a7 b7
+ */
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi)
+{
+   if (type.length * type.width == 256) {
+      LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
+      return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
+   } else {
+      return lp_build_interleave2(gallivm, type, a, b, lo_hi);
+   }
+}
 
 /**
  * Double the bit width.
@@ -237,9 +362,9 @@ lp_build_unpack(struct gallivm_state *gallivm,
  * Non-interleaved pack.
  *
  * This will move values as
- *
- *   lo =   __ l0 __ l1 __ l2 __..  __ ln
- *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *         (LSB)                     (MSB)
+ *   lo =   l0 __ l1 __ l2 __..  __ ln __
+ *   hi =   h0 __ h1 __ h2 __..  __ hn __
  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
  *
  * This will only change the number of bits the values are represented, not the
@@ -257,12 +382,14 @@ lp_build_pack2(struct gallivm_state *gallivm,
                LLVMValueRef hi)
 {
    LLVMBuilderRef builder = gallivm->builder;
-#if HAVE_LLVM < 0x0207
-   LLVMTypeRef src_vec_type = lp_build_vec_type(gallivm, src_type);
-#endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
    LLVMValueRef shuffle;
    LLVMValueRef res = NULL;
+   struct lp_type intr_type = dst_type;
+
+#if HAVE_LLVM < 0x0207
+   intr_type = src_type;
+#endif
 
    assert(!src_type.floating);
    assert(!dst_type.floating);
@@ -270,50 +397,81 @@ lp_build_pack2(struct gallivm_state *gallivm,
    assert(src_type.length * 2 == dst_type.length);
 
    /* Check for special cases first */
-   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128) {
+      const char *intrinsic = NULL;
+
       switch(src_type.width) {
       case 32:
          if(dst_type.sign) {
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-#endif
+            intrinsic = "llvm.x86.sse2.packssdw.128";
          }
          else {
             if (util_cpu_caps.has_sse4_1) {
-               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
-            }
-            else {
-               /* use generic shuffle below */
-               res = NULL;
+               intrinsic = "llvm.x86.sse41.packusdw";
+#if HAVE_LLVM < 0x0207
+               /* llvm < 2.7 has inconsistent signatures except for packusdw */
+               intr_type = dst_type;
+#endif
             }
          }
          break;
-
       case 16:
-         if(dst_type.sign)
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-#endif
-         else
-#if HAVE_LLVM >= 0x0207
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
-#else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-#endif
-         break;
-
-      default:
-         assert(0);
-         return LLVMGetUndef(dst_vec_type);
+         if (dst_type.sign) {
+            intrinsic = "llvm.x86.sse2.packsswb.128";
+         }
+         else {
+            intrinsic = "llvm.x86.sse2.packuswb.128";
+         }
          break;
+      /* default uses generic shuffle below */
       }
-
-      if (res) {
-         res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+      if (intrinsic) {
+         if (src_type.width * src_type.length == 128) {
+            LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
+            res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
+            if (dst_vec_type != intr_vec_type) {
+               res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+            }
+         }
+         else {
+            int num_split = src_type.width * src_type.length / 128;
+            int i;
+            int nlen = 128 / src_type.width;
+            struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
+            struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
+            LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
+            LLVMValueRef tmplo, tmphi;
+            LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
+            LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
+
+            assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
+
+            for (i = 0; i < num_split / 2; i++) {
+               tmplo = lp_build_extract_range(gallivm,
+                                              lo, i*nlen*2, nlen);
+               tmphi = lp_build_extract_range(gallivm,
+                                              lo, i*nlen*2 + nlen, nlen);
+               tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                     nintr_vec_type, tmplo, tmphi);
+               if (ndst_vec_type != nintr_vec_type) {
+                  tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
+               }
+            }
+            for (i = 0; i < num_split / 2; i++) {
+               tmplo = lp_build_extract_range(gallivm,
+                                              hi, i*nlen*2, nlen);
+               tmphi = lp_build_extract_range(gallivm,
+                                              hi, i*nlen*2 + nlen, nlen);
+               tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
+                                                                 nintr_vec_type,
+                                                                 tmplo, tmphi);
+               if (ndst_vec_type != nintr_vec_type) {
+                  tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
+                                                           ndst_vec_type, "");
+               }
+            }
+            res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
+         }
          return res;
       }
    }
@@ -357,8 +515,9 @@ lp_build_packs2(struct gallivm_state *gallivm,
    /* All X86 SSE non-interleaved pack instructions take signed inputs and
     * saturate them, so no need to clamp for those cases. */
    if(util_cpu_caps.has_sse2 &&
-      src_type.width * src_type.length == 128 &&
-      src_type.sign)
+      src_type.width * src_type.length >= 128 &&
+      src_type.sign &&
+      (src_type.width == 32 || src_type.width == 16))
       clamp = FALSE;
 
    if(clamp) {
@@ -395,7 +554,6 @@ lp_build_pack(struct gallivm_state *gallivm,
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned i;
 
-
    /* Register width must remain constant */
    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
 
@@ -487,21 +645,44 @@ lp_build_resize(struct gallivm_state *gallivm,
         /*
          * Register width remains constant -- use vector packing intrinsics
          */
-
          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
       }
       else {
-         /*
-          * Do it element-wise.
-          */
-
-         assert(src_type.length == dst_type.length);
-         tmp[0] = lp_build_undef(gallivm, dst_type);
-         for (i = 0; i < dst_type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
-            val = LLVMBuildTrunc(builder, val, lp_build_elem_type(gallivm, dst_type), "");
-            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         if (src_type.width / dst_type.width > num_srcs) {
+            /*
+            * First change src vectors size (with shuffle) so they have the
+            * same size as the destination vector, then pack normally.
+            * Note: cannot use cast/extract because llvm generates atrocious code.
+            */
+            unsigned size_ratio = (src_type.width * src_type.length) /
+                                  (dst_type.length * dst_type.width);
+            unsigned new_length = src_type.length / size_ratio;
+
+            for (i = 0; i < size_ratio * num_srcs; i++) {
+               unsigned start_index = (i % size_ratio) * new_length;
+               tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
+                                               start_index, new_length);
+            }
+            num_srcs *= size_ratio;
+            src_type.length = new_length;
+            tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
+         }
+         else {
+            /*
+             * Truncate bit width but expand vector size - first pack
+             * then expand simply because this should be more AVX-friendly
+             * for the cases we probably hit.
+             */
+            unsigned size_ratio = (dst_type.width * dst_type.length) /
+                                  (src_type.length * src_type.width);
+            unsigned num_pack_srcs = num_srcs / size_ratio;
+            dst_type.length = dst_type.length / size_ratio;
+
+            for (i = 0; i < size_ratio; i++) {
+               tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
+                                      &src[i*num_pack_srcs], num_pack_srcs);
+            }
+            tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
          }
       }
    }
@@ -522,19 +703,24 @@ lp_build_resize(struct gallivm_state *gallivm,
          /*
           * Do it element-wise.
           */
+         assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+         for (i = 0; i < num_dsts; i++) {
+            tmp[i] = lp_build_undef(gallivm, dst_type);
+         }
 
-         assert(src_type.length == dst_type.length);
-         tmp[0] = lp_build_undef(gallivm, dst_type);
-         for (i = 0; i < dst_type.length; ++i) {
-            LLVMValueRef index = lp_build_const_int32(gallivm, i);
-            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+         for (i = 0; i < src_type.length; ++i) {
+            unsigned j = i / dst_type.length;
+            LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
+            LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
 
             if (src_type.sign && dst_type.sign) {
                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
             } else {
                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
             }
-            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+            tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
          }
       }
    }
@@ -554,3 +740,38 @@ lp_build_resize(struct gallivm_state *gallivm,
 }
 
 
+/**
+ * Expands src vector from src.length to dst_length
+ */
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       struct lp_type src_type,
+                       unsigned dst_length)
+{
+   LLVMValueRef undef = LLVMGetUndef(lp_build_vec_type(gallivm, src_type));
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(dst_length <= Elements(elems));
+   assert(dst_length > src_type.length);
+
+   if (src_type.length == dst_length)
+      return src;
+
+   /* If its a single scalar type, no need to reinvent the wheel */
+   if (src_type.length == 1) {
+      return lp_build_broadcast(gallivm, LLVMVectorType(lp_build_elem_type(gallivm, src_type), dst_length), src);
+   }
+
+   /* All elements from src vector */
+   for (i = 0; i < src_type.length; ++i)
+      elems[i] = lp_build_const_int32(gallivm, i);
+
+   /* Undef fill remaining space */
+   for (i = src_type.length; i < dst_length; ++i)
+      elems[i] = lp_build_const_int32(gallivm, src_type.length);
+
+   /* Combine the two vectors */
+   return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index d58da4f01b3..73f299cca11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -44,6 +44,12 @@
 
 struct lp_type;
 
+LLVMValueRef
+lp_build_interleave2_half(struct gallivm_state *gallivm,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi);
 
 LLVMValueRef
 lp_build_interleave2(struct gallivm_state *gallivm,
@@ -69,6 +75,17 @@ lp_build_unpack(struct gallivm_state *gallivm,
                 LLVMValueRef src,
                 LLVMValueRef *dst, unsigned num_dsts);
 
+LLVMValueRef
+lp_build_extract_range(struct gallivm_state *gallivm,
+                       LLVMValueRef src,
+                       unsigned start,
+                       unsigned size);
+
+LLVMValueRef
+lp_build_concat(struct gallivm_state *gallivm,
+                LLVMValueRef src[],
+                struct lp_type src_type,
+                unsigned num_vectors);
 
 LLVMValueRef
 lp_build_packs2(struct gallivm_state *gallivm,
@@ -102,4 +119,10 @@ lp_build_resize(struct gallivm_state *gallivm,
                 LLVMValueRef *dst, unsigned num_dsts);
 
 
+LLVMValueRef
+lp_build_pad_vector(struct gallivm_state *gallivm,
+                    LLVMValueRef src,
+                    struct lp_type src_type,
+                    unsigned dst_length);
+
 #endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index b0a5bc0267f..b1ba7c72655 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 
+#include "u_cpu_detect.h"
 #include "lp_bld_type.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_const.h"
@@ -77,34 +78,82 @@ lp_build_ddy(struct lp_build_context *bld,
    return lp_build_sub(bld, a_bottom, a_top);
 }
 
-
+/*
+ * To be able to handle multiple quads at once in texture sampling and
+ * do lod calculations per quad, it is necessary to get the per-quad
+ * derivatives into the lp_build_rho function.
+ * For 8-wide vectors the packed derivative values for 3 coords would
+ * look like this, this scales to a arbitrary (multiple of 4) vector size:
+ * ds1dx ds1dy dt1dx dt1dy ds2dx ds2dy dt2dx dt2dy
+ * dr1dx dr1dy _____ _____ dr2dx dr2dy _____ _____
+ * The second vector will be unused for 1d and 2d textures.
+ */
 LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+                                 LLVMValueRef a)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef idx_left  = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
-   LLVMValueRef idx_right = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_RIGHT);
-   LLVMValueRef a_left  = LLVMBuildExtractElement(builder, a, idx_left, "left");
-   LLVMValueRef a_right = LLVMBuildExtractElement(builder, a, idx_right, "right");
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef vec1, vec2;
+
+   /* same packing as _twocoord, but can use aos swizzle helper */
+
+   /*
+    * XXX could make swizzle1 a noop swizzle by using right top/bottom
+    * pair for ddy
+    */
+   static const unsigned char swizzle1[] = {
+      LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+   static const unsigned char swizzle2[] = {
+      LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_BOTTOM_LEFT,
+      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+   };
+
+   vec1 = lp_build_swizzle_aos(bld, a, swizzle1);
+   vec2 = lp_build_swizzle_aos(bld, a, swizzle2);
+
    if (bld->type.floating)
-      return LLVMBuildFSub(builder, a_right, a_left, "ddx");
+      return LLVMBuildFSub(builder, vec2, vec1, "ddxddy");
    else
-      return LLVMBuildSub(builder, a_right, a_left, "ddx");
+      return LLVMBuildSub(builder, vec2, vec1, "ddxddy");
 }
 
 
 LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
-                    LLVMValueRef a)
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+                                 LLVMValueRef a, LLVMValueRef b)
 {
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef idx_top    = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_TOP_LEFT);
-   LLVMValueRef idx_bottom = lp_build_const_int32(bld->gallivm, LP_BLD_QUAD_BOTTOM_LEFT);
-   LLVMValueRef a_top    = LLVMBuildExtractElement(builder, a, idx_top, "top");
-   LLVMValueRef a_bottom = LLVMBuildExtractElement(builder, a, idx_bottom, "bottom");
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH/4];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH/4];
+   LLVMValueRef vec1, vec2;
+   unsigned length, num_quads, i;
+
+   /* XXX: do hsub version */
+   length = bld->type.length;
+   num_quads = length / 4;
+   for (i = 0; i < num_quads; i++) {
+      unsigned s1 = 4 * i;
+      unsigned s2 = 4 * i + length;
+      shuffles1[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+      shuffles1[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s1);
+      shuffles1[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+      shuffles1[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_LEFT + s2);
+      shuffles2[4*i + 0] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s1);
+      shuffles2[4*i + 1] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s1);
+      shuffles2[4*i + 2] = lp_build_const_int32(gallivm, LP_BLD_QUAD_TOP_RIGHT + s2);
+      shuffles2[4*i + 3] = lp_build_const_int32(gallivm, LP_BLD_QUAD_BOTTOM_LEFT + s2);
+   }
+   vec1 = LLVMBuildShuffleVector(builder, a, b,
+                                 LLVMConstVector(shuffles1, length), "");
+   vec2 = LLVMBuildShuffleVector(builder, a, b,
+                                 LLVMConstVector(shuffles2, length), "");
    if (bld->type.floating)
-      return LLVMBuildFSub(builder, a_bottom, a_top, "ddy");
+      return LLVMBuildFSub(builder, vec2, vec1, "ddxddyddxddy");
    else
-      return LLVMBuildSub(builder, a_bottom, a_top, "ddy");
+      return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
 }
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
index b7992912927..be6a1efc396 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -78,19 +78,15 @@ lp_build_ddy(struct lp_build_context *bld,
 
 
 /*
- * Scalar derivatives.
- *
- * Same as getting the first value of above.
+ * Packed derivatives (one derivative for each direction per quad)
  */
-
 LLVMValueRef
-lp_build_scalar_ddx(struct lp_build_context *bld,
-                    LLVMValueRef a);
-
+lp_build_packed_ddx_ddy_twocoord(struct lp_build_context *bld,
+                                 LLVMValueRef a, LLVMValueRef b);
 
 LLVMValueRef
-lp_build_scalar_ddy(struct lp_build_context *bld,
-                    LLVMValueRef a);
+lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
+                                 LLVMValueRef a);
 
 
 #endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index d966788d74e..85211161f3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -44,6 +44,8 @@
 #include "lp_bld_sample.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_type.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
 
 
 /*
@@ -175,67 +177,89 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 /**
  * Generate code to compute coordinate gradient (rho).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  *
- * XXX: The resulting rho is scalar, so we ignore all but the first element of
- * derivatives that are passed by the shader.
+ * The resulting rho is scalar per quad.
  */
 static LLVMValueRef
 lp_build_rho(struct lp_build_sample_context *bld,
              unsigned unit,
-             const LLVMValueRef ddx[4],
-             const LLVMValueRef ddy[4])
+             const struct lp_derivatives *derivs)
 {
+   struct gallivm_state *gallivm = bld->gallivm;
    struct lp_build_context *int_size_bld = &bld->int_size_bld;
    struct lp_build_context *float_size_bld = &bld->float_size_bld;
    struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
+   const LLVMValueRef *ddx_ddy = derivs->ddx_ddy;
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
-   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-   LLVMValueRef rho_x, rho_y;
    LLVMValueRef rho_vec;
    LLVMValueRef int_size, float_size;
    LLVMValueRef rho;
    LLVMValueRef first_level, first_level_vec;
+   LLVMValueRef abs_ddx_ddy[2];
+   unsigned length = coord_bld->type.length;
+   unsigned num_quads = length / 4;
+   unsigned i;
+   LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+   LLVMValueRef rho_xvec, rho_yvec;
+
+   abs_ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+   if (dims > 2) {
+      abs_ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+   }
 
-   dsdx = ddx[0];
-   dsdy = ddy[0];
-
-   if (dims <= 1) {
-      rho_x = dsdx;
-      rho_y = dsdy;
+   if (dims == 1) {
+      static const unsigned char swizzle1[] = {
+         0, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle2[] = {
+         1, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+      rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
+   }
+   else if (dims == 2) {
+      static const unsigned char swizzle1[] = {
+         0, 2,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle2[] = {
+         1, 3,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      rho_xvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle1);
+      rho_yvec = lp_build_swizzle_aos(coord_bld, abs_ddx_ddy[0], swizzle2);
    }
    else {
-      rho_x = float_size_bld->undef;
-      rho_y = float_size_bld->undef;
-
-      rho_x = LLVMBuildInsertElement(builder, rho_x, dsdx, index0, "");
-      rho_y = LLVMBuildInsertElement(builder, rho_y, dsdy, index0, "");
-
-      dtdx = ddx[1];
-      dtdy = ddy[1];
-
-      rho_x = LLVMBuildInsertElement(builder, rho_x, dtdx, index1, "");
-      rho_y = LLVMBuildInsertElement(builder, rho_y, dtdy, index1, "");
-
-      if (dims >= 3) {
-         drdx = ddx[2];
-         drdy = ddy[2];
-
-         rho_x = LLVMBuildInsertElement(builder, rho_x, drdx, index2, "");
-         rho_y = LLVMBuildInsertElement(builder, rho_y, drdy, index2, "");
+      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
+      assert(dims == 3);
+      for (i = 0; i < num_quads; i++) {
+         shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
+         shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
+         shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
+         shuffles1[4*i + 3] = i32undef;
+         shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
+         shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
+         shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 1);
+         shuffles2[4*i + 3] = i32undef;
       }
+      rho_xvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+                                        LLVMConstVector(shuffles1, length), "");
+      rho_yvec = LLVMBuildShuffleVector(builder, abs_ddx_ddy[0], abs_ddx_ddy[1],
+                                        LLVMConstVector(shuffles2, length), "");
    }
 
-   rho_x = lp_build_abs(float_size_bld, rho_x);
-   rho_y = lp_build_abs(float_size_bld, rho_y);
-
-   rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+   rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, unit);
@@ -243,22 +267,77 @@ lp_build_rho(struct lp_build_sample_context *bld,
    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
    float_size = lp_build_int_to_float(float_size_bld, int_size);
 
-   rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+   if (bld->coord_type.length > 4) {
+      /* expand size to each quad */
+      if (dims > 1) {
+         /* could use some broadcast_vector helper for this? */
+         int num_quads = bld->coord_type.length / 4;
+         LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
+         for (i = 0; i < num_quads; i++) {
+            src[i] = float_size;
+         }
+         float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
+      }
+      else {
+         float_size = lp_build_broadcast_scalar(coord_bld, float_size);
+      }
+      rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
 
-   if (dims <= 1) {
-      rho = rho_vec;
+      if (dims <= 1) {
+         rho = rho_vec;
+      }
+      else {
+         if (dims >= 2) {
+            static const unsigned char swizzle1[] = {
+               0, LP_BLD_SWIZZLE_DONTCARE,
+               LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+            };
+            static const unsigned char swizzle2[] = {
+               1, LP_BLD_SWIZZLE_DONTCARE,
+               LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+            };
+            LLVMValueRef rho_s, rho_t, rho_r;
+
+            rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+            rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
+
+            rho = lp_build_max(coord_bld, rho_s, rho_t);
+
+            if (dims >= 3) {
+               static const unsigned char swizzle3[] = {
+                  2, LP_BLD_SWIZZLE_DONTCARE,
+                  LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+               };
+               rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
+               rho = lp_build_max(coord_bld, rho, rho_r);
+            }
+         }
+      }
+      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                      perquadf_bld->type, rho);
    }
    else {
-      if (dims >= 2) {
-         LLVMValueRef rho_s, rho_t, rho_r;
+      if (dims <= 1) {
+         rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+      }
+      rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
 
-         rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
-         rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
+      if (dims <= 1) {
+         rho = rho_vec;
+      }
+      else {
+         if (dims >= 2) {
+            LLVMValueRef rho_s, rho_t, rho_r;
+
+            rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
+            rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
 
-         rho = lp_build_max(float_bld, rho_s, rho_t);
-         if (dims >= 3) {
-            rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
-            rho = lp_build_max(float_bld, rho, rho_r);
+            rho = lp_build_max(float_bld, rho_s, rho_t);
+
+            if (dims >= 3) {
+               rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
+               rho = lp_build_max(float_bld, rho, rho_r);
+            }
          }
       }
    }
@@ -396,22 +475,20 @@ lp_build_brilinear_rho(struct lp_build_context *bld,
 
 /**
  * Generate code to compute texture level of detail (lambda).
- * \param ddx  partial derivatives of (s, t, r, q) with respect to X
- * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  * \param lod_bias  optional float vector with the shader lod bias
  * \param explicit_lod  optional float vector with the explicit lod
  * \param width  scalar int texture width
  * \param height  scalar int texture height
  * \param depth  scalar int texture depth
  *
- * XXX: The resulting lod is scalar, so ignore all but the first element of
- * derivatives, lod_bias, etc that are passed by the shader.
+ * The resulting lod is scalar per quad, so only the first value per quad
+ * passed in from lod_bias, explicit_lod is used.
  */
 void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                       unsigned unit,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
+                      const struct lp_derivatives *derivs,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
@@ -420,11 +497,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
    LLVMValueRef lod;
 
-   *out_lod_ipart = bld->int_bld.zero;
-   *out_lod_fpart = bld->float_bld.zero;
+   *out_lod_ipart = bld->perquadi_bld.zero;
+   *out_lod_fpart = perquadf_bld->zero;
 
    if (bld->static_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
@@ -433,21 +510,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       LLVMValueRef min_lod =
          bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
 
-      lod = min_lod;
+      lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
    }
    else {
-      LLVMValueRef sampler_lod_bias =
-         bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
-      LLVMValueRef index0 = lp_build_const_int32(bld->gallivm, 0);
-
       if (explicit_lod) {
-         lod = LLVMBuildExtractElement(builder, explicit_lod,
-                                       index0, "");
+         lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                                         perquadf_bld->type, explicit_lod);
       }
       else {
          LLVMValueRef rho;
 
-         rho = lp_build_rho(bld, unit, ddx, ddy);
+         rho = lp_build_rho(bld, unit, derivs);
 
          /*
           * Compute lod = log2(rho)
@@ -465,66 +538,72 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-               *out_lod_ipart = lp_build_ilog2(float_bld, rho);
-               *out_lod_fpart = bld->float_bld.zero;
+               *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
+               *out_lod_fpart = perquadf_bld->zero;
                return;
             }
             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-               lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+               lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
                                       out_lod_ipart, out_lod_fpart);
                return;
             }
          }
 
          if (0) {
-            lod = lp_build_log2(float_bld, rho);
+            lod = lp_build_log2(perquadf_bld, rho);
          }
          else {
-            lod = lp_build_fast_log2(float_bld, rho);
+            lod = lp_build_fast_log2(perquadf_bld, rho);
          }
 
          /* add shader lod bias */
          if (lod_bias) {
-            lod_bias = LLVMBuildExtractElement(builder, lod_bias,
-                                               index0, "");
+            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+                  perquadf_bld->type, lod_bias);
             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
          }
       }
 
       /* add sampler lod bias */
-      if (bld->static_state->lod_bias_non_zero)
+      if (bld->static_state->lod_bias_non_zero) {
+         LLVMValueRef sampler_lod_bias =
+            bld->dynamic_state->lod_bias(bld->dynamic_state, bld->gallivm, unit);
+         sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
+                                                      sampler_lod_bias);
          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
-
+      }
 
       /* clamp lod */
       if (bld->static_state->apply_max_lod) {
          LLVMValueRef max_lod =
             bld->dynamic_state->max_lod(bld->dynamic_state, bld->gallivm, unit);
+         max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
 
-         lod = lp_build_min(float_bld, lod, max_lod);
+         lod = lp_build_min(perquadf_bld, lod, max_lod);
       }
       if (bld->static_state->apply_min_lod) {
          LLVMValueRef min_lod =
             bld->dynamic_state->min_lod(bld->dynamic_state, bld->gallivm, unit);
+         min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
 
-         lod = lp_build_max(float_bld, lod, min_lod);
+         lod = lp_build_max(perquadf_bld, lod, min_lod);
       }
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
-         lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+         lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
                                 out_lod_ipart, out_lod_fpart);
       }
       else {
-         lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+         lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
       }
 
       lp_build_name(*out_lod_fpart, "lod_fpart");
    }
    else {
-      *out_lod_ipart = lp_build_iround(float_bld, lod);
+      *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
    }
 
    lp_build_name(*out_lod_ipart, "lod_ipart");
@@ -536,8 +615,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 /**
  * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
  * mipmap level index.
- * Note: this is all scalar code.
- * \param lod  scalar float texture level of detail
+ * Note: this is all scalar per quad code.
+ * \param lod_ipart  int texture level of detail
  * \param level_out  returns integer 
  */
 void
@@ -546,26 +625,27 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                            LLVMValueRef lod_ipart,
                            LLVMValueRef *level_out)
 {
-   struct lp_build_context *int_bld = &bld->int_bld;
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
    LLVMValueRef first_level, last_level, level;
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, unit);
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->gallivm, unit);
+   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
 
-   /* convert float lod to integer */
-   level = lp_build_add(int_bld, lod_ipart, first_level);
+   level = lp_build_add(perquadi_bld, lod_ipart, first_level);
 
    /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_bld, level, first_level, last_level);
+   *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
 }
 
 
 /**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
+ * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
+ * Later, we'll sample from those two mipmap levels and interpolate between them.
  */
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
@@ -576,20 +656,21 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            LLVMValueRef *level1_out)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
+   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
    LLVMValueRef first_level, last_level;
    LLVMValueRef clamp_min;
    LLVMValueRef clamp_max;
 
    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                  bld->gallivm, unit);
-
-   *level0_out = lp_build_add(int_bld, lod_ipart, first_level);
-   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->gallivm, unit);
+   first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
+   last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
+
+   *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
+   *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
 
    /*
     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
@@ -597,6 +678,15 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
     * ends in the process.
     */
 
+   /*
+    * This code (vector select in particular) only works with llvm 3.1
+    * (if there's more than one quad, with x86 backend). Might consider
+    * converting to our lp_bld_logic helpers.
+    */
+#if HAVE_LLVM < 0x0301
+   assert(perquadi_bld->type.length == 1);
+#endif
+
    /* *level0_out < first_level */
    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
                              *level0_out, first_level,
@@ -609,7 +699,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                  first_level, *level1_out, "");
 
    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
-                                      float_bld->zero, *lod_fpart_inout, "");
+                                      perquadf_bld->zero, *lod_fpart_inout, "");
 
    /* *level0_out >= last_level */
    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
@@ -623,7 +713,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                                  last_level, *level1_out, "");
 
    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
-                                      float_bld->zero, *lod_fpart_inout, "");
+                                      perquadf_bld->zero, *lod_fpart_inout, "");
 
    lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
    lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
@@ -651,15 +741,6 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 }
 
 
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                int level)
-{
-   LLVMValueRef lvl = lp_build_const_int32(bld->gallivm, level);
-   return lp_build_get_mipmap_level(bld, lvl);
-}
-
-
 /**
  * Codegen equivalent for u_minify().
  * Return max(1, base_size >> level);
@@ -748,8 +829,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
  *                    bld->int_size_type or bld->float_size_type)
  * @param coord_type  type of the texture size vector (either
  *                    bld->int_coord_type or bld->coord_type)
- * @param int_size    vector with the integer texture size (width, height,
- *                    depth)
+ * @param size        vector with the texture size (width, height, depth)
  */
 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
@@ -788,7 +868,7 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
 /**
  * Unnormalize coords.
  *
- * @param int_size  vector with the integer texture size (width, height, depth)
+ * @param flt_size  vector with the integer texture size (width, height, depth)
  */
 void
 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
@@ -823,7 +903,18 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
 
 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = +0.5 / abs(coord); */
+   LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
+   return ima;
+}
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
 {
    /* ima = -0.5 / abs(coord); */
    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
@@ -832,9 +923,12 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
    return ima;
 }
 
-
 /**
  * Helper used by lp_build_cube_lookup()
+ * FIXME: the sign here can also be 0.
+ * Arithmetically this could definitely make a difference. Either
+ * fix the comment or use other (simpler) sign function, not sure
+ * which one it should be.
  * \param sign  scalar +1 or -1
  * \param coord  float vector
  * \param ima  float vector
@@ -898,58 +992,186 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
    struct lp_build_context *coord_bld = &bld->coord_bld;
    LLVMBuilderRef builder = bld->gallivm->builder;
+   struct gallivm_state *gallivm = bld->gallivm;
    LLVMValueRef rx, ry, rz;
-   LLVMValueRef arx, ary, arz;
-   LLVMValueRef c25 = lp_build_const_float(bld->gallivm, 0.25);
-   LLVMValueRef arx_ge_ary, arx_ge_arz;
-   LLVMValueRef ary_ge_arx, ary_ge_arz;
-   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-
-   assert(bld->coord_bld.type.length == 4);
+   LLVMValueRef tmp[4], rxyz, arxyz;
 
    /*
     * Use the average of the four pixel's texcoords to choose the face.
+    * Slight simplification just calculate the sum, skip scaling.
     */
-   rx = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, s));
-   ry = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, t));
-   rz = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, r));
+   tmp[0] = s;
+   tmp[1] = t;
+   tmp[2] = r;
+   rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
+   arxyz = lp_build_abs(&bld->coord_bld, rxyz);
+
+   if (coord_bld->type.length > 4) {
+      struct lp_build_context *cint_bld = &bld->int_coord_bld;
+      struct lp_type intctype = cint_bld->type;
+      LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
+      LLVMValueRef arxs, arys, arzs;
+      LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
+      LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+      LLVMValueRef ryneg, rzneg;
+      LLVMValueRef ma, ima;
+      LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+      LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+                                                     1 << (intctype.width - 1));
+      LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+                                                      intctype.width -1);
+      LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+      LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+      LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+
+      assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+      assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+      assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+
+      rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
+      ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
+      rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
+      ryneg = LLVMBuildXor(builder, ry, signmask, "");
+      rzneg = LLVMBuildXor(builder, rz, signmask, "");
+
+      /* the sign bit comes from the averaged vector (per quad),
+       * as does the decision which face to use */
+      signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
+      signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
+
+      arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
+      arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
+      arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
 
-   arx = lp_build_abs(float_bld, rx);
-   ary = lp_build_abs(float_bld, ry);
-   arz = lp_build_abs(float_bld, rz);
+      /*
+       * select x if x >= y else select y
+       * select previous result if y >= max(x,y) else select z
+       */
+      arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
+      maxarxsarys = lp_build_max(coord_bld, arxs, arys);
+      arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
 
-   /*
-    * Compare sign/magnitude of rx,ry,rz to determine face
-    */
-   arx_ge_ary = LLVMBuildFCmp(builder, LLVMRealUGE, arx, ary, "");
-   arx_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, arx, arz, "");
-   ary_ge_arx = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arx, "");
-   ary_ge_arz = LLVMBuildFCmp(builder, LLVMRealUGE, ary, arz, "");
+      /*
+       * compute all possible new s/t coords
+       * snewx = signrx * -rz;
+       * tnewx = -ry;
+       * snewy = rx;
+       * tnewy = signry * rz;
+       * snewz = signrz * rx;
+       * tnewz = -ry;
+       */
+      signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+      snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
+      tnewx = ryneg;
+
+      signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+      snewy = rx;
+      tnewy = LLVMBuildXor(builder, signrys, rz, "");
+
+      signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+      snewz = LLVMBuildXor(builder, signrzs, rx, "");
+      tnewz = ryneg;
+
+      /* XXX on x86 unclear if we should cast the values back to float
+       * or not - on some cpus (nehalem) pblendvb has twice the throughput
+       * of blendvps though on others there just might be domain
+       * transition penalties when using it (this depends on what llvm
+       * will chose for the bit ops above so there appears no "right way",
+       * but given the boatload of selects let's just use the int type).
+       *
+       * Unfortunately we also need the sign bit of the summed coords.
+       */
+      *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
+      *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
+      ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
+      *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
+      sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
+
+      *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
+      *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
+      ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
+      *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
+      sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
+
+      *face_s = LLVMBuildBitCast(builder, *face_s,
+                               lp_build_vec_type(gallivm, coord_bld->type), "");
+      *face_t = LLVMBuildBitCast(builder, *face_t,
+                               lp_build_vec_type(gallivm, coord_bld->type), "");
+
+      /* add +1 for neg face */
+      /* XXX with AVX probably want to use another select here -
+       * as long as we ensure vblendvps gets used we can actually
+       * skip the comparison and just use sign as a "mask" directly.
+       */
+      sign = LLVMBuildLShr(builder, sign, signshift, "");
+      *face = LLVMBuildOr(builder, *face, sign, "face");
 
-   arx_ge_ary_arz = LLVMBuildAnd(builder, arx_ge_ary, arx_ge_arz, "");
-   ary_ge_arx_arz = LLVMBuildAnd(builder, ary_ge_arx, ary_ge_arz, "");
+      ima = lp_build_cube_imapos(coord_bld, ma);
+
+      *face_s = lp_build_mul(coord_bld, *face_s, ima);
+      *face_s = lp_build_add(coord_bld, *face_s, posHalf);
+      *face_t = lp_build_mul(coord_bld, *face_t, ima);
+      *face_t = lp_build_add(coord_bld, *face_t, posHalf);
+   }
 
-   {
+   else {
       struct lp_build_if_state if_ctx;
       LLVMValueRef face_s_var;
       LLVMValueRef face_t_var;
       LLVMValueRef face_var;
-
-      face_s_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_s_var");
-      face_t_var = lp_build_alloca(bld->gallivm, bld->coord_bld.vec_type, "face_t_var");
-      face_var = lp_build_alloca(bld->gallivm, bld->int_bld.vec_type, "face_var");
-
-      lp_build_if(&if_ctx, bld->gallivm, arx_ge_ary_arz);
+      LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+      LLVMValueRef shuffles[4];
+      LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
+      LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
+      struct lp_build_context *float_bld = &bld->float_bld;
+
+      assert(bld->coord_bld.type.length == 4);
+
+      shuffles[0] = lp_build_const_int32(gallivm, 0);
+      shuffles[1] = lp_build_const_int32(gallivm, 1);
+      shuffles[2] = lp_build_const_int32(gallivm, 0);
+      shuffles[3] = lp_build_const_int32(gallivm, 1);
+      arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+      shuffles[0] = lp_build_const_int32(gallivm, 1);
+      shuffles[1] = lp_build_const_int32(gallivm, 0);
+      shuffles[2] = lp_build_const_int32(gallivm, 2);
+      shuffles[3] = lp_build_const_int32(gallivm, 2);
+      aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
+      arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
+
+      shuffles[0] = lp_build_const_int32(gallivm, 0);
+      shuffles[1] = lp_build_const_int32(gallivm, 1);
+      arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+                                            LLVMConstVector(shuffles, 2), "");
+      shuffles[0] = lp_build_const_int32(gallivm, 2);
+      shuffles[1] = lp_build_const_int32(gallivm, 3);
+      arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
+                                            LLVMConstVector(shuffles, 2), "");
+      arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
+
+      arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
+                                               lp_build_const_int32(gallivm, 1), "");
+      ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
+                                               lp_build_const_int32(gallivm, 0), "");
+      face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
+      face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
+      face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
+
+      lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
       {
          /* +/- X face */
-         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
-         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         LLVMValueRef sign, ima;
+         rx = LLVMBuildExtractElement(builder, rxyz,
+                                      lp_build_const_int32(gallivm, 0), "");
+         /* +/- X face */
+         sign = lp_build_sgn(float_bld, rx);
+         ima = lp_build_cube_imaneg(coord_bld, s);
          *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
          *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
          *face = lp_build_cube_face(bld, rx,
@@ -963,11 +1185,14 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       {
          struct lp_build_if_state if_ctx2;
 
-         lp_build_if(&if_ctx2, bld->gallivm, ary_ge_arx_arz);
+         lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
          {
+            LLVMValueRef sign, ima;
             /* +/- Y face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            ry = LLVMBuildExtractElement(builder, rxyz,
+                                         lp_build_const_int32(gallivm, 1), "");
+            sign = lp_build_sgn(float_bld, ry);
+            ima = lp_build_cube_imaneg(coord_bld, t);
             *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
             *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
             *face = lp_build_cube_face(bld, ry,
@@ -980,8 +1205,11 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          lp_build_else(&if_ctx2);
          {
             /* +/- Z face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            LLVMValueRef sign, ima;
+            rz = LLVMBuildExtractElement(builder, rxyz,
+                                         lp_build_const_int32(gallivm, 2), "");
+            sign = lp_build_sgn(float_bld, rz);
+            ima = lp_build_cube_imaneg(coord_bld, r);
             *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
             *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
             *face = lp_build_cube_face(bld, rz,
@@ -999,6 +1227,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
       *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
       *face   = LLVMBuildLoad(builder, face_var, "face");
+      *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index dad138abee0..0f3d8ae6cb5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -52,6 +52,15 @@ struct lp_build_context;
 
 
 /**
+ * Helper struct holding all derivatives needed for sampling
+ */
+struct lp_derivatives
+{
+   LLVMValueRef ddx_ddy[2];
+};
+
+
+/**
  * Sampler static state.
  *
  * These are the bits of state from pipe_resource and pipe_sampler_state that
@@ -192,6 +201,9 @@ struct lp_build_sample_context
    /* See texture_dims() */
    unsigned dims;
 
+   /** SIMD vector width */
+   unsigned vector_width;
+
    /** regular scalar float type */
    struct lp_type float_type;
    struct lp_build_context float_bld;
@@ -199,7 +211,7 @@ struct lp_build_sample_context
    /** float vector type */
    struct lp_build_context float_vec_bld;
 
-   /** regular scalar float type */
+   /** regular scalar int type */
    struct lp_type int_type;
    struct lp_build_context int_bld;
 
@@ -223,10 +235,15 @@ struct lp_build_sample_context
    struct lp_type texel_type;
    struct lp_build_context texel_bld;
 
+   /** Float per-quad type */
+   struct lp_type perquadf_type;
+   struct lp_build_context perquadf_bld;
+
+   /** Int per-quad type */
+   struct lp_type perquadi_type;
+   struct lp_build_context perquadi_bld;
+
    /* Common dynamic state values */
-   LLVMValueRef width;
-   LLVMValueRef height;
-   LLVMValueRef depth;
    LLVMValueRef row_stride_array;
    LLVMValueRef img_stride_array;
    LLVMValueRef data_array;
@@ -305,8 +322,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                       unsigned unit,
-                      const LLVMValueRef ddx[4],
-                      const LLVMValueRef ddy[4],
+                      const struct lp_derivatives *derivs,
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
                       unsigned mip_filter,
@@ -331,10 +347,6 @@ LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
                           LLVMValueRef level);
 
-LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                int level);
-
 
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
@@ -402,22 +414,35 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
+                    const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias,
                     LLVMValueRef explicit_lod,
                     LLVMValueRef texel_out[4]);
 
+
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+                                  LLVMValueRef coord_f,
+                                  LLVMValueRef length_i,
+                                  LLVMValueRef length_f,
+                                  LLVMValueRef *coord0_i,
+                                  LLVMValueRef *weight_f);
+
+
 void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                         const struct lp_sampler_static_state *static_state,
                         struct lp_sampler_dynamic_state *dynamic_state,
+                        struct lp_type int_type,
                         unsigned unit,
                         LLVMValueRef explicit_lod,
                         LLVMValueRef *sizes_out);
 
 void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm, 
+                    struct lp_type type,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 74858bc9718..ad1b29cf096 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -27,7 +27,7 @@
 
 /**
  * @file
- * Texture sampling -- SoA.
+ * Texture sampling -- AoS.
  *
  * @author Jose Fonseca <jfonseca@vmware.com>
  * @author Brian Paul <brianp@vmware.com>
@@ -40,6 +40,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -75,6 +76,7 @@ static void
 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
                                  unsigned block_length,
                                  LLVMValueRef coord,
+                                 LLVMValueRef coord_f,
                                  LLVMValueRef length,
                                  LLVMValueRef stride,
                                  boolean is_pot,
@@ -93,10 +95,11 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
       if(is_pot)
          coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         coord = LLVMBuildAdd(builder, coord, bias, "");
-         coord = LLVMBuildURem(builder, coord, length, "");
+         struct lp_build_context *coord_bld = &bld->coord_bld;
+         LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+         coord = lp_build_fract_safe(coord_bld, coord_f);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_itrunc(coord_bld, coord);
       }
       break;
 
@@ -121,6 +124,56 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
 
 
 /**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for float texcoords.
+ * \param coord  the incoming texcoord (s,t,r or q)
+ * \param length  the texture size along one dimension
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param icoord  the texcoord after wrapping, as int
+ */
+static void
+lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
+                                   LLVMValueRef coord,
+                                   LLVMValueRef length,
+                                   boolean is_pot,
+                                   unsigned wrap_mode,
+                                   LLVMValueRef *icoord)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef length_minus_one;
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      /* take fraction, unnormalize */
+      coord = lp_build_fract_safe(coord_bld, coord);
+      coord = lp_build_mul(coord_bld, coord, length);
+      *icoord = lp_build_itrunc(coord_bld, coord);
+      break;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length);
+      }
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
+                             length_minus_one);
+      *icoord = lp_build_itrunc(coord_bld, coord);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+   }
+}
+
+
+/**
  * Build LLVM code for texture coord wrapping, for linear filtering,
  * for scaled integer texcoords.
  * \param block_length  is the length of the pixel block along the
@@ -139,6 +192,8 @@ static void
 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
                                 unsigned block_length,
                                 LLVMValueRef coord0,
+                                LLVMValueRef *weight_i,
+                                LLVMValueRef coord_f,
                                 LLVMValueRef length,
                                 LLVMValueRef stride,
                                 boolean is_pot,
@@ -153,58 +208,85 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
    LLVMValueRef length_minus_one;
    LLVMValueRef lmask, umask, mask;
 
-   if (block_length != 1) {
-      /*
-       * If the pixel block covers more than one pixel then there is no easy
-       * way to calculate offset1 relative to offset0. Instead, compute them
-       * independently.
-       */
-
-      LLVMValueRef coord1;
-
-      lp_build_sample_wrap_nearest_int(bld,
-                                       block_length,
-                                       coord0,
-                                       length,
-                                       stride,
-                                       is_pot,
-                                       wrap_mode,
-                                       offset0, i0);
-
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+   /*
+    * If the pixel block covers more than one pixel then there is no easy
+    * way to calculate offset1 relative to offset0. Instead, compute them
+    * independently. Otherwise, try to compute offset0 and offset1 with
+    * a single stride multiplication.
+    */
 
-      lp_build_sample_wrap_nearest_int(bld,
-                                       block_length,
-                                       coord1,
-                                       length,
-                                       stride,
-                                       is_pot,
-                                       wrap_mode,
-                                       offset1, i1);
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
+   if (block_length != 1) {
+      LLVMValueRef coord1;
+      switch(wrap_mode) {
+      case PIPE_TEX_WRAP_REPEAT:
+         if (is_pot) {
+            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+            coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
+            coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
+         }
+         else {
+            LLVMValueRef mask;
+            LLVMValueRef weight;
+            LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+            lp_build_coord_repeat_npot_linear(bld, coord_f,
+                                              length, length_f,
+                                              &coord0, &weight);
+            mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+                                    PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+            coord1 = LLVMBuildAnd(builder,
+                                  lp_build_add(int_coord_bld, coord0,
+                                               int_coord_bld->one),
+                                  mask, "");
+            weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+            *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
+         }
+         break;
+
+      case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
+                                length_minus_one);
+         coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
+                                length_minus_one);
+         break;
+
+      case PIPE_TEX_WRAP_CLAMP:
+      case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      default:
+         assert(0);
+         coord0 = int_coord_bld->zero;
+         coord1 = int_coord_bld->zero;
+         break;
+      }
+      lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
+                                     offset0, i0);
+      lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
+                                     offset1, i1);
       return;
    }
 
-   /*
-    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
-    * multiplication.
-    */
-
    *i0 = int_coord_bld->zero;
    *i1 = int_coord_bld->zero;
 
-   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
-
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
       if (is_pot) {
          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
       }
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         coord0 = LLVMBuildAdd(builder, coord0, bias, "");
-         coord0 = LLVMBuildURem(builder, coord0, length, "");
+         LLVMValueRef weight;
+         LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
+         lp_build_coord_repeat_npot_linear(bld, coord_f,
+                                           length, length_f,
+                                           &coord0, &weight);
+         weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
+         *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
       }
 
       mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
@@ -217,6 +299,11 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* XXX this might be slower than the separate path
+       * on some newer cpus. With sse41 this is 8 instructions vs. 7
+       * - at least on SNB this is almost certainly slower since
+       * min/max are cheaper than selects, and the muls aren't bad.
+       */
       lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                                PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
       umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
@@ -249,6 +336,176 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 
 
 /**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for float texcoords.
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param coord  the incoming texcoord (s,t,r or q)
+ * \param length  the texture size along one dimension
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param coord0  the first texcoord after wrapping, as int
+ * \param coord1  the second texcoord after wrapping, as int
+ * \param weight  the filter weight as int (0-255)
+ * \param force_nearest  if this coord actually uses nearest filtering
+ */
+static void
+lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
+                                  unsigned block_length,
+                                  LLVMValueRef coord,
+                                  LLVMValueRef length,
+                                  boolean is_pot,
+                                  unsigned wrap_mode,
+                                  LLVMValueRef *coord0,
+                                  LLVMValueRef *coord1,
+                                  LLVMValueRef *weight,
+                                  unsigned force_nearest)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         /* mul by size and subtract 0.5 */
+         coord = lp_build_mul(coord_bld, coord, length);
+         if (!force_nearest)
+            coord = lp_build_sub(coord_bld, coord, half);
+         *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+         *coord1 = lp_build_ifloor(coord_bld, *coord1);
+         /* repeat wrap */
+         length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
+         *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
+         *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
+      }
+      else {
+         LLVMValueRef mask;
+         /* wrap with normalized floats is just fract */
+         coord = lp_build_fract(coord_bld, coord);
+         /* unnormalize */
+         coord = lp_build_mul(coord_bld, coord, length);
+         /*
+          * we avoided the 0.5/length division, have to fix up wrong
+          * edge cases with selects
+          */
+         *coord1 = lp_build_add(coord_bld, coord, half);
+         coord = lp_build_sub(coord_bld, coord, half);
+         *weight = lp_build_fract(coord_bld, coord);
+         mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+                                 PIPE_FUNC_LESS, coord, coord_bld->zero);
+         *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
+         *coord0 = lp_build_itrunc(coord_bld, *coord0);
+         mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+                                 PIPE_FUNC_LESS, *coord1, length);
+         *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
+         *coord1 = lp_build_itrunc(coord_bld, *coord1);
+      }
+      break;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      if (bld->static_state->normalized_coords) {
+         /* mul by tex size */
+         coord = lp_build_mul(coord_bld, coord, length);
+      }
+      /* subtract 0.5 */
+      if (!force_nearest) {
+         coord = lp_build_sub(coord_bld, coord, half);
+      }
+      /* clamp to [0, length - 1] */
+      coord = lp_build_min(coord_bld, coord, length_minus_one);
+      coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+      *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
+      /* coord1 = min(coord1, length-1) */
+      *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
+      *coord1 = lp_build_itrunc(coord_bld, *coord1);
+      break;
+   default:
+      assert(0);
+      *coord0 = int_coord_bld->zero;
+      *coord1 = int_coord_bld->zero;
+      *weight = coord_bld->zero;
+      break;
+   }
+   *weight = lp_build_mul_imm(coord_bld, *weight, 256);
+   *weight = lp_build_itrunc(coord_bld, *weight);
+   return;
+}
+
+
+/**
+ * Fetch texels for image with nearest sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
+                                    LLVMValueRef data_ptr,
+                                    LLVMValueRef offset,
+                                    LLVMValueRef x_subcoord,
+                                    LLVMValueRef y_subcoord,
+                                    LLVMValueRef *colors_lo,
+                                    LLVMValueRef *colors_hi)
+{
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef rgba8;
+   struct lp_build_context h16, u8n;
+   LLVMTypeRef u8n_vec_type;
+
+   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      /*
+       * Given the format is a rgba8, just read the pixels as is,
+       * without any swizzling. Swizzling will be done later.
+       */
+      rgba8 = lp_build_gather(bld->gallivm,
+                              bld->texel_type.length,
+                              bld->format_desc->block.bits,
+                              bld->texel_type.width,
+                              data_ptr, offset);
+
+      rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+   }
+   else {
+      rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+                                      bld->format_desc,
+                                      u8n.type,
+                                      data_ptr, offset,
+                                      x_subcoord,
+                                      y_subcoord);
+   }
+
+   /* Expand one 4*rgba8 to two 2*rgba16 */
+   lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+                    rgba8,
+                    colors_lo, colors_hi);
+}
+
+
+/**
  * Sample a single texture image with nearest sampling.
  * If sampling a cube texture, r = cube face in [0,5].
  * Return filtered color as two vectors of 16-bit fixed point values.
@@ -267,21 +524,19 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, u8n_vec_type;
+   struct lp_build_context i32;
+   LLVMTypeRef i32_vec_type;
    LLVMValueRef i32_c8;
    LLVMValueRef width_vec, height_vec, depth_vec;
    LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
+   LLVMValueRef s_float, t_float = NULL, r_float = NULL;
    LLVMValueRef x_stride;
    LLVMValueRef x_offset, offset;
    LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
 
-   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
-   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
 
    i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
-   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
    lp_build_extract_image_sizes(bld,
                                 bld->int_size_type,
@@ -291,6 +546,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                 &height_vec,
                                 &depth_vec);
 
+   s_float = s; t_float = t; r_float = r;
+
    if (bld->static_state->normalized_coords) {
       LLVMValueRef scaled_size;
       LLVMValueRef flt_size;
@@ -334,7 +591,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    /* Do texcoord wrapping, compute texel offset */
    lp_build_sample_wrap_nearest_int(bld,
                                     bld->format_desc->block.width,
-                                    s_ipart, width_vec, x_stride,
+                                    s_ipart, s_float,
+                                    width_vec, x_stride,
                                     bld->static_state->pot_width,
                                     bld->static_state->wrap_s,
                                     &x_offset, &x_subcoord);
@@ -343,7 +601,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       LLVMValueRef y_offset;
       lp_build_sample_wrap_nearest_int(bld,
                                        bld->format_desc->block.height,
-                                       t_ipart, height_vec, row_stride_vec,
+                                       t_ipart, t_float,
+                                       height_vec, row_stride_vec,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t,
                                        &y_offset, &y_subcoord);
@@ -352,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
          LLVMValueRef z_offset;
          lp_build_sample_wrap_nearest_int(bld,
                                           1, /* block length (depth) */
-                                          r_ipart, depth_vec, img_stride_vec,
+                                          r_ipart, r_float,
+                                          depth_vec, img_stride_vec,
                                           bld->static_state->pot_depth,
                                           bld->static_state->wrap_r,
                                           &z_offset, &z_subcoord);
@@ -366,6 +626,196 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       }
    }
 
+   lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+                                       x_subcoord, y_subcoord,
+                                       colors_lo, colors_hi);
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
+                                     LLVMValueRef int_size,
+                                     LLVMValueRef row_stride_vec,
+                                     LLVMValueRef img_stride_vec,
+                                     LLVMValueRef data_ptr,
+                                     LLVMValueRef s,
+                                     LLVMValueRef t,
+                                     LLVMValueRef r,
+                                     LLVMValueRef *colors_lo,
+                                     LLVMValueRef *colors_hi)
+   {
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec, height_vec, depth_vec;
+   LLVMValueRef offset;
+   LLVMValueRef x_subcoord, y_subcoord;
+   LLVMValueRef x_icoord, y_icoord, z_icoord;
+   LLVMValueRef flt_size;
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
+   /* Do texcoord wrapping */
+   lp_build_sample_wrap_nearest_float(bld,
+                                      s, width_vec,
+                                      bld->static_state->pot_width,
+                                      bld->static_state->wrap_s,
+                                      &x_icoord);
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_nearest_float(bld,
+                                         t, height_vec,
+                                         bld->static_state->pot_height,
+                                         bld->static_state->wrap_t,
+                                         &y_icoord);
+
+      if (dims >= 3) {
+         lp_build_sample_wrap_nearest_float(bld,
+                                            r, depth_vec,
+                                            bld->static_state->pot_depth,
+                                            bld->static_state->wrap_r,
+                                            &z_icoord);
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         z_icoord = r;
+      }
+   }
+
+   /*
+    * From here on we deal with ints, and we should split up the 256bit
+    * vectors manually for better generated code.
+    */
+
+   /*
+    * compute texel offsets -
+    * cannot do offset calc with floats, difficult for block-based formats,
+    * and not enough precision anyway.
+    */
+   lp_build_sample_offset(&bld->int_coord_bld,
+                          bld->format_desc,
+                          x_icoord, y_icoord,
+                          z_icoord,
+                          row_stride_vec, img_stride_vec,
+                          &offset,
+                          &x_subcoord, &y_subcoord);
+
+   lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
+                                       x_subcoord, y_subcoord,
+                                       colors_lo, colors_hi);
+}
+
+
+/**
+ * Fetch texels for image with linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
+                                   LLVMValueRef data_ptr,
+                                   LLVMValueRef offset[2][2][2],
+                                   LLVMValueRef x_subcoord[2],
+                                   LLVMValueRef y_subcoord[2],
+                                   LLVMValueRef s_fpart,
+                                   LLVMValueRef t_fpart,
+                                   LLVMValueRef r_fpart,
+                                   LLVMValueRef *colors_lo,
+                                   LLVMValueRef *colors_hi)
+{
+   const unsigned dims = bld->dims;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_build_context h16, u8n;
+   LLVMTypeRef h16_vec_type, u8n_vec_type;
+   LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
+   LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef shuffle_lo, shuffle_hi;
+   LLVMValueRef s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
+   LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
+   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+   LLVMValueRef packed_lo, packed_hi;
+   unsigned i, j, k;
+   unsigned numj, numk;
+
+   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
+   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
+   h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
+   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   if (dims >= 2)
+      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+   if (dims >= 3)
+      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+   for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+      unsigned subindex = 0;
+#else
+      unsigned subindex = 1;
+#endif
+      LLVMValueRef index;
+
+      index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+      for (i = 0; i < 4; ++i)
+         shuffles_lo[j + i] = index;
+
+      index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+      for (i = 0; i < 4; ++i)
+         shuffles_hi[j + i] = index;
+   }
+
+   shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+   shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+   s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                       shuffle_lo, "");
+   s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                       shuffle_hi, "");
+   if (dims >= 2) {
+      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                          shuffle_lo, "");
+      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                          shuffle_hi, "");
+   }
+   if (dims >= 3) {
+      r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                          shuffle_lo, "");
+      r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                          shuffle_hi, "");
+   }
+
    /*
     * Fetch the pixels as 4 x 32bit (rgba order might differ):
     *
@@ -382,38 +832,129 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
     *
     * The higher 8 bits of the resulting elements will be zero.
     */
-   {
-      LLVMValueRef rgba8;
+   numj = 1 + (dims >= 2);
+   numk = 1 + (dims >= 3);
 
-      if (util_format_is_rgba8_variant(bld->format_desc)) {
-         /*
-          * Given the format is a rgba8, just read the pixels as is,
-          * without any swizzling. Swizzling will be done later.
-          */
-         rgba8 = lp_build_gather(bld->gallivm,
-                                 bld->texel_type.length,
-                                 bld->format_desc->block.bits,
-                                 bld->texel_type.width,
-                                 data_ptr, offset);
+   for (k = 0; k < numk; k++) {
+      for (j = 0; j < numj; j++) {
+         for (i = 0; i < 2; i++) {
+            LLVMValueRef rgba8;
+
+            if (util_format_is_rgba8_variant(bld->format_desc)) {
+               /*
+                * Given the format is a rgba8, just read the pixels as is,
+                * without any swizzling. Swizzling will be done later.
+                */
+               rgba8 = lp_build_gather(bld->gallivm,
+                                       bld->texel_type.length,
+                                       bld->format_desc->block.bits,
+                                       bld->texel_type.width,
+                                       data_ptr, offset[k][j][i]);
+
+               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+            }
+            else {
+               rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
+                                               bld->format_desc,
+                                               u8n.type,
+                                               data_ptr, offset[k][j][i],
+                                               x_subcoord[i],
+                                               y_subcoord[j]);
+            }
 
-         rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+            /* Expand one 4*rgba8 to two 2*rgba16 */
+            lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
+                             rgba8,
+                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+         }
       }
-      else {
-         rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
-                                         bld->format_desc,
-                                         u8n.type,
-                                         data_ptr, offset,
-                                         x_subcoord,
-                                         y_subcoord);
+   }
+
+   /*
+    * Linear interpolation with 8.8 fixed point.
+    */
+   if (bld->static_state->force_nearest_s) {
+      /* special case 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+                                t_fpart_lo,
+                                neighbors_lo[0][0][0],
+                                neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+                                t_fpart_hi,
+                                neighbors_hi[0][1][0],
+                                neighbors_hi[0][1][0]);
+   }
+   else if (bld->static_state->force_nearest_t) {
+      /* special case 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+                                s_fpart_lo,
+                                neighbors_lo[0][0][0],
+                                neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+                                s_fpart_hi,
+                                neighbors_hi[0][0][0],
+                                neighbors_hi[0][0][1]);
+   }
+   else {
+      /* general 1/2/3-D lerping */
+      if (dims == 1) {
+         packed_lo = lp_build_lerp(&h16,
+                                   s_fpart_lo,
+                                   neighbors_lo[0][0][0],
+                                   neighbors_lo[0][0][1]);
+
+         packed_hi = lp_build_lerp(&h16,
+                                   s_fpart_hi,
+                                   neighbors_hi[0][0][0],
+                                   neighbors_hi[0][0][1]);
       }
+      else {
+         /* 2-D lerp */
+         packed_lo = lp_build_lerp_2d(&h16,
+                                      s_fpart_lo, t_fpart_lo,
+                                      neighbors_lo[0][0][0],
+                                      neighbors_lo[0][0][1],
+                                      neighbors_lo[0][1][0],
+                                      neighbors_lo[0][1][1]);
+
+         packed_hi = lp_build_lerp_2d(&h16,
+                                      s_fpart_hi, t_fpart_hi,
+                                      neighbors_hi[0][0][0],
+                                      neighbors_hi[0][0][1],
+                                      neighbors_hi[0][1][0],
+                                      neighbors_hi[0][1][1]);
+
+         if (dims >= 3) {
+            LLVMValueRef packed_lo2, packed_hi2;
+
+            /* lerp in the second z slice */
+            packed_lo2 = lp_build_lerp_2d(&h16,
+                                          s_fpart_lo, t_fpart_lo,
+                                          neighbors_lo[1][0][0],
+                                          neighbors_lo[1][0][1],
+                                          neighbors_lo[1][1][0],
+                                          neighbors_lo[1][1][1]);
 
-      /* Expand one 4*rgba8 to two 2*rgba16 */
-      lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
-                       rgba8,
-                       colors_lo, colors_hi);
+            packed_hi2 = lp_build_lerp_2d(&h16,
+                                          s_fpart_hi, t_fpart_hi,
+                                          neighbors_hi[1][0][0],
+                                          neighbors_hi[1][0][1],
+                                          neighbors_hi[1][1][0],
+                                          neighbors_hi[1][1][1]);
+            /* interp between two z slices */
+            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+                                      packed_lo, packed_lo2);
+            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+                                      packed_hi, packed_hi2);
+         }
+      }
    }
-}
 
+   *colors_lo = packed_lo;
+   *colors_hi = packed_hi;
+}
 
 /**
  * Sample a single texture image with (bi-)(tri-)linear sampling.
@@ -433,33 +974,24 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 {
    const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->gallivm->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   struct lp_build_context i32;
+   LLVMTypeRef i32_vec_type;
    LLVMValueRef i32_c8, i32_c128, i32_c255;
    LLVMValueRef width_vec, height_vec, depth_vec;
-   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
-   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
+   LLVMValueRef s_ipart, s_fpart, s_float;
+   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
+   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
    LLVMValueRef x_stride, y_stride, z_stride;
    LLVMValueRef x_offset0, x_offset1;
    LLVMValueRef y_offset0, y_offset1;
    LLVMValueRef z_offset0, z_offset1;
    LLVMValueRef offset[2][2][2]; /* [z][y][x] */
    LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
-   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
-   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
-   LLVMValueRef packed_lo, packed_hi;
    unsigned x, y, z;
-   unsigned i, j, k;
-   unsigned numj, numk;
 
-   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32));
-   lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8));
+   lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
 
    i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
-   h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
-   u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
 
    lp_build_extract_image_sizes(bld,
                                 bld->int_size_type,
@@ -469,6 +1001,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                 &height_vec,
                                 &depth_vec);
 
+   s_float = s; t_float = t; r_float = r;
+
    if (bld->static_state->normalized_coords) {
       LLVMValueRef scaled_size;
       LLVMValueRef flt_size;
@@ -533,7 +1067,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    /* do texcoord wrapping and compute texel offsets */
    lp_build_sample_wrap_linear_int(bld,
                                    bld->format_desc->block.width,
-                                   s_ipart, width_vec, x_stride,
+                                   s_ipart, &s_fpart, s_float,
+                                   width_vec, x_stride,
                                    bld->static_state->pot_width,
                                    bld->static_state->wrap_s,
                                    &x_offset0, &x_offset1,
@@ -548,7 +1083,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    if (dims >= 2) {
       lp_build_sample_wrap_linear_int(bld,
                                       bld->format_desc->block.height,
-                                      t_ipart, height_vec, y_stride,
+                                      t_ipart, &t_fpart, t_float,
+                                      height_vec, y_stride,
                                       bld->static_state->pot_height,
                                       bld->static_state->wrap_t,
                                       &y_offset0, &y_offset1,
@@ -567,7 +1103,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    if (dims >= 3) {
       lp_build_sample_wrap_linear_int(bld,
                                       bld->format_desc->block.height,
-                                      r_ipart, depth_vec, z_stride,
+                                      r_ipart, &r_fpart, r_float,
+                                      depth_vec, z_stride,
                                       bld->static_state->pot_depth,
                                       bld->static_state->wrap_r,
                                       &z_offset0, &z_offset1,
@@ -593,212 +1130,175 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Transform 4 x i32 in
-    *
-    *   s_fpart = {s0, s1, s2, s3}
-    *
-    * into 8 x i16
-    *
-    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
-    *
-    * into two 8 x i16
-    *
-    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
-    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
-    *
-    * and likewise for t_fpart. There is no risk of loosing precision here
-    * since the fractional parts only use the lower 8bits.
-    */
-   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
-   if (dims >= 2)
-      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
-   if (dims >= 3)
-      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+   lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+                                      x_subcoord, y_subcoord,
+                                      s_fpart, t_fpart, r_fpart,
+                                      colors_lo, colors_hi);
+}
 
-   {
-      LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
-      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffle_lo;
-      LLVMValueRef shuffle_hi;
 
-      for (j = 0; j < h16.type.length; j += 4) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
-         unsigned subindex = 0;
-#else
-         unsigned subindex = 1;
-#endif
-         LLVMValueRef index;
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ * Does address calcs (except offsets) with floats.
+ * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
+ */
+static void
+lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
+                                    LLVMValueRef int_size,
+                                    LLVMValueRef row_stride_vec,
+                                    LLVMValueRef img_stride_vec,
+                                    LLVMValueRef data_ptr,
+                                    LLVMValueRef s,
+                                    LLVMValueRef t,
+                                    LLVMValueRef r,
+                                    LLVMValueRef *colors_lo,
+                                    LLVMValueRef *colors_hi)
+{
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec, height_vec, depth_vec;
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart = NULL;
+   LLVMValueRef r_fpart = NULL;
+   LLVMValueRef x_stride, y_stride, z_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef z_offset0, z_offset1;
+   LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+   LLVMValueRef x_subcoord[2], y_subcoord[2];
+   LLVMValueRef flt_size;
+   LLVMValueRef x_icoord0, x_icoord1;
+   LLVMValueRef y_icoord0, y_icoord1;
+   LLVMValueRef z_icoord0, z_icoord1;
+   unsigned x, y, z;
 
-         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
-         for (i = 0; i < 4; ++i)
-            shuffles_lo[j + i] = index;
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
 
-         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
-         for (i = 0; i < 4; ++i)
-            shuffles_hi[j + i] = index;
-      }
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
 
-      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
-      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+   /* do texcoord wrapping and compute texel offsets */
+   lp_build_sample_wrap_linear_float(bld,
+                                     bld->format_desc->block.width,
+                                     s, width_vec,
+                                     bld->static_state->pot_width,
+                                     bld->static_state->wrap_s,
+                                     &x_icoord0, &x_icoord1,
+                                     &s_fpart,
+                                     bld->static_state->force_nearest_s);
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear_float(bld,
+                                        bld->format_desc->block.height,
+                                        t, height_vec,
+                                        bld->static_state->pot_height,
+                                        bld->static_state->wrap_t,
+                                        &y_icoord0, &y_icoord1,
+                                        &t_fpart,
+                                        bld->static_state->force_nearest_t);
 
-      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
-                                          shuffle_lo, "");
-      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
-                                          shuffle_hi, "");
-      if (dims >= 2) {
-         t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
-                                             shuffle_lo, "");
-         t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
-                                             shuffle_hi, "");
-      }
       if (dims >= 3) {
-         r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
-                                             shuffle_lo, "");
-         r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
-                                             shuffle_hi, "");
+         lp_build_sample_wrap_linear_float(bld,
+                                           bld->format_desc->block.height,
+                                           r, depth_vec,
+                                           bld->static_state->pot_depth,
+                                           bld->static_state->wrap_r,
+                                           &z_icoord0, &z_icoord1,
+                                           &r_fpart, 0);
       }
    }
 
    /*
-    * Fetch the pixels as 4 x 32bit (rgba order might differ):
-    *
-    *   rgba0 rgba1 rgba2 rgba3
-    *
-    * bit cast them into 16 x u8
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * unpack them into two 8 x i16:
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1
-    *   r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * The higher 8 bits of the resulting elements will be zero.
+    * From here on we deal with ints, and we should split up the 256bit
+    * vectors manually for better generated code.
     */
-   numj = 1 + (dims >= 2);
-   numk = 1 + (dims >= 3);
 
-   for (k = 0; k < numk; k++) {
-      for (j = 0; j < numj; j++) {
-         for (i = 0; i < 2; i++) {
-            LLVMValueRef rgba8;
-
-            if (util_format_is_rgba8_variant(bld->format_desc)) {
-               /*
-                * Given the format is a rgba8, just read the pixels as is,
-                * without any swizzling. Swizzling will be done later.
-                */
-               rgba8 = lp_build_gather(bld->gallivm,
-                                       bld->texel_type.length,
-                                       bld->format_desc->block.bits,
-                                       bld->texel_type.width,
-                                       data_ptr, offset[k][j][i]);
-
-               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
-            }
-            else {
-               rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
-                                               bld->format_desc,
-                                               u8n.type,
-                                               data_ptr, offset[k][j][i],
-                                               x_subcoord[i],
-                                               y_subcoord[j]);
-            }
-
-            /* Expand one 4*rgba8 to two 2*rgba16 */
-            lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
-                             rgba8,
-                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
-         }
-      }
-   }
+   /* get pixel, row and image strides */
+   x_stride = lp_build_const_vec(bld->gallivm,
+                                 bld->int_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+   y_stride = row_stride_vec;
+   z_stride = img_stride_vec;
 
    /*
-    * Linear interpolation with 8.8 fixed point.
+    * compute texel offset -
+    * cannot do offset calc with floats, difficult for block-based formats,
+    * and not enough precision anyway.
     */
-   if (bld->static_state->force_nearest_s) {
-      /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                t_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
-
-      packed_hi = lp_build_lerp(&h16,
-                                t_fpart_hi,
-                                neighbors_hi[0][1][0],
-                                neighbors_hi[0][1][0]);
+   lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                  bld->format_desc->block.width,
+                                  x_icoord0, x_stride,
+                                  &x_offset0, &x_subcoord[0]);
+   lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                  bld->format_desc->block.width,
+                                  x_icoord1, x_stride,
+                                  &x_offset1, &x_subcoord[1]);
+   for (z = 0; z < 2; z++) {
+      for (y = 0; y < 2; y++) {
+         offset[z][y][0] = x_offset0;
+         offset[z][y][1] = x_offset1;
+      }
    }
-   else if (bld->static_state->force_nearest_t) {
-      /* special case 1-D lerp */
-      packed_lo = lp_build_lerp(&h16,
-                                s_fpart_lo,
-                                neighbors_lo[0][0][0],
-                                neighbors_lo[0][0][1]);
 
-      packed_hi = lp_build_lerp(&h16,
-                                s_fpart_hi,
-                                neighbors_hi[0][0][0],
-                                neighbors_hi[0][0][1]);
+   if (dims >= 2) {
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     bld->format_desc->block.height,
+                                     y_icoord0, y_stride,
+                                     &y_offset0, &y_subcoord[0]);
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     bld->format_desc->block.height,
+                                     y_icoord1, y_stride,
+                                     &y_offset1, &y_subcoord[1]);
+      for (z = 0; z < 2; z++) {
+         for (x = 0; x < 2; x++) {
+            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[z][0][x], y_offset0);
+            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[z][1][x], y_offset1);
+         }
+      }
    }
-   else {
-      /* general 1/2/3-D lerping */
-      if (dims == 1) {
-         packed_lo = lp_build_lerp(&h16,
-                                   s_fpart_lo,
-                                   neighbors_lo[0][0][0],
-                                   neighbors_lo[0][0][1]);
 
-         packed_hi = lp_build_lerp(&h16,
-                                   s_fpart_hi,
-                                   neighbors_hi[0][0][0],
-                                   neighbors_hi[0][0][1]);
+   if (dims >= 3) {
+      LLVMValueRef z_subcoord[2];
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     1,
+                                     z_icoord0, z_stride,
+                                     &z_offset0, &z_subcoord[0]);
+      lp_build_sample_partial_offset(&bld->int_coord_bld,
+                                     1,
+                                     z_icoord1, z_stride,
+                                     &z_offset1, &z_subcoord[1]);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[0][y][x], z_offset0);
+            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[1][y][x], z_offset1);
+         }
       }
-      else {
-         /* 2-D lerp */
-         packed_lo = lp_build_lerp_2d(&h16,
-                                      s_fpart_lo, t_fpart_lo,
-                                      neighbors_lo[0][0][0],
-                                      neighbors_lo[0][0][1],
-                                      neighbors_lo[0][1][0],
-                                      neighbors_lo[0][1][1]);
-
-         packed_hi = lp_build_lerp_2d(&h16,
-                                      s_fpart_hi, t_fpart_hi,
-                                      neighbors_hi[0][0][0],
-                                      neighbors_hi[0][0][1],
-                                      neighbors_hi[0][1][0],
-                                      neighbors_hi[0][1][1]);
-
-         if (dims >= 3) {
-            LLVMValueRef packed_lo2, packed_hi2;
-
-            /* lerp in the second z slice */
-            packed_lo2 = lp_build_lerp_2d(&h16,
-                                          s_fpart_lo, t_fpart_lo,
-                                          neighbors_lo[1][0][0],
-                                          neighbors_lo[1][0][1],
-                                          neighbors_lo[1][1][0],
-                                          neighbors_lo[1][1][1]);
-
-            packed_hi2 = lp_build_lerp_2d(&h16,
-                                          s_fpart_hi, t_fpart_hi,
-                                          neighbors_hi[1][0][0],
-                                          neighbors_hi[1][0][1],
-                                          neighbors_hi[1][1][0],
-                                          neighbors_hi[1][1][1]);
-            /* interp between two z slices */
-            packed_lo = lp_build_lerp(&h16, r_fpart_lo,
-                                      packed_lo, packed_lo2);
-            packed_hi = lp_build_lerp(&h16, r_fpart_hi,
-                                      packed_hi, packed_hi2);
+   }
+   else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef z_offset;
+      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            /* The r coord is the cube face in [0,5] */
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
+                                           offset[0][y][x], z_offset);
          }
       }
    }
 
-   *colors_lo = packed_lo;
-   *colors_hi = packed_hi;
+   lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
+                                      x_subcoord, y_subcoord,
+                                      s_fpart, t_fpart, r_fpart,
+                                      colors_lo, colors_hi);
 }
 
 
@@ -824,10 +1324,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    LLVMBuilderRef builder = bld->gallivm->builder;
    LLVMValueRef size0;
    LLVMValueRef size1;
-   LLVMValueRef row_stride0_vec;
-   LLVMValueRef row_stride1_vec;
-   LLVMValueRef img_stride0_vec;
-   LLVMValueRef img_stride1_vec;
+   LLVMValueRef row_stride0_vec = NULL;
+   LLVMValueRef row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL;
+   LLVMValueRef img_stride1_vec = NULL;
    LLVMValueRef data_ptr0;
    LLVMValueRef data_ptr1;
    LLVMValueRef colors0_lo, colors0_hi;
@@ -838,20 +1338,39 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                &size0,
                                &row_stride0_vec, &img_stride0_vec);
    data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
-   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      lp_build_sample_image_nearest(bld,
-                                    size0,
-                                    row_stride0_vec, img_stride0_vec,
-                                    data_ptr0, s, t, r,
-                                    &colors0_lo, &colors0_hi);
+   if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+      if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+         lp_build_sample_image_nearest_afloat(bld,
+                                              size0,
+                                              row_stride0_vec, img_stride0_vec,
+                                              data_ptr0, s, t, r,
+                                              &colors0_lo, &colors0_hi);
+      }
+      else {
+         assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+         lp_build_sample_image_linear_afloat(bld,
+                                             size0,
+                                             row_stride0_vec, img_stride0_vec,
+                                             data_ptr0, s, t, r,
+                                             &colors0_lo, &colors0_hi);
+      }
    }
    else {
-      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-      lp_build_sample_image_linear(bld,
-                                   size0,
-                                   row_stride0_vec, img_stride0_vec,
-                                   data_ptr0, s, t, r,
-                                   &colors0_lo, &colors0_hi);
+      if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+         lp_build_sample_image_nearest(bld,
+                                       size0,
+                                       row_stride0_vec, img_stride0_vec,
+                                       data_ptr0, s, t, r,
+                                       &colors0_lo, &colors0_hi);
+      }
+      else {
+         assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+         lp_build_sample_image_linear(bld,
+                                      size0,
+                                      row_stride0_vec, img_stride0_vec,
+                                      data_ptr0, s, t, r,
+                                      &colors0_lo, &colors0_hi);
+      }
    }
 
    /* Store the first level's colors in the output variables */
@@ -859,74 +1378,138 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    LLVMBuildStore(builder, colors0_hi, colors_hi_var);
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      LLVMValueRef h16_scale = lp_build_const_float(bld->gallivm, 256.0);
-      LLVMTypeRef i32_type = LLVMIntTypeInContext(bld->gallivm->context, 32);
+      LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
+                                                     bld->perquadf_bld.type, 256.0);
+      LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
+      unsigned num_quads = bld->coord_bld.type.length / 4;
+      unsigned i;
 
-      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
-      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
+      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
 
       /* need_lerp = lod_fpart > 0 */
-      need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
-                                lod_fpart, LLVMConstNull(i32_type),
-                                "need_lerp");
+      if (num_quads == 1) {
+         need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+                                   lod_fpart, bld->perquadi_bld.zero,
+                                   "need_lerp");
+      }
+      else {
+         /*
+          * We'll do mip filtering if any of the quads need it.
+          * It might be better to split the vectors here and only fetch/filter
+          * quads which need it.
+          */
+         /*
+          * We need to clamp lod_fpart here since we can get negative
+          * values which would screw up filtering if not all
+          * lod_fpart values have same sign.
+          * We can however then skip the greater than comparison.
+          */
+         lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
+                                  bld->perquadi_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
+      }
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
       {
          struct lp_build_context h16_bld;
 
-         lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
+         lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
 
          /* sample the second mipmap level */
          lp_build_mipmap_level_sizes(bld, ilevel1,
                                      &size1,
                                      &row_stride1_vec, &img_stride1_vec);
          data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
-         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-            lp_build_sample_image_nearest(bld,
-                                          size1,
-                                          row_stride1_vec, img_stride1_vec,
-                                          data_ptr1, s, t, r,
-                                          &colors1_lo, &colors1_hi);
+
+         if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
+            if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+               lp_build_sample_image_nearest_afloat(bld,
+                                                    size1,
+                                                    row_stride1_vec, img_stride1_vec,
+                                                    data_ptr1, s, t, r,
+                                                    &colors1_lo, &colors1_hi);
+            }
+            else {
+               lp_build_sample_image_linear_afloat(bld,
+                                                   size1,
+                                                   row_stride1_vec, img_stride1_vec,
+                                                   data_ptr1, s, t, r,
+                                                   &colors1_lo, &colors1_hi);
+            }
          }
          else {
-            lp_build_sample_image_linear(bld,
-                                         size1,
-                                         row_stride1_vec, img_stride1_vec,
-                                         data_ptr1, s, t, r,
-                                         &colors1_lo, &colors1_hi);
+            if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+               lp_build_sample_image_nearest(bld,
+                                             size1,
+                                             row_stride1_vec, img_stride1_vec,
+                                             data_ptr1, s, t, r,
+                                             &colors1_lo, &colors1_hi);
+            }
+            else {
+               lp_build_sample_image_linear(bld,
+                                            size1,
+                                            row_stride1_vec, img_stride1_vec,
+                                            data_ptr1, s, t, r,
+                                            &colors1_lo, &colors1_hi);
+            }
          }
 
          /* interpolate samples from the two mipmap levels */
 
-         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
-         lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+         if (num_quads == 1) {
+            lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+            lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
 
 #if HAVE_LLVM == 0x208
-         /* This is a work-around for a bug in LLVM 2.8.
-          * Evidently, something goes wrong in the construction of the
-          * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
-          * to force the vector to be properly constructed.
-          * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
-          */
-         {
-            LLVMValueRef shuffles[8], shuffle;
-            int i;
-            assert(h16_bld.type.length <= Elements(shuffles));
-            for (i = 0; i < h16_bld.type.length; i++)
-               shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
-            shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
-            lod_fpart = LLVMBuildShuffleVector(builder,
-                                               lod_fpart, lod_fpart,
-                                               shuffle, "");
-         }
+            /* This is a work-around for a bug in LLVM 2.8.
+             * Evidently, something goes wrong in the construction of the
+             * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
+             * to force the vector to be properly constructed.
+             * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+             */
+            {
+               LLVMValueRef shuffles[8], shuffle;
+               assert(h16_bld.type.length <= Elements(shuffles));
+               for (i = 0; i < h16_bld.type.length; i++)
+                  shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
+               shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+               lod_fpart = LLVMBuildShuffleVector(builder,
+                                                  lod_fpart, lod_fpart,
+                                                  shuffle, "");
+            }
 #endif
 
-         colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
-                                    colors0_lo, colors1_lo);
-         colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
-                                    colors0_hi, colors1_hi);
+            colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+                                       colors0_lo, colors1_lo);
+            colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+                                       colors0_hi, colors1_hi);
+         }
+         else {
+            LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
+            struct lp_type perquadi16_type = bld->perquadi_bld.type;
+            perquadi16_type.width /= 2;
+            perquadi16_type.length *= 2;
+            lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
+                                         lp_build_vec_type(bld->gallivm,
+                                                           perquadi16_type), "");
+            /* XXX this only works for exactly 2 quads. More quads need shuffle */
+            assert(num_quads == 2);
+            for (i = 0; i < num_quads; i++) {
+               LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
+               lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
+                                                         perquadi16_type,
+                                                         h16_bld.type,
+                                                         lod_fpart,
+                                                         indexi2);
+            }
+            colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
+                                       colors0_lo, colors1_lo);
+            colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
+                                       colors0_hi, colors1_hi);
+         }
 
          LLVMBuildStore(builder, colors0_lo, colors_lo_var);
          LLVMBuildStore(builder, colors0_hi, colors_hi_var);
@@ -948,10 +1531,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     LLVMValueRef s,
                     LLVMValueRef t,
                     LLVMValueRef r,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
-                    LLVMValueRef lod_bias, /* optional */
-                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef lod_ipart,
+                    LLVMValueRef lod_fpart,
+                    LLVMValueRef ilevel0,
+                    LLVMValueRef ilevel1,
                     LLVMValueRef texel_out[4])
 {
    struct lp_build_context *int_bld = &bld->int_bld;
@@ -960,14 +1543,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
    const unsigned dims = bld->dims;
-   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL;
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
-   LLVMValueRef face_ddx[4], face_ddy[4];
    struct lp_build_context h16_bld;
-   LLVMValueRef first_level;
-   LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
 
    /* we only support the common/simple wrap modes at this time */
    assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -978,81 +1556,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
 
 
    /* make 16-bit fixed-pt builder context */
-   lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16));
-
-   /* cube face selection, compute pre-face coords, etc. */
-   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
-      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-
-      /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
-      face_ddx[2] = NULL;
-      face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
-      face_ddy[2] = NULL;
-      face_ddy[3] = NULL;
-      ddx = face_ddx;
-      ddy = face_ddy;
-   }
-
-   /*
-    * Compute the level of detail (float).
-    */
-   if (min_filter != mag_filter ||
-       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
-      /* Need to compute lod either to choose mipmap levels or to
-       * distinguish between minification/magnification with one mipmap level.
-       */
-      lp_build_lod_selector(bld, unit, ddx, ddy,
-                            lod_bias, explicit_lod,
-                            mip_filter,
-                            &lod_ipart, &lod_fpart);
-   } else {
-      lod_ipart = i32t_zero;
-   }
-
-   /*
-    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
-    */
-   switch (mip_filter) {
-   default:
-      assert(0 && "bad mip_filter value in lp_build_sample_aos()");
-      /* fall-through */
-   case PIPE_TEX_MIPFILTER_NONE:
-      /* always use mip level 0 */
-      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
-          * We should be able to set ilevel0 = const(0) but that causes
-          * bad x86 code to be emitted.
-          */
-         assert(lod_ipart);
-         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
-      }
-      else {
-         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
-                                                       bld->gallivm, unit);
-         ilevel0 = first_level;
-      }
-      break;
-   case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod_ipart);
-      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
-      break;
-   case PIPE_TEX_MIPFILTER_LINEAR:
-      assert(lod_ipart);
-      assert(lod_fpart);
-      lp_build_linear_mip_levels(bld, unit,
-                                 lod_ipart, &lod_fpart,
-                                 &ilevel0, &ilevel1);
-      break;
-   }
+   lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
 
    /*
     * Get/interpolate texture colors.
@@ -1062,7 +1566,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
    packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
 
    if (min_filter == mag_filter) {
-      /* no need to distinquish between minification and magnification */
+      /* no need to distinguish between minification and magnification */
       lp_build_sample_mipmap(bld,
                              min_filter, mip_filter,
                              s, t, r,
@@ -1106,7 +1610,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
     * into 'packed'
     */
    packed = lp_build_pack2(bld->gallivm,
-                           h16_bld.type, lp_type_unorm(8),
+                           h16_bld.type, lp_type_unorm(8, bld->vector_width),
                            LLVMBuildLoad(builder, packed_lo, ""),
                            LLVMBuildLoad(builder, packed_hi, ""));
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index 5d9ecac4d50..55b3bc1c09a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -46,10 +46,10 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     LLVMValueRef s,
                     LLVMValueRef t,
                     LLVMValueRef r,
-                    const LLVMValueRef *ddx,
-                    const LLVMValueRef *ddy,
-                    LLVMValueRef lod_bias, /* optional */
-                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef lod_ipart,
+                    LLVMValueRef lod_fpart,
+                    LLVMValueRef ilevel0,
+                    LLVMValueRef ilevel1,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 73dc3e77083..aaef7970635 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -41,6 +41,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -57,6 +58,7 @@
 #include "lp_bld_sample_aos.h"
 #include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
 
 
 /**
@@ -221,6 +223,41 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
+ * Helper to compute the first coord and the weight for
+ * linear wrap repeat npot textures
+ */
+void
+lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
+                                  LLVMValueRef coord_f,
+                                  LLVMValueRef length_i,
+                                  LLVMValueRef length_f,
+                                  LLVMValueRef *coord0_i,
+                                  LLVMValueRef *weight_f)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
+                                                int_coord_bld->one);
+   LLVMValueRef mask;
+   /* wrap with normalized floats is just fract */
+   coord_f = lp_build_fract(coord_bld, coord_f);
+   /* mul by size and subtract 0.5 */
+   coord_f = lp_build_mul(coord_bld, coord_f, length_f);
+   coord_f = lp_build_sub(coord_bld, coord_f, half);
+   /*
+    * we avoided the 0.5/length division before the repeat wrap,
+    * now need to fix up edge cases with selects
+    */
+   /* convert to int, compute lerp weight */
+   lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
+   mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
+                           PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
+   *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
+}
+
+
+/**
  * Build LLVM code for texture wrap mode for linear filtering.
  * \param x0_out  returns first integer texcoord
  * \param x1_out  returns second integer texcoord
@@ -246,28 +283,27 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
-      /* mul by size and subtract 0.5 */
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_sub(coord_bld, coord, half);
-      /* convert to int, compute lerp weight */
-      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      /* repeat wrap */
       if (is_pot) {
+         /* mul by size and subtract 0.5 */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* repeat wrap */
          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
       }
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          LLVMValueRef mask;
-         coord0 = LLVMBuildAdd(builder, coord0, bias, "");
-         coord0 = LLVMBuildURem(builder, coord0, length, "");
-         mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
+         lp_build_coord_repeat_npot_linear(bld, coord,
+                                           length, length_f,
+                                           &coord0, &weight);
+         mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
          coord1 = LLVMBuildAnd(builder,
-                              lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
-                              mask, "");
+                               lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+                               mask, "");
       }
       break;
 
@@ -444,15 +480,16 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      icoord = lp_build_ifloor(coord_bld, coord);
-      if (is_pot)
+      if (is_pot) {
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         icoord = lp_build_ifloor(coord_bld, coord);
          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
+      }
       else {
-         /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
-         icoord = LLVMBuildAdd(builder, icoord, bias, "");
-         icoord = LLVMBuildURem(builder, icoord, length, "");
+          /* take fraction, unnormalize */
+          coord = lp_build_fract_safe(coord_bld, coord);
+          coord = lp_build_mul(coord_bld, coord, length_f);
+          icoord = lp_build_itrunc(coord_bld, coord);
       }
       break;
 
@@ -473,7 +510,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+      /* Note: this is the same as CLAMP_TO_EDGE, except min = -1 */
       {
          LLVMValueRef min, max;
 
@@ -873,12 +910,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
+      unsigned num_quads = bld->coord_bld.type.length / 4;
 
       /* need_lerp = lod_fpart > 0 */
-      need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
-                                lod_fpart,
-                                bld->float_bld.zero,
-                                "need_lerp");
+      if (num_quads == 1) {
+         need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+                                   lod_fpart, bld->perquadf_bld.zero,
+                                   "need_lerp");
+      }
+      else {
+         /*
+          * We'll do mip filtering if any of the quads need it.
+          * It might be better to split the vectors here and only fetch/filter
+          * quads which need it.
+          */
+         /*
+          * We unfortunately need to clamp lod_fpart here since we can get
+          * negative values which would screw up filtering if not all
+          * lod_fpart values have same sign.
+          */
+         lod_fpart = lp_build_max(&bld->perquadf_bld, lod_fpart,
+                                  bld->perquadf_bld.zero);
+         need_lerp = lp_build_compare(bld->gallivm, bld->perquadf_bld.type,
+                                      PIPE_FUNC_GREATER,
+                                      lod_fpart, bld->perquadf_bld.zero);
+         need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, need_lerp);
+     }
 
       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
       {
@@ -904,7 +961,10 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
          /* interpolate samples from the two mipmap levels */
 
-         lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+         lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
+                                                           bld->perquadf_bld.type,
+                                                           bld->texel_bld.type,
+                                                           lod_fpart);
 
          for (chan = 0; chan < 4; chan++) {
             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
@@ -916,37 +976,28 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    }
 }
 
-
-
 /**
- * General texture sampling codegen.
- * This function handles texture sampling for all texture targets (1D,
- * 2D, 3D, cube) and all filtering modes.
+ * Calculate cube face, lod, mip levels.
  */
 static void
-lp_build_sample_general(struct lp_build_sample_context *bld,
-                        unsigned unit,
-                        LLVMValueRef s,
-                        LLVMValueRef t,
-                        LLVMValueRef r,
-                        const LLVMValueRef *ddx,
-                        const LLVMValueRef *ddy,
-                        LLVMValueRef lod_bias, /* optional */
-                        LLVMValueRef explicit_lod, /* optional */
-                        LLVMValueRef *colors_out)
+lp_build_sample_common(struct lp_build_sample_context *bld,
+                       unsigned unit,
+                       LLVMValueRef *s,
+                       LLVMValueRef *t,
+                       LLVMValueRef *r,
+                       const struct lp_derivatives *derivs,
+                       LLVMValueRef lod_bias, /* optional */
+                       LLVMValueRef explicit_lod, /* optional */
+                       LLVMValueRef *lod_ipart,
+                       LLVMValueRef *lod_fpart,
+                       LLVMValueRef *ilevel0,
+                       LLVMValueRef *ilevel1)
 {
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMBuilderRef builder = bld->gallivm->builder;
    const unsigned mip_filter = bld->static_state->min_mip_filter;
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
-   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef face_ddx[4], face_ddy[4];
-   LLVMValueRef texels[4];
    LLVMValueRef first_level;
-   LLVMValueRef i32t_zero = lp_build_const_int32(bld->gallivm, 0);
-   unsigned chan;
+   struct lp_derivatives face_derivs;
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -958,23 +1009,16 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
+      lp_build_cube_lookup(bld, *s, *t, *r, &face, &face_s, &face_t);
+      *s = face_s; /* vec */
+      *t = face_t; /* vec */
       /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+      *r = face; /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
-      face_ddx[2] = NULL;
-      face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
-      face_ddy[2] = NULL;
-      face_ddy[3] = NULL;
-      ddx = face_ddx;
-      ddy = face_ddy;
+      face_derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, *s, *t);
+      face_derivs.ddx_ddy[1] = NULL;
+      derivs = &face_derivs;
    }
 
    /*
@@ -985,12 +1029,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lp_build_lod_selector(bld, unit, ddx, ddy,
+      lp_build_lod_selector(bld, unit, derivs,
                             lod_bias, explicit_lod,
                             mip_filter,
-                            &lod_ipart, &lod_fpart);
+                            lod_ipart, lod_fpart);
    } else {
-      lod_ipart = i32t_zero;
+      *lod_ipart = bld->perquadi_bld.zero;
    }
 
    /*
@@ -1006,28 +1050,56 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
          /* XXX this is a work-around for an apparent bug in LLVM 2.7.
           * We should be able to set ilevel0 = const(0) but that causes
           * bad x86 code to be emitted.
+          * XXX should probably disable that on other llvm versions.
           */
-         assert(lod_ipart);
-         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+         assert(*lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
       }
       else {
          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
                                                        bld->gallivm, unit);
-         ilevel0 = first_level;
+         first_level = lp_build_broadcast_scalar(&bld->perquadi_bld, first_level);
+         *ilevel0 = first_level;
       }
       break;
    case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod_ipart);
-      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+      assert(*lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, *lod_ipart, ilevel0);
       break;
    case PIPE_TEX_MIPFILTER_LINEAR:
-      assert(lod_ipart);
-      assert(lod_fpart);
+      assert(*lod_ipart);
+      assert(*lod_fpart);
       lp_build_linear_mip_levels(bld, unit,
-                                 lod_ipart, &lod_fpart,
-                                 &ilevel0, &ilevel1);
+                                 *lod_ipart, lod_fpart,
+                                 ilevel0, ilevel1);
       break;
    }
+}
+
+/**
+ * General texture sampling codegen.
+ * This function handles texture sampling for all texture targets (1D,
+ * 2D, 3D, cube) and all filtering modes.
+ */
+static void
+lp_build_sample_general(struct lp_build_sample_context *bld,
+                        unsigned unit,
+                        LLVMValueRef s,
+                        LLVMValueRef t,
+                        LLVMValueRef r,
+                        LLVMValueRef lod_ipart,
+                        LLVMValueRef lod_fpart,
+                        LLVMValueRef ilevel0,
+                        LLVMValueRef ilevel1,
+                        LLVMValueRef *colors_out)
+{
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   LLVMValueRef texels[4];
+   unsigned chan;
 
    /*
     * Get/interpolate texture colors.
@@ -1039,7 +1111,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    }
 
    if (min_filter == mag_filter) {
-      /* no need to distinquish between minification and magnification */
+      /* no need to distinguish between minification and magnification */
       lp_build_sample_mipmap(bld, unit,
                              min_filter, mip_filter,
                              s, t, r,
@@ -1135,7 +1207,10 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
  * For debugging.
  */
 void
-lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
+lp_build_sample_nop(struct gallivm_state *gallivm,
+                    struct lp_type type,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
                     LLVMValueRef texel_out[4])
 {
    LLVMValueRef one = lp_build_one(gallivm, type);
@@ -1152,8 +1227,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm, struct lp_type type,
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
- * \param ddx  partial derivatives of (s,t,r,q) with respect to x
- * \param ddy  partial derivatives of (s,t,r,q) with respect to y
+ * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
  */
 void
 lp_build_sample_soa(struct gallivm_state *gallivm,
@@ -1163,8 +1237,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    const LLVMValueRef ddx[4],
-                    const LLVMValueRef ddy[4],
+                    const struct lp_derivatives *derivs,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
                     LLVMValueRef texel_out[4])
@@ -1173,10 +1246,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    struct lp_build_sample_context bld;
    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
    LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tex_width, tex_height, tex_depth;
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
-   struct lp_type float_vec_type;
 
    if (0) {
       enum pipe_format fmt = static_state->format;
@@ -1193,6 +1266,8 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    bld.format_desc = util_format_description(static_state->format);
    bld.dims = dims;
 
+   bld.vector_width = lp_type_width(type);
+
    bld.float_type = lp_type_float(32);
    bld.int_type = lp_type_int(32);
    bld.coord_type = type;
@@ -1201,22 +1276,26 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    bld.float_size_type.length = dims > 1 ? 4 : 1;
    bld.int_size_type = lp_int_type(bld.float_size_type);
    bld.texel_type = type;
-
-   float_vec_type = lp_type_float_vec(32);
+   bld.perquadf_type = type;
+   /* we want native vector size to be able to use our intrinsics */
+   bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
+   bld.perquadi_type = lp_int_type(bld.perquadf_type);
 
    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
-   lp_build_context_init(&bld.float_vec_bld, gallivm, float_vec_type);
+   lp_build_context_init(&bld.float_vec_bld, gallivm, type);
    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
+   lp_build_context_init(&bld.perquadf_bld, gallivm, bld.perquadf_type);
+   lp_build_context_init(&bld.perquadi_bld, gallivm, bld.perquadi_type);
 
    /* Get the dynamic state */
-   bld.width = dynamic_state->width(dynamic_state, gallivm, unit);
-   bld.height = dynamic_state->height(dynamic_state, gallivm, unit);
-   bld.depth = dynamic_state->depth(dynamic_state, gallivm, unit);
+   tex_width = dynamic_state->width(dynamic_state, gallivm, unit);
+   tex_height = dynamic_state->height(dynamic_state, gallivm, unit);
+   tex_depth = dynamic_state->depth(dynamic_state, gallivm, unit);
    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, unit);
    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, unit);
    bld.data_array = dynamic_state->data_ptr(dynamic_state, gallivm, unit);
@@ -1228,37 +1307,40 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
 
    /* width, height, depth as single int vector */
    if (dims <= 1) {
-      bld.int_size = bld.width;
+      bld.int_size = tex_width;
    }
    else {
       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
-                                            bld.width, LLVMConstInt(i32t, 0, 0), "");
+                                            tex_width, LLVMConstInt(i32t, 0, 0), "");
       if (dims >= 2) {
          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
-                                               bld.height, LLVMConstInt(i32t, 1, 0), "");
+                                               tex_height, LLVMConstInt(i32t, 1, 0), "");
          if (dims >= 3) {
             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
-                                                  bld.depth, LLVMConstInt(i32t, 2, 0), "");
+                                                  tex_depth, LLVMConstInt(i32t, 2, 0), "");
          }
       }
    }
 
    if (0) {
       /* For debug: no-op texture sampling */
-      lp_build_sample_nop(gallivm, bld.texel_type, texel_out);
-   }
-   else if (util_format_fits_8unorm(bld.format_desc) &&
-            lp_is_simple_wrap_mode(static_state->wrap_s) &&
-            lp_is_simple_wrap_mode(static_state->wrap_t)) {
-      /* do sampling/filtering with fixed pt arithmetic */
-      lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
-                          lod_bias, explicit_lod,
+      lp_build_sample_nop(gallivm,
+                          bld.texel_type,
+                          num_coords,
+                          coords,
                           texel_out);
    }
-
    else {
+      LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
+      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
+      unsigned num_quads = type.length / 4;
+      const unsigned mip_filter = bld.static_state->min_mip_filter;
+      boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
+                        lp_is_simple_wrap_mode(static_state->wrap_s) &&
+                        lp_is_simple_wrap_mode(static_state->wrap_t);
+
       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
-          util_format_fits_8unorm(bld.format_desc)) {
+          !use_aos && util_format_fits_8unorm(bld.format_desc)) {
          debug_printf("%s: using floating point linear filtering for %s\n",
                       __FUNCTION__, bld.format_desc->short_name);
          debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d\n",
@@ -1269,9 +1351,203 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                       static_state->wrap_t);
       }
 
-      lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
-                              lod_bias, explicit_lod,
-                              texel_out);
+      lp_build_sample_common(&bld, unit,
+                             &s, &t, &r,
+                             derivs, lod_bias, explicit_lod,
+                             &lod_ipart, &lod_fpart,
+                             &ilevel0, &ilevel1);
+
+      /*
+       * we only try 8-wide sampling with soa as it appears to
+       * be a loss with aos with AVX.
+       */
+      if (num_quads == 1 || (mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+                             !use_aos)) {
+
+         if (num_quads > 1) {
+            LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+            /* These parameters are the same for all quads */
+            lod_ipart = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+            ilevel0 = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+         }
+         if (use_aos) {
+            /* do sampling/filtering with fixed pt arithmetic */
+            lp_build_sample_aos(&bld, unit,
+                                s, t, r,
+                                lod_ipart, lod_fpart,
+                                ilevel0, ilevel1,
+                                texel_out);
+         }
+
+         else {
+            lp_build_sample_general(&bld, unit,
+                                    s, t, r,
+                                    lod_ipart, lod_fpart,
+                                    ilevel0, ilevel1,
+                                    texel_out);
+         }
+      }
+      else {
+         struct lp_build_if_state if_ctx;
+         LLVMValueRef notsame_levels, notsame;
+         LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+         LLVMValueRef texels[4];
+         LLVMValueRef texelout[4];
+         unsigned j;
+
+         texels[0] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texr");
+         texels[1] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texg");
+         texels[2] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texb");
+         texels[3] = lp_build_alloca(gallivm, bld.texel_bld.vec_type, "texa");
+
+         /* only build the if if we MAY split, otherwise always split */
+         if (!use_aos) {
+            notsame = lp_build_extract_broadcast(gallivm,
+                                                 bld.perquadi_bld.type,
+                                                 bld.perquadi_bld.type,
+                                                 ilevel0, index0);
+            notsame = lp_build_sub(&bld.perquadi_bld, ilevel0, notsame);
+            notsame_levels = lp_build_any_true_range(&bld.perquadi_bld, num_quads,
+                                                     notsame);
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+               notsame = lp_build_extract_broadcast(gallivm,
+                                                    bld.perquadi_bld.type,
+                                                    bld.perquadi_bld.type,
+                                                    ilevel1, index0);
+               notsame = lp_build_sub(&bld.perquadi_bld, ilevel1, notsame);
+               notsame = lp_build_any_true_range(&bld.perquadi_bld, num_quads, notsame);
+               notsame_levels = LLVMBuildOr(builder, notsame_levels, notsame, "");
+            }
+            lp_build_if(&if_ctx, gallivm, notsame_levels);
+         }
+
+         {
+            struct lp_build_sample_context bld4;
+            struct lp_type type4 = type;
+            unsigned i;
+            LLVMValueRef texelout4[4];
+            LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
+
+            type4.length = 4;
+
+            /* Setup our build context */
+            memset(&bld4, 0, sizeof bld4);
+            bld4.gallivm = bld.gallivm;
+            bld4.static_state = bld.static_state;
+            bld4.dynamic_state = bld.dynamic_state;
+            bld4.format_desc = bld.format_desc;
+            bld4.dims = bld.dims;
+            bld4.row_stride_array = bld.row_stride_array;
+            bld4.img_stride_array = bld.img_stride_array;
+            bld4.data_array = bld.data_array;
+            bld4.int_size = bld.int_size;
+
+            bld4.vector_width = lp_type_width(type4);
+
+            bld4.float_type = lp_type_float(32);
+            bld4.int_type = lp_type_int(32);
+            bld4.coord_type = type4;
+            bld4.int_coord_type = lp_int_type(type4);
+            bld4.float_size_type = lp_type_float(32);
+            bld4.float_size_type.length = dims > 1 ? 4 : 1;
+            bld4.int_size_type = lp_int_type(bld4.float_size_type);
+            bld4.texel_type = type4;
+            bld4.perquadf_type = type4;
+            /* we want native vector size to be able to use our intrinsics */
+            bld4.perquadf_type.length = 1;
+            bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
+
+            lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
+            lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
+            lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
+            lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
+            lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
+            lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
+            lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
+            lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
+            lp_build_context_init(&bld4.perquadf_bld, gallivm, bld4.perquadf_type);
+            lp_build_context_init(&bld4.perquadi_bld, gallivm, bld4.perquadi_type);
+
+            for (i = 0; i < num_quads; i++) {
+               LLVMValueRef s4, t4, r4;
+               LLVMValueRef lod_iparts, lod_fparts = NULL;
+               LLVMValueRef ilevel0s, ilevel1s = NULL;
+               LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+
+               s4 = lp_build_extract_range(gallivm, s, 4*i, 4);
+               t4 = lp_build_extract_range(gallivm, t, 4*i, 4);
+               r4 = lp_build_extract_range(gallivm, r, 4*i, 4);
+               lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, indexi, "");
+               ilevel0s = LLVMBuildExtractElement(builder, ilevel0, indexi, "");
+               if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+                  ilevel1s = LLVMBuildExtractElement(builder, ilevel1, indexi, "");
+                  lod_fparts = LLVMBuildExtractElement(builder, lod_fpart, indexi, "");
+               }
+
+               if (use_aos) {
+                  /* do sampling/filtering with fixed pt arithmetic */
+                  lp_build_sample_aos(&bld4, unit,
+                                      s4, t4, r4,
+                                      lod_iparts, lod_fparts,
+                                      ilevel0s, ilevel1s,
+                                      texelout4);
+               }
+
+               else {
+                  lp_build_sample_general(&bld4, unit,
+                                          s4, t4, r4,
+                                          lod_iparts, lod_fparts,
+                                          ilevel0s, ilevel1s,
+                                          texelout4);
+               }
+               for (j = 0; j < 4; j++) {
+                  texelouttmp[j][i] = texelout4[j];
+               }
+            }
+            for (j = 0; j < 4; j++) {
+               texelout[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
+               LLVMBuildStore(builder, texelout[j], texels[j]);
+            }
+         }
+         if (!use_aos) {
+            LLVMValueRef ilevel0s, lod_iparts, ilevel1s = NULL;
+
+            lp_build_else(&if_ctx);
+
+            /* These parameters are the same for all quads */
+            lod_iparts = LLVMBuildExtractElement(builder, lod_ipart, index0, "");
+            ilevel0s = LLVMBuildExtractElement(builder, ilevel0, index0, "");
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+               ilevel1s = LLVMBuildExtractElement(builder, ilevel1, index0, "");
+            }
+
+            if (use_aos) {
+               /* do sampling/filtering with fixed pt arithmetic */
+               lp_build_sample_aos(&bld, unit,
+                                   s, t, r,
+                                   lod_iparts, lod_fpart,
+                                   ilevel0s, ilevel1s,
+                                   texelout);
+            }
+
+            else {
+               lp_build_sample_general(&bld, unit,
+                                       s, t, r,
+                                       lod_iparts, lod_fpart,
+                                       ilevel0s, ilevel1s,
+                                       texelout);
+            }
+            for (j = 0; j < 4; j++) {
+               LLVMBuildStore(builder, texelout[j], texels[j]);
+            }
+
+            lp_build_endif(&if_ctx);
+         }
+
+         for (j = 0; j < 4; j++) {
+            texel_out[j] = LLVMBuildLoad(builder, texels[j], "");
+         }
+      }
    }
 
    lp_build_sample_compare(&bld, r, texel_out);
@@ -1283,6 +1559,7 @@ void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                         const struct lp_sampler_static_state *static_state,
                         struct lp_sampler_dynamic_state *dynamic_state,
+                        struct lp_type int_type,
                         unsigned unit,
                         LLVMValueRef explicit_lod,
                         LLVMValueRef *sizes_out)
@@ -1311,7 +1588,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
       return;
    }
 
-   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32));
+   assert(!int_type.floating);
+
+   lp_build_context_init(&bld_int_vec, gallivm, lp_type_int_vec(32, 128));
 
    if (explicit_lod) {
       LLVMValueRef first_level;
@@ -1345,7 +1624,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
    size = lp_build_minify(&bld_int_vec, size, lod);
 
    for (i=0; i < dims; i++) {
-      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, bld_int_vec.type,
+      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec.type, int_type,
                                                 size,
                                                 lp_build_const_int32(gallivm, i));
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 5d4406812c7..641c960431d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -40,6 +40,7 @@
 #include "lp_bld_init.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
 
 
 LLVMValueRef
@@ -95,7 +96,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
 
 
 /**
- * Combined extract and broadcast (or a mere shuffle when the two types match)
+ * Combined extract and broadcast (mere shuffle in most cases)
  */
 LLVMValueRef
 lp_build_extract_broadcast(struct gallivm_state *gallivm,
@@ -132,9 +133,9 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
       }
    }
    else {
-      if (dst_type.length == src_type.length) {
+      if (dst_type.length > 1) {
          /*
-          * Special shuffle of the same size.
+          * shuffle - result can be of different length.
           */
 
          LLVMValueRef shuffle;
@@ -142,28 +143,14 @@ lp_build_extract_broadcast(struct gallivm_state *gallivm,
                                       LLVMVectorType(i32t, dst_type.length),
                                       index);
          res = LLVMBuildShuffleVector(gallivm->builder, vector,
-                                      LLVMGetUndef(lp_build_vec_type(gallivm, dst_type)),
+                                      LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
                                       shuffle, "");
       }
       else {
-         LLVMValueRef scalar;
-         scalar = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
-         if (dst_type.length == 1) {
-            /*
-             * Trivial extract scalar from vector.
-             */
-
-            res = scalar;
-         }
-         else {
-            /*
-             * General case of different sized vectors.
-             */
-
-            res = lp_build_broadcast(gallivm,
-                                     lp_build_vec_type(gallivm, dst_type),
-                                     vector);
-         }
+         /*
+          * Trivial extract scalar from vector.
+          */
+          res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
       }
    }
 
@@ -290,6 +277,8 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
          return bld->zero;
       case PIPE_SWIZZLE_ONE:
          return bld->one;
+      case LP_BLD_SWIZZLE_DONTCARE:
+         return bld->undef;
       default:
          assert(0);
          return bld->undef;
@@ -319,21 +308,26 @@ lp_build_swizzle_aos(struct lp_build_context *bld,
             case PIPE_SWIZZLE_BLUE:
             case PIPE_SWIZZLE_ALPHA:
                shuffle = j + swizzles[i];
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
                break;
             case PIPE_SWIZZLE_ZERO:
                shuffle = type.length + 0;
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
                if (!aux[0]) {
                   aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
                }
                break;
             case PIPE_SWIZZLE_ONE:
                shuffle = type.length + 1;
+               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
                if (!aux[1]) {
                   aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
                }
                break;
+            case LP_BLD_SWIZZLE_DONTCARE:
+               shuffles[j + i] = LLVMGetUndef(i32t);
+               break;
             }
-            shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
          }
       }
 
@@ -508,3 +502,127 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
 
    lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
 }
+
+
+/**
+ * Transpose from AOS <-> SOA
+ *
+ * @param single_type_lp   type of pixels
+ * @param src              the 4 * n pixel input
+ * @param dst              the 4 * n pixel output
+ */
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+                       struct lp_type single_type_lp,
+                       const LLVMValueRef src[4],
+                       LLVMValueRef dst[4])
+{
+   struct lp_type double_type_lp = single_type_lp;
+   LLVMTypeRef single_type;
+   LLVMTypeRef double_type;
+   LLVMValueRef t0, t1, t2, t3;
+
+   double_type_lp.length >>= 1;
+   double_type_lp.width  <<= 1;
+
+   double_type = lp_build_vec_type(gallivm, double_type_lp);
+   single_type = lp_build_vec_type(gallivm, single_type_lp);
+
+   /* Interleave x, y, z, w -> xy and zw */
+   t0 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 0);
+   t1 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 0);
+   t2 = lp_build_interleave2_half(gallivm, single_type_lp, src[0], src[1], 1);
+   t3 = lp_build_interleave2_half(gallivm, single_type_lp, src[2], src[3], 1);
+
+   /* Cast to double width type for second interleave */
+   t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
+   t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
+   t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
+   t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
+
+   /* Interleave xy, zw -> xyzw */
+   dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
+   dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
+   dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
+   dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
+
+   /* Cast back to original single width type */
+   dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
+   dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
+   dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
+   dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
+}
+
+
+/**
+ * Pack first element of aos values,
+ * pad out to destination size.
+ * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
+ */
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+                          struct lp_type src_type,
+                          struct lp_type dst_type,
+                          const LLVMValueRef src)
+{
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef undef = LLVMGetUndef(i32t);
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   unsigned num_src = src_type.length / 4;
+   unsigned num_dst = dst_type.length;
+   unsigned i;
+
+   assert(num_src <= num_dst);
+
+   for (i = 0; i < num_src; i++) {
+      shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
+   }
+   for (i = num_src; i < num_dst; i++) {
+      shuffles[i] = undef;
+   }
+
+   if (num_dst == 1) {
+      return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(shuffles, num_dst), "");
+   }
+}
+
+
+/**
+ * Unpack and broadcast packed aos values consisting of only the
+ * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
+ */
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      struct lp_type dst_type,
+                                      const LLVMValueRef src)
+{
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+   unsigned num_dst = dst_type.length;
+   unsigned num_src = dst_type.length / 4;
+   unsigned i;
+
+   assert(num_dst / 4 <= src_type.length);
+
+   for (i = 0; i < num_src; i++) {
+      shuffles[i*4] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
+      shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
+   }
+
+   if (num_src == 1) {
+      return lp_build_extract_broadcast(gallivm, src_type, dst_type,
+                                        src, shuffles[0]);
+   }
+   else {
+      return LLVMBuildShuffleVector(gallivm->builder, src, src,
+                                    LLVMConstVector(shuffles, num_dst), "");
+   }
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index c366a65103e..0bf4ce988a2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -44,6 +44,9 @@ struct lp_type;
 struct lp_build_context;
 
 
+#define LP_BLD_SWIZZLE_DONTCARE 0xFF
+
+
 LLVMValueRef
 lp_build_broadcast(struct gallivm_state *gallivm,
                    LLVMTypeRef vec_type,
@@ -103,4 +106,25 @@ lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
                              const unsigned char swizzles[4]);
 
 
+void
+lp_build_transpose_aos(struct gallivm_state *gallivm,
+                       struct lp_type type,
+                       const LLVMValueRef src[4],
+                       LLVMValueRef dst[4]);
+
+
+LLVMValueRef
+lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
+                          struct lp_type src_type,
+                          struct lp_type dst_type,
+                          const LLVMValueRef src);
+
+
+LLVMValueRef
+lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
+                                      struct lp_type src_type,
+                                      struct lp_type dst_type,
+                                      const LLVMValueRef src);
+
+
 #endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 4423bc5dedd..e292420a61a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -60,6 +60,7 @@ struct tgsi_token;
 struct tgsi_shader_info;
 struct lp_build_mask_context;
 struct gallivm_state;
+struct lp_derivatives;
 
 
 enum lp_build_tex_modifier {
@@ -174,8 +175,7 @@ struct lp_build_sampler_soa
                         unsigned unit,
                         unsigned num_coords,
                         const LLVMValueRef *coords,
-                        const LLVMValueRef *ddx,
-                        const LLVMValueRef *ddy,
+                        const struct lp_derivatives *derivs,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef *texel);
@@ -183,6 +183,7 @@ struct lp_build_sampler_soa
    void
    (*emit_size_query)( const struct lp_build_sampler_soa *sampler,
                        struct gallivm_state *gallivm,
+                       struct lp_type type,
                        unsigned unit,
                        LLVMValueRef explicit_lod, /* optional */
                        LLVMValueRef *sizes_out);
@@ -197,8 +198,7 @@ struct lp_build_sampler_aos
                         unsigned target, /* TGSI_TEXTURE_* */
                         unsigned unit,
                         LLVMValueRef coords,
-                        LLVMValueRef ddx,
-                        LLVMValueRef ddy,
+                        const struct lp_derivatives derivs,
                         enum lp_build_tex_modifier modifier);
 };
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 24bc13a9be8..0666bba7fbd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -56,6 +56,7 @@
 #include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_sample.h"
 
 
 /**
@@ -363,6 +364,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
    LLVMValueRef coords;
    LLVMValueRef ddx;
    LLVMValueRef ddy;
+   struct lp_derivatives derivs;
 
    if (!bld->sampler) {
       _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
@@ -373,7 +375,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
 
    coords = lp_build_emit_fetch( &bld->bld_base, inst, 0 , LP_CHAN_ALL);
 
-   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+   if (0 && modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
       ddx = lp_build_emit_fetch( &bld->bld_base, inst, 1 , LP_CHAN_ALL);
       ddy = lp_build_emit_fetch( &bld->bld_base, inst, 2 , LP_CHAN_ALL);
       unit = inst->Src[3].Register.Index;
@@ -383,8 +385,8 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
       ddy = lp_build_ddy( &bld->bld_base.base, coords );
 #else
       /* TODO */
-      ddx = bld->bld_base.base.one;
-      ddy = bld->bld_base.base.one;
+      derivs.ddx_ddy[0] = bld->bld_base.base.one;
+      derivs.ddx_ddy[1] = bld->bld_base.base.one;
 #endif
       unit = inst->Src[1].Register.Index;
    }
@@ -392,7 +394,7 @@ emit_tex(struct lp_build_tgsi_aos_context *bld,
    return bld->sampler->emit_fetch_texel(bld->sampler,
                                          &bld->bld_base.base,
                                          target, unit,
-                                         coords, ddx, ddy,
+                                         coords, derivs,
                                          modifier);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index d9faaf20273..85a4401b534 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -62,6 +62,7 @@
 #include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_printf.h"
+#include "lp_bld_sample.h"
 
 
 static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -763,7 +764,7 @@ emit_fetch_temporary(
    else {
       LLVMValueRef temp_ptr;
       if (stype != TGSI_TYPE_FLOAT && stype != TGSI_TYPE_UNTYPED) {
-         LLVMTypeRef itype = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+         LLVMTypeRef itype = LLVMPointerType(bld->bld_base.int_bld.vec_type, 0);
          LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
                                                      swizzle);
          temp_ptr = LLVMBuildBitCast(builder, tint_ptr, itype, "");
@@ -1068,7 +1069,7 @@ emit_store_chan(
          switch (dtype) {
          case TGSI_TYPE_UNSIGNED:
          case TGSI_TYPE_SIGNED: {
-            LLVMTypeRef itype = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+            LLVMTypeRef itype = bld_base->int_bld.vec_type;
             LLVMTypeRef ivtype = LLVMPointerType(itype, 0);
             LLVMValueRef tint_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index,
                                                         chan_index);
@@ -1141,13 +1142,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
           LLVMValueRef *texel)
 {
    LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+   struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
    unsigned unit;
    LLVMValueRef lod_bias, explicit_lod;
    LLVMValueRef oow = NULL;
    LLVMValueRef coords[3];
-   LLVMValueRef ddx[3];
-   LLVMValueRef ddy[3];
+   struct lp_derivatives derivs;
    unsigned num_coords;
+   unsigned dims;
    unsigned i;
 
    if (!bld->sampler) {
@@ -1158,26 +1160,42 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       return;
    }
 
+   derivs.ddx_ddy[0] = bld->bld_base.base.undef;
+   derivs.ddx_ddy[1] = bld->bld_base.base.undef;
+
    switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
       num_coords = 1;
+      dims = 1;
       break;
    case TGSI_TEXTURE_1D_ARRAY:
+      num_coords = 2;
+      dims = 1;
+      break;
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
       num_coords = 2;
+      dims = 2;
       break;
    case TGSI_TEXTURE_SHADOW1D:
    case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      num_coords = 3;
+      dims = 1;
+      break;
    case TGSI_TEXTURE_SHADOW2D:
    case TGSI_TEXTURE_SHADOWRECT:
    case TGSI_TEXTURE_2D_ARRAY:
-   case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
       num_coords = 3;
+      dims = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+      num_coords = 3;
+      dims = 3;
       break;
    case TGSI_TEXTURE_SHADOW2D_ARRAY:
       num_coords = 4;
+      dims = 2;
       break;
    default:
       assert(0);
@@ -1212,31 +1230,66 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
-      LLVMValueRef index0 = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
-      for (i = 0; i < num_coords; i++) {
-         LLVMValueRef src1 = lp_build_emit_fetch( &bld->bld_base, inst, 1, i );
-         LLVMValueRef src2 = lp_build_emit_fetch( &bld->bld_base, inst, 2, i );
-         ddx[i] = LLVMBuildExtractElement(builder, src1, index0, "");
-         ddy[i] = LLVMBuildExtractElement(builder, src2, index0, "");
+      LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef ddxdyonec[3];
+      unsigned length = bld->bld_base.base.type.length;
+      unsigned num_quads = length / 4;
+      unsigned dim;
+      unsigned quad;
+
+      for (dim = 0; dim < dims; ++dim) {
+         LLVMValueRef srcx = lp_build_emit_fetch( &bld->bld_base, inst, 1, dim );
+         LLVMValueRef srcy = lp_build_emit_fetch( &bld->bld_base, inst, 2, dim );
+         for (quad = 0; quad < num_quads; ++quad) {
+            unsigned s1 = 4*quad;
+            unsigned s2 = 4*quad + length;
+            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s2);
+            shuffles[4*quad + 2] = i32undef;
+            shuffles[4*quad + 3] = i32undef;
+         }
+         ddxdyonec[dim] = LLVMBuildShuffleVector(builder, srcx, srcy,
+                                               LLVMConstVector(shuffles, length), "");
+      }
+      if (dims == 1) {
+         derivs.ddx_ddy[0] = ddxdyonec[0];
+      }
+      else if (dims >= 2) {
+         for (quad = 0; quad < num_quads; ++quad) {
+            unsigned s1 = 4*quad;
+            unsigned s2 = 4*quad + length;
+            shuffles[4*quad + 0] = lp_build_const_int32(gallivm, s1);
+            shuffles[4*quad + 1] = lp_build_const_int32(gallivm, s1 + 1);
+            shuffles[4*quad + 2] = lp_build_const_int32(gallivm, s2);
+            shuffles[4*quad + 3] = lp_build_const_int32(gallivm, s2 + 1);
+         }
+         derivs.ddx_ddy[0] = LLVMBuildShuffleVector(builder, ddxdyonec[0], ddxdyonec[1],
+                                                  LLVMConstVector(shuffles, length), "");
+         if (dims == 3) {
+            derivs.ddx_ddy[1] = ddxdyonec[2];
+         }
       }
       unit = inst->Src[3].Register.Index;
    }  else {
-      for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_scalar_ddx( &bld->bld_base.base, coords[i] );
-         ddy[i] = lp_build_scalar_ddy( &bld->bld_base.base, coords[i] );
+      if (dims == 1) {
+         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[0]);
+      }
+      else if (dims >= 2) {
+         derivs.ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(&bld->bld_base.base,
+                                                            coords[0], coords[1]);
+         if (dims == 3) {
+            derivs.ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(&bld->bld_base.base, coords[2]);
+         }
       }
       unit = inst->Src[1].Register.Index;
    }
-   for (i = num_coords; i < 3; i++) {
-      ddx[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
-      ddy[i] = LLVMGetUndef(bld->bld_base.base.elem_type);
-   }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->bld_base.base.gallivm,
                                   bld->bld_base.base.type,
                                   unit, num_coords, coords,
-                                  ddx, ddy,
+                                  &derivs,
                                   lod_bias, explicit_lod,
                                   texel);
 }
@@ -1310,6 +1363,7 @@ emit_txq( struct lp_build_tgsi_soa_context *bld,
 
    bld->sampler->emit_size_query(bld->sampler,
                                  bld->bld_base.base.gallivm,
+                                 bld->bld_base.int_bld.type,
                                  inst->Src[1].Register.Index,
                                  explicit_lod,
                                  sizes_out);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index 413e69bedac..6c3aa38bfb1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -38,6 +38,9 @@ lp_build_elem_type(struct gallivm_state *gallivm, struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         return LLVMIntTypeInContext(gallivm->context, 16);
+         break;
       case 32:
          return LLVMFloatTypeInContext(gallivm->context);
          break;
@@ -85,6 +88,10 @@ lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type)
 
    if (type.floating) {
       switch(type.width) {
+      case 16:
+         if(elem_kind != LLVMIntegerTypeKind)
+            return FALSE;
+         break;
       case 32:
          if(elem_kind != LLVMFloatTypeKind)
             return FALSE;
@@ -168,27 +175,6 @@ lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type)
 
 
 /**
- * Build int32[4] vector type
- */
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm)
-{
-   struct lp_type t;
-   LLVMTypeRef type;
-
-   memset(&t, 0, sizeof(t));
-   t.floating = FALSE; /* floating point values */
-   t.sign = TRUE;      /* values are signed */
-   t.norm = FALSE;     /* values are not limited to [0,1] or [-1,1] */
-   t.width = 32;       /* 32-bit int */
-   t.length = 4;       /* 4 elements per vector */
-
-   type = lp_build_int_elem_type(gallivm, t);
-   return LLVMVectorType(type, t.length);
-}
-
-
-/**
  * Create element of vector type
  */
 struct lp_type
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index f11a190e7cc..75310e05f3e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -40,21 +40,35 @@
 #include "pipe/p_compiler.h"
 #include "gallivm/lp_bld.h"
 
+/**
+ * Native SIMD architecture width available at runtime.
+ *
+ * Using this width should give the best performance,
+ * and it determines the necessary alignment of vector variables.
+ */
+extern unsigned lp_native_vector_width;
 
+/**
+ * Maximum supported vector width (not necessarily supported at run-time).
+ *
+ * Should only be used when lp_native_vector_width isn't available,
+ * i.e. sizing/alignment of non-malloced variables.
+ */
+#define LP_MAX_VECTOR_WIDTH 256
 
 /**
- * Native SIMD register width.
+ * Minimum vector alignment for static variable alignment
  *
- * 128 for all architectures we care about.
+ * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8.  An
+ * expression is non-portable.
  */
-#define LP_NATIVE_VECTOR_WIDTH 128
+#define LP_MIN_VECTOR_ALIGN 32
 
 /**
  * Several functions can only cope with vectors of length up to this value.
  * You may need to increase that value if you want to represent bigger vectors.
  */
-#define LP_MAX_VECTOR_LENGTH 16
-
+#define LP_MAX_VECTOR_LENGTH (LP_MAX_VECTOR_WIDTH/8)
 
 /**
  * The LLVM type system can't conveniently express all the things we care about
@@ -151,6 +165,13 @@ struct lp_build_context
 };
 
 
+static INLINE unsigned
+lp_type_width(struct lp_type type)
+{
+   return type.width * type.length;
+}
+
+
 /** Create scalar float type */
 static INLINE struct lp_type
 lp_type_float(unsigned width)
@@ -169,7 +190,7 @@ lp_type_float(unsigned width)
 
 /** Create vector of float type */
 static INLINE struct lp_type
-lp_type_float_vec(unsigned width)
+lp_type_float_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
@@ -177,7 +198,7 @@ lp_type_float_vec(unsigned width)
    res_type.floating = TRUE;
    res_type.sign = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
@@ -200,14 +221,14 @@ lp_type_int(unsigned width)
 
 /** Create vector int type */
 static INLINE struct lp_type
-lp_type_int_vec(unsigned width)
+lp_type_int_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.sign = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
@@ -229,34 +250,34 @@ lp_type_uint(unsigned width)
 
 /** Create vector uint type */
 static INLINE struct lp_type
-lp_type_uint_vec(unsigned width)
+lp_type_uint_vec(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
 
 
 static INLINE struct lp_type
-lp_type_unorm(unsigned width)
+lp_type_unorm(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.norm = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
 
 
 static INLINE struct lp_type
-lp_type_fixed(unsigned width)
+lp_type_fixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
@@ -264,21 +285,21 @@ lp_type_fixed(unsigned width)
    res_type.sign = TRUE;
    res_type.fixed = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
 
 
 static INLINE struct lp_type
-lp_type_ufixed(unsigned width)
+lp_type_ufixed(unsigned width, unsigned total_width)
 {
    struct lp_type res_type;
 
    memset(&res_type, 0, sizeof res_type);
    res_type.fixed = TRUE;
    res_type.width = width;
-   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+   res_type.length = total_width / width;
 
    return res_type;
 }
@@ -312,10 +333,6 @@ LLVMTypeRef
 lp_build_int_vec_type(struct gallivm_state *gallivm, struct lp_type type);
 
 
-LLVMTypeRef
-lp_build_int32_vec4_type(struct gallivm_state *gallivm);
-
-
 static INLINE struct lp_type
 lp_float32_vec4_type(void)
 {
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 856e8d7a0ef..b44d9d9a0fe 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -35,9 +35,16 @@
 #ifndef _UTIL_CPU_DETECT_H
 #define _UTIL_CPU_DETECT_H
 
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_config.h"
 
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
 struct util_cpu_caps {
    unsigned nr_cpus;
 
@@ -66,4 +73,9 @@ util_cpu_caps;
 void util_cpu_detect(void);
 
 
+#ifdef	__cplusplus
+}
+#endif
+
+
 #endif /* _UTIL_CPU_DETECT_H */