634 files changed, 58797 insertions, 39918 deletions
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index 89c69d7205e..8be84cddbe7 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -23,6 +23,7 @@ SConscript([
 	'auxiliary/pipebuffer/SConscript',
 	'auxiliary/indices/SConscript',
 	'auxiliary/rbug/SConscript',
+	'auxiliary/vl/SConscript',
 ])
 
 for driver in env['drivers']:
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index b04e98bfa12..69630e98bae 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -31,7 +31,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef	__cplusplus
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 78953bccfc6..6c1cb48e8b8 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -104,7 +104,7 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
    unsigned clipped = 0;
    unsigned j;
 
-   if (0) debug_printf("%s\n");
+   if (0) debug_printf("%s\n", __FUNCTION__);
 
    for (j = 0; j < count; j++) {
       float *position = out->data[pos];
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 645d7cccbae..88bc790b621 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -537,19 +537,10 @@ static struct x86_reg fetch_src( struct aos_compilation *cp,
    unsigned abs = 0;
 
    for (i = 0; i < 4; i++) {
-      unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
+      unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
       unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
 
-      switch (swizzle) {
-      case TGSI_EXTSWIZZLE_ZERO:
-      case TGSI_EXTSWIZZLE_ONE:
-         AOS_ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
-         break;
-
-      default:
-         swz |= (swizzle & 0x3) << (i * 2);
-         break;
-      }
+      swz |= (swizzle & 0x3) << (i * 2);
 
       switch (neg) {
       case TGSI_UTIL_SIGN_TOGGLE:
@@ -632,23 +623,10 @@ static void x87_fld_src( struct aos_compilation *cp,
                                                 src->SrcRegister.File, 
                                                 src->SrcRegister.Index);
 
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
    unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
 
-   switch (swizzle) {
-   case TGSI_EXTSWIZZLE_ZERO:
-      x87_fldz( cp->func );
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      x87_fld1( cp->func );
-      break;
-
-   default:
-      x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
-      break;
-   }
-   
+   x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
 
    switch (neg) {
    case TGSI_UTIL_SIGN_TOGGLE:
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 2590546cb4a..4ef372233f0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -46,7 +46,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 0d303634847..2ef4293d4d7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -42,7 +42,7 @@
 #endif
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "util/u_debug.h"
 #include "pipe/p_thread.h"
 #include "util/u_memory.h"
@@ -540,9 +540,9 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(!fenced_buf->fence);
       debug_printf("%10p %7u %7u\n",
-                   fenced_buf,
+                   (void *) fenced_buf,
                    fenced_buf->base.base.size,
-                   fenced_buf->base.base.reference.count);
+                   p_atomic_read(&fenced_buf->base.base.reference.count));
       curr = next; 
       next = curr->next;
    }
@@ -554,10 +554,10 @@ fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
       debug_printf("%10p %7u %7u %10p %s\n",
-                   fenced_buf,
+                   (void *) fenced_buf,
                    fenced_buf->base.base.size,
-                   fenced_buf->base.base.reference.count,
-                   fenced_buf->fence,
+                   p_atomic_read(&fenced_buf->base.base.reference.count),
+                   (void *) fenced_buf->fence,
                    signaled == 0 ? "y" : "n");
       curr = next; 
       next = curr->next;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index 39ab8e722c1..8c8d7130781 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -51,7 +51,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 1b4df28c707..6e3214ca9c9 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -350,7 +350,7 @@ pb_debug_manager_dump(struct pb_debug_manager *mgr)
       buf = LIST_ENTRY(struct pb_debug_buffer, curr, head);
 
       debug_printf("buffer = %p\n", buf);
-      debug_printf("    .size = %p\n", buf->base.base.size);
+      debug_printf("    .size = 0x%x\n", buf->base.base.size);
       debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
       
       curr = next; 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index e7352e90db9..d21910d0bf0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -37,7 +37,6 @@
  */
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
 #include "util/u_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_defines.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
index 150fd506181..ce40c0cf0e6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -34,7 +34,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.h b/src/gallium/auxiliary/pipebuffer/pb_validate.h
index dfb84df1cef..3c93f30f201 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.h
@@ -37,7 +37,7 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/gallium/auxiliary/rbug/README b/src/gallium/auxiliary/rbug/README
index 33d76371de4..d984067893c 100644
--- a/src/gallium/auxiliary/rbug/README
+++ b/src/gallium/auxiliary/rbug/README
@@ -16,6 +16,10 @@ for information about applications look in:
 
 progs/rbug/README
 
+for a GUI see:
+
+  http://cgit.freedesktop.org/mesa/rbug-gui
+
 
 --
 Jakob Bornecrantz <jakob@vmware.com>
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index e0cfc54420e..4fa10e2f7e3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -472,9 +472,9 @@ tgsi_default_full_instruction( void )
    unsigned i;
 
    full_instruction.Instruction = tgsi_default_instruction();
-   full_instruction.InstructionExtNv = tgsi_default_instruction_ext_nv();
    full_instruction.InstructionExtLabel = tgsi_default_instruction_ext_label();
    full_instruction.InstructionExtTexture = tgsi_default_instruction_ext_texture();
+   full_instruction.InstructionExtPredicate = tgsi_default_instruction_ext_predicate();
    for( i = 0;  i < TGSI_FULL_MAX_DST_REGISTERS; i++ ) {
       full_instruction.FullDstRegisters[i] = tgsi_default_full_dst_register();
    }
@@ -512,34 +512,6 @@ tgsi_build_full_instruction(
       header );
    prev_token = (struct tgsi_token  *) instruction;
 
-   if( tgsi_compare_instruction_ext_nv(
-         full_inst->InstructionExtNv,
-         tgsi_default_instruction_ext_nv() ) ) {
-      struct tgsi_instruction_ext_nv *instruction_ext_nv;
-
-      if( maxsize <= size )
-         return 0;
-      instruction_ext_nv =
-         (struct  tgsi_instruction_ext_nv *) &tokens[size];
-      size++;
-
-      *instruction_ext_nv  = tgsi_build_instruction_ext_nv(
-         full_inst->InstructionExtNv.Precision,
-         full_inst->InstructionExtNv.CondDstIndex,
-         full_inst->InstructionExtNv.CondFlowIndex,
-         full_inst->InstructionExtNv.CondMask,
-         full_inst->InstructionExtNv.CondSwizzleX,
-         full_inst->InstructionExtNv.CondSwizzleY,
-         full_inst->InstructionExtNv.CondSwizzleZ,
-         full_inst->InstructionExtNv.CondSwizzleW,
-         full_inst->InstructionExtNv.CondDstUpdate,
-         full_inst->InstructionExtNv.CondFlowEnable,
-         prev_token,
-         instruction,
-         header );
-      prev_token = (struct tgsi_token  *) instruction_ext_nv;
-   }
-
    if( tgsi_compare_instruction_ext_label(
          full_inst->InstructionExtLabel,
          tgsi_default_instruction_ext_label() ) ) {
@@ -578,6 +550,29 @@ tgsi_build_full_instruction(
       prev_token = (struct tgsi_token  *) instruction_ext_texture;
    }
 
+   if (tgsi_compare_instruction_ext_predicate(full_inst->InstructionExtPredicate,
+                                              tgsi_default_instruction_ext_predicate())) {
+      struct tgsi_instruction_ext_predicate *instruction_ext_predicate;
+
+      if (maxsize <= size) {
+         return 0;
+      }
+      instruction_ext_predicate = (struct tgsi_instruction_ext_predicate *)&tokens[size];
+      size++;
+
+      *instruction_ext_predicate =
+         tgsi_build_instruction_ext_predicate(full_inst->InstructionExtPredicate.SrcIndex,
+                                              full_inst->InstructionExtPredicate.Negate,
+                                              full_inst->InstructionExtPredicate.SwizzleX,
+                                              full_inst->InstructionExtPredicate.SwizzleY,
+                                              full_inst->InstructionExtPredicate.SwizzleZ,
+                                              full_inst->InstructionExtPredicate.SwizzleW,
+                                              prev_token,
+                                              instruction,
+                                              header);
+      prev_token = (struct tgsi_token *)instruction_ext_predicate;
+   }
+
    for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
       const struct tgsi_full_dst_register *reg = &full_inst->FullDstRegisters[i];
       struct tgsi_dst_register *dst_register;
@@ -597,30 +592,6 @@ tgsi_build_full_instruction(
          header );
       prev_token = (struct tgsi_token  *) dst_register;
 
-      if( tgsi_compare_dst_register_ext_concode(
-            reg->DstRegisterExtConcode,
-            tgsi_default_dst_register_ext_concode() ) ) {
-         struct tgsi_dst_register_ext_concode *dst_register_ext_concode;
-
-         if( maxsize <= size )
-            return 0;
-         dst_register_ext_concode =
-            (struct  tgsi_dst_register_ext_concode *) &tokens[size];
-         size++;
-
-         *dst_register_ext_concode =   tgsi_build_dst_register_ext_concode(
-            reg->DstRegisterExtConcode.CondMask,
-            reg->DstRegisterExtConcode.CondSwizzleX,
-            reg->DstRegisterExtConcode.CondSwizzleY,
-            reg->DstRegisterExtConcode.CondSwizzleZ,
-            reg->DstRegisterExtConcode.CondSwizzleW,
-            reg->DstRegisterExtConcode.CondSrcIndex,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) dst_register_ext_concode;
-      }
-
       if( tgsi_compare_dst_register_ext_modulate(
             reg->DstRegisterExtModulate,
             tgsi_default_dst_register_ext_modulate() ) ) {
@@ -687,40 +658,6 @@ tgsi_build_full_instruction(
          header );
       prev_token = (struct tgsi_token  *) src_register;
 
-      if( tgsi_compare_src_register_ext_swz(
-            reg->SrcRegisterExtSwz,
-            tgsi_default_src_register_ext_swz() ) ) {
-         struct tgsi_src_register_ext_swz *src_register_ext_swz;
-
-         /* Use of the extended swizzle requires the simple swizzle to be identity.
-          */
-         assert( reg->SrcRegister.SwizzleX == TGSI_SWIZZLE_X );
-         assert( reg->SrcRegister.SwizzleY == TGSI_SWIZZLE_Y );
-         assert( reg->SrcRegister.SwizzleZ == TGSI_SWIZZLE_Z );
-         assert( reg->SrcRegister.SwizzleW == TGSI_SWIZZLE_W );
-         assert( reg->SrcRegister.Negate == FALSE );
-
-         if( maxsize <= size )
-            return 0;
-         src_register_ext_swz =
-            (struct  tgsi_src_register_ext_swz *) &tokens[size];
-         size++;
-
-         *src_register_ext_swz = tgsi_build_src_register_ext_swz(
-            reg->SrcRegisterExtSwz.ExtSwizzleX,
-            reg->SrcRegisterExtSwz.ExtSwizzleY,
-            reg->SrcRegisterExtSwz.ExtSwizzleZ,
-            reg->SrcRegisterExtSwz.ExtSwizzleW,
-            reg->SrcRegisterExtSwz.NegateX,
-            reg->SrcRegisterExtSwz.NegateY,
-            reg->SrcRegisterExtSwz.NegateZ,
-            reg->SrcRegisterExtSwz.NegateW,
-            prev_token,
-            instruction,
-            header );
-         prev_token = (struct tgsi_token  *) src_register_ext_swz;
-      }
-
       if( tgsi_compare_src_register_ext_mod(
             reg->SrcRegisterExtMod,
             tgsi_default_src_register_ext_mod() ) ) {
@@ -809,29 +746,6 @@ tgsi_build_full_instruction(
    return size;
 }
 
-struct tgsi_instruction_ext_nv
-tgsi_default_instruction_ext_nv( void )
-{
-   struct tgsi_instruction_ext_nv instruction_ext_nv;
-
-   instruction_ext_nv.Type = TGSI_INSTRUCTION_EXT_TYPE_NV;
-   instruction_ext_nv.Precision = TGSI_PRECISION_DEFAULT;
-   instruction_ext_nv.CondDstIndex = 0;
-   instruction_ext_nv.CondFlowIndex = 0;
-   instruction_ext_nv.CondMask = TGSI_CC_TR;
-   instruction_ext_nv.CondSwizzleX = TGSI_SWIZZLE_X;
-   instruction_ext_nv.CondSwizzleY = TGSI_SWIZZLE_Y;
-   instruction_ext_nv.CondSwizzleZ = TGSI_SWIZZLE_Z;
-   instruction_ext_nv.CondSwizzleW = TGSI_SWIZZLE_W;
-   instruction_ext_nv.CondDstUpdate = 0;
-   instruction_ext_nv.CondFlowEnable = 0;
-   instruction_ext_nv.Padding = 0;
-   instruction_ext_nv.Extended = 0;
-
-   return instruction_ext_nv;
-}
-
-
 /** test for inequality of 32-bit values pointed to by a and b */
 static INLINE boolean
 compare32(const void *a, const void *b)
@@ -839,53 +753,6 @@ compare32(const void *a, const void *b)
    return *((uint32_t *) a) != *((uint32_t *) b);
 }
 
-
-unsigned
-tgsi_compare_instruction_ext_nv(
-   struct tgsi_instruction_ext_nv a,
-   struct tgsi_instruction_ext_nv b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return compare32(&a, &b);
-}
-
-struct tgsi_instruction_ext_nv
-tgsi_build_instruction_ext_nv(
-   unsigned precision,
-   unsigned cond_dst_index,
-   unsigned cond_flow_index,
-   unsigned cond_mask,
-   unsigned cond_swizzle_x,
-   unsigned cond_swizzle_y,
-   unsigned cond_swizzle_z,
-   unsigned cond_swizzle_w,
-   unsigned cond_dst_update,
-   unsigned cond_flow_enable,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_instruction_ext_nv instruction_ext_nv;
-
-   instruction_ext_nv = tgsi_default_instruction_ext_nv();
-   instruction_ext_nv.Precision = precision;
-   instruction_ext_nv.CondDstIndex = cond_dst_index;
-   instruction_ext_nv.CondFlowIndex = cond_flow_index;
-   instruction_ext_nv.CondMask = cond_mask;
-   instruction_ext_nv.CondSwizzleX = cond_swizzle_x;
-   instruction_ext_nv.CondSwizzleY = cond_swizzle_y;
-   instruction_ext_nv.CondSwizzleZ = cond_swizzle_z;
-   instruction_ext_nv.CondSwizzleW = cond_swizzle_w;
-   instruction_ext_nv.CondDstUpdate = cond_dst_update;
-   instruction_ext_nv.CondFlowEnable = cond_flow_enable;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return instruction_ext_nv;
-}
-
 struct tgsi_instruction_ext_label
 tgsi_default_instruction_ext_label( void )
 {
@@ -968,6 +835,60 @@ tgsi_build_instruction_ext_texture(
    return instruction_ext_texture;
 }
 
+struct tgsi_instruction_ext_predicate
+tgsi_default_instruction_ext_predicate(void)
+{
+   struct tgsi_instruction_ext_predicate instruction_ext_predicate;
+
+   instruction_ext_predicate.Type = TGSI_INSTRUCTION_EXT_TYPE_PREDICATE;
+   instruction_ext_predicate.SwizzleX = TGSI_SWIZZLE_X;
+   instruction_ext_predicate.SwizzleY = TGSI_SWIZZLE_Y;
+   instruction_ext_predicate.SwizzleZ = TGSI_SWIZZLE_Z;
+   instruction_ext_predicate.SwizzleW = TGSI_SWIZZLE_W;
+   instruction_ext_predicate.Negate = 0;
+   instruction_ext_predicate.SrcIndex = 0;
+   instruction_ext_predicate.Padding = 0;
+   instruction_ext_predicate.Extended = 0;
+
+   return instruction_ext_predicate;
+}
+
+unsigned
+tgsi_compare_instruction_ext_predicate(struct tgsi_instruction_ext_predicate a,
+                                       struct tgsi_instruction_ext_predicate b)
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_instruction_ext_predicate
+tgsi_build_instruction_ext_predicate(unsigned index,
+                                     unsigned negate,
+                                     unsigned swizzleX,
+                                     unsigned swizzleY,
+                                     unsigned swizzleZ,
+                                     unsigned swizzleW,
+                                     struct tgsi_token *prev_token,
+                                     struct tgsi_instruction *instruction,
+                                     struct tgsi_header *header)
+{
+   struct tgsi_instruction_ext_predicate instruction_ext_predicate;
+
+   instruction_ext_predicate = tgsi_default_instruction_ext_predicate();
+   instruction_ext_predicate.SwizzleX = swizzleX;
+   instruction_ext_predicate.SwizzleY = swizzleY;
+   instruction_ext_predicate.SwizzleZ = swizzleZ;
+   instruction_ext_predicate.SwizzleW = swizzleW;
+   instruction_ext_predicate.Negate = negate;
+   instruction_ext_predicate.SrcIndex = index;
+
+   prev_token->Extended = 1;
+   instruction_grow(instruction, header);
+
+   return instruction_ext_predicate;
+}
+
 struct tgsi_src_register
 tgsi_default_src_register( void )
 {
@@ -1033,7 +954,6 @@ tgsi_default_full_src_register( void )
    struct tgsi_full_src_register full_src_register;
 
    full_src_register.SrcRegister = tgsi_default_src_register();
-   full_src_register.SrcRegisterExtSwz = tgsi_default_src_register_ext_swz();
    full_src_register.SrcRegisterExtMod = tgsi_default_src_register_ext_mod();
    full_src_register.SrcRegisterInd = tgsi_default_src_register();
    full_src_register.SrcRegisterDim = tgsi_default_dimension();
@@ -1042,76 +962,6 @@ tgsi_default_full_src_register( void )
    return full_src_register;
 }
 
-struct tgsi_src_register_ext_swz
-tgsi_default_src_register_ext_swz( void )
-{
-   struct tgsi_src_register_ext_swz src_register_ext_swz;
-
-   src_register_ext_swz.Type = TGSI_SRC_REGISTER_EXT_TYPE_SWZ;
-   src_register_ext_swz.ExtSwizzleX = TGSI_EXTSWIZZLE_X;
-   src_register_ext_swz.ExtSwizzleY = TGSI_EXTSWIZZLE_Y;
-   src_register_ext_swz.ExtSwizzleZ = TGSI_EXTSWIZZLE_Z;
-   src_register_ext_swz.ExtSwizzleW = TGSI_EXTSWIZZLE_W;
-   src_register_ext_swz.NegateX = 0;
-   src_register_ext_swz.NegateY = 0;
-   src_register_ext_swz.NegateZ = 0;
-   src_register_ext_swz.NegateW = 0;
-   src_register_ext_swz.Padding = 0;
-   src_register_ext_swz.Extended = 0;
-
-   return src_register_ext_swz;
-}
-
-unsigned
-tgsi_compare_src_register_ext_swz(
-   struct tgsi_src_register_ext_swz a,
-   struct tgsi_src_register_ext_swz b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return compare32(&a, &b);
-}
-
-struct tgsi_src_register_ext_swz
-tgsi_build_src_register_ext_swz(
-   unsigned ext_swizzle_x,
-   unsigned ext_swizzle_y,
-   unsigned ext_swizzle_z,
-   unsigned ext_swizzle_w,
-   unsigned negate_x,
-   unsigned negate_y,
-   unsigned negate_z,
-   unsigned negate_w,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_src_register_ext_swz src_register_ext_swz;
-
-   assert( ext_swizzle_x <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_y <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_z <= TGSI_EXTSWIZZLE_ONE );
-   assert( ext_swizzle_w <= TGSI_EXTSWIZZLE_ONE );
-   assert( negate_x <= 1 );
-   assert( negate_y <= 1 );
-   assert( negate_z <= 1 );
-   assert( negate_w <= 1 );
-
-   src_register_ext_swz = tgsi_default_src_register_ext_swz();
-   src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
-   src_register_ext_swz.ExtSwizzleY = ext_swizzle_y;
-   src_register_ext_swz.ExtSwizzleZ = ext_swizzle_z;
-   src_register_ext_swz.ExtSwizzleW = ext_swizzle_w;
-   src_register_ext_swz.NegateX = negate_x;
-   src_register_ext_swz.NegateY = negate_y;
-   src_register_ext_swz.NegateZ = negate_z;
-   src_register_ext_swz.NegateW = negate_w;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return src_register_ext_swz;
-}
 
 struct tgsi_src_register_ext_mod
 tgsi_default_src_register_ext_mod( void )
@@ -1253,77 +1103,12 @@ tgsi_default_full_dst_register( void )
 
    full_dst_register.DstRegister = tgsi_default_dst_register();
    full_dst_register.DstRegisterInd = tgsi_default_src_register();
-   full_dst_register.DstRegisterExtConcode =
-      tgsi_default_dst_register_ext_concode();
    full_dst_register.DstRegisterExtModulate =
       tgsi_default_dst_register_ext_modulate();
 
    return full_dst_register;
 }
 
-struct tgsi_dst_register_ext_concode
-tgsi_default_dst_register_ext_concode( void )
-{
-   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
-
-   dst_register_ext_concode.Type = TGSI_DST_REGISTER_EXT_TYPE_CONDCODE;
-   dst_register_ext_concode.CondMask = TGSI_CC_TR;
-   dst_register_ext_concode.CondSwizzleX = TGSI_SWIZZLE_X;
-   dst_register_ext_concode.CondSwizzleY = TGSI_SWIZZLE_Y;
-   dst_register_ext_concode.CondSwizzleZ = TGSI_SWIZZLE_Z;
-   dst_register_ext_concode.CondSwizzleW = TGSI_SWIZZLE_W;
-   dst_register_ext_concode.CondSrcIndex = 0;
-   dst_register_ext_concode.Padding = 0;
-   dst_register_ext_concode.Extended = 0;
-
-   return dst_register_ext_concode;
-}
-
-unsigned
-tgsi_compare_dst_register_ext_concode(
-   struct tgsi_dst_register_ext_concode a,
-   struct tgsi_dst_register_ext_concode b )
-{
-   a.Padding = b.Padding = 0;
-   a.Extended = b.Extended = 0;
-   return compare32(&a, &b);
-}
-
-struct tgsi_dst_register_ext_concode
-tgsi_build_dst_register_ext_concode(
-   unsigned cc,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   int index,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header )
-{
-   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
-
-   assert( cc <= TGSI_CC_FL );
-   assert( swizzle_x <= TGSI_SWIZZLE_W );
-   assert( swizzle_y <= TGSI_SWIZZLE_W );
-   assert( swizzle_z <= TGSI_SWIZZLE_W );
-   assert( swizzle_w <= TGSI_SWIZZLE_W );
-   assert( index >= -32768 && index <= 32767 );
-
-   dst_register_ext_concode = tgsi_default_dst_register_ext_concode();
-   dst_register_ext_concode.CondMask = cc;
-   dst_register_ext_concode.CondSwizzleX = swizzle_x;
-   dst_register_ext_concode.CondSwizzleY = swizzle_y;
-   dst_register_ext_concode.CondSwizzleZ = swizzle_z;
-   dst_register_ext_concode.CondSwizzleW = swizzle_w;
-   dst_register_ext_concode.CondSrcIndex = index;
-
-   prev_token->Extended = 1;
-   instruction_grow( instruction, header );
-
-   return dst_register_ext_concode;
-}
-
 struct tgsi_dst_register_ext_modulate
 tgsi_default_dst_register_ext_modulate( void )
 {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index 17d977b0597..669712eb8f9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -157,30 +157,6 @@ tgsi_build_full_instruction(
    struct tgsi_header *header,
    unsigned maxsize );
 
-struct tgsi_instruction_ext_nv
-tgsi_default_instruction_ext_nv( void );
-
-unsigned
-tgsi_compare_instruction_ext_nv(
-   struct tgsi_instruction_ext_nv a,
-   struct tgsi_instruction_ext_nv b );
-
-struct tgsi_instruction_ext_nv
-tgsi_build_instruction_ext_nv(
-   unsigned precision,
-   unsigned cond_dst_index,
-   unsigned cond_flow_index,
-   unsigned cond_mask,
-   unsigned cond_swizzle_x,
-   unsigned cond_swizzle_y,
-   unsigned cond_swizzle_z,
-   unsigned cond_swizzle_w,
-   unsigned cond_dst_update,
-   unsigned cond_flow_enable,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
 struct tgsi_instruction_ext_label
 tgsi_default_instruction_ext_label( void );
 
@@ -211,6 +187,24 @@ tgsi_build_instruction_ext_texture(
    struct tgsi_instruction *instruction,
    struct tgsi_header *header );
 
+struct tgsi_instruction_ext_predicate
+tgsi_default_instruction_ext_predicate(void);
+
+unsigned
+tgsi_compare_instruction_ext_predicate(struct tgsi_instruction_ext_predicate a,
+                                       struct tgsi_instruction_ext_predicate b);
+
+struct tgsi_instruction_ext_predicate
+tgsi_build_instruction_ext_predicate(unsigned index,
+                                     unsigned negate,
+                                     unsigned swizzleX,
+                                     unsigned swizzleY,
+                                     unsigned swizzleZ,
+                                     unsigned swizzleW,
+                                     struct tgsi_token *prev_token,
+                                     struct tgsi_instruction *instruction,
+                                     struct tgsi_header *header);
+
 struct tgsi_src_register
 tgsi_default_src_register( void );
 
@@ -231,28 +225,6 @@ tgsi_build_src_register(
 struct tgsi_full_src_register
 tgsi_default_full_src_register( void );
 
-struct tgsi_src_register_ext_swz
-tgsi_default_src_register_ext_swz( void );
-
-unsigned
-tgsi_compare_src_register_ext_swz(
-   struct tgsi_src_register_ext_swz a,
-   struct tgsi_src_register_ext_swz b );
-
-struct tgsi_src_register_ext_swz
-tgsi_build_src_register_ext_swz(
-   unsigned ext_swizzle_x,
-   unsigned ext_swizzle_y,
-   unsigned ext_swizzle_z,
-   unsigned ext_swizzle_w,
-   unsigned negate_x,
-   unsigned negate_y,
-   unsigned negate_z,
-   unsigned negate_w,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
 struct tgsi_src_register_ext_mod
 tgsi_default_src_register_ext_mod( void );
 
@@ -297,26 +269,6 @@ tgsi_build_dst_register(
 struct tgsi_full_dst_register
 tgsi_default_full_dst_register( void );
 
-struct tgsi_dst_register_ext_concode
-tgsi_default_dst_register_ext_concode( void );
-
-unsigned
-tgsi_compare_dst_register_ext_concode(
-   struct tgsi_dst_register_ext_concode a,
-   struct tgsi_dst_register_ext_concode b );
-
-struct tgsi_dst_register_ext_concode
-tgsi_build_dst_register_ext_concode(
-   unsigned cc,
-   unsigned swizzle_x,
-   unsigned swizzle_y,
-   unsigned swizzle_z,
-   unsigned swizzle_w,
-   int index,
-   struct tgsi_token *prev_token,
-   struct tgsi_instruction *instruction,
-   struct tgsi_header *header );
-
 struct tgsi_dst_register_ext_modulate
 tgsi_default_dst_register_ext_modulate( void );
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 111d95b6665..d16e64f9c59 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -100,7 +100,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP"
+   "LOOP",
+   "PRED"
 };
 
 static const char *interpolate_names[] =
@@ -148,15 +149,6 @@ static const char *texture_names[] =
    "SHADOWRECT"
 };
 
-static const char *extswizzle_names[] =
-{
-   "x",
-   "y",
-   "z",
-   "w",
-   "0",
-   "1"
-};
 
 static const char *modulate_names[TGSI_MODULATE_COUNT] =
 {
@@ -446,24 +438,6 @@ iter_instruction(
          ENM( src->SrcRegister.SwizzleZ, swizzle_names );
          ENM( src->SrcRegister.SwizzleW, swizzle_names );
       }
-      if (src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
-          src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
-          src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
-          src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W) {
-         CHR( '.' );
-         if (src->SrcRegisterExtSwz.NegateX)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleX, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateY)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleY, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateZ)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, extswizzle_names );
-         if (src->SrcRegisterExtSwz.NegateW)
-            TXT("-");
-         ENM( src->SrcRegisterExtSwz.ExtSwizzleW, extswizzle_names );
-      }
 
       if (src->SrcRegisterExtMod.Complement)
          CHR( ')' );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
index 4a9c02b1413..4648051e29e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
@@ -79,7 +79,8 @@ static const char *TGSI_FILES[TGSI_FILE_COUNT] =
    "FILE_SAMPLER",
    "FILE_ADDRESS",
    "FILE_IMMEDIATE",
-   "FILE_LOOP"
+   "FILE_LOOP",
+   "FILE_PREDICATE"
 };
 
 static const char *TGSI_INTERPOLATES[] =
@@ -114,32 +115,11 @@ static const char *TGSI_SATS[] =
 
 static const char *TGSI_INSTRUCTION_EXTS[] =
 {
-   "INSTRUCTION_EXT_TYPE_NV",
+   "",
    "INSTRUCTION_EXT_TYPE_LABEL",
    "INSTRUCTION_EXT_TYPE_TEXTURE"
 };
 
-static const char *TGSI_PRECISIONS[] =
-{
-   "PRECISION_DEFAULT",
-   "PRECISION_FLOAT32",
-   "PRECISION_FLOAT16",
-   "PRECISION_FIXED12"
-};
-
-static const char *TGSI_CCS[] =
-{
-   "CC_GT",
-   "CC_EQ",
-   "CC_LT",
-   "CC_UN",
-   "CC_GE",
-   "CC_LE",
-   "CC_NE",
-   "CC_TR",
-   "CC_FL"
-};
-
 static const char *TGSI_SWIZZLES[] =
 {
    "SWIZZLE_X",
@@ -163,20 +143,10 @@ static const char *TGSI_TEXTURES[] =
 
 static const char *TGSI_SRC_REGISTER_EXTS[] =
 {
-   "SRC_REGISTER_EXT_TYPE_SWZ",
+   "",
    "SRC_REGISTER_EXT_TYPE_MOD"
 };
 
-static const char *TGSI_EXTSWIZZLES[] =
-{
-   "EXTSWIZZLE_X",
-   "EXTSWIZZLE_Y",
-   "EXTSWIZZLE_Z",
-   "EXTSWIZZLE_W",
-   "EXTSWIZZLE_ZERO",
-   "EXTSWIZZLE_ONE"
-};
-
 static const char *TGSI_WRITEMASKS[] =
 {
    "0",
@@ -199,7 +169,7 @@ static const char *TGSI_WRITEMASKS[] =
 
 static const char *TGSI_DST_REGISTER_EXTS[] =
 {
-   "DST_REGISTER_EXT_TYPE_CONDCODE",
+   "",
    "DST_REGISTER_EXT_TYPE_MODULATE"
 };
 
@@ -327,60 +297,6 @@ dump_instruction_verbose(
       UIX( inst->Instruction.Padding );
    }
 
-   if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
-      EOL();
-      TXT( "\nType          : " );
-      ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
-      if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
-         TXT( "\nPrecision     : " );
-         ENM( inst->InstructionExtNv.Precision, TGSI_PRECISIONS );
-      }
-      if( deflt || fi->InstructionExtNv.CondDstIndex != inst->InstructionExtNv.CondDstIndex ) {
-         TXT( "\nCondDstIndex  : " );
-         UID( inst->InstructionExtNv.CondDstIndex );
-      }
-      if( deflt || fi->InstructionExtNv.CondFlowIndex != inst->InstructionExtNv.CondFlowIndex ) {
-         TXT( "\nCondFlowIndex : " );
-         UID( inst->InstructionExtNv.CondFlowIndex );
-      }
-      if( deflt || fi->InstructionExtNv.CondMask != inst->InstructionExtNv.CondMask ) {
-         TXT( "\nCondMask      : " );
-         ENM( inst->InstructionExtNv.CondMask, TGSI_CCS );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleX != inst->InstructionExtNv.CondSwizzleX ) {
-         TXT( "\nCondSwizzleX  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleX, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleY != inst->InstructionExtNv.CondSwizzleY ) {
-         TXT( "\nCondSwizzleY  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleY, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleZ != inst->InstructionExtNv.CondSwizzleZ ) {
-         TXT( "\nCondSwizzleZ  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleZ, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondSwizzleW != inst->InstructionExtNv.CondSwizzleW ) {
-         TXT( "\nCondSwizzleW  : " );
-         ENM( inst->InstructionExtNv.CondSwizzleW, TGSI_SWIZZLES );
-      }
-      if( deflt || fi->InstructionExtNv.CondDstUpdate != inst->InstructionExtNv.CondDstUpdate ) {
-         TXT( "\nCondDstUpdate : " );
-         UID( inst->InstructionExtNv.CondDstUpdate );
-      }
-      if( deflt || fi->InstructionExtNv.CondFlowEnable != inst->InstructionExtNv.CondFlowEnable ) {
-         TXT( "\nCondFlowEnable: " );
-         UID( inst->InstructionExtNv.CondFlowEnable );
-      }
-      if( ignored ) {
-         TXT( "\nPadding       : " );
-         UIX( inst->InstructionExtNv.Padding );
-         if( deflt || fi->InstructionExtNv.Extended != inst->InstructionExtNv.Extended ) {
-            TXT( "\nExtended      : " );
-            UID( inst->InstructionExtNv.Extended );
-         }
-      }
-   }
-
    if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
       EOL();
       TXT( "\nType    : " );
@@ -451,44 +367,6 @@ dump_instruction_verbose(
          }
       }
 
-      if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
-         EOL();
-         TXT( "\nType        : " );
-         ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
-         if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
-            TXT( "\nCondMask    : " );
-            ENM( dst->DstRegisterExtConcode.CondMask, TGSI_CCS );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleX != dst->DstRegisterExtConcode.CondSwizzleX ) {
-            TXT( "\nCondSwizzleX: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleX, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleY != dst->DstRegisterExtConcode.CondSwizzleY ) {
-            TXT( "\nCondSwizzleY: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleY, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleZ != dst->DstRegisterExtConcode.CondSwizzleZ ) {
-            TXT( "\nCondSwizzleZ: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleZ, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSwizzleW != dst->DstRegisterExtConcode.CondSwizzleW ) {
-            TXT( "\nCondSwizzleW: " );
-            ENM( dst->DstRegisterExtConcode.CondSwizzleW, TGSI_SWIZZLES );
-         }
-         if( deflt || fd->DstRegisterExtConcode.CondSrcIndex != dst->DstRegisterExtConcode.CondSrcIndex ) {
-            TXT( "\nCondSrcIndex: " );
-            UID( dst->DstRegisterExtConcode.CondSrcIndex );
-         }
-         if( ignored ) {
-            TXT( "\nPadding     : " );
-            UIX( dst->DstRegisterExtConcode.Padding );
-            if( deflt || fd->DstRegisterExtConcode.Extended != dst->DstRegisterExtConcode.Extended ) {
-               TXT( "\nExtended    : " );
-               UID( dst->DstRegisterExtConcode.Extended );
-            }
-         }
-      }
-
       if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
          EOL();
          TXT( "\nType    : " );
@@ -556,52 +434,6 @@ dump_instruction_verbose(
          }
       }
 
-      if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
-         EOL();
-         TXT( "\nType       : " );
-         ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
-            TXT( "\nExtSwizzleX: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleY != src->SrcRegisterExtSwz.ExtSwizzleY ) {
-            TXT( "\nExtSwizzleY: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleZ != src->SrcRegisterExtSwz.ExtSwizzleZ ) {
-            TXT( "\nExtSwizzleZ: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleW != src->SrcRegisterExtSwz.ExtSwizzleW ) {
-            TXT( "\nExtSwizzleW: " );
-            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateX != src->SrcRegisterExtSwz.NegateX ) {
-            TXT( "\nNegateX   : " );
-            UID( src->SrcRegisterExtSwz.NegateX );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateY != src->SrcRegisterExtSwz.NegateY ) {
-            TXT( "\nNegateY   : " );
-            UID( src->SrcRegisterExtSwz.NegateY );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateZ != src->SrcRegisterExtSwz.NegateZ ) {
-            TXT( "\nNegateZ   : " );
-            UID( src->SrcRegisterExtSwz.NegateZ );
-         }
-         if( deflt || fs->SrcRegisterExtSwz.NegateW != src->SrcRegisterExtSwz.NegateW ) {
-            TXT( "\nNegateW   : " );
-            UID( src->SrcRegisterExtSwz.NegateW );
-         }
-         if( ignored ) {
-            TXT( "\nPadding   : " );
-            UIX( src->SrcRegisterExtSwz.Padding );
-            if( deflt || fs->SrcRegisterExtSwz.Extended != src->SrcRegisterExtSwz.Extended ) {
-               TXT( "\nExtended   : " );
-               UID( src->SrcRegisterExtSwz.Extended );
-            }
-         }
-      }
-
       if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
          EOL();
          TXT( "\nType     : " );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index c79c56debd6..b7569e74d4b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -107,6 +107,7 @@
 #define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
 #define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 #define TEMP_R0            TGSI_EXEC_TEMP_R0
+#define TEMP_P0            TGSI_EXEC_TEMP_P0
 
 #define IS_CHANNEL_ENABLED(INST, CHAN)\
    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
@@ -210,9 +211,8 @@ tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
          uint channelsWritten = 0x0;
          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
             /* check if we're reading a channel that's been written */
-            uint swizzle = tgsi_util_get_full_src_register_extswizzle(&inst->FullSrcRegisters[i], chan);
-            if (swizzle <= TGSI_SWIZZLE_W &&
-                (channelsWritten & (1 << swizzle))) {
+            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
+            if (channelsWritten & (1 << swizzle)) {
                return TRUE;
             }
 
@@ -338,7 +338,7 @@ tgsi_exec_machine_bind_shader(
             /* XXX we only handle SOA dependencies properly for MOV/SWZ
              * at this time!
              */
-            if (opcode != TGSI_OPCODE_MOV && opcode != TGSI_OPCODE_SWZ) {
+            if (opcode != TGSI_OPCODE_MOV) {
                debug_printf("Warning: SOA dependency in instruction"
                             " is not handled:\n");
                tgsi_dump_instruction(&parse.FullToken.FullInstruction,
@@ -1130,10 +1130,10 @@ fetch_src_file_channel(
    union tgsi_exec_channel *chan )
 {
    switch( swizzle ) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT:
          assert(mach->Consts);
@@ -1188,6 +1188,17 @@ fetch_src_file_channel(
          chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
          break;
 
+      case TGSI_FILE_PREDICATE:
+         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
+         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
+         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
+         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
+         chan->u[0] = mach->Addrs[0].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[0].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[0].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[0].xyzw[swizzle].u[3];
+         break;
+
       case TGSI_FILE_OUTPUT:
          /* vertex/fragment output vars can be read too */
          chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
@@ -1201,14 +1212,6 @@ fetch_src_file_channel(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
-      break;
-
    default:
       assert( 0 );
    }
@@ -1367,7 +1370,7 @@ fetch_source(
        */
    }
 
-   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    fetch_src_file_channel(
       mach,
       reg->SrcRegister.File,
@@ -1475,119 +1478,17 @@ store_dest(
       dst = &mach->Addrs[index].xyzw[chan_index];
       break;
 
+   case TGSI_FILE_PREDICATE:
+      index = reg->DstRegister.Index;
+      assert(index < TGSI_EXEC_NUM_PREDS);
+      dst = &mach->Addrs[index].xyzw[chan_index];
+      break;
+
    default:
       assert( 0 );
       return;
    }
 
-   if (inst->InstructionExtNv.CondFlowEnable) {
-      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
-      uint swizzle;
-      uint shift;
-      uint mask;
-      uint test;
-
-      /* Only CC0 supported.
-       */
-      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
-
-      switch (chan_index) {
-      case CHAN_X:
-         swizzle = inst->InstructionExtNv.CondSwizzleX;
-         break;
-      case CHAN_Y:
-         swizzle = inst->InstructionExtNv.CondSwizzleY;
-         break;
-      case CHAN_Z:
-         swizzle = inst->InstructionExtNv.CondSwizzleZ;
-         break;
-      case CHAN_W:
-         swizzle = inst->InstructionExtNv.CondSwizzleW;
-         break;
-      default:
-         assert( 0 );
-         return;
-      }
-
-      switch (swizzle) {
-      case TGSI_SWIZZLE_X:
-         shift = TGSI_EXEC_CC_X_SHIFT;
-         mask = TGSI_EXEC_CC_X_MASK;
-         break;
-      case TGSI_SWIZZLE_Y:
-         shift = TGSI_EXEC_CC_Y_SHIFT;
-         mask = TGSI_EXEC_CC_Y_MASK;
-         break;
-      case TGSI_SWIZZLE_Z:
-         shift = TGSI_EXEC_CC_Z_SHIFT;
-         mask = TGSI_EXEC_CC_Z_MASK;
-         break;
-      case TGSI_SWIZZLE_W:
-         shift = TGSI_EXEC_CC_W_SHIFT;
-         mask = TGSI_EXEC_CC_W_MASK;
-         break;
-      default:
-         assert( 0 );
-         return;
-      }
-
-      switch (inst->InstructionExtNv.CondMask) {
-      case TGSI_CC_GT:
-         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_EQ:
-         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_LT:
-         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_GE:
-         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_LE:
-         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_NE:
-         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
-         for (i = 0; i < QUAD_SIZE; i++)
-            if (cc->u[i] & test)
-               execmask &= ~(1 << i);
-         break;
-
-      case TGSI_CC_TR:
-         break;
-
-      case TGSI_CC_FL:
-         for (i = 0; i < QUAD_SIZE; i++)
-            execmask &= ~(1 << i);
-         break;
-
-      default:
-         assert( 0 );
-         return;
-      }
-   }
-
    switch (inst->Instruction.Saturate) {
    case TGSI_SAT_NONE:
       for (i = 0; i < QUAD_SIZE; i++)
@@ -1622,51 +1523,6 @@ store_dest(
    default:
       assert( 0 );
    }
-
-   if (inst->InstructionExtNv.CondDstUpdate) {
-      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
-      uint shift;
-      uint mask;
-
-      /* Only CC0 supported.
-       */
-      assert( inst->InstructionExtNv.CondDstIndex < 1 );
-
-      switch (chan_index) {
-      case CHAN_X:
-         shift = TGSI_EXEC_CC_X_SHIFT;
-         mask = ~TGSI_EXEC_CC_X_MASK;
-         break;
-      case CHAN_Y:
-         shift = TGSI_EXEC_CC_Y_SHIFT;
-         mask = ~TGSI_EXEC_CC_Y_MASK;
-         break;
-      case CHAN_Z:
-         shift = TGSI_EXEC_CC_Z_SHIFT;
-         mask = ~TGSI_EXEC_CC_Z_MASK;
-         break;
-      case CHAN_W:
-         shift = TGSI_EXEC_CC_W_SHIFT;
-         mask = ~TGSI_EXEC_CC_W_MASK;
-         break;
-      default:
-         assert( 0 );
-         return;
-      }
-
-      for (i = 0; i < QUAD_SIZE; i++)
-         if (execmask & (1 << i)) {
-            cc->u[i] &= mask;
-            if (dst->f[i] < 0.0f)
-               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
-            else if (dst->f[i] > 0.0f)
-               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
-            else if (dst->f[i] == 0.0f)
-               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
-            else
-               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
-         }
-   }
 }
 
 #define FETCH(VAL,INDEX,CHAN)\
@@ -1689,10 +1545,8 @@ exec_kil(struct tgsi_exec_machine *mach,
    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
    union tgsi_exec_channel r[1];
 
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+   /* This mask stores component bits that were already tested. */
+   uniquemask = 0;
 
    for (chan_index = 0; chan_index < 4; chan_index++)
    {
@@ -1700,7 +1554,7 @@ exec_kil(struct tgsi_exec_machine *mach,
       uint i;
 
       /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle (
+      swizzle = tgsi_util_get_full_src_register_swizzle (
                         &inst->FullSrcRegisters[0],
                         chan_index);
 
@@ -1728,32 +1582,8 @@ exec_kilp(struct tgsi_exec_machine *mach,
 {
    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
 
-   if (inst->InstructionExtNv.CondFlowEnable) {
-      uint swizzle[4];
-      uint chan_index;
-
-      kilmask = 0x0;
-
-      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
-      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
-      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
-      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
-
-      for (chan_index = 0; chan_index < 4; chan_index++)
-      {
-         uint i;
-
-         for (i = 0; i < 4; i++) {
-            /* TODO: evaluate the condition code */
-            if (0)
-               kilmask |= 1 << i;
-         }
-      }
-   }
-   else {
-      /* "unconditional" kil */
-      kilmask = mach->ExecMask;
-   }
+   /* "unconditional" kil */
+   kilmask = mach->ExecMask;
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
 }
 
@@ -1981,8 +1811,8 @@ exec_declaration(
             break;
 
          default:
-            eval = NULL;
             assert( 0 );
+            return;
          }
 
          if( mask == TGSI_WRITEMASK_XYZW ) {
@@ -2031,7 +1861,6 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       if (inst->Flags & SOA_DEPENDENCY_FLAG) {
          /* Do all fetches into temp regs, then do all stores to avoid
           * intermediate/accidental clobbering.  This could be done all the
@@ -3223,22 +3052,6 @@ exec_instruction(
       /* no-op */
       break;
 
-   case TGSI_OPCODE_NOISE1:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE2:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE3:
-      assert( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE4:
-      assert( 0 );
-      break;
-
    case TGSI_OPCODE_NOP:
       break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index c72f76809d4..471f591dd6e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -168,13 +168,18 @@ struct tgsi_exec_labels
 
 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 8)
 #define TGSI_EXEC_NUM_ADDRS         1
-#define TGSI_EXEC_NUM_TEMP_EXTRAS   9
 
+/* predicate register */
+#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_NUM_PREDS         1
 
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   10
 
-#define TGSI_EXEC_MAX_COND_NESTING  20
-#define TGSI_EXEC_MAX_LOOP_NESTING  20
-#define TGSI_EXEC_MAX_CALL_NESTING  20
+
+
+#define TGSI_EXEC_MAX_COND_NESTING  32
+#define TGSI_EXEC_MAX_LOOP_NESTING  32
+#define TGSI_EXEC_MAX_CALL_NESTING  32
 
 /* The maximum number of input attributes per vertex. For 2D
  * input register files, this is the stride between two 1D
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 17af4cb7ad2..be375cabb8b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -134,10 +134,10 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 0, 0, 0, 0, 1, "BGNSUB", TGSI_OPCODE_BGNSUB },
    { 0, 0, 0, 1, 1, 0, "ENDLOOP", TGSI_OPCODE_ENDLOOP },
    { 0, 0, 0, 0, 1, 0, "ENDSUB", TGSI_OPCODE_ENDSUB },
-   { 1, 1, 0, 0, 0, 0, "NOISE1", TGSI_OPCODE_NOISE1 },
-   { 1, 1, 0, 0, 0, 0, "NOISE2", TGSI_OPCODE_NOISE2 },
-   { 1, 1, 0, 0, 0, 0, "NOISE3", TGSI_OPCODE_NOISE3 },
-   { 1, 1, 0, 0, 0, 0, "NOISE4", TGSI_OPCODE_NOISE4 },
+   { 0, 0, 0, 0, 0, 0, "", 103 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 104 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 105 },     /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 106 },     /* removed */
    { 0, 0, 0, 0, 0, 0, "NOP", TGSI_OPCODE_NOP },
    { 0, 0, 0, 0, 0, 0, "", 108 },     /* removed */
    { 0, 0, 0, 0, 0, 0, "", 109 },     /* removed */
@@ -149,7 +149,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 1, 0, 0, 0, 0, "BREAKC", TGSI_OPCODE_BREAKC },
    { 0, 1, 0, 0, 0, 0, "KIL", TGSI_OPCODE_KIL },
    { 0, 0, 0, 0, 0, 0, "END", TGSI_OPCODE_END },
-   { 1, 1, 0, 0, 0, 0, "SWZ", TGSI_OPCODE_SWZ }
+   { 0, 0, 0, 0, 0, 0, "", 118 }      /* removed */
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index e7bcf4bf754..b34263da489 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -139,10 +139,6 @@ OP00_LBL(BGNLOOP)
 OP00(BGNSUB)
 OP00_LBL(ENDLOOP)
 OP00(ENDSUB)
-OP11(NOISE1)
-OP11(NOISE2)
-OP11(NOISE3)
-OP11(NOISE4)
 OP00(NOP)
 OP11(NRM4)
 OP01(CALLNZ)
@@ -150,7 +146,6 @@ OP01(IFC)
 OP01(BREAKC)
 OP01(KIL)
 OP00(END)
-OP11(SWZ)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 4870f82b6bd..83f9df1183e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -181,10 +181,6 @@ tgsi_parse_token(
          next_token( ctx, &token );
 
          switch( token.Type ) {
-         case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            copy_token(&inst->InstructionExtNv, &token);
-            break;
-
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
             copy_token(&inst->InstructionExtLabel, &token);
             break;
@@ -193,6 +189,10 @@ tgsi_parse_token(
             copy_token(&inst->InstructionExtTexture, &token);
             break;
 
+         case TGSI_INSTRUCTION_EXT_TYPE_PREDICATE:
+            copy_token(&inst->InstructionExtPredicate, &token);
+            break;
+
          default:
             assert( 0 );
          }
@@ -220,11 +220,6 @@ tgsi_parse_token(
             next_token( ctx, &token );
 
             switch( token.Type ) {
-            case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
-                          &token);
-               break;
-
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
                copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
                           &token);
@@ -264,11 +259,6 @@ tgsi_parse_token(
             next_token( ctx, &token );
 
             switch( token.Type ) {
-            case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
-                          &token);
-               break;
-
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
                copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
                           &token);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index a26ee5ba862..76f1676d85d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -49,14 +49,12 @@ struct tgsi_full_dst_register
 {
    struct tgsi_dst_register               DstRegister;
    struct tgsi_src_register               DstRegisterInd;
-   struct tgsi_dst_register_ext_concode   DstRegisterExtConcode;
    struct tgsi_dst_register_ext_modulate  DstRegisterExtModulate;
 };
 
 struct tgsi_full_src_register
 {
    struct tgsi_src_register         SrcRegister;
-   struct tgsi_src_register_ext_swz SrcRegisterExtSwz;
    struct tgsi_src_register_ext_mod SrcRegisterExtMod;
    struct tgsi_src_register         SrcRegisterInd;
    struct tgsi_dimension            SrcRegisterDim;
@@ -82,9 +80,9 @@ struct tgsi_full_immediate
 struct tgsi_full_instruction
 {
    struct tgsi_instruction             Instruction;
-   struct tgsi_instruction_ext_nv      InstructionExtNv;
    struct tgsi_instruction_ext_label   InstructionExtLabel;
    struct tgsi_instruction_ext_texture InstructionExtTexture;
+   struct tgsi_instruction_ext_predicate InstructionExtPredicate;
    struct tgsi_full_dst_register       FullDstRegisters[TGSI_FULL_MAX_DST_REGISTERS];
    struct tgsi_full_src_register       FullSrcRegisters[TGSI_FULL_MAX_SRC_REGISTERS];
    uint Flags;  /**< user-defined usage */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 4b1c7d4e01b..617fd7f6be1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -283,14 +283,14 @@ emit_fetch(struct gen_context *gen,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
-   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+   uint swizzle = tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    int dst_vec = -1;
 
    switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_INPUT:
          {
@@ -349,16 +349,6 @@ emit_fetch(struct gen_context *gen,
          assert( 0 );
       }
       break;
-   case TGSI_EXTSWIZZLE_ZERO:
-      ppc_vzero(gen->f, dst_vec);
-      break;
-   case TGSI_EXTSWIZZLE_ONE:
-      {
-         int one_vec = gen_one_vec(gen);
-         dst_vec = ppc_allocate_vec_register(gen->f);
-         ppc_vmove(gen->f, dst_vec, one_vec);
-      }
-      break;
    default:
       assert( 0 );
    }
@@ -418,8 +408,8 @@ equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
       return FALSE;
    if (a->SrcRegister.Index != b->SrcRegister.Index)
       return FALSE;
-   swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
-   swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+   swz_a = tgsi_util_get_full_src_register_swizzle(a, chan_a);
+   swz_b = tgsi_util_get_full_src_register_swizzle(b, chan_b);
    if (swz_a != swz_b)
       return FALSE;
    sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
@@ -635,7 +625,6 @@ emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
          break;
       case TGSI_OPCODE_MOV:
-      case TGSI_OPCODE_SWZ:
          if (v0 != v1)
             ppc_vmove(gen->f, v1, v0);
          break;
@@ -1119,7 +1108,6 @@ emit_instruction(struct gen_context *gen,
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
    case TGSI_OPCODE_ABS:
    case TGSI_OPCODE_FLR:
    case TGSI_OPCODE_FRC:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 8a13885da9b..36e27ea52f4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -141,7 +141,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP"
+   "LOOP",
+   "PRED"
 };
 
 static boolean
@@ -358,7 +359,7 @@ epilog(
 
 boolean
 tgsi_sanity_check(
-   struct tgsi_token *tokens )
+   const struct tgsi_token *tokens )
 {
    struct sanity_check_ctx ctx;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
index ca45e94c7ad..52263ff8832 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -40,7 +40,7 @@ extern "C" {
  */
 boolean
 tgsi_sanity_check(
-   struct tgsi_token *tokens );
+   const struct tgsi_token *tokens );
 
 #if defined __cplusplus
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index c535788819f..f9c16f1b6cb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -132,6 +132,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                if (file == TGSI_FILE_INPUT) {
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.SemanticName;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
                   info->num_inputs++;
                }
                else if (file == TGSI_FILE_OUTPUT) {
@@ -227,11 +228,6 @@ tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
                 src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
                 src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W ||
 
-                src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
-                src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
-                src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
-                src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W ||
-
                 dst->DstRegister.WriteMask != TGSI_WRITEMASK_XYZW)
             {
                tgsi_parse_free(&parse);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 2c1a75bc812..8a7ee0c7e4f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte num_outputs;
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 1e719940ec5..a96fc94c7ad 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -39,8 +39,9 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
-#include "tgsi_exec.h"
-#include "tgsi_sse2.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
 
 #include "rtasm/rtasm_x86sse.h"
 
@@ -1259,13 +1260,13 @@ emit_fetch(
    const struct tgsi_full_src_register *reg,
    const unsigned chan_index )
 {
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
 
    switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_CONSTANT:
          emit_const(
@@ -1307,22 +1308,6 @@ emit_fetch(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      emit_tempf(
-         func,
-         xmm,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      emit_tempf(
-         func,
-         xmm,
-         TEMP_ONE_I,
-         TEMP_ONE_C );
-      break;
-
    default:
       assert( 0 );
    }
@@ -1360,6 +1345,32 @@ emit_store(
    const struct tgsi_full_instruction *inst,
    unsigned chan_index )
 {
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      sse_maxps(
+         func,
+         make_xmm( xmm ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ) );
+
+      sse_minps(
+         func,
+         make_xmm( xmm ),
+         get_temp(
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C ) );
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+
+
    switch( reg->DstRegister.File ) {
    case TGSI_FILE_OUTPUT:
       emit_output(
@@ -1388,19 +1399,6 @@ emit_store(
    default:
       assert( 0 );
    }
-
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      /* assert( 0 ); */
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
 }
 
 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
@@ -1568,13 +1566,13 @@ emit_kil(
    /* This mask stores component bits that were already tested. Note that
     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
     * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+   uniquemask = 0;
 
    FOR_EACH_CHANNEL( chan_index ) {
       unsigned swizzle;
 
       /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle(
+      swizzle = tgsi_util_get_full_src_register_swizzle(
          reg,
          chan_index );
 
@@ -1747,14 +1745,6 @@ emit_instruction(
    if (indirect_temp_reference(inst))
       return FALSE;
 
-   /* we don't handle saturation/clamping yet */
-   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
-      return FALSE;
-
-   /* need to use extra temps to fix SOA dependencies : */
-   if (tgsi_check_soa_dependencies(inst))
-      return FALSE;
-
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1766,10 +1756,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         STORE( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 4 + chan_index, 0, chan_index );
+      }
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 4 + chan_index, 0, chan_index );
       }
       break;
 
@@ -1847,7 +1838,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
       FETCH( func, *inst, 0, 0, CHAN_X );
       emit_rcp( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -1856,7 +1846,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
       FETCH( func, *inst, 0, 0, CHAN_X );
       emit_abs( func, 0 );
       emit_rsqrt( func, 1, 0 );
@@ -1954,7 +1943,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
       emit_mul( func, 0, 1 );
@@ -1972,7 +1960,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP4:
-   /* TGSI_OPCODE_DOT4 */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
       emit_mul( func, 0, 1 );
@@ -2043,17 +2030,14 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
       emit_setcc( func, inst, cc_LessThan );
       break;
 
    case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
       emit_setcc( func, inst, cc_NotLessThan );
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
@@ -2283,7 +2267,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SEQ:
-      return 0;
+      emit_setcc( func, inst, cc_Equal );
       break;
 
    case TGSI_OPCODE_SFL:
@@ -2291,7 +2275,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SGT:
-      return 0;
+      emit_setcc( func, inst, cc_NotLessThanEqual );
       break;
 
    case TGSI_OPCODE_SIN:
@@ -2303,11 +2287,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SLE:
-      return 0;
+      emit_setcc( func, inst, cc_LessThanEqual );
       break;
 
    case TGSI_OPCODE_SNE:
-      return 0;
+      emit_setcc( func, inst, cc_NotEqual );
       break;
 
    case TGSI_OPCODE_STR:
@@ -2371,7 +2355,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          emit_sgn( func, 0, 0 );
@@ -2929,6 +2912,21 @@ tgsi_emit_sse2(
                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
                          "vertex shader" : "fragment shader");
 	 }
+
+         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
+            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
+
+            /* XXX: we only handle src/dst aliasing in a few opcodes
+             * currently.  Need to use an additional temporay to hold
+             * the result in the cases where the code is too opaque to
+             * fix.
+             */
+            if (opcode != TGSI_OPCODE_MOV) {
+               debug_printf("Warning: src/dst aliasing in instruction"
+                            " is not handled:\n");
+               tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
+            }
+         }
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index d438450b1e4..d2b03ffb2fc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -232,7 +232,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP"
+   "LOOP",
+   "PRED"
 };
 
 static boolean
@@ -538,13 +539,11 @@ static boolean
 parse_optional_swizzle(
    struct translate_ctx *ctx,
    uint swizzle[4],
-   boolean *parsed_swizzle,
-   boolean *parsed_extswizzle )
+   boolean *parsed_swizzle )
 {
    const char *cur = ctx->cur;
 
    *parsed_swizzle = FALSE;
-   *parsed_extswizzle = FALSE;
 
    eat_opt_white( &cur );
    if (*cur == '.') {
@@ -562,15 +561,8 @@ parse_optional_swizzle(
          else if (uprcase( *cur ) == 'W')
             swizzle[i] = TGSI_SWIZZLE_W;
          else {
-            if (*cur == '0')
-               swizzle[i] = TGSI_EXTSWIZZLE_ZERO;
-            else if (*cur == '1')
-               swizzle[i] = TGSI_EXTSWIZZLE_ONE;
-            else {
-               report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
-               return FALSE;
-            }
-            *parsed_extswizzle = TRUE;
+	    report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
+	    return FALSE;
          }
          cur++;
       }
@@ -595,7 +587,6 @@ parse_src_operand(
    uint swizzle[4];
    boolean parsed_ext_negate_paren = FALSE;
    boolean parsed_swizzle;
-   boolean parsed_extswizzle;
 
    if (*ctx->cur == '-') {
       cur = ctx->cur;
@@ -690,16 +681,8 @@ parse_src_operand(
 
    /* Parse optional swizzle.
     */
-   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle, &parsed_extswizzle )) {
-      if (parsed_extswizzle) {
-         assert( parsed_swizzle );
-
-         src->SrcRegisterExtSwz.ExtSwizzleX = swizzle[0];
-         src->SrcRegisterExtSwz.ExtSwizzleY = swizzle[1];
-         src->SrcRegisterExtSwz.ExtSwizzleZ = swizzle[2];
-         src->SrcRegisterExtSwz.ExtSwizzleW = swizzle[3];
-      }
-      else if (parsed_swizzle) {
+   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle )) {
+      if (parsed_swizzle) {
          src->SrcRegister.SwizzleX = swizzle[0];
          src->SrcRegister.SwizzleY = swizzle[1];
          src->SrcRegister.SwizzleZ = swizzle[2];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index f7096bd8e2c..3f752e9352f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -29,8 +29,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_sanity.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -45,18 +47,14 @@ union tgsi_any_token {
    struct tgsi_immediate imm;
    union  tgsi_immediate_data imm_data;
    struct tgsi_instruction insn;
-   struct tgsi_instruction_ext_nv insn_ext_nv;
    struct tgsi_instruction_ext_label insn_ext_label;
    struct tgsi_instruction_ext_texture insn_ext_texture;
    struct tgsi_instruction_ext_predicate insn_ext_predicate;
    struct tgsi_src_register src;
-   struct tgsi_src_register_ext_swz src_ext_swz;
    struct tgsi_src_register_ext_mod src_ext_mod;
    struct tgsi_dimension dim;
    struct tgsi_dst_register dst;
-   struct tgsi_dst_register_ext_concode dst_ext_code;
    struct tgsi_dst_register_ext_modulate dst_ext_mod;
-   struct tgsi_dst_register_ext_predicate dst_ext_pred;
    unsigned value;
 };
 
@@ -70,9 +68,11 @@ struct ureg_tokens {
 
 #define UREG_MAX_INPUT PIPE_MAX_ATTRIBS
 #define UREG_MAX_OUTPUT PIPE_MAX_ATTRIBS
+#define UREG_MAX_CONSTANT_RANGE 32
 #define UREG_MAX_IMMEDIATE 32
 #define UREG_MAX_TEMP 256
 #define UREG_MAX_ADDR 2
+#define UREG_MAX_PRED 1
 
 #define DOMAIN_DECL 0
 #define DOMAIN_INSN 1
@@ -86,8 +86,10 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned interp;
-   } input[UREG_MAX_INPUT];
-   unsigned nr_inputs;
+   } fs_input[UREG_MAX_INPUT];
+   unsigned nr_fs_inputs;
+
+   unsigned vs_inputs[UREG_MAX_INPUT/32];
 
    struct {
       unsigned semantic_name;
@@ -107,9 +109,14 @@ struct ureg_program
    unsigned temps_active[UREG_MAX_TEMP / 32];
    unsigned nr_temps;
 
-   unsigned nr_addrs;
+   struct {
+      unsigned first;
+      unsigned last;
+   } constant_range[UREG_MAX_CONSTANT_RANGE];
+   unsigned nr_constant_ranges;
 
-   unsigned nr_constants;
+   unsigned nr_addrs;
+   unsigned nr_preds;
    unsigned nr_instructions;
 
    struct ureg_tokens domain[2];
@@ -119,6 +126,9 @@ static union tgsi_any_token error_tokens[32];
 
 static void tokens_error( struct ureg_tokens *tokens )
 {
+   if (tokens->tokens && tokens->tokens != error_tokens)
+      FREE(tokens->tokens);
+
    tokens->tokens = error_tokens;
    tokens->size = Elements(error_tokens);
    tokens->count = 0;
@@ -130,8 +140,9 @@ static void tokens_expand( struct ureg_tokens *tokens,
 {
    unsigned old_size = tokens->size * sizeof(unsigned);
 
-   if (tokens->tokens == error_tokens)
-      goto fail;
+   if (tokens->tokens == error_tokens) {
+      return;
+   }
 
    while (tokens->count + count > tokens->size) {
       tokens->size = (1 << ++tokens->order);
@@ -140,13 +151,9 @@ static void tokens_expand( struct ureg_tokens *tokens,
    tokens->tokens = REALLOC(tokens->tokens, 
                             old_size,
                             tokens->size * sizeof(unsigned));
-   if (tokens->tokens == NULL) 
-      goto fail;
-
-   return;
-          
-fail:
-   tokens_error(tokens);
+   if (tokens->tokens == NULL) {
+      tokens_error(tokens);
+   }
 }
 
 static void set_bad( struct ureg_program *ureg )
@@ -196,9 +203,13 @@ ureg_dst_register( unsigned file,
    dst.IndirectIndex = 0;
    dst.IndirectSwizzle = 0;
    dst.Saturate  = 0;
+   dst.Predicate = 0;
+   dst.PredNegate = 0;
+   dst.PredSwizzleX = TGSI_SWIZZLE_X;
+   dst.PredSwizzleY = TGSI_SWIZZLE_Y;
+   dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
+   dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = index;
-   dst.Pad1      = 0;
-   dst.Pad2      = 0;
 
    return dst;
 }
@@ -228,25 +239,25 @@ ureg_src_register( unsigned file,
 
 
 
-static struct ureg_src 
-ureg_DECL_input( struct ureg_program *ureg,
-                 unsigned name,
-                 unsigned index,
-                 unsigned interp_mode )
+struct ureg_src 
+ureg_DECL_fs_input( struct ureg_program *ureg,
+                    unsigned name,
+                    unsigned index,
+                    unsigned interp_mode )
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_inputs; i++) {
-      if (ureg->input[i].semantic_name == name &&
-          ureg->input[i].semantic_index == index) 
+   for (i = 0; i < ureg->nr_fs_inputs; i++) {
+      if (ureg->fs_input[i].semantic_name == name &&
+          ureg->fs_input[i].semantic_index == index) 
          goto out;
    }
 
-   if (ureg->nr_inputs < UREG_MAX_INPUT) {
-      ureg->input[i].semantic_name = name;
-      ureg->input[i].semantic_index = index;
-      ureg->input[i].interp = interp_mode;
-      ureg->nr_inputs++;
+   if (ureg->nr_fs_inputs < UREG_MAX_INPUT) {
+      ureg->fs_input[i].semantic_name = name;
+      ureg->fs_input[i].semantic_index = index;
+      ureg->fs_input[i].interp = interp_mode;
+      ureg->nr_fs_inputs++;
    }
    else {
       set_bad( ureg );
@@ -257,25 +268,14 @@ out:
 }
 
 
-
-struct ureg_src 
-ureg_DECL_fs_input( struct ureg_program *ureg,
-                    unsigned name,
-                    unsigned index,
-                    unsigned interp )
-{
-   assert(ureg->processor == TGSI_PROCESSOR_FRAGMENT);
-   return ureg_DECL_input( ureg, name, index, interp );
-}
-
-
 struct ureg_src 
 ureg_DECL_vs_input( struct ureg_program *ureg,
-                    unsigned name,
                     unsigned index )
 {
    assert(ureg->processor == TGSI_PROCESSOR_VERTEX);
-   return ureg_DECL_input( ureg, name, index, TGSI_INTERPOLATE_CONSTANT );
+   
+   ureg->vs_inputs[index/32] |= 1 << (index % 32);
+   return ureg_src_register( TGSI_FILE_INPUT, index );
 }
 
 
@@ -313,9 +313,57 @@ out:
  * value or manage any constant_buffer contents -- that's the
  * resposibility of the calling code.
  */
-struct ureg_src ureg_DECL_constant(struct ureg_program *ureg )
+struct ureg_src ureg_DECL_constant(struct ureg_program *ureg, 
+                                   unsigned index )
 {
-   return ureg_src_register( TGSI_FILE_CONSTANT, ureg->nr_constants++ );
+   unsigned minconst = index, maxconst = index;
+   unsigned i;
+
+   /* Inside existing range?
+    */
+   for (i = 0; i < ureg->nr_constant_ranges; i++) {
+      if (ureg->constant_range[i].first <= index &&
+          ureg->constant_range[i].last >= index)
+         goto out;
+   }
+
+   /* Extend existing range?
+    */
+   for (i = 0; i < ureg->nr_constant_ranges; i++) {
+      if (ureg->constant_range[i].last == index - 1) {
+         ureg->constant_range[i].last = index;
+         goto out;
+      }
+
+      if (ureg->constant_range[i].first == index + 1) {
+         ureg->constant_range[i].first = index;
+         goto out;
+      }
+
+      minconst = MIN2(minconst, ureg->constant_range[i].first);
+      maxconst = MAX2(maxconst, ureg->constant_range[i].last);
+   }
+
+   /* Create new range?
+    */
+   if (ureg->nr_constant_ranges < UREG_MAX_CONSTANT_RANGE) {
+      i = ureg->nr_constant_ranges++;
+      ureg->constant_range[i].first = index;
+      ureg->constant_range[i].last = index;
+   }
+
+   /* Collapse all ranges down to one:
+    */
+   i = 0;
+   ureg->constant_range[0].first = minconst;
+   ureg->constant_range[0].last = maxconst;
+   ureg->nr_constant_ranges = 1;
+
+out:
+   assert(i < ureg->nr_constant_ranges);
+   assert(ureg->constant_range[i].first <= index);
+   assert(ureg->constant_range[i].last >= index);
+   return ureg_src_register( TGSI_FILE_CONSTANT, index );
 }
 
 
@@ -369,6 +417,19 @@ struct ureg_dst ureg_DECL_address( struct ureg_program *ureg )
    return ureg_dst_register( TGSI_FILE_ADDRESS, 0 );
 }
 
+/* Allocate a new predicate register.
+ */
+struct ureg_dst
+ureg_DECL_predicate(struct ureg_program *ureg)
+{
+   if (ureg->nr_preds < UREG_MAX_PRED) {
+      return ureg_dst_register(TGSI_FILE_PREDICATE, ureg->nr_preds++);
+   }
+
+   assert(0);
+   return ureg_dst_register(TGSI_FILE_PREDICATE, 0);
+}
+
 /* Allocate a new sampler.
  */
 struct ureg_src ureg_DECL_sampler( struct ureg_program *ureg,
@@ -566,17 +627,40 @@ ureg_emit_dst( struct ureg_program *ureg,
 }
 
 
+static void validate( unsigned opcode,
+                      unsigned nr_dst,
+                      unsigned nr_src )
+{
+#ifdef DEBUG
+   const struct tgsi_opcode_info *info = tgsi_get_opcode_info( opcode );
+   assert(info);
+   if(info) {
+      assert(nr_dst == info->num_dst);
+      assert(nr_src == info->num_src);
+   }
+#endif
+}
 
-unsigned
+struct ureg_emit_insn_result
 ureg_emit_insn(struct ureg_program *ureg,
                unsigned opcode,
                boolean saturate,
+               boolean predicate,
+               boolean pred_negate,
+               unsigned pred_swizzle_x,
+               unsigned pred_swizzle_y,
+               unsigned pred_swizzle_z,
+               unsigned pred_swizzle_w,
                unsigned num_dst,
                unsigned num_src )
 {
    union tgsi_any_token *out;
+   uint count = predicate ? 2 : 1;
+   struct ureg_emit_insn_result result;
 
-   out = get_tokens( ureg, DOMAIN_INSN, 1 );
+   validate( opcode, num_dst, num_src );
+   
+   out = get_tokens( ureg, DOMAIN_INSN, count );
    out[0].value = 0;
    out[0].insn.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
    out[0].insn.NrTokens = 0;
@@ -585,17 +669,34 @@ ureg_emit_insn(struct ureg_program *ureg,
    out[0].insn.NumDstRegs = num_dst;
    out[0].insn.NumSrcRegs = num_src;
    out[0].insn.Padding = 0;
-   out[0].insn.Extended = 0;
-   
+
+   result.insn_token = ureg->domain[DOMAIN_INSN].count - count;
+
+   if (predicate) {
+      out[0].insn.Extended = 1;
+      out[1].insn_ext_predicate = tgsi_default_instruction_ext_predicate();
+      out[1].insn_ext_predicate.Negate = pred_negate;
+      out[1].insn_ext_predicate.SwizzleX = pred_swizzle_x;
+      out[1].insn_ext_predicate.SwizzleY = pred_swizzle_y;
+      out[1].insn_ext_predicate.SwizzleZ = pred_swizzle_z;
+      out[1].insn_ext_predicate.SwizzleW = pred_swizzle_w;
+
+      result.extended_token = result.insn_token + 1;
+   } else {
+      out[0].insn.Extended = 0;
+
+      result.extended_token = result.insn_token;
+   }
+
    ureg->nr_instructions++;
    
-   return ureg->domain[DOMAIN_INSN].count - 1;
+   return result;
 }
 
 
 void
 ureg_emit_label(struct ureg_program *ureg,
-                unsigned insn_token,
+                unsigned extended_token,
                 unsigned *label_token )
 {
    union tgsi_any_token *out, *insn;
@@ -604,9 +705,9 @@ ureg_emit_label(struct ureg_program *ureg,
       return;
 
    out = get_tokens( ureg, DOMAIN_INSN, 1 );
-   insn = retrieve_token( ureg, DOMAIN_INSN, insn_token );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
 
-   insn->insn.Extended = 1;
+   insn->token.Extended = 1;
 
    out[0].value = 0;
    out[0].insn_ext_label.Type = TGSI_INSTRUCTION_EXT_TYPE_LABEL;
@@ -640,15 +741,15 @@ ureg_fixup_label(struct ureg_program *ureg,
 
 void
 ureg_emit_texture(struct ureg_program *ureg,
-                  unsigned insn_token,
+                  unsigned extended_token,
                   unsigned target )
 {
    union tgsi_any_token *out, *insn;
 
    out = get_tokens( ureg, DOMAIN_INSN, 1 );
-   insn = retrieve_token( ureg, DOMAIN_INSN, insn_token );
+   insn = retrieve_token( ureg, DOMAIN_INSN, extended_token );
 
-   insn->insn.Extended = 1;
+   insn->token.Extended = 1;
 
    out[0].value = 0;
    out[0].insn_ext_texture.Type = TGSI_INSTRUCTION_EXT_TYPE_TEXTURE;
@@ -675,23 +776,83 @@ ureg_insn(struct ureg_program *ureg,
           const struct ureg_src *src,
           unsigned nr_src )
 {
-   unsigned insn, i;
+   struct ureg_emit_insn_result insn;
+   unsigned i;
    boolean saturate;
+   boolean predicate;
+   boolean negate;
+   unsigned swizzle[4];
 
-#ifdef DEBUG
-   {
-      const struct tgsi_opcode_info *info = tgsi_get_opcode_info( opcode );
-      assert(info);
-      if(info) {
-         assert(nr_dst == info->num_dst);
-         assert(nr_src == info->num_src);
-      }
+   saturate = nr_dst ? dst[0].Saturate : FALSE;
+   predicate = nr_dst ? dst[0].Predicate : FALSE;
+   if (predicate) {
+      negate = dst[0].PredNegate;
+      swizzle[0] = dst[0].PredSwizzleX;
+      swizzle[1] = dst[0].PredSwizzleY;
+      swizzle[2] = dst[0].PredSwizzleZ;
+      swizzle[3] = dst[0].PredSwizzleW;
    }
-#endif
-   
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         saturate,
+                         predicate,
+                         negate,
+                         swizzle[0],
+                         swizzle[1],
+                         swizzle[2],
+                         swizzle[3],
+                         nr_dst,
+                         nr_src);
+
+   for (i = 0; i < nr_dst; i++)
+      ureg_emit_dst( ureg, dst[i] );
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src( ureg, src[i] );
+
+   ureg_fixup_insn_size( ureg, insn.insn_token );
+}
+
+void
+ureg_tex_insn(struct ureg_program *ureg,
+              unsigned opcode,
+              const struct ureg_dst *dst,
+              unsigned nr_dst,
+              unsigned target,
+              const struct ureg_src *src,
+              unsigned nr_src )
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+   boolean saturate;
+   boolean predicate;
+   boolean negate;
+   unsigned swizzle[4];
+
    saturate = nr_dst ? dst[0].Saturate : FALSE;
+   predicate = nr_dst ? dst[0].Predicate : FALSE;
+   if (predicate) {
+      negate = dst[0].PredNegate;
+      swizzle[0] = dst[0].PredSwizzleX;
+      swizzle[1] = dst[0].PredSwizzleY;
+      swizzle[2] = dst[0].PredSwizzleZ;
+      swizzle[3] = dst[0].PredSwizzleW;
+   }
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         saturate,
+                         predicate,
+                         negate,
+                         swizzle[0],
+                         swizzle[1],
+                         swizzle[2],
+                         swizzle[3],
+                         nr_dst,
+                         nr_src);
 
-   insn = ureg_emit_insn( ureg, opcode, saturate, nr_dst, nr_src );
+   ureg_emit_texture( ureg, insn.extended_token, target );
 
    for (i = 0; i < nr_dst; i++)
       ureg_emit_dst( ureg, dst[i] );
@@ -699,7 +860,38 @@ ureg_insn(struct ureg_program *ureg,
    for (i = 0; i < nr_src; i++)
       ureg_emit_src( ureg, src[i] );
 
-   ureg_fixup_insn_size( ureg, insn );
+   ureg_fixup_insn_size( ureg, insn.insn_token );
+}
+
+
+void
+ureg_label_insn(struct ureg_program *ureg,
+                unsigned opcode,
+                const struct ureg_src *src,
+                unsigned nr_src,
+                unsigned *label_token )
+{
+   struct ureg_emit_insn_result insn;
+   unsigned i;
+
+   insn = ureg_emit_insn(ureg,
+                         opcode,
+                         FALSE,
+                         FALSE,
+                         FALSE,
+                         TGSI_SWIZZLE_X,
+                         TGSI_SWIZZLE_Y,
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W,
+                         0,
+                         nr_src);
+
+   ureg_emit_label( ureg, insn.extended_token, label_token );
+
+   for (i = 0; i < nr_src; i++)
+      ureg_emit_src( ureg, src[i] );
+
+   ureg_fixup_insn_size( ureg, insn.insn_token );
 }
 
 
@@ -777,13 +969,22 @@ static void emit_decls( struct ureg_program *ureg )
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_inputs; i++) {
-      emit_decl( ureg, 
-                 TGSI_FILE_INPUT, 
-                 i,
-                 ureg->input[i].semantic_name,
-                 ureg->input[i].semantic_index,
-                 ureg->input[i].interp );
+   if (ureg->processor == TGSI_PROCESSOR_VERTEX) {
+      for (i = 0; i < UREG_MAX_INPUT; i++) {
+         if (ureg->vs_inputs[i/32] & (1 << (i%32))) {
+            emit_decl_range( ureg, TGSI_FILE_INPUT, i, 1 );
+         }
+      }
+   }
+   else {
+      for (i = 0; i < ureg->nr_fs_inputs; i++) {
+         emit_decl( ureg, 
+                    TGSI_FILE_INPUT, 
+                    i,
+                    ureg->fs_input[i].semantic_name,
+                    ureg->fs_input[i].semantic_index,
+                    ureg->fs_input[i].interp );
+      }
    }
 
    for (i = 0; i < ureg->nr_outputs; i++) {
@@ -801,10 +1002,13 @@ static void emit_decls( struct ureg_program *ureg )
                        ureg->sampler[i].Index, 1 );
    }
 
-   if (ureg->nr_constants) {
-      emit_decl_range( ureg,
-                       TGSI_FILE_CONSTANT,
-                       0, ureg->nr_constants );
+   if (ureg->nr_constant_ranges) {
+      for (i = 0; i < ureg->nr_constant_ranges; i++)
+         emit_decl_range( ureg,
+                          TGSI_FILE_CONSTANT,
+                          ureg->constant_range[i].first, 
+                          (ureg->constant_range[i].last + 1 -
+                           ureg->constant_range[i].first) );
    }
 
    if (ureg->nr_temps) {
@@ -819,6 +1023,13 @@ static void emit_decls( struct ureg_program *ureg )
                        0, ureg->nr_addrs );
    }
 
+   if (ureg->nr_preds) {
+      emit_decl_range(ureg,
+                      TGSI_FILE_PREDICATE,
+                      0,
+                      ureg->nr_preds);
+   }
+
    for (i = 0; i < ureg->nr_immediates; i++) {
       emit_immediate( ureg,
                       ureg->immediate[i].v );
@@ -890,6 +1101,15 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
                    ureg->domain[DOMAIN_DECL].count);
       tgsi_dump( tokens, 0 );
    }
+
+#if DEBUG
+   if (tokens && !tgsi_sanity_check(tokens)) {
+      debug_printf("tgsi_ureg.c, sanity check failed on generated tokens:\n");
+      tgsi_dump(tokens, 0);
+      assert(0);
+   }
+#endif
+
    
    return tokens;
 }
@@ -911,6 +1131,25 @@ void *ureg_create_shader( struct ureg_program *ureg,
 }
 
 
+const struct tgsi_token *ureg_get_tokens( struct ureg_program *ureg,
+                                          unsigned *nr_tokens )
+{
+   const struct tgsi_token *tokens;
+
+   ureg_finalize(ureg);
+
+   tokens = &ureg->domain[DOMAIN_DECL].tokens[0].token;
+
+   if (nr_tokens) 
+      *nr_tokens = ureg->domain[DOMAIN_DECL].size;
+
+   ureg->domain[DOMAIN_DECL].tokens = 0;
+   ureg->domain[DOMAIN_DECL].size = 0;
+   ureg->domain[DOMAIN_DECL].order = 0;
+   ureg->domain[DOMAIN_DECL].count = 0;
+
+   return tokens;
+}
 
 
 struct ureg_program *ureg_create( unsigned processor )
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index acbca59040c..dae42911947 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -67,9 +67,13 @@ struct ureg_dst
    unsigned WriteMask   : 4;  /* TGSI_WRITEMASK_ */
    unsigned Indirect    : 1;  /* BOOL */
    unsigned Saturate    : 1;  /* BOOL */
+   unsigned Predicate   : 1;
+   unsigned PredNegate  : 1;  /* BOOL */
+   unsigned PredSwizzleX: 2;  /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleY: 2;  /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleZ: 2;  /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleW: 2;  /* TGSI_SWIZZLE_ */
    int      Index       : 16; /* SINT */
-   unsigned Pad1        : 5;
-   unsigned Pad2        : 1;  /* BOOL */
    int      IndirectIndex   : 16; /* SINT */
    int      IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
 };
@@ -82,10 +86,21 @@ ureg_create( unsigned processor );
 const struct tgsi_token *
 ureg_finalize( struct ureg_program * );
 
+/* Create and return a shader:
+ */
 void *
 ureg_create_shader( struct ureg_program *,
                     struct pipe_context *pipe );
 
+
+/* Alternately, return the built token stream and hand ownership of
+ * that memory to the caller:
+ */
+const struct tgsi_token *
+ureg_get_tokens( struct ureg_program *ureg,
+                 unsigned *nr_tokens );
+
+
 void 
 ureg_destroy( struct ureg_program * );
 
@@ -116,8 +131,7 @@ ureg_DECL_fs_input( struct ureg_program *,
 
 struct ureg_src
 ureg_DECL_vs_input( struct ureg_program *,
-                    unsigned semantic_name,
-                    unsigned semantic_index );
+                    unsigned index );
 
 struct ureg_dst
 ureg_DECL_output( struct ureg_program *,
@@ -130,7 +144,8 @@ ureg_DECL_immediate( struct ureg_program *,
                      unsigned nr );
 
 struct ureg_src
-ureg_DECL_constant( struct ureg_program * );
+ureg_DECL_constant( struct ureg_program *,
+                    unsigned index );
 
 struct ureg_dst
 ureg_DECL_temporary( struct ureg_program * );
@@ -142,6 +157,9 @@ ureg_release_temporary( struct ureg_program *ureg,
 struct ureg_dst
 ureg_DECL_address( struct ureg_program * );
 
+struct ureg_dst
+ureg_DECL_predicate(struct ureg_program *);
+
 /* Supply an index to the sampler declaration as this is the hook to
  * the external pipe_sampler state.  Users of this function probably
  * don't want just any sampler, but a specific one which they've set
@@ -233,14 +251,43 @@ ureg_insn(struct ureg_program *ureg,
           unsigned nr_src );
 
 
+void
+ureg_tex_insn(struct ureg_program *ureg,
+              unsigned opcode,
+              const struct ureg_dst *dst,
+              unsigned nr_dst,
+              unsigned target,
+              const struct ureg_src *src,
+              unsigned nr_src );
+
+
+void
+ureg_label_insn(struct ureg_program *ureg,
+                unsigned opcode,
+                const struct ureg_src *src,
+                unsigned nr_src,
+                unsigned *label);
+
+
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
  */
 
-unsigned
+struct ureg_emit_insn_result {
+   unsigned insn_token;       /*< Used to fixup insn size. */
+   unsigned extended_token;   /*< Used to set the Extended bit, usually the same as insn_token. */
+};
+
+struct ureg_emit_insn_result
 ureg_emit_insn(struct ureg_program *ureg,
                unsigned opcode,
                boolean saturate,
+               boolean predicate,
+               boolean pred_negate,
+               unsigned pred_swizzle_x,
+               unsigned pred_swizzle_y,
+               unsigned pred_swizzle_z,
+               unsigned pred_swizzle_w,
                unsigned num_dst,
                unsigned num_src );
 
@@ -271,7 +318,17 @@ ureg_fixup_insn_size(struct ureg_program *ureg,
 static INLINE void ureg_##op( struct ureg_program *ureg )       \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 0 ); \
+   unsigned insn = ureg_emit_insn(ureg,                         \
+                                  opcode,                       \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  TGSI_SWIZZLE_X,               \
+                                  TGSI_SWIZZLE_Y,               \
+                                  TGSI_SWIZZLE_Z,               \
+                                  TGSI_SWIZZLE_W,               \
+                                  0,                            \
+                                  0).insn_token;                \
    ureg_fixup_insn_size( ureg, insn );                          \
 }
 
@@ -280,7 +337,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
                               struct ureg_src src )             \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 1 ); \
+   unsigned insn = ureg_emit_insn(ureg,                         \
+                                  opcode,                       \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  FALSE,                        \
+                                  TGSI_SWIZZLE_X,               \
+                                  TGSI_SWIZZLE_Y,               \
+                                  TGSI_SWIZZLE_Z,               \
+                                  TGSI_SWIZZLE_W,               \
+                                  0,                            \
+                                  1).insn_token;                \
    ureg_emit_src( ureg, src );                                  \
    ureg_fixup_insn_size( ureg, insn );                          \
 }
@@ -290,9 +357,20 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
                               unsigned *label_token )           \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 0 ); \
-   ureg_emit_label( ureg, insn, label_token );                  \
-   ureg_fixup_insn_size( ureg, insn );                          \
+   struct ureg_emit_insn_result insn;                           \
+   insn = ureg_emit_insn(ureg,                                  \
+                         opcode,                                \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         TGSI_SWIZZLE_X,                        \
+                         TGSI_SWIZZLE_Y,                        \
+                         TGSI_SWIZZLE_Z,                        \
+                         TGSI_SWIZZLE_W,                        \
+                         0,                                     \
+                         0);                                    \
+   ureg_emit_label( ureg, insn.extended_token, label_token );   \
+   ureg_fixup_insn_size( ureg, insn.insn_token );               \
 }
 
 #define OP01_LBL( op )                                          \
@@ -301,10 +379,21 @@ static INLINE void ureg_##op( struct ureg_program *ureg,        \
                               unsigned *label_token )          \
 {                                                               \
    unsigned opcode = TGSI_OPCODE_##op;                          \
-   unsigned insn = ureg_emit_insn( ureg, opcode, FALSE, 0, 1 ); \
-   ureg_emit_label( ureg, insn, label_token );                  \
+   struct ureg_emit_insn_result insn;                           \
+   insn = ureg_emit_insn(ureg,                                  \
+                         opcode,                                \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         FALSE,                                 \
+                         TGSI_SWIZZLE_X,                        \
+                         TGSI_SWIZZLE_Y,                        \
+                         TGSI_SWIZZLE_Z,                        \
+                         TGSI_SWIZZLE_W,                        \
+                         0,                                     \
+                         1);                                    \
+   ureg_emit_label( ureg, insn.extended_token, label_token );   \
    ureg_emit_src( ureg, src );                                  \
-   ureg_fixup_insn_size( ureg, insn );                          \
+   ureg_fixup_insn_size( ureg, insn.insn_token );               \
 }
 
 #define OP10( op )                                                      \
@@ -312,7 +401,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_dst dst )                     \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 0 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  0).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_fixup_insn_size( ureg, insn );                                  \
 }
@@ -324,7 +423,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src )                     \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 1 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  1).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src );                                          \
    ureg_fixup_insn_size( ureg, insn );                                  \
@@ -337,7 +446,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src1 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 2 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  2).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
@@ -352,12 +471,23 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src1 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 2 );  \
-   ureg_emit_texture( ureg, insn, target );                             \
+   struct ureg_emit_insn_result insn;                                   \
+   insn = ureg_emit_insn(ureg,                                          \
+                         opcode,                                        \
+                         dst.Saturate,                                  \
+                         dst.Predicate,                                 \
+                         dst.PredNegate,                                \
+                         dst.PredSwizzleX,                              \
+                         dst.PredSwizzleY,                              \
+                         dst.PredSwizzleZ,                              \
+                         dst.PredSwizzleW,                              \
+                         1,                                             \
+                         2);                                            \
+   ureg_emit_texture( ureg, insn.extended_token, target );              \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
-   ureg_fixup_insn_size( ureg, insn );                                  \
+   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
 }
 
 #define OP13( op )                                                      \
@@ -368,7 +498,17 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src2 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 3 );  \
+   unsigned insn = ureg_emit_insn(ureg,                                 \
+                                  opcode,                               \
+                                  dst.Saturate,                         \
+                                  dst.Predicate,                        \
+                                  dst.PredNegate,                       \
+                                  dst.PredSwizzleX,                     \
+                                  dst.PredSwizzleY,                     \
+                                  dst.PredSwizzleZ,                     \
+                                  dst.PredSwizzleW,                     \
+                                  1,                                    \
+                                  3).insn_token;                        \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
@@ -386,14 +526,25 @@ static INLINE void ureg_##op( struct ureg_program *ureg,                \
                               struct ureg_src src3 )                    \
 {                                                                       \
    unsigned opcode = TGSI_OPCODE_##op;                                  \
-   unsigned insn = ureg_emit_insn( ureg, opcode, dst.Saturate, 1, 4 );  \
-   ureg_emit_texture( ureg, insn, target );                             \
+   struct ureg_emit_insn_result insn;                                   \
+   insn = ureg_emit_insn(ureg,                                          \
+                         opcode,                                        \
+                         dst.Saturate,                                  \
+                         dst.Predicate,                                 \
+                         dst.PredNegate,                                \
+                         dst.PredSwizzleX,                              \
+                         dst.PredSwizzleY,                              \
+                         dst.PredSwizzleZ,                              \
+                         dst.PredSwizzleW,                              \
+                         1,                                             \
+                         4);                                            \
+   ureg_emit_texture( ureg, insn.extended_token, target );              \
    ureg_emit_dst( ureg, dst );                                          \
    ureg_emit_src( ureg, src0 );                                         \
    ureg_emit_src( ureg, src1 );                                         \
    ureg_emit_src( ureg, src2 );                                         \
    ureg_emit_src( ureg, src3 );                                         \
-   ureg_fixup_insn_size( ureg, insn );                                  \
+   ureg_fixup_insn_size( ureg, insn.insn_token );                       \
 }
 
 
@@ -468,6 +619,24 @@ ureg_saturate( struct ureg_dst reg )
    return reg;
 }
 
+static INLINE struct ureg_dst
+ureg_predicate(struct ureg_dst reg,
+               boolean negate,
+               unsigned swizzle_x,
+               unsigned swizzle_y,
+               unsigned swizzle_z,
+               unsigned swizzle_w)
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Predicate = 1;
+   reg.PredNegate = negate;
+   reg.PredSwizzleX = swizzle_x;
+   reg.PredSwizzleY = swizzle_y;
+   reg.PredSwizzleZ = swizzle_z;
+   reg.PredSwizzleW = swizzle_w;
+   return reg;
+}
+
 static INLINE struct ureg_dst 
 ureg_dst_indirect( struct ureg_dst reg, struct ureg_src addr )
 {
@@ -501,9 +670,13 @@ ureg_dst( struct ureg_src src )
    dst.IndirectIndex = src.IndirectIndex;
    dst.IndirectSwizzle = src.IndirectSwizzle;
    dst.Saturate  = 0;
+   dst.Predicate = 0;
+   dst.PredNegate = 0;
+   dst.PredSwizzleX = TGSI_SWIZZLE_X;
+   dst.PredSwizzleY = TGSI_SWIZZLE_Y;
+   dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
+   dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = src.Index;
-   dst.Pad1      = 0;
-   dst.Pad2      = 0;
 
    return dst;
 }
@@ -542,9 +715,13 @@ ureg_dst_undef( void )
    dst.IndirectIndex = 0;
    dst.IndirectSwizzle = 0;
    dst.Saturate  = 0;
+   dst.Predicate = 0;
+   dst.PredNegate = 0;
+   dst.PredSwizzleX = TGSI_SWIZZLE_X;
+   dst.PredSwizzleY = TGSI_SWIZZLE_Y;
+   dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
+   dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = 0;
-   dst.Pad1      = 0;
-   dst.Pad2      = 0;
 
    return dst;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 71f8a6ca401..4dee1be9e8c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -69,59 +69,15 @@ tgsi_util_get_src_register_swizzle(
    return 0;
 }
 
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->ExtSwizzleX;
-   case 1:
-      return reg->ExtSwizzleY;
-   case 2:
-      return reg->ExtSwizzleZ;
-   case 3:
-      return reg->ExtSwizzleW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
 
 unsigned
-tgsi_util_get_full_src_register_extswizzle(
+tgsi_util_get_full_src_register_swizzle(
    const struct tgsi_full_src_register  *reg,
    unsigned component )
 {
-   unsigned swizzle;
-
-   /*
-    * First, calculate  the   extended swizzle for a given channel. This will give
-    * us either a channel index into the simple swizzle or  a constant 1 or   0.
-    */
-   swizzle = tgsi_util_get_src_register_extswizzle(
-      &reg->SrcRegisterExtSwz,
+   return tgsi_util_get_src_register_swizzle(
+      &reg->SrcRegister,
       component );
-
-   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
-
-   /*
-    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
-    * Leave the constants intact, they are   not   affected by the   simple swizzle.
-    */
-   if( swizzle <= TGSI_SWIZZLE_W ) {
-      swizzle = tgsi_util_get_src_register_swizzle(
-         &reg->SrcRegister,
-         swizzle );
-   }
-
-   return swizzle;
 }
 
 void
@@ -148,74 +104,6 @@ tgsi_util_set_src_register_swizzle(
    }
 }
 
-void
-tgsi_util_set_src_register_extswizzle(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned swizzle,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->ExtSwizzleX = swizzle;
-      break;
-   case 1:
-      reg->ExtSwizzleY = swizzle;
-      break;
-   case 2:
-      reg->ExtSwizzleZ = swizzle;
-      break;
-   case 3:
-      reg->ExtSwizzleW = swizzle;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const  struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->NegateX;
-   case 1:
-      return reg->NegateY;
-   case 2:
-      return reg->NegateZ;
-   case 3:
-      return reg->NegateW;
-   default:
-      assert( 0 );
-   }
-   return 0;
-}
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->NegateX = negate;
-      break;
-   case 1:
-      reg->NegateY = negate;
-      break;
-   case 2:
-      reg->NegateZ = negate;
-      break;
-   case 3:
-      reg->NegateW = negate;
-      break;
-   default:
-      assert( 0 );
-   }
-}
-
 unsigned
 tgsi_util_get_full_src_register_sign_mode(
    const struct  tgsi_full_src_register *reg,
@@ -239,9 +127,7 @@ tgsi_util_get_full_src_register_sign_mode(
       unsigned negate;
 
       negate = reg->SrcRegister.Negate;
-      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
-         negate = !negate;
-      }
+
       if( reg->SrcRegisterExtMod.Negate ) {
          negate = !negate;
       }
@@ -262,11 +148,6 @@ tgsi_util_set_full_src_register_sign_mode(
    struct tgsi_full_src_register *reg,
    unsigned sign_mode )
 {
-   reg->SrcRegisterExtSwz.NegateX = 0;
-   reg->SrcRegisterExtSwz.NegateY = 0;
-   reg->SrcRegisterExtSwz.NegateZ = 0;
-   reg->SrcRegisterExtSwz.NegateW = 0;
-
    switch (sign_mode)
    {
    case TGSI_UTIL_SIGN_CLEAR:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 21eb656327e..19ee2e7cf2a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -33,7 +33,6 @@ extern "C" {
 #endif
 
 struct tgsi_src_register;
-struct tgsi_src_register_ext_swz;
 struct tgsi_full_src_register;
 
 void *
@@ -45,13 +44,9 @@ tgsi_util_get_src_register_swizzle(
    const struct tgsi_src_register *reg,
    unsigned component );
 
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component);
 
 unsigned
-tgsi_util_get_full_src_register_extswizzle(
+tgsi_util_get_full_src_register_swizzle(
    const struct tgsi_full_src_register *reg,
    unsigned component );
 
@@ -61,23 +56,6 @@ tgsi_util_set_src_register_swizzle(
    unsigned swizzle,
    unsigned component );
 
-void
-tgsi_util_set_src_register_extswizzle(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned swizzle,
-   unsigned component );
-
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component );
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component );
-
 #define TGSI_UTIL_SIGN_CLEAR    0   /* Force positive */
 #define TGSI_UTIL_SIGN_SET      1   /* Force negative */
 #define TGSI_UTIL_SIGN_TOGGLE   2   /* Negate */
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index ae8d330a787..1d8bb55bbd6 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	u_debug_stack.c \
 	u_blit.c \
 	u_cache.c \
+	u_cpu_detect.c \
 	u_draw_quad.c \
 	u_format.c \
 	u_format_access.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index 28a5ab42569..8d99106d0b8 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -24,10 +24,10 @@ util = env.ConvenienceLibrary(
 		'u_bitmask.c',
 		'u_blit.c',
 		'u_cache.c',
+		'u_cpu_detect.c',
 		'u_debug.c',
 		'u_debug_dump.c',
 		'u_debug_memory.c',
-		'u_debug_profile.c',
 		'u_debug_stack.c',
 		'u_debug_symbol.c',
 		'u_draw_quad.c',
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index c516317d701..50386425995 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -46,6 +46,7 @@
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_surface.h"
+#include "util/u_rect.h"
 
 #include "cso_cache/cso_context.h"
 
@@ -182,47 +183,7 @@ get_next_slot( struct blit_state *ctx )
 }
                                
 
-/**
- * Setup vertex data for the textured quad we'll draw.
- * Note: y=0=top
- */
-static unsigned
-setup_vertex_data(struct blit_state *ctx,
-                  float x0, float y0, float x1, float y1, float z)
-{
-   unsigned offset;
-
-   ctx->vertices[0][0][0] = x0;
-   ctx->vertices[0][0][1] = y0;
-   ctx->vertices[0][0][2] = z;
-   ctx->vertices[0][1][0] = 0.0f; /*s*/
-   ctx->vertices[0][1][1] = 0.0f; /*t*/
-
-   ctx->vertices[1][0][0] = x1;
-   ctx->vertices[1][0][1] = y0;
-   ctx->vertices[1][0][2] = z;
-   ctx->vertices[1][1][0] = 1.0f; /*s*/
-   ctx->vertices[1][1][1] = 0.0f; /*t*/
-
-   ctx->vertices[2][0][0] = x1;
-   ctx->vertices[2][0][1] = y1;
-   ctx->vertices[2][0][2] = z;
-   ctx->vertices[2][1][0] = 1.0f;
-   ctx->vertices[2][1][1] = 1.0f;
 
-   ctx->vertices[3][0][0] = x0;
-   ctx->vertices[3][0][1] = y1;
-   ctx->vertices[3][0][2] = z;
-   ctx->vertices[3][1][0] = 0.0f;
-   ctx->vertices[3][1][1] = 1.0f;
-
-   offset = get_next_slot( ctx );
-
-   pipe_buffer_write(ctx->pipe->screen, ctx->vbuf,
-                     offset, sizeof(ctx->vertices), ctx->vertices);
-
-   return offset;
-}
 
 
 /**
@@ -315,15 +276,13 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 {
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_texture texTemp, *tex;
-   struct pipe_surface *texSurf;
+   struct pipe_texture *tex = NULL;
    struct pipe_framebuffer_state fb;
    const int srcW = abs(srcX1 - srcX0);
    const int srcH = abs(srcY1 - srcY0);
-   const int srcLeft = MIN2(srcX0, srcX1);
-   const int srcTop = MIN2(srcY0, srcY1);
    unsigned offset;
    boolean overlap;
+   float s0, t0, s1, t1;
 
    assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
           filter == PIPE_TEX_MIPFILTER_LINEAR);
@@ -343,7 +302,8 @@ util_blit_pixels_writemask(struct blit_state *ctx,
     * no overlapping.
     * Filter mode should not matter since there's no stretching.
     */
-   if (dst->format == src->format &&
+   if (pipe->surface_copy &&
+       dst->format == src->format &&
        srcX0 < srcX1 &&
        dstX0 < dstX1 &&
        srcY0 < srcY1 &&
@@ -358,54 +318,83 @@ util_blit_pixels_writemask(struct blit_state *ctx,
       return;
    }
    
-   if (srcLeft != srcX0) {
-      /* left-right flip */
-      int tmp = dstX0;
-      dstX0 = dstX1;
-      dstX1 = tmp;
-   }
-
-   if (srcTop != srcY0) {
-      /* up-down flip */
-      int tmp = dstY0;
-      dstY0 = dstY1;
-      dstY1 = tmp;
-   }
-
    assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
                                       PIPE_TEXTURE_USAGE_RENDER_TARGET, 0));
 
-   /*
-    * XXX for now we're always creating a temporary texture.
-    * Strictly speaking that's not always needed.
+   /* Create a temporary texture when src and dest alias or when src
+    * is anything other than a single-level 2d texture.
+    * 
+    * This can still be improved upon.
     */
+   if (util_same_surface(src, dst) ||
+       src->texture->target != PIPE_TEXTURE_2D ||
+       src->texture->last_level != 0)
+   {
+      struct pipe_texture texTemp;
+      struct pipe_surface *texSurf;
+      const int srcLeft = MIN2(srcX0, srcX1);
+      const int srcTop = MIN2(srcY0, srcY1);
+
+      if (srcLeft != srcX0) {
+         /* left-right flip */
+         int tmp = dstX0;
+         dstX0 = dstX1;
+         dstX1 = tmp;
+      }
+
+      if (srcTop != srcY0) {
+         /* up-down flip */
+         int tmp = dstY0;
+         dstY0 = dstY1;
+         dstY1 = tmp;
+      }
+
+      /* create temp texture */
+      memset(&texTemp, 0, sizeof(texTemp));
+      texTemp.target = PIPE_TEXTURE_2D;
+      texTemp.format = src->format;
+      texTemp.last_level = 0;
+      texTemp.width[0] = srcW;
+      texTemp.height[0] = srcH;
+      texTemp.depth[0] = 1;
+      pf_get_block(src->format, &texTemp.block);
+
+      tex = screen->texture_create(screen, &texTemp);
+      if (!tex)
+         return;
+
+      texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
+                                        PIPE_BUFFER_USAGE_GPU_WRITE);
+
+      /* load temp texture */
+      if (pipe->surface_copy) {
+         pipe->surface_copy(pipe,
+                            texSurf, 0, 0,   /* dest */
+                            src, srcLeft, srcTop, /* src */
+                            srcW, srcH);     /* size */
+      } else {
+         util_surface_copy(pipe, FALSE,
+                           texSurf, 0, 0,   /* dest */
+                           src, srcLeft, srcTop, /* src */
+                           srcW, srcH);     /* size */
+      }
+
+      /* free the surface, update the texture if necessary.
+       */
+      pipe_surface_reference(&texSurf, NULL);
+      s0 = 0.0f; 
+      s1 = 1.0f;
+      t0 = 0.0f;
+      t1 = 1.0f;
+   }
+   else {
+      pipe_texture_reference(&tex, src->texture);
+      s0 = srcX0 / (float)tex->width[0];
+      s1 = srcX1 / (float)tex->width[0];
+      t0 = srcY0 / (float)tex->height[0];
+      t1 = srcY1 / (float)tex->height[0];
+   }
 
-   /* create temp texture */
-   memset(&texTemp, 0, sizeof(texTemp));
-   texTemp.target = PIPE_TEXTURE_2D;
-   texTemp.format = src->format;
-   texTemp.last_level = 0;
-   texTemp.width[0] = srcW;
-   texTemp.height[0] = srcH;
-   texTemp.depth[0] = 1;
-   pf_get_block(src->format, &texTemp.block);
-
-   tex = screen->texture_create(screen, &texTemp);
-   if (!tex)
-      return;
-
-   texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
-                                     PIPE_BUFFER_USAGE_GPU_WRITE);
-
-   /* load temp texture */
-   pipe->surface_copy(pipe,
-                      texSurf, 0, 0,   /* dest */
-                      src, srcLeft, srcTop, /* src */
-                      srcW, srcH);     /* size */
-
-   /* free the surface, update the texture if necessary.
-    */
-   pipe_surface_reference(&texSurf, NULL);
 
    /* save state (restored below) */
    cso_save_blend(ctx->cso);
@@ -447,9 +436,12 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_set_framebuffer(ctx->cso, &fb);
 
    /* draw quad */
-   offset = setup_vertex_data(ctx,
-                              (float) dstX0, (float) dstY0, 
-                              (float) dstX1, (float) dstY1, z);
+   offset = setup_vertex_data_tex(ctx,
+                                  (float) dstX0, (float) dstY0, 
+                                  (float) dstX1, (float) dstY1,
+                                  s0, t0,
+                                  s1, t1,
+                                  z);
 
    util_draw_vertex_buffer(ctx->pipe, ctx->vbuf, offset,
                            PIPE_PRIM_TRIANGLE_FAN,
diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h
index 7c16b32cf9f..1e65a035aed 100644
--- a/src/gallium/auxiliary/util/u_clear.h
+++ b/src/gallium/auxiliary/util/u_clear.h
@@ -32,6 +32,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "util/u_pack_color.h"
+#include "util/u_rect.h"
 
 
 /**
@@ -48,13 +49,22 @@ util_clear(struct pipe_context *pipe,
       unsigned color;
 
       util_pack_color(rgba, ps->format, &color);
-      pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+      if (pipe->surface_fill) {
+         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+      } else {
+         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+      }
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
       struct pipe_surface *ps = framebuffer->zsbuf;
 
-      pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
-                         util_pack_z_stencil(ps->format, depth, stencil));
+      if (pipe->surface_fill) {
+         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+                            util_pack_z_stencil(ps->format, depth, stencil));
+      } else {
+         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+                           util_pack_z_stencil(ps->format, depth, stencil));
+      }
    }
 }
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index f78706f4470..a08241971ca 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -24,23 +24,21 @@
  * 
  **************************************************************************/
 
-/*
- * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
  */
 
-/* FIXME: clean this entire file up */
+#include "pipe/p_config.h"
 
+#include "u_debug.h"
 #include "u_cpu_detect.h"
 
-#ifdef __linux__
-#define OS_LINUX
-#endif
-#ifdef WIN32
-#define OS_WIN32
-#endif
-
-#if defined(ARCH_POWERPC)
-#if defined(OS_DARWIN)
+#if defined(PIPE_ARCH_PPC)
+#if defined(PIPE_OS_DARWIN)
 #include <sys/sysctl.h>
 #else
 #include <signal.h>
@@ -48,140 +46,147 @@
 #endif
 #endif
 
-#if defined(OS_NETBSD) || defined(OS_OPENBSD)
+#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #include <machine/cpu.h>
 #endif
 
-#if defined(OS_FREEBSD)
+#if defined(PIPE_OS_FREEBSD)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 
-#if defined(OS_LINUX)
+#if defined(PIPE_OS_LINUX)
 #include <signal.h>
 #endif
 
-#if defined(OS_WIN32)
-#include <windows.h>
+#ifdef PIPE_OS_UNIX
+#include <unistd.h>
 #endif
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
+#if defined(PIPE_OS_WINDOWS)
+#include <windows.h>
+#if defined(MSVC)
+#include <intrin.h>
+#endif
+#endif
 
 
-static struct cpu_detect_caps __cpu_detect_caps;
-static int __cpu_detect_initialized = 0;
+struct util_cpu_caps util_cpu_caps;
 
 static int has_cpuid(void);
-static int cpuid(unsigned int ax, unsigned int *p);
+
+#if defined(PIPE_ARCH_X86)
 
 /* The sigill handlers */
-#if defined(ARCH_X86) /* x86 (linux katmai handler check thing) */
-#if defined(OS_LINUX) && defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)
-static void sigill_handler_sse(int signal, struct sigcontext sc)
+#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/
+static void
+sigill_handler_sse(int signal, struct sigcontext sc)
 {
-	/* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
-	 * instructions are 3 bytes long.  We must increment the instruction
-	 * pointer manually to avoid repeated execution of the offending
-	 * instruction.
-	 *
-	 * If the SIGILL is caused by a divide-by-zero when unmasked
-	 * exceptions aren't supported, the SIMD FPU status and control
-	 * word will be restored at the end of the test, so we don't need
-	 * to worry about doing it here.  Besides, we may not be able to...
-	 */
-	sc.eip += 3;
-
-	__cpu_detect_caps.hasSSE=0;
+   /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
+    * instructions are 3 bytes long.  We must increment the instruction
+    * pointer manually to avoid repeated execution of the offending
+    * instruction.
+    *
+    * If the SIGILL is caused by a divide-by-zero when unmasked
+    * exceptions aren't supported, the SIMD FPU status and control
+    * word will be restored at the end of the test, so we don't need
+    * to worry about doing it here.  Besides, we may not be able to...
+    */
+   sc.eip += 3;
+
+   util_cpu_caps.has_sse=0;
 }
 
-static void sigfpe_handler_sse(int signal, struct sigcontext sc)
+static void
+sigfpe_handler_sse(int signal, struct sigcontext sc)
 {
-	if (sc.fpstate->magic != 0xffff) {
-		/* Our signal context has the extended FPU state, so reset the
-		 * divide-by-zero exception mask and clear the divide-by-zero
-		 * exception bit.
-		 */
-		sc.fpstate->mxcsr |= 0x00000200;
-		sc.fpstate->mxcsr &= 0xfffffffb;
-	} else {
-		/* If we ever get here, we're completely hosed.
-		*/
-	}
+   if (sc.fpstate->magic != 0xffff) {
+      /* Our signal context has the extended FPU state, so reset the
+       * divide-by-zero exception mask and clear the divide-by-zero
+       * exception bit.
+       */
+      sc.fpstate->mxcsr |= 0x00000200;
+      sc.fpstate->mxcsr &= 0xfffffffb;
+   } else {
+      /* If we ever get here, we're completely hosed.
+      */
+   }
 }
-#endif
-#endif /* OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
+#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
 
-#if defined(OS_WIN32)
-LONG CALLBACK win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
+#if defined(PIPE_OS_WINDOWS)
+static LONG CALLBACK
+win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
 {
-	if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
-		ep->ContextRecord->Eip +=3;
-		__cpu_detect_caps.hasSSE=0;
-		return EXCEPTION_CONTINUE_EXECUTION;
-	}
-	return EXCEPTION_CONTINUE_SEARCH;
+   if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
+      ep->ContextRecord->Eip +=3;
+      util_cpu_caps.has_sse=0;
+      return EXCEPTION_CONTINUE_EXECUTION;
+   }
+   return EXCEPTION_CONTINUE_SEARCH;
 }
-#endif /* OS_WIN32 */
+#endif /* PIPE_OS_WINDOWS */
 
+#endif /* PIPE_ARCH_X86 */
 
-#if defined(ARCH_POWERPC) && !defined(OS_DARWIN)
-static sigjmp_buf __lv_powerpc_jmpbuf;
-static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
-static void sigill_handler (int sig);
+#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_DARWIN)
+static jmp_buf  __lv_powerpc_jmpbuf;
+static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
-static void sigill_handler (int sig)
+static void
+sigill_handler(int sig)
 {
-	if (!__lv_powerpc_canjump) {
-		signal (sig, SIG_DFL);
-		raise (sig);
-	}
+   if (!__lv_powerpc_canjump) {
+      signal (sig, SIG_DFL);
+      raise (sig);
+   }
 
-	__lv_powerpc_canjump = 0;
-	siglongjmp(__lv_powerpc_jmpbuf, 1);
+   __lv_powerpc_canjump = 0;
+   longjmp(__lv_powerpc_jmpbuf, 1);
 }
+#endif
 
-static void check_os_altivec_support(void)
+#if defined(PIPE_ARCH_PPC)
+static void
+check_os_altivec_support(void)
 {
-#if defined(OS_DARWIN)
-	int sels[2] = {CTL_HW, HW_VECTORUNIT};
-	int has_vu = 0;
-	int len = sizeof (has_vu);
-	int err;
-
-	err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
-
-	if (err == 0) {
-		if (has_vu != 0) {
-			__cpu_detect_caps.hasAltiVec = 1;
-		}
-	}
-#else /* !OS_DARWIN */
-	/* no Darwin, do it the brute-force way */
-	/* this is borrowed from the libmpeg2 library */
-	signal(SIGILL, sigill_handler);
-	if (sigsetjmp(__lv_powerpc_jmpbuf, 1)) {
-		signal(SIGILL, SIG_DFL);
-	} else {
-		__lv_powerpc_canjump = 1;
-
-		__asm __volatile
-			("mtspr 256, %0\n\t"
-			 "vand %%v0, %%v0, %%v0"
-			 :
-			 : "r" (-1));
-
-		signal(SIGILL, SIG_DFL);
-		__cpu_detect_caps.hasAltiVec = 1;
-	}
-#endif
+#if defined(PIPE_OS_DARWIN)
+   int sels[2] = {CTL_HW, HW_VECTORUNIT};
+   int has_vu = 0;
+   int len = sizeof (has_vu);
+   int err;
+
+   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+   if (err == 0) {
+      if (has_vu != 0) {
+         util_cpu_caps.has_altivec = 1;
+      }
+   }
+#else /* !PIPE_OS_DARWIN */
+   /* no Darwin, do it the brute-force way */
+   /* this is borrowed from the libmpeg2 library */
+   signal(SIGILL, sigill_handler);
+   if (setjmp(__lv_powerpc_jmpbuf)) {
+      signal(SIGILL, SIG_DFL);
+   } else {
+      __lv_powerpc_canjump = 1;
+
+      __asm __volatile
+         ("mtspr 256, %0\n\t"
+          "vand %%v0, %%v0, %%v0"
+          :
+          : "r" (-1));
+
+      signal(SIGILL, SIG_DFL);
+      util_cpu_caps.has_altivec = 1;
+   }
+#endif /* PIPE_OS_DARWIN */
 }
-#endif
+#endif /* PIPE_ARCH_PPC */
 
 /* If we're running on a processor that can do SSE, let's see if we
  * are allowed to or not.  This will catch 2.4.0 or later kernels that
@@ -189,318 +194,327 @@ static void check_os_altivec_support(void)
  * and RedHat patched 2.2 kernels that have broken exception handling
  * support for user space apps that do SSE.
  */
-static void check_os_katmai_support(void)
+#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
+static void
+check_os_katmai_support(void)
 {
-#if defined(ARCH_X86)
-#if defined(OS_FREEBSD)
-	int has_sse=0, ret;
-	int len = sizeof (has_sse);
-
-	ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
-	if (ret || !has_sse)
-		__cpu_detect_caps.hasSSE=0;
-
-#elif defined(OS_NETBSD) || defined(OS_OPENBSD)
-	int has_sse, has_sse2, ret, mib[2];
-	int varlen;
-
-	mib[0] = CTL_MACHDEP;
-	mib[1] = CPU_SSE;
-	varlen = sizeof (has_sse);
-
-	ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
-	if (ret < 0 || !has_sse) {
-		__cpu_detect_caps.hasSSE = 0;
-	} else {
-		__cpu_detect_caps.hasSSE = 1;
-	}
-
-	mib[1] = CPU_SSE2;
-	varlen = sizeof (has_sse2);
-	ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
-	if (ret < 0 || !has_sse2) {
-		__cpu_detect_caps.hasSSE2 = 0;
-	} else {
-		__cpu_detect_caps.hasSSE2 = 1;
-	}
-	__cpu_detect_caps.hasSSE = 0; /* FIXME ?!?!? */
-
-#elif defined(OS_WIN32)
-	LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
-	if (__cpu_detect_caps.hasSSE) {
-		exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
-		__asm __volatile ("xorps %xmm0, %xmm0");
-		SetUnhandledExceptionFilter(exc_fil);
-	}
-#elif defined(OS_LINUX)
-	struct sigaction saved_sigill;
-	struct sigaction saved_sigfpe;
-
-	/* Save the original signal handlers.
-	*/
-	sigaction(SIGILL, NULL, &saved_sigill);
-	sigaction(SIGFPE, NULL, &saved_sigfpe);
-
-	signal(SIGILL, (void (*)(int))sigill_handler_sse);
-	signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
-
-	/* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
-	 * supports the extended FPU save and restore required for SSE.  If
-	 * we execute an SSE instruction on a PIII and get a SIGILL, the OS
-	 * doesn't support Streaming SIMD Exceptions, even if the processor
-	 * does.
-	 */
-	if (__cpu_detect_caps.hasSSE) {
-		__asm __volatile ("xorps %xmm1, %xmm0");
-	}
-
-	/* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
-	 * it supports unmasked SIMD FPU exceptions.  If we unmask the
-	 * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
-	 * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
-	 * as expected, we're okay but we need to clean up after it.
-	 *
-	 * Are we being too stringent in our requirement that the OS support
-	 * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
-	 * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
-	 * doesn't even support them.  We at least know the user-space SSE
-	 * support is good in kernels that do support unmasked exceptions,
-	 * and therefore to be safe I'm going to leave this test in here.
-	 */
-	if (__cpu_detect_caps.hasSSE) {
-           /* test_os_katmai_exception_support(); */
-	}
-
-	/* Restore the original signal handlers.
-	*/
-	sigaction(SIGILL, &saved_sigill, NULL);
-	sigaction(SIGFPE, &saved_sigfpe, NULL);
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_FREEBSD)
+   int has_sse=0, ret;
+   int len = sizeof (has_sse);
+
+   ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
+   if (ret || !has_sse)
+      util_cpu_caps.has_sse=0;
+
+#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
+   int has_sse, has_sse2, ret, mib[2];
+   int varlen;
+
+   mib[0] = CTL_MACHDEP;
+   mib[1] = CPU_SSE;
+   varlen = sizeof (has_sse);
+
+   ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
+   if (ret < 0 || !has_sse) {
+      util_cpu_caps.has_sse = 0;
+   } else {
+      util_cpu_caps.has_sse = 1;
+   }
+
+   mib[1] = CPU_SSE2;
+   varlen = sizeof (has_sse2);
+   ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
+   if (ret < 0 || !has_sse2) {
+      util_cpu_caps.has_sse2 = 0;
+   } else {
+      util_cpu_caps.has_sse2 = 1;
+   }
+   util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */
+
+#elif defined(PIPE_OS_WINDOWS)
+   LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
+   if (util_cpu_caps.has_sse) {
+      exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
+#if defined(PIPE_CC_GCC)
+      __asm __volatile ("xorps %xmm0, %xmm0");
+#elif defined(PIPE_CC_MSVC)
+      __asm {
+          xorps xmm0, xmm0        /* executing SSE instruction */
+      }
+#else
+#error Unsupported compiler
+#endif
+      SetUnhandledExceptionFilter(exc_fil);
+   }
+#elif defined(PIPE_OS_LINUX)
+   struct sigaction saved_sigill;
+   struct sigaction saved_sigfpe;
+
+   /* Save the original signal handlers.
+   */
+   sigaction(SIGILL, NULL, &saved_sigill);
+   sigaction(SIGFPE, NULL, &saved_sigfpe);
+
+   signal(SIGILL, (void (*)(int))sigill_handler_sse);
+   signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
+
+   /* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
+    * supports the extended FPU save and restore required for SSE.  If
+    * we execute an SSE instruction on a PIII and get a SIGILL, the OS
+    * doesn't support Streaming SIMD Exceptions, even if the processor
+    * does.
+    */
+   if (util_cpu_caps.has_sse) {
+      __asm __volatile ("xorps %xmm1, %xmm0");
+   }
+
+   /* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
+    * it supports unmasked SIMD FPU exceptions.  If we unmask the
+    * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
+    * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
+    * as expected, we're okay but we need to clean up after it.
+    *
+    * Are we being too stringent in our requirement that the OS support
+    * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
+    * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
+    * doesn't even support them.  We at least know the user-space SSE
+    * support is good in kernels that do support unmasked exceptions,
+    * and therefore to be safe I'm going to leave this test in here.
+    */
+   if (util_cpu_caps.has_sse) {
+      /* test_os_katmai_exception_support(); */
+   }
+
+   /* Restore the original signal handlers.
+   */
+   sigaction(SIGILL, &saved_sigill, NULL);
+   sigaction(SIGFPE, &saved_sigfpe, NULL);
 
 #else
-	/* We can't use POSIX signal handling to test the availability of
-	 * SSE, so we disable it by default.
-	 */
-	__cpu_detect_caps.hasSSE = 0;
+   /* We can't use POSIX signal handling to test the availability of
+    * SSE, so we disable it by default.
+    */
+   util_cpu_caps.has_sse = 0;
 #endif /* __linux__ */
 #endif
+
+#if defined(PIPE_ARCH_X86_64)
+   util_cpu_caps.has_sse = 1;
+#endif
 }
 
 
 static int has_cpuid(void)
 {
-#if defined(ARCH_X86)
-	int a, c;
-
-	__asm __volatile
-		("pushf\n"
-		 "popl %0\n"
-		 "movl %0, %1\n"
-		 "xorl $0x200000, %0\n"
-		 "push %0\n"
-		 "popf\n"
-		 "pushf\n"
-		 "popl %0\n"
-		 : "=a" (a), "=c" (c)
-		 :
-		 : "cc");
-
-	return a != c;
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_GCC)
+   int a, c;
+
+   __asm __volatile
+      ("pushf\n"
+       "popl %0\n"
+       "movl %0, %1\n"
+       "xorl $0x200000, %0\n"
+       "push %0\n"
+       "popf\n"
+       "pushf\n"
+       "popl %0\n"
+       : "=a" (a), "=c" (c)
+       :
+       : "cc");
+
+   return a != c;
+#else
+   /* FIXME */
+   return 1;
+#endif
+#elif defined(PIPE_ARCH_X86_64)
+   return 1;
 #else
-	return 0;
+   return 0;
 #endif
 }
 
-static int cpuid(unsigned int ax, unsigned int *p)
+
+/**
+ * @sa cpuid.h included in gcc-4.3 onwards.
+ * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+ */
+static INLINE void
+cpuid(uint32_t ax, uint32_t *p)
 {
-#if defined(ARCH_X86)
-	unsigned int flags;
-
-	__asm __volatile
-		("movl %%ebx, %%esi\n\t"
-		 "cpuid\n\t"
-		 "xchgl %%ebx, %%esi"
-		 : "=a" (p[0]), "=S" (p[1]),
-		 "=c" (p[2]), "=d" (p[3])
-		 : "0" (ax));
-
-	return 0;
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+   __asm __volatile (
+     "xchgl %%ebx, %1\n\t"
+     "cpuid\n\t"
+     "xchgl %%ebx, %1"
+     : "=a" (p[0]),
+       "=S" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax)
+   );
+#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
+   __asm __volatile (
+     "cpuid\n\t"
+     : "=a" (p[0]),
+       "=b" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax)
+   );
+#elif defined(PIPE_CC_MSVC)
+   __cpuid(p, ax);
 #else
-	return -1;
+   p[0] = 0;
+   p[1] = 0;
+   p[2] = 0;
+   p[3] = 0;
 #endif
 }
+#endif /* X86 or X86_64 */
 
-void cpu_detect_initialize()
+void
+util_cpu_detect(void)
 {
-	unsigned int regs[4];
-	unsigned int regs2[4];
-
-	int mib[2], ncpu;
-	int len;
-
-	memset(&__cpu_detect_caps, 0, sizeof (struct cpu_detect_caps));
-
-	/* Check for arch type */
-#if defined(ARCH_MIPS)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_MIPS;
-#elif defined(ARCH_ALPHA)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_ALPHA;
-#elif defined(ARCH_SPARC)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_SPARC;
-#elif defined(ARCH_X86)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_X86;
-#elif defined(ARCH_POWERPC)
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_POWERPC;
+   static boolean util_cpu_detect_initialized = FALSE;
+
+   if(util_cpu_detect_initialized)
+      return;
+
+   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
+
+   /* Check for arch type */
+#if defined(PIPE_ARCH_MIPS)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS;
+#elif defined(PIPE_ARCH_ALPHA)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA;
+#elif defined(PIPE_ARCH_SPARC)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC;
+#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_X86;
+   util_cpu_caps.little_endian = 1;
+#elif defined(PIPE_ARCH_PPC)
+   util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC;
+   util_cpu_caps.little_endian = 0;
 #else
-	__cpu_detect_caps.type = CPU_DETECT_TYPE_OTHER;
+   util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN;
 #endif
 
-	/* Count the number of CPUs in system */
-#if !defined(OS_WIN32) && !defined(OS_UNKNOWN) && defined(_SC_NPROCESSORS_ONLN)
-	__cpu_detect_caps.nrcpu = sysconf(_SC_NPROCESSORS_ONLN);
-	if (__cpu_detect_caps.nrcpu == -1)
-		__cpu_detect_caps.nrcpu = 1;
-
-#elif defined(OS_NETBSD) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
-
-	mib[0] = CTL_HW;
-	mib[1] = HW_NCPU;
-
-	len = sizeof (ncpu);
-	sysctl(mib, 2, &ncpu, &len, NULL, 0);
-	__cpu_detect_caps.nrcpu = ncpu;
-
+   /* Count the number of CPUs in system */
+#if defined(PIPE_OS_WINDOWS)
+   {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
+   }
+#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
+   util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+   if (util_cpu_caps.nr_cpus == -1)
+      util_cpu_caps.nr_cpus = 1;
+#elif defined(PIPE_OS_BSD)
+   {
+      int mib[2], ncpu;
+      int len;
+
+      mib[0] = CTL_HW;
+      mib[1] = HW_NCPU;
+
+      len = sizeof (ncpu);
+      sysctl(mib, 2, &ncpu, &len, NULL, 0);
+      util_cpu_caps.nr_cpus = ncpu;
+   }
 #else
-	__cpu_detect_caps.nrcpu = 1;
+   util_cpu_caps.nr_cpus = 1;
 #endif
 
-#if defined(ARCH_X86)
-	/* No cpuid, old 486 or lower */
-	if (has_cpuid() == 0)
-		return;
-
-	__cpu_detect_caps.cacheline = 32;
-
-	/* Get max cpuid level */
-	cpuid(0x00000000, regs);
-
-	if (regs[0] >= 0x00000001) {
-		unsigned int cacheline;
-
-		cpuid (0x00000001, regs2);
-
-		__cpu_detect_caps.x86cpuType = (regs2[0] >> 8) & 0xf;
-		if (__cpu_detect_caps.x86cpuType == 0xf)
-		    __cpu_detect_caps.x86cpuType = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
-
-		/* general feature flags */
-		__cpu_detect_caps.hasTSC  = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
-		__cpu_detect_caps.hasMMX  = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
-		__cpu_detect_caps.hasSSE  = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
-		__cpu_detect_caps.hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
-		__cpu_detect_caps.hasSSE3 = (regs2[2] & (1));	       /* 0x0000001 */
-		__cpu_detect_caps.hasSSSE3 = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
-		__cpu_detect_caps.hasMMX2 = __cpu_detect_caps.hasSSE; /* SSE cpus supports mmxext too */
-
-		cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
-		if (cacheline > 0)
-			__cpu_detect_caps.cacheline = cacheline;
-	}
-
-	cpuid(0x80000000, regs);
-
-	if (regs[0] >= 0x80000001) {
-
-		cpuid(0x80000001, regs2);
-
-		__cpu_detect_caps.hasMMX  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
-		__cpu_detect_caps.hasMMX2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
-		__cpu_detect_caps.has3DNow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
-		__cpu_detect_caps.has3DNowExt = (regs2[3] & (1 << 30 )) >> 30;
-	}
-
-	if (regs[0] >= 0x80000006) {
-		cpuid(0x80000006, regs2);
-		__cpu_detect_caps.cacheline = regs2[2] & 0xFF;
-	}
-
-
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_CYGWIN) || defined(OS_OPENBSD)
-	if (__cpu_detect_caps.hasSSE)
-		check_os_katmai_support();
-
-	if (!__cpu_detect_caps.hasSSE) {
-		__cpu_detect_caps.hasSSE2 = 0;
-		__cpu_detect_caps.hasSSE3 = 0;
-		__cpu_detect_caps.hasSSSE3 = 0;
-	}
-#else
-	__cpu_detect_caps.hasSSE = 0;
-	__cpu_detect_caps.hasSSE2 = 0;
-	__cpu_detect_caps.hasSSE3 = 0;
-	__cpu_detect_caps.hasSSSE3 = 0;
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if (has_cpuid()) {
+      uint32_t regs[4];
+      uint32_t regs2[4];
+
+      util_cpu_caps.cacheline = 32;
+
+      /* Get max cpuid level */
+      cpuid(0x00000000, regs);
+
+      if (regs[0] >= 0x00000001) {
+         unsigned int cacheline;
+
+         cpuid (0x00000001, regs2);
+
+         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
+         if (util_cpu_caps.x86_cpu_type == 0xf)
+             util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
+
+         /* general feature flags */
+         util_cpu_caps.has_tsc    = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
+         util_cpu_caps.has_mmx    = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+         util_cpu_caps.has_sse    = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
+         util_cpu_caps.has_sse2   = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
+         util_cpu_caps.has_sse3   = (regs2[2] & (1));          /* 0x0000001 */
+         util_cpu_caps.has_ssse3  = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
+         util_cpu_caps.has_sse4_1 = (regs2[2] & (1 << 19)) >> 19;
+         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
+
+         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
+         if (cacheline > 0)
+            util_cpu_caps.cacheline = cacheline;
+      }
+
+      cpuid(0x80000000, regs);
+
+      if (regs[0] >= 0x80000001) {
+
+         cpuid(0x80000001, regs2);
+
+         util_cpu_caps.has_mmx  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+         util_cpu_caps.has_mmx2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
+         util_cpu_caps.has_3dnow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
+         util_cpu_caps.has_3dnow_ext = (regs2[3] & (1 << 30 )) >> 30;
+      }
+
+      if (regs[0] >= 0x80000006) {
+         cpuid(0x80000006, regs2);
+         util_cpu_caps.cacheline = regs2[2] & 0xFF;
+      }
+
+      if (util_cpu_caps.has_sse)
+         check_os_katmai_support();
+
+      if (!util_cpu_caps.has_sse) {
+         util_cpu_caps.has_sse2 = 0;
+         util_cpu_caps.has_sse3 = 0;
+         util_cpu_caps.has_ssse3 = 0;
+         util_cpu_caps.has_sse4_1 = 0;
+      }
+   }
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#if defined(PIPE_ARCH_PPC)
+   check_os_altivec_support();
+#endif /* PIPE_ARCH_PPC */
+
+#ifdef DEBUG
+   debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch);
+   debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
+
+   debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
+   debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
+
+   debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
+   debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
+   debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
+   debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
+   debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
+   debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
+   debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
+   debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
+   debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
+   debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
+   debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
 #endif
-#endif /* ARCH_X86 */
-
-#if defined(ARCH_POWERPC)
-	check_os_altivec_support();
-#endif /* ARCH_POWERPC */
-
-	__cpu_detect_initialized = 1;
-}
-
-struct cpu_detect_caps *cpu_detect_get_caps()
-{
-	return &__cpu_detect_caps;
-}
-
-/* The getters and setters for feature flags */
-int cpu_detect_get_tsc()
-{
-	return __cpu_detect_caps.hasTSC;
-}
-
-int cpu_detect_get_mmx()
-{
-	return __cpu_detect_caps.hasMMX;
-}
-
-int cpu_detect_get_mmx2()
-{
-	return __cpu_detect_caps.hasMMX2;
-}
 
-int cpu_detect_get_sse()
-{
-	return __cpu_detect_caps.hasSSE;
-}
-
-int cpu_detect_get_sse2()
-{
-	return __cpu_detect_caps.hasSSE2;
-}
-
-int cpu_detect_get_sse3()
-{
-	return __cpu_detect_caps.hasSSE3;
-}
-
-int cpu_detect_get_ssse3()
-{
-	return __cpu_detect_caps.hasSSSE3;
+   util_cpu_detect_initialized = TRUE;
 }
-
-int cpu_detect_get_3dnow()
-{
-	return __cpu_detect_caps.has3DNow;
-}
-
-int cpu_detect_get_3dnow2()
-{
-	return __cpu_detect_caps.has3DNowExt;
-}
-
-int cpu_detect_get_altivec()
-{
-	return __cpu_detect_caps.hasAltiVec;
-}
-
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 1612d49286a..4b3dc39c342 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -24,55 +24,55 @@
  *
  ***************************************************************************/
 
-/*
- * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
  */
 
-#ifndef _CPU_DETECT_H
-#define _CPU_DETECT_H
+#ifndef _UTIL_CPU_DETECT_H
+#define _UTIL_CPU_DETECT_H
+
+#include "pipe/p_compiler.h"
 
-typedef enum {
-	CPU_DETECT_TYPE_MIPS,
-	CPU_DETECT_TYPE_ALPHA,
-	CPU_DETECT_TYPE_SPARC,
-	CPU_DETECT_TYPE_X86,
-	CPU_DETECT_TYPE_POWERPC,
-	CPU_DETECT_TYPE_OTHER
-} cpu_detect_type;
+enum util_cpu_arch {
+   UTIL_CPU_ARCH_UNKNOWN = 0,
+   UTIL_CPU_ARCH_MIPS,
+   UTIL_CPU_ARCH_ALPHA,
+   UTIL_CPU_ARCH_SPARC,
+   UTIL_CPU_ARCH_X86,
+   UTIL_CPU_ARCH_POWERPC
+};
 
-struct cpu_detect_caps {
-	cpu_detect_type	type;
-	int		nrcpu;
+struct util_cpu_caps {
+   enum util_cpu_arch arch;
+   unsigned nr_cpus;
 
-	/* Feature flags */
-	int		x86cpuType;
-	int		cacheline;
+   /* Feature flags */
+   int x86_cpu_type;
+   unsigned cacheline;
 
-	int		hasTSC;
-	int		hasMMX;
-	int		hasMMX2;
-	int		hasSSE;
-	int		hasSSE2;
-	int		hasSSE3;
-	int		hasSSSE3;
-	int		has3DNow;
-	int		has3DNowExt;
-	int		hasAltiVec;
+   unsigned little_endian:1;
+
+   unsigned has_tsc:1;
+   unsigned has_mmx:1;
+   unsigned has_mmx2:1;
+   unsigned has_sse:1;
+   unsigned has_sse2:1;
+   unsigned has_sse3:1;
+   unsigned has_ssse3:1;
+   unsigned has_sse4_1:1;
+   unsigned has_3dnow:1;
+   unsigned has_3dnow_ext:1;
+   unsigned has_altivec:1;
 };
 
-/* prototypes */
-void cpu_detect_initialize(void);
-struct cpu_detect_caps *cpu_detect_get_caps(void);
+extern struct util_cpu_caps
+util_cpu_caps;
+
+void util_cpu_detect(void);
 
-int cpu_detect_get_tsc(void);
-int cpu_detect_get_mmx(void);
-int cpu_detect_get_mmx2(void);
-int cpu_detect_get_sse(void);
-int cpu_detect_get_sse2(void);
-int cpu_detect_get_sse3(void);
-int cpu_detect_get_ssse3(void);
-int cpu_detect_get_3dnow(void);
-int cpu_detect_get_3dnow2(void);
-int cpu_detect_get_altivec(void);
 
-#endif /* _CPU_DETECT_H */
+#endif /* _UTIL_CPU_DETECT_H */
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 1380d98d7ee..abd834c741a 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -65,6 +65,11 @@ extern "C" {
 #define __FUNCTION__ "???"
 #endif
 
+#if defined(__GNUC__)
+#define _util_printf_format(fmt, list) __attribute__ ((format (printf, fmt, list)))
+#else
+#define _util_printf_format(fmt, list)
+#endif
 
 void _debug_vprintf(const char *format, va_list ap);
    
@@ -82,14 +87,17 @@ _debug_printf(const char *format, ...)
 /**
  * Print debug messages.
  *
- * The actual channel used to output debug message is platform specific. To 
- * avoid misformating or truncation, follow these rules of thumb:   
+ * The actual channel used to output debug message is platform specific. To
+ * avoid misformating or truncation, follow these rules of thumb:
  * - output whole lines
- * - avoid outputing large strings (512 bytes is the current maximum length 
+ * - avoid outputing large strings (512 bytes is the current maximum length
  * that is guaranteed to be printed in all platforms)
  */
 #if !defined(PIPE_OS_HAIKU)
 static INLINE void
+debug_printf(const char *format, ...) _util_printf_format(1,2);
+
+static INLINE void
 debug_printf(const char *format, ...)
 {
 #ifdef DEBUG
@@ -173,11 +181,14 @@ void _debug_assert_fail(const char *expr,
  * 
  * Do not expect that the assert call terminates -- errors must be handled 
  * regardless of assert behavior.
+ *
+ * For non debug builds the assert macro will expand to a no-op, so do not
+ * call functions with side effects in the assert expression.
  */
 #ifdef DEBUG
 #define debug_assert(expr) ((expr) ? (void)0 : _debug_assert_fail(#expr, __FILE__, __LINE__, __FUNCTION__))
 #else
-#define debug_assert(expr) ((void)(expr))
+#define debug_assert(expr) ((void)0)
 #endif
 
 
@@ -340,17 +351,6 @@ void
 debug_memory_end(unsigned long beginning);
 
 
-#if defined(PROFILE) && defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-void
-debug_profile_start(void);
-
-void 
-debug_profile_stop(void);
-
-#endif
-
-
 #ifdef DEBUG
 struct pipe_surface;
 struct pipe_transfer;
diff --git a/src/gallium/auxiliary/util/u_debug_dump.c b/src/gallium/auxiliary/util/u_debug_dump.c
index 6bdecde048e..09866880aea 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.c
+++ b/src/gallium/auxiliary/util/u_debug_dump.c
@@ -187,3 +187,83 @@ debug_dump_func_short_names[] = {
 };
 
 DEFINE_DEBUG_DUMP_CONTINUOUS(func)
+
+
+static const char *
+debug_dump_tex_target_names[] = {
+   "PIPE_TEXTURE_1D",
+   "PIPE_TEXTURE_2D",
+   "PIPE_TEXTURE_3D",
+   "PIPE_TEXTURE_CUBE"
+};
+
+static const char *
+debug_dump_tex_target_short_names[] = {
+   "1d",
+   "2d",
+   "3d",
+   "cube"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_target)
+
+
+static const char *
+debug_dump_tex_wrap_names[] = {
+   "PIPE_TEX_WRAP_REPEAT",
+   "PIPE_TEX_WRAP_CLAMP",
+   "PIPE_TEX_WRAP_CLAMP_TO_EDGE",
+   "PIPE_TEX_WRAP_CLAMP_TO_BORDER",
+   "PIPE_TEX_WRAP_MIRROR_REPEAT",
+   "PIPE_TEX_WRAP_MIRROR_CLAMP",
+   "PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE",
+   "PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER"
+};
+
+static const char *
+debug_dump_tex_wrap_short_names[] = {
+   "repeat",
+   "clamp",
+   "clamp_to_edge",
+   "clamp_to_border",
+   "mirror_repeat",
+   "mirror_clamp",
+   "mirror_clamp_to_edge",
+   "mirror_clamp_to_border"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_wrap)
+
+
+static const char *
+debug_dump_tex_mipfilter_names[] = {
+   "PIPE_TEX_MIPFILTER_NEAREST",
+   "PIPE_TEX_MIPFILTER_LINEAR",
+   "PIPE_TEX_MIPFILTER_NONE"
+};
+
+static const char *
+debug_dump_tex_mipfilter_short_names[] = {
+   "nearest",
+   "linear",
+   "none"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_mipfilter)
+
+
+static const char *
+debug_dump_tex_filter_names[] = {
+   "PIPE_TEX_FILTER_NEAREST",
+   "PIPE_TEX_FILTER_LINEAR",
+   "PIPE_TEX_FILTER_ANISO"
+};
+
+static const char *
+debug_dump_tex_filter_short_names[] = {
+   "nearest",
+   "linear",
+   "aniso"
+};
+
+DEFINE_DEBUG_DUMP_CONTINUOUS(tex_filter)
diff --git a/src/gallium/auxiliary/util/u_debug_dump.h b/src/gallium/auxiliary/util/u_debug_dump.h
index 102935559c1..19b130ad183 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.h
+++ b/src/gallium/auxiliary/util/u_debug_dump.h
@@ -54,6 +54,18 @@ debug_dump_blend_func(unsigned value, boolean shortened);
 const char *
 debug_dump_func(unsigned value, boolean shortened);
 
+const char *
+debug_dump_tex_target(unsigned value, boolean shortened);
+
+const char *
+debug_dump_tex_wrap(unsigned value, boolean shortened);
+
+const char *
+debug_dump_tex_mipfilter(unsigned value, boolean shortened);
+
+const char *
+debug_dump_tex_filter(unsigned value, boolean shortened);
+
 
 /* FIXME: Move the other debug_dump_xxx functions out of u_debug.h into here. */
 
diff --git a/src/gallium/auxiliary/util/u_debug_profile.c b/src/gallium/auxiliary/util/u_debug_profile.c
deleted file mode 100644
index d765b501445..00000000000
--- a/src/gallium/auxiliary/util/u_debug_profile.c
+++ /dev/null
@@ -1,320 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * Poor-man profiling.
- * 
- * @author José Fonseca <jrfonseca@tungstengraphics.com>
- * 
- * @sa http://blogs.msdn.com/joshpoley/archive/2008/03/12/poor-man-s-profiler.aspx
- * @sa http://www.johnpanzer.com/aci_cuj/index.html
- */
-
-#include "pipe/p_config.h" 
-
-#if defined(PROFILE) && defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-#include <windows.h>
-#include <winddi.h>
-
-#include "util/u_debug.h" 
-#include "util/u_string.h" 
-
-
-#define PROFILE_TABLE_SIZE (1024*1024)
-#define FILE_NAME_SIZE 256
-
-struct debug_profile_entry
-{
-   uintptr_t caller;
-   uintptr_t callee;
-   uint64_t samples;
-};
-
-static unsigned long enabled = 0;
-
-static WCHAR wFileName[FILE_NAME_SIZE] = L"\\??\\c:\\00000000.prof";
-static ULONG_PTR iFile = 0;
-
-static struct debug_profile_entry *table = NULL;
-static unsigned long free_table_entries = 0;
-static unsigned long max_table_entries = 0;
-
-uint64_t start_stamp = 0;
-uint64_t end_stamp = 0;
-
-
-static void
-debug_profile_entry(uintptr_t caller, uintptr_t callee, uint64_t samples)
-{
-   unsigned hash = ( caller + callee ) & PROFILE_TABLE_SIZE - 1;
-   
-   while(1) {
-      if(table[hash].caller == 0 && table[hash].callee == 0) {
-         table[hash].caller = caller;
-         table[hash].callee = callee;
-         table[hash].samples = samples;
-         --free_table_entries;
-         break;
-      }
-      else if(table[hash].caller == caller && table[hash].callee == callee) {
-         table[hash].samples += samples;
-         break;
-      }
-      else {
-         ++hash;
-      }
-   }
-}
-
-
-static uintptr_t caller_stack[1024];
-static unsigned last_caller = 0;
-
-
-static int64_t delta(void) {
-   int64_t result = end_stamp - start_stamp;
-   if(result > UINT64_C(0xffffffff))
-      result = 0;
-   return result;
-}
-
-
-static void __cdecl 
-debug_profile_enter(uintptr_t callee)
-{
-   uintptr_t caller = last_caller ? caller_stack[last_caller - 1] : 0;
-                
-   if (caller)
-      debug_profile_entry(caller, 0, delta());
-   debug_profile_entry(caller, callee, 1);
-   caller_stack[last_caller++] = callee;
-}
-
-
-static void __cdecl
-debug_profile_exit(uintptr_t callee)
-{
-   debug_profile_entry(callee, 0, delta());
-   if(last_caller)
-      --last_caller;
-}
-   
-   
-/**
- * Called at the start of every method or function.
- * 
- * @sa http://msdn.microsoft.com/en-us/library/c63a9b7h.aspx
- */
-void __declspec(naked) __cdecl 
-_penter(void) {
-   _asm {
-      push eax
-      mov eax, [enabled]
-      test eax, eax
-      jz skip
-
-      push edx
-      
-      rdtsc
-      mov dword ptr [end_stamp], eax
-      mov dword ptr [end_stamp+4], edx
-
-      xor eax, eax
-      mov [enabled], eax
-
-      mov eax, [esp+8]
-
-      push ebx
-      push ecx
-      push ebp
-      push edi
-      push esi
-
-      push eax
-      call debug_profile_enter
-      add esp, 4
-
-      pop esi
-      pop edi
-      pop ebp
-      pop ecx
-      pop ebx
-
-      mov eax, 1
-      mov [enabled], eax 
-
-      rdtsc
-      mov dword ptr [start_stamp], eax
-      mov dword ptr [start_stamp+4], edx
-      
-      pop edx
-skip:
-      pop eax
-      ret
-   }
-}
-
-
-/**
- * Called at the end of Calls the end of every method or function.
- * 
- * @sa http://msdn.microsoft.com/en-us/library/xc11y76y.aspx
- */
-void __declspec(naked) __cdecl 
-_pexit(void) {
-   _asm {
-      push eax
-      mov eax, [enabled]
-      test eax, eax
-      jz skip
-
-      push edx
-      
-      rdtsc
-      mov dword ptr [end_stamp], eax
-      mov dword ptr [end_stamp+4], edx
-
-      xor eax, eax
-      mov [enabled], eax
-
-      mov eax, [esp+8]
-
-      push ebx
-      push ecx
-      push ebp
-      push edi
-      push esi
-
-      push eax
-      call debug_profile_exit
-      add esp, 4
-
-      pop esi
-      pop edi
-      pop ebp
-      pop ecx
-      pop ebx
-
-      mov eax, 1
-      mov [enabled], eax 
-
-      rdtsc
-      mov dword ptr [start_stamp], eax
-      mov dword ptr [start_stamp+4], edx
-      
-      pop edx
-skip:
-      pop eax
-      ret
-   }
-}
-
-
-/**
- * Reference function for calibration. 
- */
-void __declspec(naked) 
-__debug_profile_reference(void) {
-   _asm {
-      call _penter
-      call _pexit
-      ret
-   }
-}
-
-
-void
-debug_profile_start(void)
-{
-   WCHAR *p;
-
-   /* increment starting from the less significant digit */
-   p = &wFileName[14];
-   while(1) {
-      if(*p == '9') {
-         *p-- = '0';
-      }
-      else {
-         *p += 1;
-         break;
-      }
-   }
-
-   table = EngMapFile(wFileName, 
-                      PROFILE_TABLE_SIZE*sizeof(struct debug_profile_entry), 
-                      &iFile);
-   if(table) {
-      unsigned i;
-      
-      free_table_entries = max_table_entries = PROFILE_TABLE_SIZE;
-      memset(table, 0, PROFILE_TABLE_SIZE*sizeof(struct debug_profile_entry));
-      
-      table[0].caller = (uintptr_t)&__debug_profile_reference;
-      table[0].callee = 0;
-      table[0].samples = 0;
-      --free_table_entries;
-
-      _asm {
-         push edx
-         push eax
-      
-         rdtsc
-         mov dword ptr [start_stamp], eax
-         mov dword ptr [start_stamp+4], edx
-         
-         pop edx
-         pop eax
-      }
-
-      last_caller = 0;
-      
-      enabled = 1;
-
-      for(i = 0; i < 8; ++i) {
-         _asm {
-            call __debug_profile_reference
-         }
-      }
-   }
-}
-
-
-void 
-debug_profile_stop(void)
-{
-   enabled = 0;
-
-   if(iFile)
-      EngUnmapFile(iFile);
-   iFile = 0;
-   table = NULL;
-   free_table_entries = max_table_entries = 0;
-}
-
-#endif /* PROFILE */
diff --git a/src/gallium/auxiliary/util/u_fifo.h b/src/gallium/auxiliary/util/u_fifo.h
new file mode 100644
index 00000000000..9e007de1ada
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_fifo.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_FIFO_H
+#define U_FIFO_H
+
+#include "util/u_memory.h"
+
+struct util_fifo
+{
+   size_t head;
+   size_t tail;
+   size_t num;
+   size_t size;
+};
+
+static INLINE struct util_fifo *
+u_fifo_create(size_t size)
+{
+   struct util_fifo *fifo;
+   fifo = MALLOC(sizeof(*fifo) + size * sizeof(void*));
+
+   fifo->head = 0;
+   fifo->tail = 0;
+   fifo->num = 0;
+   fifo->size = size;
+
+   return fifo;
+}
+
+static INLINE boolean
+u_fifo_add(struct util_fifo *fifo, void *ptr)
+{
+   void **array = (void**)&fifo[1];
+   if (fifo->num >= fifo->size)
+      return FALSE;
+
+   if (++fifo->head >= fifo->size)
+      fifo->head = 0;
+
+   array[fifo->head] = ptr;
+
+   ++fifo->num;
+
+   return TRUE;
+}
+
+static INLINE boolean
+u_fifo_pop(struct util_fifo *fifo, void **ptr)
+{
+   void **array = (void**)&fifo[1];
+
+   if (!fifo->num)
+      return FALSE;
+
+   if (++fifo->tail >= fifo->size)
+      fifo->tail = 0;
+
+   *ptr = array[fifo->tail];
+
+   ++fifo->num;
+
+   return TRUE;
+}
+
+static INLINE void
+u_fifo_destroy(struct util_fifo *fifo)
+{
+   FREE(fifo);
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 00a46d0cc48..f1bf94f17dd 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -2,7 +2,7 @@ PIPE_FORMAT_A8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyxw,
 PIPE_FORMAT_X8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyx1, rgb
 PIPE_FORMAT_B8G8R8A8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb
 PIPE_FORMAT_B8G8R8X8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzw1, rgb
-PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un1 , un5 , un5 , un5 , zyxw, rgb
+PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb
 PIPE_FORMAT_A4R4G4B4_UNORM        , arith , 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb
 PIPE_FORMAT_R5G6B5_UNORM          , arith , 1, 1, un5 , un6 , un5 ,     , zyx1, rgb
 PIPE_FORMAT_A2B10G10R10_UNORM     , arith , 1, 1, un10, un10, un10, un2 , xyzw, rgb
@@ -14,10 +14,10 @@ PIPE_FORMAT_L16_UNORM             , arith , 1, 1, un16,     ,     ,     , xxx1,
 PIPE_FORMAT_Z16_UNORM             , array , 1, 1, un16,     ,     ,     , x___, zs 
 PIPE_FORMAT_Z32_UNORM             , array , 1, 1, un32,     ,     ,     , x___, zs 
 PIPE_FORMAT_Z32_FLOAT             , array , 1, 1, f32 ,     ,     ,     , x___, zs 
-PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
-PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
-PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
-PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
+PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
+PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
 PIPE_FORMAT_S8_UNORM              , array , 1, 1, un8 ,     ,     ,     , _x__, zs 
 PIPE_FORMAT_R64_FLOAT             , array , 1, 1, f64 ,     ,     ,     , x001, rgb
 PIPE_FORMAT_R64G64_FLOAT          , array , 1, 1, f64 , f64 ,     ,     , xy01, rgb
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index 8c2a8f454cc..5604e3ac374 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -47,7 +47,7 @@
 #include "util/u_hash_table.h"
 
 
-struct hash_table
+struct util_hash_table
 {
    struct cso_hash *cso;   
    
@@ -61,27 +61,27 @@ struct hash_table
 };
 
 
-struct hash_table_item
+struct util_hash_table_item
 {
    void *key;
    void *value;
 };
 
 
-static INLINE struct hash_table_item *
-hash_table_item(struct cso_hash_iter iter)
+static INLINE struct util_hash_table_item *
+util_hash_table_item(struct cso_hash_iter iter)
 {
-   return (struct hash_table_item *)cso_hash_iter_data(iter);
+   return (struct util_hash_table_item *)cso_hash_iter_data(iter);
 }
 
 
-struct hash_table *
-hash_table_create(unsigned (*hash)(void *key),
-                  int (*compare)(void *key1, void *key2))
+struct util_hash_table *
+util_hash_table_create(unsigned (*hash)(void *key),
+                       int (*compare)(void *key1, void *key2))
 {
-   struct hash_table *ht;
+   struct util_hash_table *ht;
    
-   ht = MALLOC_STRUCT(hash_table);
+   ht = MALLOC_STRUCT(util_hash_table);
    if(!ht)
       return NULL;
    
@@ -99,16 +99,16 @@ hash_table_create(unsigned (*hash)(void *key),
 
 
 static INLINE struct cso_hash_iter
-hash_table_find_iter(struct hash_table *ht,
-                     void *key, 
-                     unsigned key_hash)
+util_hash_table_find_iter(struct util_hash_table *ht,
+                          void *key,
+                          unsigned key_hash)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    
    iter = cso_hash_find(ht->cso, key_hash);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       if (!ht->compare(item->key, key))
          break;
       iter = cso_hash_iter_next(iter);
@@ -118,17 +118,17 @@ hash_table_find_iter(struct hash_table *ht,
 }
 
 
-static INLINE struct hash_table_item *
-hash_table_find_item(struct hash_table *ht,
-                     void *key, 
-                     unsigned key_hash)
+static INLINE struct util_hash_table_item *
+util_hash_table_find_item(struct util_hash_table *ht,
+                          void *key,
+                          unsigned key_hash)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    
    iter = cso_hash_find(ht->cso, key_hash);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       if (!ht->compare(item->key, key))
          return item;
       iter = cso_hash_iter_next(iter);
@@ -139,12 +139,12 @@ hash_table_find_item(struct hash_table *ht,
 
 
 enum pipe_error
-hash_table_set(struct hash_table *ht,
-               void *key,
-               void *value)
+util_hash_table_set(struct util_hash_table *ht,
+                    void *key,
+                    void *value)
 {
    unsigned key_hash;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    struct cso_hash_iter iter;
 
    assert(ht);
@@ -153,14 +153,14 @@ hash_table_set(struct hash_table *ht,
 
    key_hash = ht->hash(key);
 
-   item = hash_table_find_item(ht, key, key_hash);
+   item = util_hash_table_find_item(ht, key, key_hash);
    if(item) {
       /* TODO: key/value destruction? */
       item->value = value;
       return PIPE_OK;
    }
    
-   item = MALLOC_STRUCT(hash_table_item);
+   item = MALLOC_STRUCT(util_hash_table_item);
    if(!item)
       return PIPE_ERROR_OUT_OF_MEMORY;
    
@@ -178,11 +178,11 @@ hash_table_set(struct hash_table *ht,
 
 
 void *
-hash_table_get(struct hash_table *ht, 
-               void *key)
+util_hash_table_get(struct util_hash_table *ht,
+                    void *key)
 {
    unsigned key_hash;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -190,7 +190,7 @@ hash_table_get(struct hash_table *ht,
 
    key_hash = ht->hash(key);
 
-   item = hash_table_find_item(ht, key, key_hash);
+   item = util_hash_table_find_item(ht, key, key_hash);
    if(!item)
       return NULL;
    
@@ -199,12 +199,12 @@ hash_table_get(struct hash_table *ht,
 
 
 void
-hash_table_remove(struct hash_table *ht, 
-                  void *key)
+util_hash_table_remove(struct util_hash_table *ht,
+                       void *key)
 {
    unsigned key_hash;
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -212,11 +212,11 @@ hash_table_remove(struct hash_table *ht,
 
    key_hash = ht->hash(key);
 
-   iter = hash_table_find_iter(ht, key, key_hash);
+   iter = util_hash_table_find_iter(ht, key, key_hash);
    if(cso_hash_iter_is_null(iter))
       return;
    
-   item = hash_table_item(iter);
+   item = util_hash_table_item(iter);
    assert(item);
    FREE(item);
    
@@ -225,10 +225,10 @@ hash_table_remove(struct hash_table *ht,
 
 
 void 
-hash_table_clear(struct hash_table *ht)
+util_hash_table_clear(struct util_hash_table *ht)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -236,7 +236,7 @@ hash_table_clear(struct hash_table *ht)
 
    iter = cso_hash_first_node(ht->cso);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_take(ht->cso, cso_hash_iter_key(iter));
+      item = (struct util_hash_table_item *)cso_hash_take(ht->cso, cso_hash_iter_key(iter));
       FREE(item);
       iter = cso_hash_first_node(ht->cso);
    }
@@ -244,12 +244,13 @@ hash_table_clear(struct hash_table *ht)
 
 
 enum pipe_error
-hash_table_foreach(struct hash_table *ht,
-                   enum pipe_error (*callback)(void *key, void *value, void *data),
-                   void *data)
+util_hash_table_foreach(struct util_hash_table *ht,
+                     enum pipe_error (*callback)
+                        (void *key, void *value, void *data),
+                     void *data)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
    enum pipe_error result;
 
    assert(ht);
@@ -258,7 +259,7 @@ hash_table_foreach(struct hash_table *ht,
 
    iter = cso_hash_first_node(ht->cso);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       result = callback(item->key, item->value, data);
       if(result != PIPE_OK)
 	 return result;
@@ -270,10 +271,10 @@ hash_table_foreach(struct hash_table *ht,
 
 
 void
-hash_table_destroy(struct hash_table *ht)
+util_hash_table_destroy(struct util_hash_table *ht)
 {
    struct cso_hash_iter iter;
-   struct hash_table_item *item;
+   struct util_hash_table_item *item;
 
    assert(ht);
    if (!ht)
@@ -281,7 +282,7 @@ hash_table_destroy(struct hash_table *ht)
 
    iter = cso_hash_first_node(ht->cso);
    while (!cso_hash_iter_is_null(iter)) {
-      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      item = (struct util_hash_table_item *)cso_hash_iter_data(iter);
       FREE(item);
       iter = cso_hash_iter_next(iter);
    }
diff --git a/src/gallium/auxiliary/util/u_hash_table.h b/src/gallium/auxiliary/util/u_hash_table.h
index feee881582e..51ec10a8041 100644
--- a/src/gallium/auxiliary/util/u_hash_table.h
+++ b/src/gallium/auxiliary/util/u_hash_table.h
@@ -35,7 +35,7 @@
 #define U_HASH_TABLE_H_
 
 
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef __cplusplus
@@ -46,7 +46,7 @@ extern "C" {
 /**
  * Generic purpose hash table.
  */
-struct hash_table;
+struct util_hash_table;
 
 
 /**
@@ -55,37 +55,38 @@ struct hash_table;
  * @param hash hash function
  * @param compare should return 0 for two equal keys.
  */
-struct hash_table *
-hash_table_create(unsigned (*hash)(void *key),
-                  int (*compare)(void *key1, void *key2));
+struct util_hash_table *
+util_hash_table_create(unsigned (*hash)(void *key),
+                       int (*compare)(void *key1, void *key2));
 
 
 enum pipe_error
-hash_table_set(struct hash_table *ht,
-               void *key,
-               void *value);
+util_hash_table_set(struct util_hash_table *ht,
+                    void *key,
+                    void *value);
 
 void *
-hash_table_get(struct hash_table *ht, 
-               void *key);
+util_hash_table_get(struct util_hash_table *ht,
+                    void *key);
 
 
 void
-hash_table_remove(struct hash_table *ht, 
-                  void *key);
+util_hash_table_remove(struct util_hash_table *ht,
+                       void *key);
 
 
 void
-hash_table_clear(struct hash_table *ht);
+util_hash_table_clear(struct util_hash_table *ht);
 
 
 enum pipe_error
-hash_table_foreach(struct hash_table *ht,
-                   enum pipe_error (*callback)(void *key, void *value, void *data),
-                   void *data);
+util_hash_table_foreach(struct util_hash_table *ht,
+                        enum pipe_error (*callback)
+                        (void *key, void *value, void *data),
+                        void *data);
 
 void
-hash_table_destroy(struct hash_table *ht);
+util_hash_table_destroy(struct util_hash_table *ht);
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
index 508a2ee0634..c4b9eb3d9b7 100644
--- a/src/gallium/auxiliary/util/u_keymap.c
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -28,7 +28,7 @@
 /**
  * Key lookup/associative container.
  *
- * Like Jose's u_hash_table, based on CSO cache code for now.
+ * Like Jose's util_hash_table, based on CSO cache code for now.
  *
  * Author: Brian Paul
  */
@@ -36,7 +36,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 
 #include "cso_cache/cso_hash.h"
 
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 4c6c2bc00e1..75b075f160d 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -283,6 +283,14 @@ util_fast_pow(float x, float y)
    return util_fast_exp2(util_fast_log2(x) * y);
 }
 
+/* Note that this counts zero as a power of two.
+ */
+static INLINE boolean
+util_is_power_of_two( unsigned v )
+{
+   return (v & (v-1)) == 0;
+}
+
 
 /**
  * Floor(x), returned as int.
@@ -341,10 +349,22 @@ util_is_inf_or_nan(float x)
 
 
 /**
+ * Test whether x is a power of two.
+ */
+static INLINE boolean
+util_is_pot(unsigned x)
+{
+   return (x & (x - 1)) == 0;
+}
+
+
+/**
  * Find first bit set in word.  Least significant bit is 1.
  * Return 0 if no bits set.
  */
-#if defined(_MSC_VER) && _MSC_VER >= 1300
+#if defined(_MSC_VER) && _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64)
+unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
+#pragma intrinsic(_BitScanForward)
 static INLINE
 unsigned long ffs( unsigned long u )
 {
@@ -451,6 +471,26 @@ util_logbase2(unsigned n)
 
 
 /**
+ * Returns the smallest power of two >= x
+ */
+static INLINE unsigned
+util_next_power_of_two(unsigned x)
+{
+   unsigned i;
+
+   if (x == 0)
+      return 1;
+
+   --x;
+
+   for (i = 1; i < sizeof(unsigned) * 8; i <<= 1)
+      x |= x >> i;
+
+   return x + 1;
+}
+
+
+/**
  * Clamp X to [MIN, MAX].
  * This is a macro to allow float, int, uint, etc. types.
  */
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 4b75d4ba1d0..82f83702d1e 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -39,13 +39,20 @@ u_mmDumpMemInfo(const struct mem_block *heap)
    }
    else {
       const struct mem_block *p;
+      int total_used = 0, total_free = 0;
 
       for (p = heap->next; p != heap; p = p->next) {
 	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n", p->ofs, p->size,
                       p->free ? 'F':'.',
                       p->reserved ? 'R':'.');
+         if (p->free)
+            total_free += p->size;
+         else
+            total_used += p->size;
       }
 
+      debug_printf("'\nMemory stats: total = %d, used = %d, free = %d\n",
+                   total_used + total_free, total_used, total_free);
       debug_printf("\nFree list:\n");
 
       for (p = heap->next_free; p != heap; p = p->next_free) {
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index bc4b7584067..6269c72e121 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -6,7 +6,7 @@
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <winsock2.h>
 #  include <windows.h>
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
 #  include <sys/socket.h>
 #  include <netinet/in.h>
 #  include <unistd.h>
@@ -54,7 +54,7 @@ u_socket_close(int s)
    if (s < 0)
       return;
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
    shutdown(s, SHUT_RDWR);
    close(s);
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
@@ -169,7 +169,7 @@ u_socket_listen_on_port(uint16_t portnum)
 void
 u_socket_block(int s, boolean block)
 {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
    int old = fcntl(s, F_GETFL, 0);
    if (old == -1)
       return;
diff --git a/src/gallium/auxiliary/util/u_network.h b/src/gallium/auxiliary/util/u_network.h
index 8c778f492ca..0aa898b9676 100644
--- a/src/gallium/auxiliary/util/u_network.h
+++ b/src/gallium/auxiliary/util/u_network.h
@@ -6,7 +6,7 @@
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  define PIPE_HAVE_SOCKETS
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
 #  define PIPE_HAVE_SOCKETS
 #endif
 
diff --git a/src/gallium/auxiliary/util/u_simple_screen.c b/src/gallium/auxiliary/util/u_simple_screen.c
index f01296b40fc..52382990155 100644
--- a/src/gallium/auxiliary/util/u_simple_screen.c
+++ b/src/gallium/auxiliary/util/u_simple_screen.c
@@ -52,8 +52,7 @@ pass_user_buffer_create(struct pipe_screen *screen,
                         unsigned bytes)
 {
    struct pipe_buffer *buffer =
-      screen->winsys->user_buffer_create(screen->winsys,
-                                             ptr, bytes);
+      screen->winsys->user_buffer_create(screen->winsys, ptr, bytes);
 
    buffer->screen = screen;
 
@@ -69,9 +68,8 @@ pass_surface_buffer_create(struct pipe_screen *screen,
                            unsigned *stride)
 {
    struct pipe_buffer *buffer =
-      screen->winsys->surface_buffer_create(screen->winsys,
-                                                width, height,
-                                                format, usage, tex_usage, stride);
+      screen->winsys->surface_buffer_create(screen->winsys, width, height,
+                                            format, usage, tex_usage, stride);
 
    buffer->screen = screen;
 
@@ -83,8 +81,7 @@ pass_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,
                 unsigned usage)
 {
-   return screen->winsys->buffer_map(screen->winsys,
-                                     buf, usage);
+   return screen->winsys->buffer_map(screen->winsys, buf, usage);
 }
 
 static void
@@ -106,8 +103,7 @@ pass_flush_frontbuffer(struct pipe_screen *screen,
                        struct pipe_surface *surf,
                        void *context_private)
 {
-   screen->winsys->flush_frontbuffer(screen->winsys,
-                                     surf, context_private);
+   screen->winsys->flush_frontbuffer(screen->winsys, surf, context_private);
 }
 
 static void
@@ -115,8 +111,7 @@ pass_fence_reference(struct pipe_screen *screen,
                      struct pipe_fence_handle **ptr,
                      struct pipe_fence_handle *fence)
 {
-   screen->winsys->fence_reference(screen->winsys,
-                                   ptr, fence);
+   screen->winsys->fence_reference(screen->winsys, ptr, fence);
 }
 
 static int
@@ -124,8 +119,7 @@ pass_fence_signalled(struct pipe_screen *screen,
                      struct pipe_fence_handle *fence,
                      unsigned flag)
 {
-   return screen->winsys->fence_signalled(screen->winsys,
-                                          fence, flag);
+   return screen->winsys->fence_signalled(screen->winsys, fence, flag);
 }
 
 static int
@@ -133,11 +127,11 @@ pass_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
                   unsigned flag)
 {
-   return screen->winsys->fence_finish(screen->winsys,
-                                       fence, flag);
+   return screen->winsys->fence_finish(screen->winsys, fence, flag);
 }
 
-void u_simple_screen_init(struct pipe_screen *screen)
+void
+u_simple_screen_init(struct pipe_screen *screen)
 {
    screen->buffer_create = pass_buffer_create;
    screen->user_buffer_create = pass_user_buffer_create;
@@ -152,7 +146,8 @@ void u_simple_screen_init(struct pipe_screen *screen)
    screen->fence_finish = pass_fence_finish;
 }
 
-const char* u_simple_screen_winsys_name(struct pipe_screen *screen)
+const char *
+u_simple_screen_winsys_name(struct pipe_screen *screen)
 {
    return screen->winsys->get_name(screen->winsys);
 }
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index ab754296fa8..1c8b157d91f 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -34,14 +34,8 @@
 
 
 #include "pipe/p_context.h"
-#include "util/u_debug.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_screen.h"
 #include "pipe/p_shader_tokens.h"
-
-#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
-
 #include "tgsi/tgsi_ureg.h"
 
 
@@ -67,9 +61,7 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
       struct ureg_src src;
       struct ureg_dst dst;
 
-      src = ureg_DECL_vs_input( ureg,
-                                semantic_names[i],
-                                semantic_indexes[i]);
+      src = ureg_DECL_vs_input( ureg, i );
       
       dst = ureg_DECL_output( ureg,
                               semantic_names[i],
@@ -116,7 +108,15 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
                            TGSI_SEMANTIC_COLOR,
                            0 );
 
-   ureg_TEX( ureg, out, TGSI_TEXTURE_2D, tex, sampler );
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      struct ureg_src imm = ureg_imm4f( ureg, 0, 0, 0, 1 );
+
+      ureg_MOV( ureg, out, imm );
+   }
+
+   ureg_TEX( ureg, 
+             ureg_writemask(out, writemask),
+             TGSI_TEXTURE_2D, tex, sampler );
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 1235a67d264..8a22f584bee 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -170,7 +170,7 @@ x8r8g8b8_get_tile_rgba(const unsigned *src,
          pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
          pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
          pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
-         pRow[3] = ubyte_to_float(0xff);
+         pRow[3] = 1.0F;
       }
       p += dst_stride;
    }
@@ -394,6 +394,52 @@ r5g6b5_put_tile_rgba(ushort *dst,
 
 
 
+/*** PIPE_FORMAT_R8G8B8_UNORM ***/
+
+static void
+r8g8b8_get_tile_rgba(const ubyte *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] = ubyte_to_float(src[0]);
+         pRow[1] = ubyte_to_float(src[1]);
+         pRow[2] = ubyte_to_float(src[2]);
+         pRow[3] = 1.0f;
+         src += 3;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r8g8b8_put_tile_rgba(ubyte *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         dst[0] = float_to_ubyte(pRow[0]);
+         dst[1] = float_to_ubyte(pRow[1]);
+         dst[2] = float_to_ubyte(pRow[2]);
+         dst += 3;
+      }
+      p += src_stride;
+   }
+}
+
+
+
 /*** PIPE_FORMAT_Z16_UNORM ***/
 
 /**
@@ -1106,6 +1152,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_R5G6B5_UNORM:
       r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      r8g8b8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_L8_UNORM:
       l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
       break;
@@ -1222,6 +1271,9 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    case PIPE_FORMAT_R5G6B5_UNORM:
       r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      r8g8b8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       assert(0);
       break;
@@ -1400,7 +1452,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
    case PIPE_FORMAT_S8Z24_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
-         assert(pt->usage == PIPE_TRANSFER_READ_WRITE);
+         assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
          for (i = 0; i < h; i++) {
             for (j = 0; j < w; j++) {
                /* convert 32-bit Z to 24-bit Z, preserve stencil */
@@ -1427,7 +1479,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
    case PIPE_FORMAT_Z24S8_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
-         assert(pt->usage == PIPE_TRANSFER_READ_WRITE);
+         assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
          for (i = 0; i < h; i++) {
             for (j = 0; j < w; j++) {
                /* convert 32-bit Z to 24-bit Z, preserve stencil */
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index eb635c9f149..975ee89c455 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -29,7 +29,7 @@
  * coalescing small buffers into larger ones.
  */
 
-#include "pipe/p_error.h"
+#include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
diff --git a/src/gallium/auxiliary/vl/Makefile b/src/gallium/auxiliary/vl/Makefile
new file mode 100644
index 00000000000..4314c1e8d69
--- /dev/null
+++ b/src/gallium/auxiliary/vl/Makefile
@@ -0,0 +1,13 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = vl
+
+C_SOURCES = \
+	vl_bitstream_parser.c \
+	vl_mpeg12_mc_renderer.c \
+	vl_compositor.c \
+        vl_csc.c \
+	vl_shader_build.c
+
+include ../../Makefile.template
diff --git a/src/gallium/auxiliary/vl/SConscript b/src/gallium/auxiliary/vl/SConscript
new file mode 100644
index 00000000000..aed69f5efed
--- /dev/null
+++ b/src/gallium/auxiliary/vl/SConscript
@@ -0,0 +1,13 @@
+Import('*')
+
+vl = env.ConvenienceLibrary(
+	target = 'vl',
+	source = [
+		'vl_bitstream_parser.c',
+		'vl_mpeg12_mc_renderer.c',
+		'vl_compositor.c',
+                'vl_csc.c',
+		'vl_shader_build.c',
+	])
+
+auxiliaries.insert(0, vl)
diff --git a/src/gallium/auxiliary/vl/vl_bitstream_parser.c b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
new file mode 100644
index 00000000000..3193ea5f41c
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_bitstream_parser.c
@@ -0,0 +1,167 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_bitstream_parser.h"
+#include <assert.h>
+#include <limits.h>
+#include <util/u_memory.h>
+
+static unsigned
+grab_bits(unsigned cursor, unsigned how_many_bits, unsigned bitstream_elt)
+{
+   unsigned excess_bits = sizeof(unsigned) * CHAR_BIT - how_many_bits - cursor;
+	
+   assert(cursor < sizeof(unsigned) * CHAR_BIT);
+   assert(how_many_bits > 0 && how_many_bits <= sizeof(unsigned) * CHAR_BIT);
+   assert(cursor + how_many_bits <= sizeof(unsigned) * CHAR_BIT);
+
+   return (bitstream_elt << excess_bits) >> (excess_bits + cursor);
+}
+
+static unsigned
+show_bits(unsigned cursor, unsigned how_many_bits, const unsigned *bitstream)
+{	
+   unsigned cur_int = cursor / (sizeof(unsigned) * CHAR_BIT);
+   unsigned cur_bit = cursor % (sizeof(unsigned) * CHAR_BIT);
+	
+   assert(bitstream);
+	
+   if (cur_bit + how_many_bits > sizeof(unsigned) * CHAR_BIT) {
+      unsigned lower = grab_bits(cur_bit, sizeof(unsigned) * CHAR_BIT - cur_bit,
+                                 bitstream[cur_int]);
+      unsigned upper = grab_bits(0, cur_bit + how_many_bits - sizeof(unsigned) * CHAR_BIT,
+                                 bitstream[cur_int + 1]);
+      return lower | upper << (sizeof(unsigned) * CHAR_BIT - cur_bit);
+   }
+   else
+      return grab_bits(cur_bit, how_many_bits, bitstream[cur_int]);
+}
+
+bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
+                              unsigned num_bitstreams,
+                              const void **bitstreams,
+                              const unsigned *sizes)
+{
+   assert(parser);
+   assert(num_bitstreams);
+   assert(bitstreams);
+   assert(sizes);
+
+   parser->num_bitstreams = num_bitstreams;
+   parser->bitstreams = (const unsigned**)bitstreams;
+   parser->sizes = sizes;
+   parser->cur_bitstream = 0;
+   parser->cursor = 0;
+
+   return true;
+}
+
+void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser)
+{
+   assert(parser);
+}
+
+unsigned
+vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
+                             unsigned how_many_bits)
+{
+   unsigned bits;
+
+   assert(parser);
+
+   bits = vl_bitstream_parser_show_bits(parser, how_many_bits);
+
+   vl_bitstream_parser_forward(parser, how_many_bits);
+
+   return bits;
+}
+
+unsigned
+vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
+                              unsigned how_many_bits)
+{	
+   unsigned bits = 0;
+   unsigned shift = 0;
+   unsigned cursor;
+   unsigned cur_bitstream;
+
+   assert(parser);
+
+   cursor = parser->cursor;
+   cur_bitstream = parser->cur_bitstream;
+
+   while (1) {
+      unsigned bits_left = parser->sizes[cur_bitstream] * CHAR_BIT - cursor;
+      unsigned bits_to_show = how_many_bits > bits_left ? bits_left : how_many_bits;
+
+      bits |= show_bits(cursor, bits_to_show,
+                        parser->bitstreams[cur_bitstream]) << shift;
+		
+      if (how_many_bits > bits_to_show) {
+         how_many_bits -= bits_to_show;
+         cursor = 0;
+         ++cur_bitstream;
+         shift += bits_to_show;
+      }
+      else
+         break;
+   }
+
+   return bits;
+}
+
+void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
+                                 unsigned how_many_bits)
+{
+   assert(parser);
+   assert(how_many_bits);
+
+   parser->cursor += how_many_bits;
+
+   while (parser->cursor > parser->sizes[parser->cur_bitstream] * CHAR_BIT) {
+      parser->cursor -= parser->sizes[parser->cur_bitstream++] * CHAR_BIT;
+      assert(parser->cur_bitstream < parser->num_bitstreams);
+   }
+}
+
+void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
+                                unsigned how_many_bits)
+{
+   signed c;
+	
+   assert(parser);
+   assert(how_many_bits);
+	
+   c = parser->cursor - how_many_bits;
+
+   while (c < 0) {
+      c += parser->sizes[parser->cur_bitstream--] * CHAR_BIT;
+      assert(parser->cur_bitstream < parser->num_bitstreams);
+   }
+
+   parser->cursor = (unsigned)c;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
index dfa7ff3b1d1..30ec743fa75 100644
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ b/src/gallium/auxiliary/vl/vl_bitstream_parser.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,61 +25,39 @@
  * 
  **************************************************************************/
 
+#ifndef vl_bitstream_parser_h
+#define vl_bitstream_parser_h
 
-/**
- * \brief  Quad occlusion counter stage
- * \author  Brian Paul
- */
+#include "pipe/p_compiler.h"
 
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-static unsigned count_bits( unsigned val )
-{
-   unsigned i;
-
-   for (i = 0; val ; val >>= 1)
-      i += (val & 1);
-
-   return i;
-}
-
-static void
-occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
+struct vl_bitstream_parser
 {
-   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned num_bitstreams;
+   const unsigned **bitstreams;
+   const unsigned *sizes;
+   unsigned cur_bitstream;
+   unsigned cursor;
+};
 
-   softpipe->occlusion_count += count_bits(quad->inout.mask);
+bool vl_bitstream_parser_init(struct vl_bitstream_parser *parser,
+                              unsigned num_bitstreams,
+                              const void **bitstreams,
+                              const unsigned *sizes);
 
-   qs->next->run(qs->next, quad);
-}
+void vl_bitstream_parser_cleanup(struct vl_bitstream_parser *parser);
 
+unsigned
+vl_bitstream_parser_get_bits(struct vl_bitstream_parser *parser,
+                             unsigned how_many_bits);
 
-static void occlusion_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void occlusion_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
+unsigned
+vl_bitstream_parser_show_bits(struct vl_bitstream_parser *parser,
+                              unsigned how_many_bits);
 
-struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+void vl_bitstream_parser_forward(struct vl_bitstream_parser *parser,
+                                 unsigned how_many_bits);
 
-   stage->softpipe = softpipe;
-   stage->begin = occlusion_begin;
-   stage->run = occlusion_count_quad;
-   stage->destroy = occlusion_destroy;
+void vl_bitstream_parser_rewind(struct vl_bitstream_parser *parser,
+                                unsigned how_many_bits);
 
-   return stage;
-}
+#endif /* vl_bitstream_parser_h */
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
new file mode 100644
index 00000000000..cda6dc134a0
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -0,0 +1,536 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_compositor.h"
+#include <assert.h>
+#include <pipe/p_context.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include <util/u_memory.h>
+#include "vl_csc.h"
+#include "vl_shader_build.h"
+
+struct vertex2f
+{
+   float x, y;
+};
+
+struct vertex4f
+{
+   float x, y, z, w;
+};
+
+struct vertex_shader_consts
+{
+   struct vertex4f dst_scale;
+   struct vertex4f dst_trans;
+   struct vertex4f src_scale;
+   struct vertex4f src_trans;
+};
+
+struct fragment_shader_consts
+{
+   float matrix[16];
+};
+
+/*
+ * Represents 2 triangles in a strip in normalized coords.
+ * Used to render the surface onto the frame buffer.
+ */
+static const struct vertex2f surface_verts[4] =
+{
+   {0.0f, 0.0f},
+   {0.0f, 1.0f},
+   {1.0f, 0.0f},
+   {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above. We can use the position values directly.
+ * TODO: Duplicate these in the shader, no need to create a buffer.
+ */
+static const struct vertex2f *surface_texcoords = surface_verts;
+
+static void
+create_vert_shader(struct vl_compositor *c)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(c);
+
+   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header*)&tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0             ; Vertex pos
+    * decl i1             ; Vertex texcoords
+    */
+   for (i = 0; i < 2; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl c0             ; Scaling vector to scale vertex pos rect to destination size
+    * decl c1             ; Translation vector to move vertex pos rect into position
+    * decl c2             ; Scaling vector to scale texcoord rect to source size
+    * decl c3             ; Translation vector to move texcoord rect into position
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl o0             ; Vertex pos
+    * decl o1             ; Vertex texcoords
+    */
+   for (i = 0; i < 2; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * mad o0, i0, c0, c1  ; Scale and translate unit output rect to destination size and pos
+    * mad o1, i1, c2, c3  ; Scale and translate unit texcoord rect to source size and pos
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_inst4(TGSI_OPCODE_MAD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   c->vertex_shader = c->pipe->create_vs_state(c->pipe, &vs);
+   FREE(tokens);
+}
+
+static void
+create_frag_shader(struct vl_compositor *c)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(c);
+
+   tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header*)&tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /* decl i0             ; Texcoords for s0 */
+   decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl c0-c3          ; CSC matrix c0-c3
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0             ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0 */
+   decl = vl_decl_temps(0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl s0             ; Sampler for tex containing picture to display */
+   decl = vl_decl_samplers(0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* tex2d t0, i0, s0    ; Read src pixel */
+   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * dp4 o0.x, t0, c0    ; Multiply pixel by the color conversion matrix
+    * dp4 o0.y, t0, c1
+    * dp4 o0.z, t0, c2
+    * dp4 o0.w, t0, c3
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i);
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   c->fragment_shader = c->pipe->create_fs_state(c->pipe, &fs);
+   FREE(tokens);
+}
+
+static bool
+init_pipe_state(struct vl_compositor *c)
+{
+   struct pipe_sampler_state sampler;
+
+   assert(c);
+
+   c->fb_state.nr_cbufs = 1;
+   c->fb_state.zsbuf = NULL;
+
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+   sampler.compare_func = PIPE_FUNC_ALWAYS;
+   sampler.normalized_coords = 1;
+   /*sampler.prefilter = ;*/
+   /*sampler.lod_bias = ;*/
+   /*sampler.min_lod = ;*/
+   /*sampler.max_lod = ;*/
+   /*sampler.border_color[i] = ;*/
+   /*sampler.max_anisotropy = ;*/
+   c->sampler = c->pipe->create_sampler_state(c->pipe, &sampler);
+	
+   return true;
+}
+
+static void cleanup_pipe_state(struct vl_compositor *c)
+{
+   assert(c);
+	
+   c->pipe->delete_sampler_state(c->pipe, c->sampler);
+}
+
+static bool
+init_shaders(struct vl_compositor *c)
+{
+   assert(c);
+
+   create_vert_shader(c);
+   create_frag_shader(c);
+
+   return true;
+}
+
+static void cleanup_shaders(struct vl_compositor *c)
+{
+   assert(c);
+	
+   c->pipe->delete_vs_state(c->pipe, c->vertex_shader);
+   c->pipe->delete_fs_state(c->pipe, c->fragment_shader);
+}
+
+static bool
+init_buffers(struct vl_compositor *c)
+{
+   struct fragment_shader_consts fsc;
+
+   assert(c);
+	
+   /*
+    * Create our vertex buffer and vertex buffer element
+    * VB contains 4 vertices that render a quad covering the entire window
+    * to display a rendered surface
+    * Quad is rendered as a tri strip
+    */
+   c->vertex_bufs[0].stride = sizeof(struct vertex2f);
+   c->vertex_bufs[0].max_index = 3;
+   c->vertex_bufs[0].buffer_offset = 0;
+   c->vertex_bufs[0].buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_VERTEX,
+      sizeof(struct vertex2f) * 4
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      surface_verts,
+      sizeof(struct vertex2f) * 4
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[0].buffer);
+
+   c->vertex_elems[0].src_offset = 0;
+   c->vertex_elems[0].vertex_buffer_index = 0;
+   c->vertex_elems[0].nr_components = 2;
+   c->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /*
+    * Create our texcoord buffer and texcoord buffer element
+    * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
+    */
+   c->vertex_bufs[1].stride = sizeof(struct vertex2f);
+   c->vertex_bufs[1].max_index = 3;
+   c->vertex_bufs[1].buffer_offset = 0;
+   c->vertex_bufs[1].buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_VERTEX,
+      sizeof(struct vertex2f) * 4
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(c->pipe->screen, c->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      surface_texcoords,
+      sizeof(struct vertex2f) * 4
+   );
+
+   pipe_buffer_unmap(c->pipe->screen, c->vertex_bufs[1].buffer);
+
+   c->vertex_elems[1].src_offset = 0;
+   c->vertex_elems[1].vertex_buffer_index = 1;
+   c->vertex_elems[1].nr_components = 2;
+   c->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /*
+    * Create our vertex shader's constant buffer
+    * Const buffer contains scaling and translation vectors
+    */
+   c->vs_const_buf.buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex_shader_consts)
+   );
+
+   /*
+    * Create our fragment shader's constant buffer
+    * Const buffer contains the color conversion matrix and bias vectors
+    */
+   c->fs_const_buf.buffer = pipe_buffer_create
+   (
+      c->pipe->screen,
+      1,
+      PIPE_BUFFER_USAGE_CONSTANT,
+      sizeof(struct fragment_shader_consts)
+   );
+
+   vl_csc_get_matrix(VL_CSC_COLOR_STANDARD_IDENTITY, NULL, true, fsc.matrix);
+
+   vl_compositor_set_csc_matrix(c, fsc.matrix);
+
+   return true;
+}
+
+static void
+cleanup_buffers(struct vl_compositor *c)
+{
+   unsigned i;
+
+   assert(c);
+	
+   for (i = 0; i < 2; ++i)
+      pipe_buffer_reference(&c->vertex_bufs[i].buffer, NULL);
+
+   pipe_buffer_reference(&c->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&c->fs_const_buf.buffer, NULL);
+}
+
+bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe)
+{
+   assert(compositor);
+
+   memset(compositor, 0, sizeof(struct vl_compositor));
+
+   compositor->pipe = pipe;
+
+   if (!init_pipe_state(compositor))
+      return false;
+   if (!init_shaders(compositor)) {
+      cleanup_pipe_state(compositor);
+      return false;
+   }
+   if (!init_buffers(compositor)) {
+      cleanup_shaders(compositor);
+      cleanup_pipe_state(compositor);
+      return false;
+   }
+
+   return true;
+}
+
+void vl_compositor_cleanup(struct vl_compositor *compositor)
+{
+   assert(compositor);
+	
+   cleanup_buffers(compositor);
+   cleanup_shaders(compositor);
+   cleanup_pipe_state(compositor);
+}
+
+void vl_compositor_render(struct vl_compositor          *compositor,
+                          /*struct pipe_texture         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_texture           *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_texture           *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_texture           *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_texture           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas*/
+                          struct pipe_fence_handle      **fence)
+{
+   struct vertex_shader_consts *vs_consts;
+
+   assert(compositor);
+   assert(src_surface);
+   assert(src_area);
+   assert(dst_surface);
+   assert(dst_area);
+   assert(picture_type == PIPE_MPEG12_PICTURE_TYPE_FRAME);
+
+   compositor->fb_state.width = dst_surface->width[0];
+   compositor->fb_state.height = dst_surface->height[0];
+   compositor->fb_state.cbufs[0] = compositor->pipe->screen->get_tex_surface
+   (
+      compositor->pipe->screen,
+      dst_surface,
+      0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+   );
+
+   compositor->viewport.scale[0] = compositor->fb_state.width;
+   compositor->viewport.scale[1] = compositor->fb_state.height;
+   compositor->viewport.scale[2] = 1;
+   compositor->viewport.scale[3] = 1;
+   compositor->viewport.translate[0] = 0;
+   compositor->viewport.translate[1] = 0;
+   compositor->viewport.translate[2] = 0;
+   compositor->viewport.translate[3] = 0;
+
+   compositor->scissor.maxx = compositor->fb_state.width;
+   compositor->scissor.maxy = compositor->fb_state.height;
+
+   compositor->pipe->set_framebuffer_state(compositor->pipe, &compositor->fb_state);
+   compositor->pipe->set_viewport_state(compositor->pipe, &compositor->viewport);
+   compositor->pipe->set_scissor_state(compositor->pipe, &compositor->scissor);
+   compositor->pipe->bind_sampler_states(compositor->pipe, 1, &compositor->sampler);
+   compositor->pipe->set_sampler_textures(compositor->pipe, 1, &src_surface);
+   compositor->pipe->bind_vs_state(compositor->pipe, compositor->vertex_shader);
+   compositor->pipe->bind_fs_state(compositor->pipe, compositor->fragment_shader);
+   compositor->pipe->set_vertex_buffers(compositor->pipe, 2, compositor->vertex_bufs);
+   compositor->pipe->set_vertex_elements(compositor->pipe, 2, compositor->vertex_elems);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_VERTEX, 0, &compositor->vs_const_buf);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, &compositor->fs_const_buf);
+
+   vs_consts = pipe_buffer_map
+   (
+      compositor->pipe->screen,
+      compositor->vs_const_buf.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   vs_consts->dst_scale.x = dst_area->w / (float)compositor->fb_state.cbufs[0]->width;
+   vs_consts->dst_scale.y = dst_area->h / (float)compositor->fb_state.cbufs[0]->height;
+   vs_consts->dst_scale.z = 1;
+   vs_consts->dst_scale.w = 1;
+   vs_consts->dst_trans.x = dst_area->x / (float)compositor->fb_state.cbufs[0]->width;
+   vs_consts->dst_trans.y = dst_area->y / (float)compositor->fb_state.cbufs[0]->height;
+   vs_consts->dst_trans.z = 0;
+   vs_consts->dst_trans.w = 0;
+
+   vs_consts->src_scale.x = src_area->w / (float)src_surface->width[0];
+   vs_consts->src_scale.y = src_area->h / (float)src_surface->height[0];
+   vs_consts->src_scale.z = 1;
+   vs_consts->src_scale.w = 1;
+   vs_consts->src_trans.x = src_area->x / (float)src_surface->width[0];
+   vs_consts->src_trans.y = src_area->y / (float)src_surface->height[0];
+   vs_consts->src_trans.z = 0;
+   vs_consts->src_trans.w = 0;
+
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->vs_const_buf.buffer);
+
+   compositor->pipe->draw_arrays(compositor->pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+   compositor->pipe->flush(compositor->pipe, PIPE_FLUSH_RENDER_CACHE, fence);
+
+   pipe_surface_reference(&compositor->fb_state.cbufs[0], NULL);
+}
+
+void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float *mat)
+{
+   assert(compositor);
+
+   memcpy
+   (
+      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      mat,
+      sizeof(struct fragment_shader_consts)
+   );
+
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf.buffer);
+}
diff --git a/src/gallium/auxiliary/vl/vl_compositor.h b/src/gallium/auxiliary/vl/vl_compositor.h
new file mode 100644
index 00000000000..f441901a751
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_compositor.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_compositor_h
+#define vl_compositor_h
+
+#include <pipe/p_compiler.h>
+#include <pipe/p_state.h>
+#include <pipe/p_video_state.h>
+
+struct pipe_context;
+struct pipe_texture;
+
+struct vl_compositor
+{
+   struct pipe_context *pipe;
+
+   struct pipe_framebuffer_state fb_state;
+   void *sampler;
+   void *vertex_shader;
+   void *fragment_shader;
+   struct pipe_viewport_state viewport;
+   struct pipe_scissor_state scissor;
+   struct pipe_vertex_buffer vertex_bufs[2];
+   struct pipe_vertex_element vertex_elems[2];
+   struct pipe_constant_buffer vs_const_buf, fs_const_buf;
+};
+
+bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe);
+
+void vl_compositor_cleanup(struct vl_compositor *compositor);
+
+void vl_compositor_render(struct vl_compositor          *compositor,
+                          /*struct pipe_texture         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_texture           *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_texture           *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_texture           *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_texture           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas,*/
+                          struct pipe_fence_handle      **fence);
+
+void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float *mat);
+
+#endif /* vl_compositor_h */
diff --git a/src/gallium/auxiliary/vl/vl_csc.c b/src/gallium/auxiliary/vl/vl_csc.c
new file mode 100644
index 00000000000..5ecc43a5fa3
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_csc.c
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_csc.h"
+#include <util/u_math.h>
+#include <util/u_debug.h>
+
+/*
+ * Color space conversion formulas
+ *
+ * To convert YCbCr to RGB,
+ *    vec4  ycbcr, rgb
+ *    mat44 csc
+ *    rgb = csc * ycbcr
+ *
+ * To calculate the color space conversion matrix csc with ProcAmp adjustments,
+ *    mat44 csc, cstd, procamp, bias
+ *    csc = cstd * (procamp * bias)
+ *
+ * Where cstd is a matrix corresponding to one of the color standards (BT.601, BT.709, etc)
+ * adjusted for the kind of YCbCr -> RGB mapping wanted (1:1, full),
+ * bias is a matrix corresponding to the kind of YCbCr -> RGB mapping wanted (1:1, full)
+ *
+ * To calculate procamp,
+ *    mat44 procamp, hue, saturation, brightness, contrast
+ *    procamp = brightness * (saturation * (contrast * hue))
+ * Alternatively,
+ *    procamp = saturation * (brightness * (contrast * hue))
+ *
+ * contrast
+ * [ c, 0, 0, 0]
+ * [ 0, c, 0, 0]
+ * [ 0, 0, c, 0]
+ * [ 0, 0, 0, 1]
+ *
+ * brightness
+ * [ 1, 0, 0, b]
+ * [ 0, 1, 0, 0]
+ * [ 0, 0, 1, 0]
+ * [ 0, 0, 0, 1]
+ *
+ * saturation
+ * [ 1, 0, 0, 0]
+ * [ 0, s, 0, 0]
+ * [ 0, 0, s, 0]
+ * [ 0, 0, 0, 1]
+ *
+ * hue
+ * [ 1,       0,      0, 0]
+ * [ 0,  cos(h), sin(h), 0]
+ * [ 0, -sin(h), cos(h), 0]
+ * [ 0,       0,      0, 1]
+ *
+ * procamp
+ * [ c,           0,          0, b]
+ * [ 0,  c*s*cos(h), c*s*sin(h), 0]
+ * [ 0, -c*s*sin(h), c*s*cos(h), 0]
+ * [ 0,           0,          0, 1]
+ *
+ * bias
+ * [ 1, 0, 0,  ybias]
+ * [ 0, 1, 0, cbbias]
+ * [ 0, 0, 1, crbias]
+ * [ 0, 0, 0,      1]
+ *
+ * csc
+ * [ c*cstd[ 0], c*cstd[ 1]*s*cos(h) - c*cstd[ 2]*s*sin(h), c*cstd[ 2]*s*cos(h) + c*cstd[ 1]*s*sin(h), cstd[ 3] + cstd[ 0]*(b + c*ybias) + cstd[ 1]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[ 2]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ * [ c*cstd[ 4], c*cstd[ 5]*s*cos(h) - c*cstd[ 6]*s*sin(h), c*cstd[ 6]*s*cos(h) + c*cstd[ 5]*s*sin(h), cstd[ 7] + cstd[ 4]*(b + c*ybias) + cstd[ 5]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[ 6]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ * [ c*cstd[ 8], c*cstd[ 9]*s*cos(h) - c*cstd[10]*s*sin(h), c*cstd[10]*s*cos(h) + c*cstd[ 9]*s*sin(h), cstd[11] + cstd[ 8]*(b + c*ybias) + cstd[ 9]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[10]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ * [ c*cstd[12], c*cstd[13]*s*cos(h) - c*cstd[14]*s*sin(h), c*cstd[14]*s*cos(h) + c*cstd[13]*s*sin(h), cstd[15] + cstd[12]*(b + c*ybias) + cstd[13]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[14]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h))]
+ */
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const float bt_601[16] =
+{
+   1.0f,  0.0f,    1.371f, 0.0f,
+   1.0f, -0.336f, -0.698f, 0.0f,
+   1.0f,  1.732f,  0.0f,   0.0f,
+   0.0f,  0.0f,    0.0f,   1.0f
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const float bt_601_full[16] =
+{
+   1.164f,  0.0f,    1.596f, 0.0f,
+   1.164f, -0.391f, -0.813f, 0.0f,
+   1.164f,  2.018f,  0.0f,   0.0f,
+   0.0f,    0.0f,    0.0f,   1.0f
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const float bt_709[16] =
+{
+   1.0f,  0.0f,    1.540f, 0.0f,
+   1.0f, -0.183f, -0.459f, 0.0f,
+   1.0f,  1.816f,  0.0f,   0.0f,
+   0.0f,  0.0f,    0.0f,   1.0f
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const float bt_709_full[16] =
+{
+   1.164f,  0.0f,    1.793f, 0.0f,
+   1.164f, -0.213f, -0.534f, 0.0f,
+   1.164f,  2.115f,  0.0f,   0.0f,
+   0.0f,    0.0f,    0.0f,   1.0f
+};
+
+static const float identity[16] =
+{
+   1.0f, 0.0f, 0.0f, 0.0f,
+   0.0f, 1.0f, 0.0f, 0.0f,
+   0.0f, 0.0f, 1.0f, 0.0f,
+   0.0f, 0.0f, 0.0f, 1.0f
+};
+
+void vl_csc_get_matrix(enum VL_CSC_COLOR_STANDARD cs,
+                       struct vl_procamp *procamp,
+                       bool full_range,
+                       float *matrix)
+{
+   float ybias = full_range ? -16.0f/255.0f : 0.0f;
+   float cbbias = -128.0f/255.0f;
+   float crbias = -128.0f/255.0f;
+   float c = procamp ? procamp->contrast : 1.0f;
+   float s = procamp ? procamp->saturation : 1.0f;
+   float b = procamp ? procamp->brightness : 0.0f;
+   float h = procamp ? procamp->hue : 0.0f;
+   const float *cstd;
+
+   assert(matrix);
+
+   switch (cs) {
+      case VL_CSC_COLOR_STANDARD_BT_601:
+         cstd = full_range ? &bt_601_full[0] : &bt_601[0];
+         break;
+      case VL_CSC_COLOR_STANDARD_BT_709:
+         cstd = full_range ? &bt_709_full[0] : &bt_709[0];
+         break;
+      case VL_CSC_COLOR_STANDARD_IDENTITY:
+      default:
+         assert(cs == VL_CSC_COLOR_STANDARD_IDENTITY);
+         memcpy(matrix, &identity[0], sizeof(float) * 16);
+         return;
+   }
+
+   matrix[ 0] = c*cstd[ 0];
+   matrix[ 1] = c*cstd[ 1]*s*cosf(h) - c*cstd[ 2]*s*sinf(h);
+   matrix[ 2] = c*cstd[ 2]*s*cosf(h) + c*cstd[ 1]*s*sinf(h);
+   matrix[ 3] = cstd[ 3] + cstd[ 0]*(b + c*ybias) + cstd[ 1]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[ 2]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
+
+   matrix[ 4] = c*cstd[ 4];
+   matrix[ 5] = c*cstd[ 5]*s*cosf(h) - c*cstd[ 6]*s*sinf(h);
+   matrix[ 6] = c*cstd[ 6]*s*cosf(h) + c*cstd[ 5]*s*sinf(h);
+   matrix[ 7] = cstd[ 7] + cstd[ 4]*(b + c*ybias) + cstd[ 5]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[ 6]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
+
+   matrix[ 8] = c*cstd[ 8];
+   matrix[ 9] = c*cstd[ 9]*s*cosf(h) - c*cstd[10]*s*sinf(h);
+   matrix[10] = c*cstd[10]*s*cosf(h) + c*cstd[ 9]*s*sinf(h);
+   matrix[11] = cstd[11] + cstd[ 8]*(b + c*ybias) + cstd[ 9]*(c*cbbias*s*cosf(h) + c*crbias*s*sinf(h)) + cstd[10]*(c*crbias*s*cosf(h) - c*cbbias*s*sinf(h));
+
+   matrix[12] = c*cstd[12];
+   matrix[13] = c*cstd[13]*s*cos(h) - c*cstd[14]*s*sin(h);
+   matrix[14] = c*cstd[14]*s*cos(h) + c*cstd[13]*s*sin(h);
+   matrix[15] = cstd[15] + cstd[12]*(b + c*ybias) + cstd[13]*(c*cbbias*s*cos(h) + c*crbias*s*sin(h)) + cstd[14]*(c*crbias*s*cos(h) - c*cbbias*s*sin(h));
+}
diff --git a/src/gallium/drivers/i965simple/brw_screen.h b/src/gallium/auxiliary/vl/vl_csc.h
index d3c70387e61..722ca35f339 100644
--- a/src/gallium/drivers/i965simple/brw_screen.h
+++ b/src/gallium/auxiliary/vl/vl_csc.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,44 +25,29 @@
  * 
  **************************************************************************/
 
+#ifndef vl_csc_h
+#define vl_csc_h
 
-#ifndef BRW_SCREEN_H
-#define BRW_SCREEN_H
+#include <pipe/p_compiler.h>
 
-
-#include "pipe/p_screen.h"
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/**
- * Subclass of pipe_screen
- */
-struct brw_screen
+struct vl_procamp
 {
-   struct pipe_screen screen;
-
-   uint pci_id;
+   float brightness;
+   float contrast;
+   float saturation;
+   float hue;
 };
 
-
-/** cast wrapper */
-static INLINE struct brw_screen *
-brw_screen(struct pipe_screen *pscreen)
+enum VL_CSC_COLOR_STANDARD
 {
-   return (struct brw_screen *) pscreen;
-}
-
-
-extern struct pipe_screen *
-brw_create_screen(struct pipe_winsys *winsys, uint pci_id);
-
+   VL_CSC_COLOR_STANDARD_IDENTITY,
+   VL_CSC_COLOR_STANDARD_BT_601,
+   VL_CSC_COLOR_STANDARD_BT_709
+};
 
-#ifdef __cplusplus
-}
-#endif
+void vl_csc_get_matrix(enum VL_CSC_COLOR_STANDARD cs,
+                       struct vl_procamp *procamp,
+                       bool full_range,
+                       float *matrix);
 
-#endif /* BRW_SCREEN_H */
+#endif /* vl_csc_h */
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
new file mode 100644
index 00000000000..c4ba69817f9
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
@@ -0,0 +1,1660 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_mpeg12_mc_renderer.h"
+#include <assert.h>
+#include <pipe/p_context.h>
+#include <pipe/p_inlines.h>
+#include <util/u_math.h>
+#include <util/u_memory.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_shader_build.h"
+
+#define DEFAULT_BUF_ALIGNMENT 1
+#define MACROBLOCK_WIDTH 16
+#define MACROBLOCK_HEIGHT 16
+#define BLOCK_WIDTH 8
+#define BLOCK_HEIGHT 8
+#define ZERO_BLOCK_NIL -1.0f
+#define ZERO_BLOCK_IS_NIL(zb) ((zb).x < 0.0f)
+
+struct vertex2f
+{
+   float x, y;
+};
+
+struct vertex4f
+{
+   float x, y, z, w;
+};
+
+struct vertex_shader_consts
+{
+   struct vertex4f denorm;
+};
+
+struct fragment_shader_consts
+{
+   struct vertex4f multiplier;
+   struct vertex4f div;
+};
+
+/*
+ * Muliplier renormalizes block samples from 16 bits to 12 bits.
+ * Divider is used when calculating Y % 2 for choosing top or bottom
+ * field for P or B macroblocks.
+ * TODO: Use immediates.
+ */
+static const struct fragment_shader_consts fs_consts = {
+   {32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+   {0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+struct vert_stream_0
+{
+   struct vertex2f pos;
+   struct vertex2f luma_tc;
+   struct vertex2f cb_tc;
+   struct vertex2f cr_tc;
+};
+
+enum MACROBLOCK_TYPE
+{
+   MACROBLOCK_TYPE_INTRA,
+   MACROBLOCK_TYPE_FWD_FRAME_PRED,
+   MACROBLOCK_TYPE_FWD_FIELD_PRED,
+   MACROBLOCK_TYPE_BKWD_FRAME_PRED,
+   MACROBLOCK_TYPE_BKWD_FIELD_PRED,
+   MACROBLOCK_TYPE_BI_FRAME_PRED,
+   MACROBLOCK_TYPE_BI_FIELD_PRED,
+
+   NUM_MACROBLOCK_TYPES
+};
+
+static void
+create_intra_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 50;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 4; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 4; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->i_vs = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_intra_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    */
+   for (i = 0; i < 3; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    */
+   for (i = 0; i < 3; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul o0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->i_fs = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_frame_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    * decl i4              ; Ref surface top field texcoords
+    * decl i5              ; Ref surface bottom field texcoords (unused, packed in the same stream)
+    */
+   for (i = 0; i < 6; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    * decl o4              ; Ref macroblock texcoords
+    */
+   for (i = 0; i < 5; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+        inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+        ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* add o4, i0, i4       ; Translate vertex pos by motion vec to form ref macroblock texcoords */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->p_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_field_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    * decl i3                      ; Ref macroblock texcoords
+    */
+   for (i = 0; i < 4; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0, t1 */
+   decl = vl_decl_temps(0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    * decl s3                      ; Sampler for ref surface texture
+    */
+   for (i = 0; i < 4; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul t0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* tex2d t1, i3, s3             ; Read texel from ref macroblock */
+   inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* add o0, t0, t1               ; Add ref and differential to form final output */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->p_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_field_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state vs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+   ti = 3;
+
+   /*
+    * decl i0              ; Vertex pos
+    * decl i1              ; Luma texcoords
+    * decl i2              ; Chroma Cb texcoords
+    * decl i3              ; Chroma Cr texcoords
+    * decl i4              ; First ref macroblock top field texcoords
+    * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
+    * decl i6              ; Second ref macroblock top field texcoords
+    * decl i7              ; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
+    */
+   for (i = 0; i < 8; i++) {
+      decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl o0              ; Vertex pos
+    * decl o1              ; Luma texcoords
+    * decl o2              ; Chroma Cb texcoords
+    * decl o3              ; Chroma Cr texcoords
+    * decl o4              ; First ref macroblock texcoords
+    * decl o5              ; Second ref macroblock texcoords
+    */
+   for (i = 0; i < 6; i++) {
+      decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * mov o0, i0           ; Move input vertex pos to output
+    * mov o1, i1           ; Move input luma texcoords to output
+    * mov o2, i2           ; Move input chroma Cb texcoords to output
+    * mov o3, i3           ; Move input chroma Cr texcoords to output
+    */
+   for (i = 0; i < 4; ++i) {
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * add o4, i0, i4       ; Translate vertex pos by motion vec to form first ref macroblock texcoords
+    * add o5, i0, i6       ; Translate vertex pos by motion vec to form second ref macroblock texcoords
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   vs.tokens = tokens;
+   r->b_vs[0] = r->pipe->create_vs_state(r->pipe, &vs);
+   free(tokens);
+}
+
+static void
+create_field_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+create_frame_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   const unsigned max_tokens = 100;
+
+   struct pipe_shader_state fs;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+
+   unsigned ti;
+
+   unsigned i;
+
+   assert(r);
+
+   tokens = (struct tgsi_token *) malloc(max_tokens * sizeof(struct tgsi_token));
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+   *(struct tgsi_processor *) &tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+   ti = 3;
+
+   /*
+    * decl i0                      ; Luma texcoords
+    * decl i1                      ; Chroma Cb texcoords
+    * decl i2                      ; Chroma Cr texcoords
+    * decl i3                      ; First ref macroblock texcoords
+    * decl i4                      ; Second ref macroblock texcoords
+    */
+   for (i = 0; i < 5; ++i) {
+      decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * decl c0                      ; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+    * decl c1                      ; Constant 1/2 in .x channel to use as weight to blend past and future texels
+    */
+   decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl o0                      ; Fragment color */
+   decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /* decl t0-t2 */
+   decl = vl_decl_temps(0, 2);
+   ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * decl s0                      ; Sampler for luma texture
+    * decl s1                      ; Sampler for chroma Cb texture
+    * decl s2                      ; Sampler for chroma Cr texture
+    * decl s3                      ; Sampler for first ref surface texture
+    * decl s4                      ; Sampler for second ref surface texture
+    */
+   for (i = 0; i < 5; ++i) {
+      decl = vl_decl_samplers(i, i);
+      ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /*
+    * tex2d t1, i0, s0             ; Read texel from luma texture
+    * mov t0.x, t1.x               ; Move luma sample into .x component
+    * tex2d t1, i1, s1             ; Read texel from chroma Cb texture
+    * mov t0.y, t1.x               ; Move Cb sample into .y component
+    * tex2d t1, i2, s2             ; Read texel from chroma Cr texture
+    * mov t0.z, t1.x               ; Move Cr sample into .z component
+    */
+   for (i = 0; i < 3; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+      inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+      inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* mul t0, t0, c0               ; Rescale texel to correct range */
+   inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /*
+    * tex2d t1, i3, s3             ; Read texel from first ref macroblock
+    * tex2d t2, i4, s4             ; Read texel from second ref macroblock
+    */
+   for (i = 0; i < 2; ++i) {
+      inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
+      ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+   }
+
+   /* lerp t1, c1.x, t1, t2        ; Blend past and future texels */
+   inst = vl_inst4(TGSI_OPCODE_LRP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+   inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* add o0, t0, t1               ; Add past/future ref and differential to form final output */
+   inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   /* end */
+   inst = vl_end();
+   ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+   assert(ti <= max_tokens);
+
+   fs.tokens = tokens;
+   r->b_fs[0] = r->pipe->create_fs_state(r->pipe, &fs);
+   free(tokens);
+}
+
+static void
+create_field_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(false);
+}
+
+static void
+xfer_buffers_map(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 3; ++i) {
+      r->tex_transfer[i] = r->pipe->screen->get_tex_transfer
+      (
+         r->pipe->screen, r->textures.all[i],
+         0, 0, 0, PIPE_TRANSFER_WRITE, 0, 0,
+         r->textures.all[i]->width[0], r->textures.all[i]->height[0]
+      );
+
+      r->texels[i] = r->pipe->screen->transfer_map(r->pipe->screen, r->tex_transfer[i]);
+   }
+}
+
+static void
+xfer_buffers_unmap(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 3; ++i) {
+      r->pipe->screen->transfer_unmap(r->pipe->screen, r->tex_transfer[i]);
+      r->pipe->screen->tex_transfer_destroy(r->tex_transfer[i]);
+   }
+}
+
+static bool
+init_pipe_state(struct vl_mpeg12_mc_renderer *r)
+{
+   struct pipe_sampler_state sampler;
+   unsigned filters[5];
+   unsigned i;
+
+   assert(r);
+
+   r->viewport.scale[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->viewport.scale[1] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   r->viewport.scale[2] = 1;
+   r->viewport.scale[3] = 1;
+   r->viewport.translate[0] = 0;
+   r->viewport.translate[1] = 0;
+   r->viewport.translate[2] = 0;
+   r->viewport.translate[3] = 0;
+
+   r->scissor.maxx = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->scissor.maxy = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+
+   r->fb_state.width = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   r->fb_state.height = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   r->fb_state.nr_cbufs = 1;
+   r->fb_state.zsbuf = NULL;
+
+   /* Luma filter */
+   filters[0] = PIPE_TEX_FILTER_NEAREST;
+   /* Chroma filters */
+   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_444 ||
+       r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+      filters[1] = PIPE_TEX_FILTER_NEAREST;
+      filters[2] = PIPE_TEX_FILTER_NEAREST;
+   }
+   else {
+      filters[1] = PIPE_TEX_FILTER_LINEAR;
+      filters[2] = PIPE_TEX_FILTER_LINEAR;
+   }
+   /* Fwd, bkwd ref filters */
+   filters[3] = PIPE_TEX_FILTER_LINEAR;
+   filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+   for (i = 0; i < 5; ++i) {
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.min_img_filter = filters[i];
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      sampler.mag_img_filter = filters[i];
+      sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+      sampler.compare_func = PIPE_FUNC_ALWAYS;
+      sampler.normalized_coords = 1;
+      /*sampler.prefilter = ; */
+      /*sampler.shadow_ambient = ; */
+      /*sampler.lod_bias = ; */
+      sampler.min_lod = 0;
+      /*sampler.max_lod = ; */
+      /*sampler.border_color[i] = ; */
+      /*sampler.max_anisotropy = ; */
+      r->samplers.all[i] = r->pipe->create_sampler_state(r->pipe, &sampler);
+   }
+
+   return true;
+}
+
+static void
+cleanup_pipe_state(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   for (i = 0; i < 5; ++i)
+      r->pipe->delete_sampler_state(r->pipe, r->samplers.all[i]);
+}
+
+static bool
+init_shaders(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(r);
+
+   create_intra_vert_shader(r);
+   create_intra_frag_shader(r);
+   create_frame_pred_vert_shader(r);
+   create_frame_pred_frag_shader(r);
+   create_frame_bi_pred_vert_shader(r);
+   create_frame_bi_pred_frag_shader(r);
+
+   return true;
+}
+
+static void
+cleanup_shaders(struct vl_mpeg12_mc_renderer *r)
+{
+   assert(r);
+
+   r->pipe->delete_vs_state(r->pipe, r->i_vs);
+   r->pipe->delete_fs_state(r->pipe, r->i_fs);
+   r->pipe->delete_vs_state(r->pipe, r->p_vs[0]);
+   r->pipe->delete_fs_state(r->pipe, r->p_fs[0]);
+   r->pipe->delete_vs_state(r->pipe, r->b_vs[0]);
+   r->pipe->delete_fs_state(r->pipe, r->b_fs[0]);
+}
+
+static bool
+init_buffers(struct vl_mpeg12_mc_renderer *r)
+{
+   struct pipe_texture template;
+
+   const unsigned mbw =
+      align(r->picture_width, MACROBLOCK_WIDTH) / MACROBLOCK_WIDTH;
+   const unsigned mbh =
+      align(r->picture_height, MACROBLOCK_HEIGHT) / MACROBLOCK_HEIGHT;
+
+   unsigned i;
+
+   assert(r);
+
+   r->macroblocks_per_batch =
+      mbw * (r->bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE ? mbh : 1);
+   r->num_macroblocks = 0;
+   r->macroblock_buf = MALLOC(r->macroblocks_per_batch * sizeof(struct pipe_mpeg12_macroblock));
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   /* TODO: Accomodate HW that can't do this and also for cases when this isn't precise enough */
+   template.format = PIPE_FORMAT_R16_SNORM;
+   template.last_level = 0;
+   template.width[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_width) : r->picture_width;
+   template.height[0] = r->pot_buffers ?
+      util_next_power_of_two(r->picture_height) : r->picture_height;
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_DYNAMIC;
+
+   r->textures.individual.y = r->pipe->screen->texture_create(r->pipe->screen, &template);
+
+   if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) {
+      template.width[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_width / 2) :
+         r->picture_width / 2;
+      template.height[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_height / 2) :
+         r->picture_height / 2;
+   }
+   else if (r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422)
+      template.height[0] = r->pot_buffers ?
+         util_next_power_of_two(r->picture_height / 2) :
+         r->picture_height / 2;
+
+   r->textures.individual.cb =
+      r->pipe->screen->texture_create(r->pipe->screen, &template);
+   r->textures.individual.cr =
+      r->pipe->screen->texture_create(r->pipe->screen, &template);
+
+   r->vertex_bufs.individual.ycbcr.stride = sizeof(struct vertex2f) * 4;
+   r->vertex_bufs.individual.ycbcr.max_index = 24 * r->macroblocks_per_batch - 1;
+   r->vertex_bufs.individual.ycbcr.buffer_offset = 0;
+   r->vertex_bufs.individual.ycbcr.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex2f) * 4 * 24 * r->macroblocks_per_batch
+   );
+
+   for (i = 1; i < 3; ++i) {
+      r->vertex_bufs.all[i].stride = sizeof(struct vertex2f) * 2;
+      r->vertex_bufs.all[i].max_index = 24 * r->macroblocks_per_batch - 1;
+      r->vertex_bufs.all[i].buffer_offset = 0;
+      r->vertex_bufs.all[i].buffer = pipe_buffer_create
+      (
+         r->pipe->screen,
+         DEFAULT_BUF_ALIGNMENT,
+         PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
+         sizeof(struct vertex2f) * 2 * 24 * r->macroblocks_per_batch
+      );
+   }
+
+   /* Position element */
+   r->vertex_elems[0].src_offset = 0;
+   r->vertex_elems[0].vertex_buffer_index = 0;
+   r->vertex_elems[0].nr_components = 2;
+   r->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Luma, texcoord element */
+   r->vertex_elems[1].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[1].vertex_buffer_index = 0;
+   r->vertex_elems[1].nr_components = 2;
+   r->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Chroma Cr texcoord element */
+   r->vertex_elems[2].src_offset = sizeof(struct vertex2f) * 2;
+   r->vertex_elems[2].vertex_buffer_index = 0;
+   r->vertex_elems[2].nr_components = 2;
+   r->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Chroma Cb texcoord element */
+   r->vertex_elems[3].src_offset = sizeof(struct vertex2f) * 3;
+   r->vertex_elems[3].vertex_buffer_index = 0;
+   r->vertex_elems[3].nr_components = 2;
+   r->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* First ref surface top field texcoord element */
+   r->vertex_elems[4].src_offset = 0;
+   r->vertex_elems[4].vertex_buffer_index = 1;
+   r->vertex_elems[4].nr_components = 2;
+   r->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* First ref surface bottom field texcoord element */
+   r->vertex_elems[5].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[5].vertex_buffer_index = 1;
+   r->vertex_elems[5].nr_components = 2;
+   r->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Second ref surface top field texcoord element */
+   r->vertex_elems[6].src_offset = 0;
+   r->vertex_elems[6].vertex_buffer_index = 2;
+   r->vertex_elems[6].nr_components = 2;
+   r->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   /* Second ref surface bottom field texcoord element */
+   r->vertex_elems[7].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[7].vertex_buffer_index = 2;
+   r->vertex_elems[7].nr_components = 2;
+   r->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+   r->vs_const_buf.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
+      sizeof(struct vertex_shader_consts)
+   );
+
+   r->fs_const_buf.buffer = pipe_buffer_create
+   (
+      r->pipe->screen,
+      DEFAULT_BUF_ALIGNMENT,
+      PIPE_BUFFER_USAGE_CONSTANT, sizeof(struct fragment_shader_consts)
+   );
+
+   memcpy
+   (
+      pipe_buffer_map(r->pipe->screen, r->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+      &fs_consts, sizeof(struct fragment_shader_consts)
+   );
+
+   pipe_buffer_unmap(r->pipe->screen, r->fs_const_buf.buffer);
+
+   return true;
+}
+
+static void
+cleanup_buffers(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned i;
+
+   assert(r);
+
+   pipe_buffer_reference(&r->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&r->fs_const_buf.buffer, NULL);
+
+   for (i = 0; i < 3; ++i)
+      pipe_buffer_reference(&r->vertex_bufs.all[i].buffer, NULL);
+
+   for (i = 0; i < 3; ++i)
+      pipe_texture_reference(&r->textures.all[i], NULL);
+
+   FREE(r->macroblock_buf);
+}
+
+static enum MACROBLOCK_TYPE
+get_macroblock_type(struct pipe_mpeg12_macroblock *mb)
+{
+   assert(mb);
+
+   switch (mb->mb_type) {
+      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
+         return MACROBLOCK_TYPE_INTRA;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_FWD_FRAME_PRED : MACROBLOCK_TYPE_FWD_FIELD_PRED;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_BKWD_FRAME_PRED : MACROBLOCK_TYPE_BKWD_FIELD_PRED;
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
+         return mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME ?
+            MACROBLOCK_TYPE_BI_FRAME_PRED : MACROBLOCK_TYPE_BI_FIELD_PRED;
+      default:
+         assert(0);
+   }
+
+   /* Unreachable */
+   return -1;
+}
+
+/* XXX: One of these days this will have to be killed with fire */
+#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, use_zb, zb)				\
+	do {															\
+	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+																\
+	if (!use_zb || (cbp) & (lm))												\
+	{															\
+		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
+		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
+		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
+		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
+	}															\
+																\
+	if (!use_zb || (cbp) & (cbm))												\
+	{															\
+		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
+		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
+		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
+		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
+	}															\
+																\
+	if (!use_zb || (cbp) & (crm))												\
+	{															\
+		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
+		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
+		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
+		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
+	}															\
+	} while (0)
+
+static void
+gen_macroblock_verts(struct vl_mpeg12_mc_renderer *r,
+                     struct pipe_mpeg12_macroblock *mb, unsigned pos,
+                     struct vert_stream_0 *ycbcr_vb, struct vertex2f **ref_vb)
+{
+   struct vertex2f mo_vec[2];
+
+   unsigned i;
+
+   assert(r);
+   assert(mb);
+   assert(ycbcr_vb);
+   assert(pos < r->macroblocks_per_batch);
+
+   switch (mb->mb_type) {
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
+      {
+         struct vertex2f *vb;
+
+         assert(ref_vb && ref_vb[1]);
+
+         vb = ref_vb[1] + pos * 2 * 24;
+
+         mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
+         mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+         if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+            }
+         }
+         else {
+            mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
+            mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+               vb[i + 1].x = mo_vec[1].x;
+               vb[i + 1].y = mo_vec[1].y;
+            }
+         }
+
+         /* fall-through */
+      }
+      case PIPE_MPEG12_MACROBLOCK_TYPE_FWD:
+      case PIPE_MPEG12_MACROBLOCK_TYPE_BKWD:
+      {
+         struct vertex2f *vb;
+
+         assert(ref_vb && ref_vb[0]);
+
+         vb = ref_vb[0] + pos * 2 * 24;
+
+         if (mb->mb_type == PIPE_MPEG12_MACROBLOCK_TYPE_BKWD) {
+             mo_vec[0].x = mb->pmv[0][1][0] * 0.5f * r->surface_tex_inv_size.x;
+             mo_vec[0].y = mb->pmv[0][1][1] * 0.5f * r->surface_tex_inv_size.y;
+
+             if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
+                mo_vec[1].x = mb->pmv[1][1][0] * 0.5f * r->surface_tex_inv_size.x;
+                mo_vec[1].y = mb->pmv[1][1][1] * 0.5f * r->surface_tex_inv_size.y;
+             }
+         }
+         else {
+            mo_vec[0].x = mb->pmv[0][0][0] * 0.5f * r->surface_tex_inv_size.x;
+            mo_vec[0].y = mb->pmv[0][0][1] * 0.5f * r->surface_tex_inv_size.y;
+
+            if (mb->mo_type == PIPE_MPEG12_MOTION_TYPE_FIELD) {
+               mo_vec[1].x = mb->pmv[1][0][0] * 0.5f * r->surface_tex_inv_size.x;
+               mo_vec[1].y = mb->pmv[1][0][1] * 0.5f * r->surface_tex_inv_size.y;
+            }
+         }
+
+         if (mb->mb_type == PIPE_MPEG12_MOTION_TYPE_FRAME) {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+            }
+         }
+         else {
+            for (i = 0; i < 24 * 2; i += 2) {
+               vb[i].x = mo_vec[0].x;
+               vb[i].y = mo_vec[0].y;
+               vb[i + 1].x = mo_vec[1].x;
+               vb[i + 1].y = mo_vec[1].y;
+            }
+         }
+
+         /* fall-through */
+      }
+      case PIPE_MPEG12_MACROBLOCK_TYPE_INTRA:
+      {
+         const struct vertex2f unit =
+         {
+            r->surface_tex_inv_size.x * MACROBLOCK_WIDTH,
+            r->surface_tex_inv_size.y * MACROBLOCK_HEIGHT
+         };
+         const struct vertex2f half =
+         {
+            r->surface_tex_inv_size.x * (MACROBLOCK_WIDTH / 2),
+            r->surface_tex_inv_size.y * (MACROBLOCK_HEIGHT / 2)
+         };
+         const bool use_zb = r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE;
+
+         struct vert_stream_0 *vb = ycbcr_vb + pos * 24;
+
+         SET_BLOCK(vb, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, 0, 0, half.x, half.y,
+                   32, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 6, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, half.x, 0, half.x, half.y,
+                   16, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 12, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, 0, half.y, half.x, half.y,
+                   8, 2, 1, use_zb, r->zero_block);
+
+         SET_BLOCK(vb + 18, mb->cbp, mb->mbx, mb->mby,
+                   unit.x, unit.y, half.x, half.y, half.x, half.y,
+                   4, 2, 1, use_zb, r->zero_block);
+
+         break;
+      }
+      default:
+         assert(0);
+   }
+}
+
+static void
+gen_macroblock_stream(struct vl_mpeg12_mc_renderer *r,
+                      unsigned *num_macroblocks)
+{
+   unsigned offset[NUM_MACROBLOCK_TYPES];
+   struct vert_stream_0 *ycbcr_vb;
+   struct vertex2f *ref_vb[2];
+   unsigned i;
+
+   assert(r);
+   assert(num_macroblocks);
+
+   for (i = 0; i < r->num_macroblocks; ++i) {
+      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
+      ++num_macroblocks[mb_type];
+   }
+
+   offset[0] = 0;
+
+   for (i = 1; i < NUM_MACROBLOCK_TYPES; ++i)
+      offset[i] = offset[i - 1] + num_macroblocks[i - 1];
+
+   ycbcr_vb = (struct vert_stream_0 *)pipe_buffer_map
+   (
+      r->pipe->screen,
+      r->vertex_bufs.individual.ycbcr.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   for (i = 0; i < 2; ++i)
+      ref_vb[i] = (struct vertex2f *)pipe_buffer_map
+      (
+         r->pipe->screen,
+         r->vertex_bufs.individual.ref[i].buffer,
+         PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+      );
+
+   for (i = 0; i < r->num_macroblocks; ++i) {
+      enum MACROBLOCK_TYPE mb_type = get_macroblock_type(&r->macroblock_buf[i]);
+
+      gen_macroblock_verts(r, &r->macroblock_buf[i], offset[mb_type],
+                           ycbcr_vb, ref_vb);
+
+      ++offset[mb_type];
+   }
+
+   pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ycbcr.buffer);
+   for (i = 0; i < 2; ++i)
+      pipe_buffer_unmap(r->pipe->screen, r->vertex_bufs.individual.ref[i].buffer);
+}
+
+static void
+flush(struct vl_mpeg12_mc_renderer *r)
+{
+   unsigned num_macroblocks[NUM_MACROBLOCK_TYPES] = { 0 };
+   unsigned vb_start = 0;
+   struct vertex_shader_consts *vs_consts;
+   unsigned i;
+
+   assert(r);
+   assert(r->num_macroblocks == r->macroblocks_per_batch);
+
+   gen_macroblock_stream(r, num_macroblocks);
+
+   r->fb_state.cbufs[0] = r->pipe->screen->get_tex_surface
+   (
+      r->pipe->screen, r->surface,
+      0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE
+   );
+
+   r->pipe->set_framebuffer_state(r->pipe, &r->fb_state);
+   r->pipe->set_viewport_state(r->pipe, &r->viewport);
+   r->pipe->set_scissor_state(r->pipe, &r->scissor);
+
+   vs_consts = pipe_buffer_map
+   (
+      r->pipe->screen, r->vs_const_buf.buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
+   );
+
+   vs_consts->denorm.x = r->surface->width[0];
+   vs_consts->denorm.y = r->surface->height[0];
+
+   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf.buffer);
+
+   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_VERTEX, 0,
+                                &r->vs_const_buf);
+   r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_FRAGMENT, 0,
+                                &r->fs_const_buf);
+
+   if (num_macroblocks[MACROBLOCK_TYPE_INTRA] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 1, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 4, r->vertex_elems);
+      r->pipe->set_sampler_textures(r->pipe, 3, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 3, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->i_vs);
+      r->pipe->bind_fs_state(r->pipe, r->i_fs);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_INTRA] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_FWD_FIELD_PRED] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 2, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 6, r->vertex_elems);
+      r->textures.individual.ref[0] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 4, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 4, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->p_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->p_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BKWD_FIELD_PRED] * 24;
+   }
+
+   if (num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] > 0) {
+      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->textures.individual.ref[1] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 5, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 5, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->b_vs[0]);
+      r->pipe->bind_fs_state(r->pipe, r->b_fs[0]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FRAME_PRED] * 24;
+   }
+
+   if (false /*num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] > 0 */ ) {
+      r->pipe->set_vertex_buffers(r->pipe, 3, r->vertex_bufs.all);
+      r->pipe->set_vertex_elements(r->pipe, 8, r->vertex_elems);
+      r->textures.individual.ref[0] = r->past;
+      r->textures.individual.ref[1] = r->future;
+      r->pipe->set_sampler_textures(r->pipe, 5, r->textures.all);
+      r->pipe->bind_sampler_states(r->pipe, 5, r->samplers.all);
+      r->pipe->bind_vs_state(r->pipe, r->b_vs[1]);
+      r->pipe->bind_fs_state(r->pipe, r->b_fs[1]);
+
+      r->pipe->draw_arrays(r->pipe, PIPE_PRIM_TRIANGLES, vb_start,
+                           num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24);
+      vb_start += num_macroblocks[MACROBLOCK_TYPE_BI_FIELD_PRED] * 24;
+   }
+
+   r->pipe->flush(r->pipe, PIPE_FLUSH_RENDER_CACHE, r->fence);
+   pipe_surface_reference(&r->fb_state.cbufs[0], NULL);
+
+   if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE)
+      for (i = 0; i < 3; ++i)
+         r->zero_block[i].x = ZERO_BLOCK_NIL;
+
+   r->num_macroblocks = 0;
+}
+
+static void
+grab_frame_coded_block(short *src, short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(src);
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memcpy(dst + y * dst_pitch, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
+}
+
+static void
+grab_field_coded_block(short *src, short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(src);
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memcpy(dst + y * dst_pitch * 2, src + y * BLOCK_WIDTH, BLOCK_WIDTH * 2);
+}
+
+static void
+fill_zero_block(short *dst, unsigned dst_pitch)
+{
+   unsigned y;
+
+   assert(dst);
+
+   for (y = 0; y < BLOCK_HEIGHT; ++y)
+      memset(dst + y * dst_pitch, 0, BLOCK_WIDTH * 2);
+}
+
+static void
+grab_blocks(struct vl_mpeg12_mc_renderer *r, unsigned mbx, unsigned mby,
+            enum pipe_mpeg12_dct_type dct_type, unsigned cbp, short *blocks)
+{
+   unsigned tex_pitch;
+   short *texels;
+   unsigned tb = 0, sb = 0;
+   unsigned mbpx = mbx * MACROBLOCK_WIDTH, mbpy = mby * MACROBLOCK_HEIGHT;
+   unsigned x, y;
+
+   assert(r);
+   assert(blocks);
+
+   tex_pitch = r->tex_transfer[0]->stride / r->tex_transfer[0]->block.size;
+   texels = r->texels[0] + mbpy * tex_pitch + mbpx;
+
+   for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x, ++tb) {
+         if ((cbp >> (5 - tb)) & 1) {
+            if (dct_type == PIPE_MPEG12_DCT_TYPE_FRAME) {
+               grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
+                                      texels + y * tex_pitch * BLOCK_WIDTH +
+                                      x * BLOCK_WIDTH, tex_pitch);
+            }
+            else {
+               grab_field_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT,
+                                      texels + y * tex_pitch + x * BLOCK_WIDTH,
+                                      tex_pitch);
+            }
+
+            ++sb;
+         }
+         else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
+            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
+                ZERO_BLOCK_IS_NIL(r->zero_block[0])) {
+               fill_zero_block(texels + y * tex_pitch * BLOCK_WIDTH + x * BLOCK_WIDTH, tex_pitch);
+               if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+                  r->zero_block[0].x = (mbpx + x * 8) * r->surface_tex_inv_size.x;
+                  r->zero_block[0].y = (mbpy + y * 8) * r->surface_tex_inv_size.y;
+               }
+            }
+         }
+      }
+   }
+
+   /* TODO: Implement 422, 444 */
+   assert(r->chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420);
+
+   mbpx /= 2;
+   mbpy /= 2;
+
+   for (tb = 0; tb < 2; ++tb) {
+      tex_pitch = r->tex_transfer[tb + 1]->stride / r->tex_transfer[tb + 1]->block.size;
+      texels = r->texels[tb + 1] + mbpy * tex_pitch + mbpx;
+
+      if ((cbp >> (1 - tb)) & 1) {
+         grab_frame_coded_block(blocks + sb * BLOCK_WIDTH * BLOCK_HEIGHT, texels, tex_pitch);
+         ++sb;
+      }
+      else if (r->eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE) {
+         if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL ||
+             ZERO_BLOCK_IS_NIL(r->zero_block[tb + 1])) {
+            fill_zero_block(texels, tex_pitch);
+            if (r->eb_handling == VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE) {
+               r->zero_block[tb + 1].x = (mbpx << 1) * r->surface_tex_inv_size.x;
+               r->zero_block[tb + 1].y = (mbpy << 1) * r->surface_tex_inv_size.y;
+            }
+         }
+      }
+   }
+}
+
+static void
+grab_macroblock(struct vl_mpeg12_mc_renderer *r,
+                struct pipe_mpeg12_macroblock *mb)
+{
+   assert(r);
+   assert(mb);
+   assert(r->num_macroblocks < r->macroblocks_per_batch);
+
+   memcpy(&r->macroblock_buf[r->num_macroblocks], mb,
+          sizeof(struct pipe_mpeg12_macroblock));
+
+   grab_blocks(r, mb->mbx, mb->mby, mb->dct_type, mb->cbp, mb->blocks);
+
+   ++r->num_macroblocks;
+}
+
+bool
+vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
+                           struct pipe_context *pipe,
+                           unsigned picture_width,
+                           unsigned picture_height,
+                           enum pipe_video_chroma_format chroma_format,
+                           enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
+                           enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
+                           bool pot_buffers)
+{
+   unsigned i;
+
+   assert(renderer);
+   assert(pipe);
+   /* TODO: Implement other policies */
+   assert(bufmode == VL_MPEG12_MC_RENDERER_BUFFER_PICTURE);
+   /* TODO: Implement this */
+   /* XXX: XFER_ALL sampling issue at block edges when using bilinear filtering */
+   assert(eb_handling != VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE);
+   /* TODO: Non-pot buffers untested, probably doesn't work without changes to texcoord generation, vert shader, etc */
+   assert(pot_buffers);
+
+   memset(renderer, 0, sizeof(struct vl_mpeg12_mc_renderer));
+
+   renderer->pipe = pipe;
+   renderer->picture_width = picture_width;
+   renderer->picture_height = picture_height;
+   renderer->chroma_format = chroma_format;
+   renderer->bufmode = bufmode;
+   renderer->eb_handling = eb_handling;
+   renderer->pot_buffers = pot_buffers;
+
+   if (!init_pipe_state(renderer))
+      return false;
+   if (!init_shaders(renderer)) {
+      cleanup_pipe_state(renderer);
+      return false;
+   }
+   if (!init_buffers(renderer)) {
+      cleanup_shaders(renderer);
+      cleanup_pipe_state(renderer);
+      return false;
+   }
+
+   renderer->surface = NULL;
+   renderer->past = NULL;
+   renderer->future = NULL;
+   for (i = 0; i < 3; ++i)
+      renderer->zero_block[i].x = ZERO_BLOCK_NIL;
+   renderer->num_macroblocks = 0;
+
+   xfer_buffers_map(renderer);
+
+   return true;
+}
+
+void
+vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer)
+{
+   assert(renderer);
+
+   xfer_buffers_unmap(renderer);
+
+   cleanup_pipe_state(renderer);
+   cleanup_shaders(renderer);
+   cleanup_buffers(renderer);
+}
+
+void
+vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer
+                                         *renderer,
+                                         struct pipe_texture *surface,
+                                         struct pipe_texture *past,
+                                         struct pipe_texture *future,
+                                         unsigned num_macroblocks,
+                                         struct pipe_mpeg12_macroblock
+                                         *mpeg12_macroblocks,
+                                         struct pipe_fence_handle **fence)
+{
+   bool new_surface = false;
+
+   assert(renderer);
+   assert(surface);
+   assert(num_macroblocks);
+   assert(mpeg12_macroblocks);
+
+   if (renderer->surface) {
+      if (surface != renderer->surface) {
+         if (renderer->num_macroblocks > 0) {
+            xfer_buffers_unmap(renderer);
+            flush(renderer);
+         }
+         
+         new_surface = true;
+      }
+
+      /* If the surface we're rendering hasn't changed the ref frames shouldn't change. */
+      assert(surface != renderer->surface || renderer->past == past);
+      assert(surface != renderer->surface || renderer->future == future);
+   }
+   else
+      new_surface = true;
+
+   if (new_surface) {
+      renderer->surface = surface;
+      renderer->past = past;
+      renderer->future = future;
+      renderer->fence = fence;
+      renderer->surface_tex_inv_size.x = 1.0f / surface->width[0];
+      renderer->surface_tex_inv_size.y = 1.0f / surface->height[0];
+   }
+
+   while (num_macroblocks) {
+      unsigned left_in_batch = renderer->macroblocks_per_batch - renderer->num_macroblocks;
+      unsigned num_to_submit = MIN2(num_macroblocks, left_in_batch);
+      unsigned i;
+
+      for (i = 0; i < num_to_submit; ++i) {
+         assert(mpeg12_macroblocks[i].base.codec == PIPE_VIDEO_CODEC_MPEG12);
+         grab_macroblock(renderer, &mpeg12_macroblocks[i]);
+      }
+
+      num_macroblocks -= num_to_submit;
+
+      if (renderer->num_macroblocks == renderer->macroblocks_per_batch) {
+         xfer_buffers_unmap(renderer);
+         flush(renderer);
+         xfer_buffers_map(renderer);
+         /* Next time we get this surface it may have new ref frames */
+         renderer->surface = NULL;
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
new file mode 100644
index 00000000000..64184337a06
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
@@ -0,0 +1,121 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_mpeg12_mc_renderer_h
+#define vl_mpeg12_mc_renderer_h
+
+#include <pipe/p_compiler.h>
+#include <pipe/p_state.h>
+#include <pipe/p_video_state.h>
+
+struct pipe_context;
+struct pipe_video_surface;
+struct pipe_macroblock;
+
+/* A slice is video-width (rounded up to a multiple of macroblock width) x macroblock height */
+enum VL_MPEG12_MC_RENDERER_BUFFER_MODE
+{
+   VL_MPEG12_MC_RENDERER_BUFFER_SLICE,  /* Saves memory at the cost of smaller batches */
+   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE /* Larger batches, more memory */
+};
+
+enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK
+{
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ALL, /* Waste of memory bandwidth */
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE, /* Can only do point-filtering when interpolating subsampled chroma channels */
+   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_NONE /* Needs conditional texel fetch! */
+};
+
+struct vl_mpeg12_mc_renderer
+{
+   struct pipe_context *pipe;
+   unsigned picture_width;
+   unsigned picture_height;
+   enum pipe_video_chroma_format chroma_format;
+   enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode;
+   enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling;
+   bool pot_buffers;
+   unsigned macroblocks_per_batch;
+
+   struct pipe_viewport_state viewport;
+   struct pipe_scissor_state scissor;
+   struct pipe_constant_buffer vs_const_buf;
+   struct pipe_constant_buffer fs_const_buf;
+   struct pipe_framebuffer_state fb_state;
+   struct pipe_vertex_element vertex_elems[8];
+	
+   union
+   {
+      void *all[5];
+      struct { void *y, *cb, *cr, *ref[2]; } individual;
+   } samplers;
+	
+   void *i_vs, *p_vs[2], *b_vs[2];
+   void *i_fs, *p_fs[2], *b_fs[2];
+	
+   union
+   {
+      struct pipe_texture *all[5];
+      struct { struct pipe_texture *y, *cb, *cr, *ref[2]; } individual;
+   } textures;
+
+   union
+   {
+      struct pipe_vertex_buffer all[3];
+      struct { struct pipe_vertex_buffer ycbcr, ref[2]; } individual;
+   } vertex_bufs;
+	
+   struct pipe_texture *surface, *past, *future;
+   struct pipe_fence_handle **fence;
+   unsigned num_macroblocks;
+   struct pipe_mpeg12_macroblock *macroblock_buf;
+   struct pipe_transfer *tex_transfer[3];
+   short *texels[3];
+   struct { float x, y; } surface_tex_inv_size;
+   struct { float x, y; } zero_block[3];
+};
+
+bool vl_mpeg12_mc_renderer_init(struct vl_mpeg12_mc_renderer *renderer,
+                                struct pipe_context *pipe,
+                                unsigned picture_width,
+                                unsigned picture_height,
+                                enum pipe_video_chroma_format chroma_format,
+                                enum VL_MPEG12_MC_RENDERER_BUFFER_MODE bufmode,
+                                enum VL_MPEG12_MC_RENDERER_EMPTY_BLOCK eb_handling,
+                                bool pot_buffers);
+
+void vl_mpeg12_mc_renderer_cleanup(struct vl_mpeg12_mc_renderer *renderer);
+
+void vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer *renderer,
+                                              struct pipe_texture *surface,
+                                              struct pipe_texture *past,
+                                              struct pipe_texture *future,
+                                              unsigned num_macroblocks,
+                                              struct pipe_mpeg12_macroblock *mpeg12_macroblocks,
+                                              struct pipe_fence_handle **fence);
+
+#endif /* vl_mpeg12_mc_renderer_h */
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.c b/src/gallium/auxiliary/vl/vl_shader_build.c
new file mode 100644
index 00000000000..faa20a903cd
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_shader_build.c
@@ -0,0 +1,242 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "vl_shader_build.h"
+#include <assert.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+   unsigned int name,
+   unsigned int index,
+   unsigned int first,
+   unsigned int last,
+   int interpolation
+)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   assert
+   (
+      interpolation == TGSI_INTERPOLATE_CONSTANT ||
+      interpolation == TGSI_INTERPOLATE_LINEAR ||
+      interpolation == TGSI_INTERPOLATE_PERSPECTIVE
+   );
+
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.Declaration.Interpolate = interpolation;;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_CONSTANT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl.Declaration.File = TGSI_FILE_OUTPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = name;
+   decl.Semantic.SemanticIndex = index;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_TEMPORARY;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
+{
+   struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   return decl;
+}
+
+struct tgsi_full_instruction vl_inst2
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src_file,
+   unsigned int src_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 1;
+   inst.FullSrcRegisters[0].SrcRegister.File = src_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_inst3
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_tex
+(
+   int tex,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.InstructionExtTexture.Texture = tex;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_inst4
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index,
+   enum tgsi_file_type src3_file,
+   unsigned int src3_index
+)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = dst_file;
+   inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+   inst.Instruction.NumSrcRegs = 3;
+   inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+   inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+   inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+   inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+   inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
+   inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
+
+   return inst;
+}
+
+struct tgsi_full_instruction vl_end(void)
+{
+   struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+   inst.Instruction.Opcode = TGSI_OPCODE_END;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 0;
+
+   return inst;
+}
diff --git a/src/gallium/auxiliary/vl/vl_shader_build.h b/src/gallium/auxiliary/vl/vl_shader_build.h
new file mode 100644
index 00000000000..5da71f8e136
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_shader_build.h
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef vl_shader_build_h
+#define vl_shader_build_h
+
+#include <pipe/p_shader_tokens.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+   unsigned int name,
+   unsigned int index,
+   unsigned int first,
+   unsigned int last,
+   int interpolation
+);
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
+struct tgsi_full_instruction vl_inst2
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src_file,
+   unsigned int src_index
+);
+struct tgsi_full_instruction vl_inst3
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+);
+struct tgsi_full_instruction vl_tex
+(
+   int tex,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index
+);
+struct tgsi_full_instruction vl_inst4
+(
+   int opcode,
+   enum tgsi_file_type dst_file,
+   unsigned int dst_index,
+   enum tgsi_file_type src1_file,
+   unsigned int src1_index,
+   enum tgsi_file_type src2_file,
+   unsigned int src2_index,
+   enum tgsi_file_type src3_file,
+   unsigned int src3_index
+);
+struct tgsi_full_instruction vl_end(void);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index b6b2f885af5..19e3ab08440 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -231,7 +231,7 @@ static boolean
 is_register_src(struct codegen *gen, int channel,
                 const struct tgsi_full_src_register *src)
 {
-   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int swizzle = tgsi_util_get_full_src_register_swizzle(src, channel);
    int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
 
    if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
@@ -271,23 +271,14 @@ get_src_reg(struct codegen *gen,
             const struct tgsi_full_src_register *src)
 {
    int reg = -1;
-   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int swizzle = tgsi_util_get_full_src_register_swizzle(src, channel);
    boolean reg_is_itemp = FALSE;
    uint sign_op;
 
    assert(swizzle >= TGSI_SWIZZLE_X);
-   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+   assert(swizzle <= TGSI_SWIZZLE_W);
 
-   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
-      /* Load const one float and early out */
-      reg = get_const_one_reg(gen);
-   }
-   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
-      /* Load const zero float and early out */
-      reg = get_itemp(gen);
-      spe_xor(gen->f, reg, reg, reg);
-   }
-   else {
+   {
       int index = src->SrcRegister.Index;
 
       assert(swizzle < 4);
@@ -1758,7 +1749,6 @@ emit_instruction(struct codegen *gen,
    case TGSI_OPCODE_ARL:
       return emit_ARL(gen, inst);
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       return emit_MOV(gen, inst);
    case TGSI_OPCODE_ADD:
    case TGSI_OPCODE_SUB:
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index bd48ce70050..d185c6b8497 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -41,7 +41,7 @@
 static const char *
 cell_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -64,8 +64,6 @@ cell_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 6a63a0e6ced..ae4c61efb3b 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -389,22 +389,14 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
    const uint texWidth = pt->width[level];
    const uint texHeight = pt->height[level];
    const uint stride = ct->stride[level];
-   unsigned flags = 0x0;
    unsigned size;
 
    assert(transfer->texture);
 
-   if (transfer->usage != PIPE_TRANSFER_READ) {
-      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-   }
-
-   if (transfer->usage != PIPE_TRANSFER_WRITE) {
-      flags |= PIPE_BUFFER_USAGE_CPU_READ;
-   }
-
    if (!ct->mapped) {
       /* map now */
-      ct->mapped = pipe_buffer_map(screen, ct->buffer, flags);
+      ct->mapped = pipe_buffer_map(screen, ct->buffer,
+                                   pipe_transfer_buffer_flags(transfer));
    }
 
    /*
@@ -417,8 +409,7 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
    if (!ctrans->map)
       return NULL; /* out of memory */
 
-   if (transfer->usage == PIPE_TRANSFER_READ ||
-       transfer->usage == PIPE_TRANSFER_READ_WRITE) {
+   if (transfer->usage & PIPE_TRANSFER_READ) {
       /* need to untwiddle the texture to make a linear version */
       const uint bpp = pf_get_size(ct->base.format);
       if (bpp == 4) {
@@ -459,8 +450,7 @@ cell_transfer_unmap(struct pipe_screen *screen,
                                    PIPE_BUFFER_USAGE_CPU_READ);
    }
 
-   if (transfer->usage == PIPE_TRANSFER_WRITE ||
-       transfer->usage == PIPE_TRANSFER_READ_WRITE) {
+   if (transfer->usage & PIPE_TRANSFER_WRITE) {
       /* The user wrote new texture data into the mapped buffer.
        * We need to convert the new linear data into the twiddled/tiled format.
        */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 0eaae2e451b..4c32b2d06d7 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -346,10 +346,10 @@ fetch_src_file_channel(
    union spu_exec_channel *chan )
 {
    switch( swizzle ) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT: {
          unsigned i;
@@ -413,14 +413,6 @@ fetch_src_file_channel(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
-      break;
-
    default:
       ASSERT( 0 );
    }
@@ -500,7 +492,7 @@ fetch_source(
       }
    }
 
-   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    fetch_src_file_channel(
       mach,
       reg->SrcRegister.File,
@@ -610,10 +602,8 @@ exec_kil(struct spu_exec_machine *mach,
    uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
    union spu_exec_channel r[1];
 
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
-   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+   /* This mask stores component bits that were already tested. */
+   uniquemask = 0;
 
    for (chan_index = 0; chan_index < 4; chan_index++)
    {
@@ -621,7 +611,7 @@ exec_kil(struct spu_exec_machine *mach,
       uint i;
 
       /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle (
+      swizzle = tgsi_util_get_full_src_register_swizzle (
                         &inst->FullSrcRegisters[0],
                         chan_index);
 
@@ -909,7 +899,6 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          STORE( &r[0], 0, chan_index );
@@ -1807,22 +1796,6 @@ exec_instruction(
       /* no-op */
       break;
 
-   case TGSI_OPCODE_NOISE1:
-      ASSERT( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE2:
-      ASSERT( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE3:
-      ASSERT( 0 );
-      break;
-
-   case TGSI_OPCODE_NOISE4:
-      ASSERT( 0 );
-      break;
-
    case TGSI_OPCODE_NOP:
       break;
 
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index af25dd3718a..c2c32b22d5a 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -26,104 +26,17 @@ tgsi_util_get_src_register_swizzle(
    return 0;
 }
 
-unsigned
-tgsi_util_get_src_register_extswizzle(
-   const struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->ExtSwizzleX;
-   case 1:
-      return reg->ExtSwizzleY;
-   case 2:
-      return reg->ExtSwizzleZ;
-   case 3:
-      return reg->ExtSwizzleW;
-   default:
-      ASSERT( 0 );
-   }
-   return 0;
-}
 
 unsigned
-tgsi_util_get_full_src_register_extswizzle(
+tgsi_util_get_full_src_register_swizzle(
    const struct tgsi_full_src_register  *reg,
    unsigned component )
 {
-   unsigned swizzle;
-
-   /*
-    * First, calculate  the   extended swizzle for a given channel. This will give
-    * us either a channel index into the simple swizzle or  a constant 1 or   0.
-    */
-   swizzle = tgsi_util_get_src_register_extswizzle(
-      &reg->SrcRegisterExtSwz,
+   return tgsi_util_get_src_register_swizzle(
+      reg->SrcRegister,
       component );
-
-   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
-
-   /*
-    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
-    * Leave the constants intact, they are   not   affected by the   simple swizzle.
-    */
-   if( swizzle <= TGSI_SWIZZLE_W ) {
-      swizzle = tgsi_util_get_src_register_swizzle(
-         &reg->SrcRegister,
-         component );
-   }
-
-   return swizzle;
 }
 
-unsigned
-tgsi_util_get_src_register_extnegate(
-   const  struct tgsi_src_register_ext_swz *reg,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      return reg->NegateX;
-   case 1:
-      return reg->NegateY;
-   case 2:
-      return reg->NegateZ;
-   case 3:
-      return reg->NegateW;
-   default:
-      ASSERT( 0 );
-   }
-   return 0;
-}
-
-void
-tgsi_util_set_src_register_extnegate(
-   struct tgsi_src_register_ext_swz *reg,
-   unsigned negate,
-   unsigned component )
-{
-   switch( component ) {
-   case 0:
-      reg->NegateX = negate;
-      break;
-   case 1:
-      reg->NegateY = negate;
-      break;
-   case 2:
-      reg->NegateZ = negate;
-      break;
-   case 3:
-      reg->NegateW = negate;
-      break;
-   default:
-      ASSERT( 0 );
-   }
-}
 
 unsigned
 tgsi_util_get_full_src_register_sign_mode(
@@ -148,9 +61,6 @@ tgsi_util_get_full_src_register_sign_mode(
       unsigned negate;
 
       negate = reg->SrcRegister.Negate;
-      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
-         negate = !negate;
-      }
       if( reg->SrcRegisterExtMod.Negate ) {
          negate = !negate;
       }
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915/Makefile
index fb533c17961..e33c74d02f7 100644
--- a/src/gallium/drivers/i915simple/Makefile
+++ b/src/gallium/drivers/i915/Makefile
@@ -1,7 +1,7 @@
 TOP = ../../../..
 include $(TOP)/configs/current
 
-LIBNAME = i915simple
+LIBNAME = i915
 
 C_SOURCES = \
 	i915_blit.c \
diff --git a/src/gallium/drivers/i915simple/SConscript b/src/gallium/drivers/i915/SConscript
index 778c4ed0fde..5a1c47c88db 100644
--- a/src/gallium/drivers/i915simple/SConscript
+++ b/src/gallium/drivers/i915/SConscript
@@ -2,8 +2,8 @@ Import('*')
 
 env = env.Clone()
 
-i915simple = env.ConvenienceLibrary(
-	target = 'i915simple',
+i915 = env.ConvenienceLibrary(
+	target = 'i915',
 	source = [
 		'i915_blit.c',
 		'i915_buffer.c',
@@ -27,4 +27,4 @@ i915simple = env.ConvenienceLibrary(
 		'i915_texture.c',
 	])
 
-Export('i915simple')
+Export('i915')
diff --git a/src/gallium/drivers/i915simple/i915_batch.h b/src/gallium/drivers/i915/i915_batch.h
index b813784723f..b813784723f 100644
--- a/src/gallium/drivers/i915simple/i915_batch.h
+++ b/src/gallium/drivers/i915/i915_batch.h
diff --git a/src/gallium/drivers/i915simple/i915_blit.c b/src/gallium/drivers/i915/i915_blit.c
index 83dfc335288..83dfc335288 100644
--- a/src/gallium/drivers/i915simple/i915_blit.c
+++ b/src/gallium/drivers/i915/i915_blit.c
diff --git a/src/gallium/drivers/i915simple/i915_blit.h b/src/gallium/drivers/i915/i915_blit.h
index 8ce3220cfd9..8ce3220cfd9 100644
--- a/src/gallium/drivers/i915simple/i915_blit.h
+++ b/src/gallium/drivers/i915/i915_blit.h
diff --git a/src/gallium/drivers/i915simple/i915_buffer.c b/src/gallium/drivers/i915/i915_buffer.c
index effeba12972..effeba12972 100644
--- a/src/gallium/drivers/i915simple/i915_buffer.c
+++ b/src/gallium/drivers/i915/i915_buffer.c
diff --git a/src/gallium/drivers/i915simple/i915_buffer.h b/src/gallium/drivers/i915/i915_buffer.h
index 80fda7c62fd..80fda7c62fd 100644
--- a/src/gallium/drivers/i915simple/i915_buffer.h
+++ b/src/gallium/drivers/i915/i915_buffer.h
diff --git a/src/gallium/drivers/i915simple/i915_clear.c b/src/gallium/drivers/i915/i915_clear.c
index 90530f2826f..90530f2826f 100644
--- a/src/gallium/drivers/i915simple/i915_clear.c
+++ b/src/gallium/drivers/i915/i915_clear.c
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index b43f7352456..94c8aee30fe 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -155,15 +155,11 @@ static unsigned int
 i915_is_buffer_referenced(struct pipe_context *pipe,
                           struct pipe_buffer *buf)
 {
-   /**
-    * FIXME: Return the corrent result. We can't alays return referenced
-    *        since it causes a double flush within the vbo module.
+   /*
+    * Since we never expose hardware buffers to the state tracker
+    * they can never be referenced, so this isn't a lie
     */
-#if 0
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-#else
    return 0;
-#endif
 }
 
 
@@ -175,12 +171,19 @@ i915_is_buffer_referenced(struct pipe_context *pipe,
 static void i915_destroy(struct pipe_context *pipe)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
 
    draw_destroy(i915->draw);
    
    if(i915->batch)
       i915->iws->batchbuffer_destroy(i915->batch);
 
+   /* unbind framebuffer */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, NULL);
+
    FREE(i915);
 }
 
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 234b441ce6e..234b441ce6e 100644
--- a/src/gallium/drivers/i915simple/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
diff --git a/src/gallium/drivers/i915simple/i915_debug.c b/src/gallium/drivers/i915/i915_debug.c
index ce92d1af9a7..c6e6d6fd313 100644
--- a/src/gallium/drivers/i915simple/i915_debug.c
+++ b/src/gallium/drivers/i915/i915_debug.c
@@ -851,6 +851,7 @@ static boolean i915_debug_packet( struct debug_stream *stream )
       default:
 	 return debug(stream, "", 0);
       }
+      break;
    default:
       assert(0);
       return 0;
@@ -880,7 +881,7 @@ i915_dump_batchbuffer( struct intel_batchbuffer *batch )
       return;
    }
    
-   debug_printf( "\n\nBATCH: (%d)\n", bytes / 4);
+   debug_printf( "\n\nBATCH: (%d)\n", (int)bytes / 4);
 
    while (!done &&
 	  stream.offset < bytes)
diff --git a/src/gallium/drivers/i915simple/i915_debug.h b/src/gallium/drivers/i915/i915_debug.h
index dd9b86e17b5..dd9b86e17b5 100644
--- a/src/gallium/drivers/i915simple/i915_debug.h
+++ b/src/gallium/drivers/i915/i915_debug.h
diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915/i915_debug_fp.c
index 9c5b117b6dd..9c5b117b6dd 100644
--- a/src/gallium/drivers/i915simple/i915_debug_fp.c
+++ b/src/gallium/drivers/i915/i915_debug_fp.c
diff --git a/src/gallium/drivers/i915simple/i915_flush.c b/src/gallium/drivers/i915/i915_flush.c
index 1582168eba5..1582168eba5 100644
--- a/src/gallium/drivers/i915simple/i915_flush.c
+++ b/src/gallium/drivers/i915/i915_flush.c
diff --git a/src/gallium/drivers/i915simple/i915_fpc.h b/src/gallium/drivers/i915/i915_fpc.h
index 2f0f99d0468..2f0f99d0468 100644
--- a/src/gallium/drivers/i915simple/i915_fpc.h
+++ b/src/gallium/drivers/i915/i915_fpc.h
diff --git a/src/gallium/drivers/i915simple/i915_fpc_emit.c b/src/gallium/drivers/i915/i915_fpc_emit.c
index b054ce41d39..b054ce41d39 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_emit.c
+++ b/src/gallium/drivers/i915/i915_fpc_emit.c
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 89504ced276..379d47e79a3 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -127,7 +127,7 @@ i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
    va_start( args, msg );  
    util_vsnprintf( buffer, sizeof(buffer), msg, args );
    va_end( args );
-   debug_printf(buffer);
+   debug_printf("%s", buffer);
    debug_printf("\n");
 
    p->error = 1;
@@ -214,37 +214,19 @@ src_vector(struct i915_fp_compile *p,
       return 0;
    }
 
-   if (source->SrcRegister.Extended) {
-      src = swizzle(src,
-                    source->SrcRegisterExtSwz.ExtSwizzleX,
-                    source->SrcRegisterExtSwz.ExtSwizzleY,
-                    source->SrcRegisterExtSwz.ExtSwizzleZ,
-                    source->SrcRegisterExtSwz.ExtSwizzleW);
-   }
-   else {
-      src = swizzle(src,
-                    source->SrcRegister.SwizzleX,
-                    source->SrcRegister.SwizzleY,
-                    source->SrcRegister.SwizzleZ,
-                    source->SrcRegister.SwizzleW);
-   }
+   src = swizzle(src,
+		 source->SrcRegister.SwizzleX,
+		 source->SrcRegister.SwizzleY,
+		 source->SrcRegister.SwizzleZ,
+		 source->SrcRegister.SwizzleW);
 
 
    /* There's both negate-all-components and per-component negation.
     * Try to handle both here.
     */
    {
-      int nx = source->SrcRegisterExtSwz.NegateX;
-      int ny = source->SrcRegisterExtSwz.NegateY;
-      int nz = source->SrcRegisterExtSwz.NegateZ;
-      int nw = source->SrcRegisterExtSwz.NegateW;
-      if (source->SrcRegister.Negate) {
-         nx = !nx;
-         ny = !ny;
-         nz = !nz;
-         nw = !nw;
-      }
-      src = negate(src, nx, ny, nz, nw);
+      int n = source->SrcRegister.Negate;
+      src = negate(src, n, n, n, n);
    }
 
    /* no abs() or post-abs negation */
@@ -681,7 +663,6 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       emit_simple_arith(p, inst, A0_MOV, 1);
       break;
 
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915/i915_prim_emit.c
index d9a5c40ab97..d9a5c40ab97 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915/i915_prim_emit.c
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915/i915_prim_vbuf.c
index 508f4560e48..6b832140a87 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915/i915_prim_vbuf.c
@@ -44,6 +44,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_fifo.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
@@ -51,6 +52,9 @@
 #include "i915_state.h"
 
 
+#undef VBUF_USE_FIFO
+#undef VBUF_MAP_BUFFER
+
 /**
  * Primitive renderer for i915.
  */
@@ -73,11 +77,25 @@ struct i915_vbuf_render {
 
    /* Stuff for the vbo */
    struct intel_buffer *vbo;
-   size_t vbo_size;
+   size_t vbo_size; /**< current size of allocated buffer */
+   size_t vbo_alloc_size; /**< minimum buffer size to allocate */
    size_t vbo_offset;
    void *vbo_ptr;
-   size_t vbo_alloc_size;
    size_t vbo_max_used;
+
+#ifndef VBUF_MAP_BUFFER
+   size_t map_used_start;
+   size_t map_used_end;
+   size_t map_size;
+#endif
+
+#ifdef VBUF_USE_FIFO
+   /* Stuff for the pool */
+   struct util_fifo *pool_fifo;
+   unsigned pool_used;
+   unsigned pool_buffer_size;
+   boolean pool_not_used;
+#endif
 };
 
 
@@ -106,33 +124,90 @@ i915_vbuf_render_get_vertex_info(struct vbuf_render *render)
 }
 
 static boolean
+i915_vbuf_render_reserve(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915_render->vbo_size < size + i915_render->vbo_offset)
+      return FALSE;
+
+   if (i915->vbo_flushed)
+      return FALSE;
+
+   return TRUE;
+}
+
+static void
+i915_vbuf_render_new_buf(struct i915_vbuf_render *i915_render, size_t size)
+{
+   struct i915_context *i915 = i915_render->i915;
+   struct intel_winsys *iws = i915->iws;
+
+   if (i915_render->vbo) {
+#ifdef VBUF_USE_FIFO
+      if (i915_render->pool_not_used)
+         iws->buffer_destroy(iws, i915_render->vbo);
+      else
+         u_fifo_add(i915_render->pool_fifo, i915_render->vbo);
+      i915_render->vbo = NULL;
+#else
+      iws->buffer_destroy(iws, i915_render->vbo);
+#endif
+   }
+
+   i915->vbo_flushed = 0;
+
+   i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
+   i915_render->vbo_offset = 0;
+
+#ifndef VBUF_MAP_BUFFER
+   if (i915_render->vbo_size > i915_render->map_size) {
+      i915_render->map_size = i915_render->vbo_size;
+      FREE(i915_render->vbo_ptr);
+      i915_render->vbo_ptr = MALLOC(i915_render->map_size);
+   }
+#endif
+
+#ifdef VBUF_USE_FIFO
+   if (i915_render->vbo_size != i915_render->pool_buffer_size) {
+      i915_render->pool_not_used = TRUE;
+      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
+            INTEL_NEW_VERTEX);
+   } else {
+      i915_render->pool_not_used = FALSE;
+
+      if (i915_render->pool_used >= 2) {
+         FLUSH_BATCH(NULL);
+         i915->vbo_flushed = 0;
+         i915_render->pool_used = 0;
+      }
+      u_fifo_pop(i915_render->pool_fifo, (void**)&i915_render->vbo);
+   }
+#else
+   i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size,
+                                         64, INTEL_NEW_VERTEX);
+#endif
+}
+
+static boolean
 i915_vbuf_render_allocate_vertices(struct vbuf_render *render,
                                    ushort vertex_size,
                                    ushort nr_vertices)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct intel_winsys *iws = i915->iws;
    size_t size = (size_t)vertex_size * (size_t)nr_vertices;
 
    /* FIXME: handle failure */
    assert(!i915->vbo);
 
-   if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
-   } else {
-      i915->vbo_flushed = 0;
-      if (i915_render->vbo) {
-         iws->buffer_destroy(iws, i915_render->vbo);
-         i915_render->vbo = NULL;
-      }
-   }
-
-   if (!i915_render->vbo) {
-      i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
-      i915_render->vbo_offset = 0;
-      i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-                                            INTEL_NEW_VERTEX);
-
+   if (!i915_vbuf_render_reserve(i915_render, size)) {
+#ifdef VBUF_USE_FIFO
+      /* incase we flushed reset the number of pool buffers used */
+      if (i915->vbo_flushed)
+         i915_render->pool_used = 0;
+#endif
+      i915_vbuf_render_new_buf(i915_render, size);
    }
 
    i915_render->vertex_size = vertex_size;
@@ -153,11 +228,15 @@ i915_vbuf_render_map_vertices(struct vbuf_render *render)
    struct intel_winsys *iws = i915->iws;
 
    if (i915->vbo_flushed)
-      debug_printf("%s bad vbo flush occured stalling on hw\n");
+      debug_printf("%s bad vbo flush occured stalling on hw\n", __FUNCTION__);
 
+#ifdef VBUF_MAP_BUFFER
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
-
-   return (unsigned char *)i915_render->vbo_ptr + i915->vbo_offset;
+   return (unsigned char *)i915_render->vbo_ptr + i915_render->vbo_offset;
+#else
+   (void)iws;
+   return (unsigned char *)i915_render->vbo_ptr;
+#endif
 }
 
 static void
@@ -170,7 +249,17 @@ i915_vbuf_render_unmap_vertices(struct vbuf_render *render,
    struct intel_winsys *iws = i915->iws;
 
    i915_render->vbo_max_used = MAX2(i915_render->vbo_max_used, i915_render->vertex_size * (max_index + 1));
+#ifdef VBUF_MAP_BUFFER
    iws->buffer_unmap(iws, i915_render->vbo);
+#else
+   i915_render->map_used_start = i915_render->vertex_size * min_index;
+   i915_render->map_used_end = i915_render->vertex_size * (max_index + 1);
+   iws->buffer_write(iws, i915_render->vbo,
+                     i915_render->map_used_start + i915_render->vbo_offset,
+                     i915_render->map_used_end - i915_render->map_used_start,
+                     (unsigned char *)i915_render->vbo_ptr + i915_render->map_used_start);
+
+#endif
 }
 
 static boolean
@@ -344,14 +433,43 @@ i915_vbuf_render_draw_arrays(struct vbuf_render *render,
                              uint nr)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
 
    if (i915_render->fallback) {
       draw_arrays_fallback(render, start, nr);
       return;
    }
 
-   /* JB: TODO submit direct cmds */
-   draw_arrays_fallback(render, start, nr);
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state(i915);
+
+   if (!BEGIN_BATCH(2, 0)) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH(2, 0)) {
+         assert(0);
+         goto out;
+      }
+   }
+
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             PRIM_INDIRECT_SEQUENTIAL |
+             i915_render->hwprim |
+             nr);
+   OUT_BATCH(start); /* Beginning vertex index */
+
+out:
+   return;
 }
 
 /**
@@ -504,11 +622,12 @@ i915_vbuf_render_create(struct i915_context *i915)
 {
    struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
    struct intel_winsys *iws = i915->iws;
+   int i;
 
    i915_render->i915 = i915;
-   
-   i915_render->base.max_vertex_buffer_bytes = 128*1024;
-   
+
+   i915_render->base.max_vertex_buffer_bytes = 16*4096;
+
    /* NOTE: it must be such that state and vertices indices fit in a single 
     * batch buffer.
     */
@@ -524,14 +643,30 @@ i915_vbuf_render_create(struct i915_context *i915)
    i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
    i915_render->base.destroy = i915_vbuf_render_destroy;
 
-   i915_render->vbo_alloc_size = 128 * 4096;
-   i915_render->vbo_size = i915_render->vbo_alloc_size;
+#ifndef VBUF_MAP_BUFFER
+   i915_render->map_size = 0;
+   i915_render->map_used_start = 0;
+   i915_render->map_used_end = 0;
+#endif
+
+   i915_render->vbo = NULL;
+   i915_render->vbo_ptr = NULL;
+   i915_render->vbo_size = 0;
    i915_render->vbo_offset = 0;
-   i915_render->vbo = iws->buffer_create(iws, i915_render->vbo_size, 64,
-                                         INTEL_NEW_VERTEX);
-   /* TODO JB: is this realy needed? */
-   i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
-   iws->buffer_unmap(iws, i915_render->vbo);
+   i915_render->vbo_alloc_size = i915_render->base.max_vertex_buffer_bytes * 4;
+
+#ifdef VBUF_USE_POOL
+   i915_render->pool_used = FALSE;
+   i915_render->pool_buffer_size = i915_render->vbo_alloc_size;
+   i915_render->pool_fifo = u_fifo_create(6);
+   for (i = 0; i < 6; i++)
+      u_fifo_add(i915_render->pool_fifo,
+                 iws->buffer_create(iws, i915_render->pool_buffer_size, 64,
+                                    INTEL_NEW_VERTEX));
+#else
+   (void)i;
+   (void)iws;
+#endif
 
    return &i915_render->base;
 }
diff --git a/src/gallium/drivers/i915simple/i915_reg.h b/src/gallium/drivers/i915/i915_reg.h
index 04620fec681..04620fec681 100644
--- a/src/gallium/drivers/i915simple/i915_reg.h
+++ b/src/gallium/drivers/i915/i915_reg.h
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 9f017a14cca..d4ee8f5339b 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -46,7 +46,7 @@
 static const char *
 i915_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 static const char *
@@ -101,8 +101,6 @@ i915_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -273,6 +271,7 @@ i915_create_screen(struct intel_winsys *iws, uint pci_id)
    default:
       debug_printf("%s: unknown pci id 0x%x, cannot create screen\n", 
                    __FUNCTION__, pci_id);
+      FREE(is);
       return NULL;
    }
 
diff --git a/src/gallium/drivers/i915simple/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h
index 5126485caa7..5126485caa7 100644
--- a/src/gallium/drivers/i915simple/i915_screen.h
+++ b/src/gallium/drivers/i915/i915_screen.h
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 0087dfa410f..71f00bc346a 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -58,8 +58,10 @@ translate_wrap_mode(unsigned wrap)
       return TEXCOORDMODE_CLAMP_EDGE;
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       return TEXCOORDMODE_CLAMP_BORDER;
-//   case PIPE_TEX_WRAP_MIRRORED_REPEAT:
-//      return TEXCOORDMODE_MIRROR;
+/*
+   case PIPE_TEX_WRAP_MIRRORED_REPEAT:
+      return TEXCOORDMODE_MIRROR;
+*/
    default:
       return TEXCOORDMODE_WRAP;
    }
@@ -588,9 +590,17 @@ static void i915_set_framebuffer_state(struct pipe_context *pipe,
 				       const struct pipe_framebuffer_state *fb)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
+
    draw_flush(i915->draw);
 
-   i915->framebuffer = *fb; /* struct copy */
+   i915->framebuffer.width = fb->width;
+   i915->framebuffer.height = fb->height;
+   i915->framebuffer.nr_cbufs = fb->nr_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], fb->cbufs[i]);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
 
    i915->dirty |= I915_NEW_FRAMEBUFFER;
 }
diff --git a/src/gallium/drivers/i915simple/i915_state.h b/src/gallium/drivers/i915/i915_state.h
index 86c6b0027d5..86c6b0027d5 100644
--- a/src/gallium/drivers/i915simple/i915_state.h
+++ b/src/gallium/drivers/i915/i915_state.h
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 178d4e8781d..178d4e8781d 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
diff --git a/src/gallium/drivers/i915simple/i915_state_dynamic.c b/src/gallium/drivers/i915/i915_state_dynamic.c
index 86126a5a152..86126a5a152 100644
--- a/src/gallium/drivers/i915simple/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915/i915_state_dynamic.c
diff --git a/src/gallium/drivers/i915simple/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
index a3d4e3b04e5..a3d4e3b04e5 100644
--- a/src/gallium/drivers/i915simple/i915_state_emit.c
+++ b/src/gallium/drivers/i915/i915_state_emit.c
diff --git a/src/gallium/drivers/i915simple/i915_state_immediate.c b/src/gallium/drivers/i915/i915_state_immediate.c
index 8c16bb4e271..8c16bb4e271 100644
--- a/src/gallium/drivers/i915simple/i915_state_immediate.c
+++ b/src/gallium/drivers/i915/i915_state_immediate.c
diff --git a/src/gallium/drivers/i915simple/i915_state_inlines.h b/src/gallium/drivers/i915/i915_state_inlines.h
index 378de8f9c48..378de8f9c48 100644
--- a/src/gallium/drivers/i915simple/i915_state_inlines.h
+++ b/src/gallium/drivers/i915/i915_state_inlines.h
diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915/i915_state_sampler.c
index c5e9084d12e..c5e9084d12e 100644
--- a/src/gallium/drivers/i915simple/i915_state_sampler.c
+++ b/src/gallium/drivers/i915/i915_state_sampler.c
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
index ab8331f3e64..ab8331f3e64 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915/i915_texture.c
index 6a6c6542717..286c9ace8e5 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915/i915_texture.c
@@ -165,7 +165,7 @@ i915_scanout_layout(struct i915_texture *tex)
    struct pipe_texture *pt = &tex->base;
 
    if (pt->last_level > 0 || pt->block.size != 4)
-      return 0;
+      return FALSE;
 
    i915_miptree_set_level_info(tex, 0, 1,
                                tex->base.width[0],
@@ -191,6 +191,38 @@ i915_scanout_layout(struct i915_texture *tex)
    return TRUE;
 }
 
+/**
+ * Special case to deal with shared textures.
+ */
+static boolean
+i915_display_target_layout(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   if (pt->last_level > 0 || pt->block.size != 4)
+      return FALSE;
+
+   /* fallback to normal textures for small textures */
+   if (tex->base.width[0] < 240)
+      return FALSE;
+
+   i915_miptree_set_level_info(tex, 0, 1,
+                               tex->base.width[0],
+                               tex->base.height[0],
+                               1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
+
+   tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
+   tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+   tex->hw_tiled = INTEL_TILE_X;
+
+   debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      tex->base.width[0], tex->base.height[0], pt->block.size,
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+
+   return TRUE;
+}
+
 static void
 i915_miptree_layout_2d(struct i915_texture *tex)
 {
@@ -201,6 +233,16 @@ i915_miptree_layout_2d(struct i915_texture *tex)
    unsigned nblocksx = pt->nblocksx[0];
    unsigned nblocksy = pt->nblocksy[0];
 
+   /* used for scanouts that need special layouts */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
+      if (i915_scanout_layout(tex))
+         return;
+
+   /* for shared buffers we use some very like scanout */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      if (i915_display_target_layout(tex))
+         return;
+
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
    tex->total_nblocksy = 0;
 
@@ -351,6 +393,11 @@ i945_miptree_layout_2d(struct i915_texture *tex)
       if (i915_scanout_layout(tex))
          return;
 
+   /* for shared buffers we use some very like scanout */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      if (i915_display_target_layout(tex))
+         return;
+
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
 
    /* May need to adjust pitch to accomodate the placement of
@@ -812,7 +859,7 @@ i915_transfer_map(struct pipe_screen *screen,
    char *map;
    boolean write = FALSE;
 
-   if (transfer->usage != PIPE_TRANSFER_READ)
+   if (transfer->usage & PIPE_TRANSFER_WRITE)
       write = TRUE;
 
    map = iws->buffer_map(iws, tex->buffer, write);
diff --git a/src/gallium/drivers/i915simple/i915_texture.h b/src/gallium/drivers/i915/i915_texture.h
index 51a1dd984c8..51a1dd984c8 100644
--- a/src/gallium/drivers/i915simple/i915_texture.h
+++ b/src/gallium/drivers/i915/i915_texture.h
diff --git a/src/gallium/drivers/i915simple/intel_batchbuffer.h b/src/gallium/drivers/i915/intel_batchbuffer.h
index db12dfd2ac2..db12dfd2ac2 100644
--- a/src/gallium/drivers/i915simple/intel_batchbuffer.h
+++ b/src/gallium/drivers/i915/intel_batchbuffer.h
diff --git a/src/gallium/drivers/i915simple/intel_winsys.h b/src/gallium/drivers/i915/intel_winsys.h
index f949f52a9ce..c6bf6e6f7f1 100644
--- a/src/gallium/drivers/i915simple/intel_winsys.h
+++ b/src/gallium/drivers/i915/intel_winsys.h
@@ -42,21 +42,21 @@ enum intel_buffer_usage
    INTEL_USAGE_2D_TARGET = 0x04,
    INTEL_USAGE_2D_SOURCE = 0x08,
    /* use on vertex */
-   INTEL_USAGE_VERTEX    = 0x10,
+   INTEL_USAGE_VERTEX    = 0x10
 };
 
 enum intel_buffer_type
 {
    INTEL_NEW_TEXTURE,
    INTEL_NEW_SCANOUT, /**< a texture used for scanning out from */
-   INTEL_NEW_VERTEX,
+   INTEL_NEW_VERTEX
 };
 
 enum intel_buffer_tile
 {
    INTEL_TILE_NONE,
    INTEL_TILE_X,
-   INTEL_TILE_Y,
+   INTEL_TILE_Y
 };
 
 struct intel_batchbuffer {
@@ -150,6 +150,17 @@ struct intel_winsys {
    void (*buffer_unmap)(struct intel_winsys *iws,
                         struct intel_buffer *buffer);
 
+   /**
+    * Write to a buffer.
+    *
+    * Arguments follows pipe_buffer_write.
+    */
+   int (*buffer_write)(struct intel_winsys *iws,
+                       struct intel_buffer *dst,
+                       size_t offset,
+                       size_t size,
+                       const void *data);
+
    void (*buffer_destroy)(struct intel_winsys *iws,
                           struct intel_buffer *buffer);
    /*@}*/
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
deleted file mode 100644
index 19182afa75d..00000000000
--- a/src/gallium/drivers/i965simple/Makefile
+++ /dev/null
@@ -1,52 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = i965simple
-
-C_SOURCES = \
-	brw_blit.c \
-	brw_flush.c \
-	brw_screen.c \
-	brw_surface.c \
-	brw_cc.c \
-	brw_clip.c \
-	brw_clip_line.c \
-	brw_clip_point.c \
-	brw_clip_state.c \
-	brw_clip_tri.c \
-	brw_clip_util.c \
-	brw_context.c \
-	brw_curbe.c \
-	brw_draw.c \
-	brw_draw_upload.c \
-	brw_eu.c \
-	brw_eu_debug.c \
-	brw_eu_emit.c \
-	brw_eu_util.c \
-	brw_gs.c \
-	brw_gs_emit.c \
-	brw_gs_state.c \
-	brw_misc_state.c \
-	brw_sf.c \
-	brw_sf_emit.c \
-	brw_sf_state.c \
-	brw_state.c \
-	brw_state_batch.c \
-	brw_state_cache.c \
-	brw_state_pool.c \
-	brw_state_upload.c \
-	brw_tex_layout.c \
-	brw_urb.c \
-	brw_util.c \
-	brw_vs.c \
-	brw_vs_emit.c \
-	brw_vs_state.c \
-	brw_wm.c \
-	brw_wm_iz.c \
-	brw_wm_decl.c \
-	brw_wm_glsl.c \
-	brw_wm_sampler_state.c \
-	brw_wm_state.c \
-	brw_wm_surface_state.c
-
-include ../../Makefile.template
diff --git a/src/gallium/drivers/i965simple/SConscript b/src/gallium/drivers/i965simple/SConscript
deleted file mode 100644
index 43fc2a40052..00000000000
--- a/src/gallium/drivers/i965simple/SConscript
+++ /dev/null
@@ -1,54 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-i965simple = env.ConvenienceLibrary(
-	target = 'i965simple',
-	source = [
-		'brw_blit.c',
-		'brw_cc.c',
-		'brw_clip.c',
-		'brw_clip_line.c',
-		'brw_clip_point.c',
-		'brw_clip_state.c',
-		'brw_clip_tri.c',
-		'brw_clip_util.c',
-		'brw_context.c',
-		'brw_curbe.c',
-		'brw_draw.c',
-		'brw_draw_upload.c',
-		'brw_eu.c',
-		'brw_eu_debug.c',
-		'brw_eu_emit.c',
-		'brw_eu_util.c',
-		'brw_flush.c',
-		'brw_gs.c',
-		'brw_gs_emit.c',
-		'brw_gs_state.c',
-		'brw_misc_state.c',
-		'brw_screen.c',
-		'brw_sf.c',
-		'brw_sf_emit.c',
-		'brw_sf_state.c',
-		'brw_state.c',
-		'brw_state_batch.c',
-		'brw_state_cache.c',
-		'brw_state_pool.c',
-		'brw_state_upload.c',
-		'brw_surface.c',
-		'brw_tex_layout.c',
-		'brw_urb.c',
-		'brw_util.c',
-		'brw_vs.c',
-		'brw_vs_emit.c',
-		'brw_vs_state.c',
-		'brw_wm.c',
-		'brw_wm_decl.c',
-		'brw_wm_glsl.c',
-		'brw_wm_iz.c',
-		'brw_wm_sampler_state.c',
-		'brw_wm_state.c',
-		'brw_wm_surface_state.c',
-	])
-
-Export('i965simple')
diff --git a/src/gallium/drivers/i965simple/brw_batch.h b/src/gallium/drivers/i965simple/brw_batch.h
deleted file mode 100644
index 5f5932a4883..00000000000
--- a/src/gallium/drivers/i965simple/brw_batch.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef BRW_BATCH_H
-#define BRW_BATCH_H
-
-#include "brw_winsys.h"
-
-#define BATCH_LOCALS
-
-#define INTEL_BATCH_NO_CLIPRECTS 0x1
-#define INTEL_BATCH_CLIPRECTS    0x2
-
-#define BEGIN_BATCH( dwords, relocs ) \
-   brw->winsys->batch_start(brw->winsys, dwords, relocs)
-
-#define OUT_BATCH( dword ) \
-   brw->winsys->batch_dword(brw->winsys, dword)
-
-#define OUT_RELOC( buf, flags, delta ) \
-   brw->winsys->batch_reloc(brw->winsys, buf, flags, delta)
-
-#define ADVANCE_BATCH() \
-   brw->winsys->batch_end( brw->winsys )
-
-/* XXX: this is bogus - need proper handling for out-of-memory in batchbuffer.
- */
-#define FLUSH_BATCH(fence) do {				\
-   brw->winsys->batch_flush(brw->winsys, fence);	\
-   brw->hardware_dirty = ~0;				\
-} while (0)
-
-#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->winsys, (s), sizeof(*(s)))
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_blit.c b/src/gallium/drivers/i965simple/brw_blit.c
deleted file mode 100644
index 4d11f8d2ab8..00000000000
--- a/src/gallium/drivers/i965simple/brw_blit.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include <stdio.h>
-#include <errno.h>
-
-#include "brw_batch.h"
-#include "brw_blit.h"
-#include "brw_context.h"
-#include "brw_reg.h"
-
-#include "pipe/p_context.h"
-#include "pipe/internal/p_winsys_screen.h"
-
-#define FILE_DEBUG_FLAG DEBUG_BLIT
-
-void brw_fill_blit(struct brw_context *brw,
-                   unsigned cpp,
-                   short dst_pitch,
-                   struct pipe_buffer *dst_buffer,
-                   unsigned dst_offset,
-                   boolean dst_tiled,
-                   short x, short y,
-                   short w, short h,
-                   unsigned color)
-{
-   unsigned BR13, CMD;
-   BATCH_LOCALS;
-
-   dst_pitch *= cpp;
-
-   switch(cpp) {
-   case 1:
-   case 2:
-   case 3:
-      BR13 = (0xF0 << 16) | (1<<24);
-      CMD = XY_COLOR_BLT_CMD;
-      break;
-   case 4:
-      BR13 = (0xF0 << 16) | (1<<24) | (1<<25);
-      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return;
-   }
-
-   if (dst_tiled) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-
-   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
-   OUT_BATCH( CMD );
-   OUT_BATCH( dst_pitch | BR13 );
-   OUT_BATCH( (y << 16) | x );
-   OUT_BATCH( ((y+h) << 16) | (x+w) );
-   OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE, dst_offset );
-   OUT_BATCH( color );
-   ADVANCE_BATCH();
-}
-
-static unsigned translate_raster_op(unsigned logicop)
-{
-   switch(logicop) {
-   case PIPE_LOGICOP_CLEAR: return 0x00;
-   case PIPE_LOGICOP_AND: return 0x88;
-   case PIPE_LOGICOP_AND_REVERSE: return 0x44;
-   case PIPE_LOGICOP_COPY: return 0xCC;
-   case PIPE_LOGICOP_AND_INVERTED: return 0x22;
-   case PIPE_LOGICOP_NOOP: return 0xAA;
-   case PIPE_LOGICOP_XOR: return 0x66;
-   case PIPE_LOGICOP_OR: return 0xEE;
-   case PIPE_LOGICOP_NOR: return 0x11;
-   case PIPE_LOGICOP_EQUIV: return 0x99;
-   case PIPE_LOGICOP_INVERT: return 0x55;
-   case PIPE_LOGICOP_OR_REVERSE: return 0xDD;
-   case PIPE_LOGICOP_COPY_INVERTED: return 0x33;
-   case PIPE_LOGICOP_OR_INVERTED: return 0xBB;
-   case PIPE_LOGICOP_NAND: return 0x77;
-   case PIPE_LOGICOP_SET: return 0xFF;
-   default: return 0;
-   }
-}
-
-
-/* Copy BitBlt
- */
-void brw_copy_blit(struct brw_context *brw,
-                   unsigned do_flip,
-                   unsigned cpp,
-                   short src_pitch,
-                   struct pipe_buffer *src_buffer,
-                   unsigned  src_offset,
-                   boolean src_tiled,
-                   short dst_pitch,
-                   struct pipe_buffer *dst_buffer,
-                   unsigned  dst_offset,
-                   boolean dst_tiled,
-                   short src_x, short src_y,
-                   short dst_x, short dst_y,
-                   short w, short h,
-                   unsigned logic_op)
-{
-   unsigned CMD, BR13;
-   int dst_y2 = dst_y + h;
-   int dst_x2 = dst_x + w;
-   BATCH_LOCALS;
-
-
-   DBG("%s src:buf(%d)/%d %d,%d dst:buf(%d)/%d %d,%d sz:%dx%d op:%d\n",
-       __FUNCTION__,
-       src_buffer, src_pitch, src_x, src_y,
-       dst_buffer, dst_pitch, dst_x, dst_y,
-       w,h,logic_op);
-
-   assert( logic_op - PIPE_LOGICOP_CLEAR >= 0 );
-   assert( logic_op - PIPE_LOGICOP_CLEAR < 0x10 );
-
-   src_pitch *= cpp;
-   dst_pitch *= cpp;
-
-   switch(cpp) {
-   case 1:
-   case 2:
-   case 3:
-      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24);
-      CMD = XY_SRC_COPY_BLT_CMD;
-      break;
-   case 4:
-      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24) |
-	  (1<<25);
-      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return;
-   }
-
-   if (src_tiled) {
-      CMD |= XY_SRC_TILED;
-      src_pitch /= 4;
-   }
-
-   if (dst_tiled) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-
-   if (dst_y2 < dst_y ||
-       dst_x2 < dst_x) {
-      return;
-   }
-
-   dst_pitch &= 0xffff;
-   src_pitch &= 0xffff;
-
-   /* Initial y values don't seem to work with negative pitches.  If
-    * we adjust the offsets manually (below), it seems to work fine.
-    *
-    * On the other hand, if we always adjust, the hardware doesn't
-    * know which blit directions to use, so overlapping copypixels get
-    * the wrong result.
-    */
-   if (dst_pitch > 0 && src_pitch > 0) {
-      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
-      OUT_BATCH( CMD );
-      OUT_BATCH( dst_pitch | BR13 );
-      OUT_BATCH( (dst_y << 16) | dst_x );
-      OUT_BATCH( (dst_y2 << 16) | dst_x2 );
-      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
-		 dst_offset );
-      OUT_BATCH( (src_y << 16) | src_x );
-      OUT_BATCH( src_pitch );
-      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
-		 src_offset );
-      ADVANCE_BATCH();
-   }
-   else {
-      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
-      OUT_BATCH( CMD );
-      OUT_BATCH( (dst_pitch & 0xffff) | BR13 );
-      OUT_BATCH( (0 << 16) | dst_x );
-      OUT_BATCH( (h << 16) | dst_x2 );
-      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
-		 dst_offset + dst_y * dst_pitch );
-      OUT_BATCH( (src_pitch & 0xffff) );
-      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
-		 src_offset + src_y * src_pitch );
-      ADVANCE_BATCH();
-   }
-}
-
-
-
diff --git a/src/gallium/drivers/i965simple/brw_blit.h b/src/gallium/drivers/i965simple/brw_blit.h
deleted file mode 100644
index 111c5d91d39..00000000000
--- a/src/gallium/drivers/i965simple/brw_blit.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef BRW_BLIT_H
-#define BRW_BLIT_H
-
-#include "pipe/p_compiler.h"
-
-struct pipe_buffer;
-struct brw_context;
-
-void brw_fill_blit(struct brw_context *intel,
-                   unsigned cpp,
-                   short dst_pitch,
-                   struct pipe_buffer *dst_buffer,
-                   unsigned dst_offset,
-                   boolean dst_tiled,
-                   short x, short y,
-                   short w, short h,
-                   unsigned color);
-void brw_copy_blit(struct brw_context *intel,
-                   unsigned do_flip,
-                   unsigned cpp,
-                   short src_pitch,
-                   struct pipe_buffer *src_buffer,
-                   unsigned  src_offset,
-                   boolean src_tiled,
-                   short dst_pitch,
-                   struct pipe_buffer *dst_buffer,
-                   unsigned  dst_offset,
-                   boolean dst_tiled,
-                   short src_x, short src_y,
-                   short dst_x, short dst_y,
-                   short w, short h,
-                   unsigned logic_op);
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
deleted file mode 100644
index 3668123e2e1..00000000000
--- a/src/gallium/drivers/i965simple/brw_cc.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_util.h"
-
-
-static int brw_translate_compare_func(int func)
-{
-   switch(func) {
-   case PIPE_FUNC_NEVER:
-      return BRW_COMPAREFUNCTION_NEVER;
-   case PIPE_FUNC_LESS:
-      return BRW_COMPAREFUNCTION_LESS;
-   case PIPE_FUNC_LEQUAL:
-      return BRW_COMPAREFUNCTION_LEQUAL;
-   case PIPE_FUNC_GREATER:
-      return BRW_COMPAREFUNCTION_GREATER;
-   case PIPE_FUNC_GEQUAL:
-      return BRW_COMPAREFUNCTION_GEQUAL;
-   case PIPE_FUNC_NOTEQUAL:
-      return BRW_COMPAREFUNCTION_NOTEQUAL;
-   case PIPE_FUNC_EQUAL:
-      return BRW_COMPAREFUNCTION_EQUAL;
-   case PIPE_FUNC_ALWAYS:
-      return BRW_COMPAREFUNCTION_ALWAYS;
-   }
-
-   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
-   return BRW_COMPAREFUNCTION_ALWAYS;
-}
-
-static int brw_translate_stencil_op(int op)
-{
-   switch(op) {
-   case PIPE_STENCIL_OP_KEEP:
-      return BRW_STENCILOP_KEEP;
-   case PIPE_STENCIL_OP_ZERO:
-      return BRW_STENCILOP_ZERO;
-   case PIPE_STENCIL_OP_REPLACE:
-      return BRW_STENCILOP_REPLACE;
-   case PIPE_STENCIL_OP_INCR:
-      return BRW_STENCILOP_INCRSAT;
-   case PIPE_STENCIL_OP_DECR:
-      return BRW_STENCILOP_DECRSAT;
-   case PIPE_STENCIL_OP_INCR_WRAP:
-      return BRW_STENCILOP_INCR;
-   case PIPE_STENCIL_OP_DECR_WRAP:
-      return BRW_STENCILOP_DECR;
-   case PIPE_STENCIL_OP_INVERT:
-      return BRW_STENCILOP_INVERT;
-   default:
-      return BRW_STENCILOP_ZERO;
-   }
-}
-
-
-static int brw_translate_logic_op(int opcode)
-{
-   switch(opcode) {
-   case PIPE_LOGICOP_CLEAR:
-      return BRW_LOGICOPFUNCTION_CLEAR;
-   case PIPE_LOGICOP_AND:
-      return BRW_LOGICOPFUNCTION_AND;
-   case PIPE_LOGICOP_AND_REVERSE:
-      return BRW_LOGICOPFUNCTION_AND_REVERSE;
-   case PIPE_LOGICOP_COPY:
-      return BRW_LOGICOPFUNCTION_COPY;
-   case PIPE_LOGICOP_COPY_INVERTED:
-      return BRW_LOGICOPFUNCTION_COPY_INVERTED;
-   case PIPE_LOGICOP_AND_INVERTED:
-      return BRW_LOGICOPFUNCTION_AND_INVERTED;
-   case PIPE_LOGICOP_NOOP:
-      return BRW_LOGICOPFUNCTION_NOOP;
-   case PIPE_LOGICOP_XOR:
-      return BRW_LOGICOPFUNCTION_XOR;
-   case PIPE_LOGICOP_OR:
-      return BRW_LOGICOPFUNCTION_OR;
-   case PIPE_LOGICOP_OR_INVERTED:
-      return BRW_LOGICOPFUNCTION_OR_INVERTED;
-   case PIPE_LOGICOP_NOR:
-      return BRW_LOGICOPFUNCTION_NOR;
-   case PIPE_LOGICOP_EQUIV:
-      return BRW_LOGICOPFUNCTION_EQUIV;
-   case PIPE_LOGICOP_INVERT:
-      return BRW_LOGICOPFUNCTION_INVERT;
-   case PIPE_LOGICOP_OR_REVERSE:
-      return BRW_LOGICOPFUNCTION_OR_REVERSE;
-   case PIPE_LOGICOP_NAND:
-      return BRW_LOGICOPFUNCTION_NAND;
-   case PIPE_LOGICOP_SET:
-      return BRW_LOGICOPFUNCTION_SET;
-   default:
-      return BRW_LOGICOPFUNCTION_SET;
-   }
-}
-
-
-static void upload_cc_vp( struct brw_context *brw )
-{
-   struct brw_cc_viewport ccv;
-
-   memset(&ccv, 0, sizeof(ccv));
-
-   ccv.min_depth = 0.0;
-   ccv.max_depth = 1.0;
-
-   brw->cc.vp_gs_offset = brw_cache_data( &brw->cache[BRW_CC_VP], &ccv );
-}
-
-const struct brw_tracked_state brw_cc_vp = {
-   .dirty = {
-      .brw = BRW_NEW_SCENE,
-      .cache = 0
-   },
-   .update = upload_cc_vp
-};
-
-
-static void upload_cc_unit( struct brw_context *brw )
-{
-   struct brw_cc_unit_state cc;
-
-   memset(&cc, 0, sizeof(cc));
-
-   /* BRW_NEW_DEPTH_STENCIL */
-   if (brw->attribs.DepthStencil->stencil[0].enabled) {
-      cc.cc0.stencil_enable = brw->attribs.DepthStencil->stencil[0].enabled;
-      cc.cc0.stencil_func = brw_translate_compare_func(brw->attribs.DepthStencil->stencil[0].func);
-      cc.cc0.stencil_fail_op = brw_translate_stencil_op(brw->attribs.DepthStencil->stencil[0].fail_op);
-      cc.cc0.stencil_pass_depth_fail_op = brw_translate_stencil_op(
-         brw->attribs.DepthStencil->stencil[0].zfail_op);
-      cc.cc0.stencil_pass_depth_pass_op = brw_translate_stencil_op(
-         brw->attribs.DepthStencil->stencil[0].zpass_op);
-      cc.cc1.stencil_ref = brw->attribs.DepthStencil->stencil[0].ref_value;
-      cc.cc1.stencil_write_mask = brw->attribs.DepthStencil->stencil[0].writemask;
-      cc.cc1.stencil_test_mask = brw->attribs.DepthStencil->stencil[0].valuemask;
-
-      if (brw->attribs.DepthStencil->stencil[1].enabled) {
-	 cc.cc0.bf_stencil_enable = brw->attribs.DepthStencil->stencil[1].enabled;
-	 cc.cc0.bf_stencil_func = brw_translate_compare_func(
-            brw->attribs.DepthStencil->stencil[1].func);
-	 cc.cc0.bf_stencil_fail_op = brw_translate_stencil_op(
-            brw->attribs.DepthStencil->stencil[1].fail_op);
-	 cc.cc0.bf_stencil_pass_depth_fail_op = brw_translate_stencil_op(
-            brw->attribs.DepthStencil->stencil[1].zfail_op);
-	 cc.cc0.bf_stencil_pass_depth_pass_op = brw_translate_stencil_op(
-            brw->attribs.DepthStencil->stencil[1].zpass_op);
-	 cc.cc1.bf_stencil_ref = brw->attribs.DepthStencil->stencil[1].ref_value;
-	 cc.cc2.bf_stencil_write_mask = brw->attribs.DepthStencil->stencil[1].writemask;
-	 cc.cc2.bf_stencil_test_mask = brw->attribs.DepthStencil->stencil[1].valuemask;
-      }
-
-      /* Not really sure about this:
-       */
-      if (brw->attribs.DepthStencil->stencil[0].writemask ||
-	  brw->attribs.DepthStencil->stencil[1].writemask)
-	 cc.cc0.stencil_write_enable = 1;
-   }
-
-   /* BRW_NEW_BLEND */
-   if (brw->attribs.Blend->logicop_enable) {
-      cc.cc2.logicop_enable = 1;
-      cc.cc5.logicop_func = brw_translate_logic_op( brw->attribs.Blend->logicop_func );
-   }
-   else if (brw->attribs.Blend->blend_enable) {
-      int eqRGB = brw->attribs.Blend->rgb_func;
-      int eqA = brw->attribs.Blend->alpha_func;
-      int srcRGB = brw->attribs.Blend->rgb_src_factor;
-      int dstRGB = brw->attribs.Blend->rgb_dst_factor;
-      int srcA = brw->attribs.Blend->alpha_src_factor;
-      int dstA = brw->attribs.Blend->alpha_dst_factor;
-
-      if (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX) {
-	 srcRGB = dstRGB = PIPE_BLENDFACTOR_ONE;
-      }
-
-      if (eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX) {
-	 srcA = dstA = PIPE_BLENDFACTOR_ONE;
-      }
-
-      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
-      cc.cc6.blend_function = brw_translate_blend_equation( eqRGB );
-
-      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
-      cc.cc5.ia_blend_function = brw_translate_blend_equation( eqA );
-
-      cc.cc3.blend_enable = 1;
-      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
-				dstA != dstRGB ||
-				eqA != eqRGB);
-   }
-   
-   /* BRW_NEW_ALPHATEST
-    */
-   if (brw->attribs.DepthStencil->alpha.enabled) {
-      cc.cc3.alpha_test = 1;
-      cc.cc3.alpha_test_func = 
-	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
-
-      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref_value);
-
-      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
-   }
-
-   if (brw->attribs.Blend->dither) {
-      cc.cc5.dither_enable = 1;
-      cc.cc6.y_dither_offset = 0;
-      cc.cc6.x_dither_offset = 0;
-   }
-
-   if (brw->attribs.DepthStencil->depth.enabled) {
-      cc.cc2.depth_test = brw->attribs.DepthStencil->depth.enabled;
-      cc.cc2.depth_test_function = brw_translate_compare_func(brw->attribs.DepthStencil->depth.func);
-      cc.cc2.depth_write_enable = brw->attribs.DepthStencil->depth.writemask;
-   }
-
-   /* CACHE_NEW_CC_VP */
-   cc.cc4.cc_viewport_state_offset =  brw->cc.vp_gs_offset >> 5;
-
-   if (BRW_DEBUG & DEBUG_STATS)
-      cc.cc5.statistics_enable = 1;
-
-   brw->cc.state_gs_offset = brw_cache_data( &brw->cache[BRW_CC_UNIT], &cc );
-}
-
-const struct brw_tracked_state brw_cc_unit = {
-   .dirty = {
-      .brw = BRW_NEW_DEPTH_STENCIL | BRW_NEW_BLEND | BRW_NEW_ALPHA_TEST,
-      .cache = CACHE_NEW_CC_VP
-   },
-   .update = upload_cc_unit
-};
-
diff --git a/src/gallium/drivers/i965simple/brw_clip.c b/src/gallium/drivers/i965simple/brw_clip.c
deleted file mode 100644
index 268124cc53f..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_state.h"
-#include "brw_clip.h"
-
-#define FRONT_UNFILLED_BIT  0x1
-#define BACK_UNFILLED_BIT   0x2
-
-
-static void compile_clip_prog( struct brw_context *brw,
-			     struct brw_clip_prog_key *key )
-{
-   struct brw_clip_compile c;
-   const unsigned *program;
-   unsigned program_size;
-   unsigned delta;
-   unsigned i;
-
-   memset(&c, 0, sizeof(c));
-
-   /* Begin the compilation:
-    */
-   brw_init_compile(&c.func);
-
-   c.func.single_program_flow = 1;
-
-   c.key = *key;
-
-
-   /* Need to locate the two positions present in vertex + header.
-    * These are currently hardcoded:
-    */
-   c.header_position_offset = ATTR_SIZE;
-
-   for (i = 0, delta = REG_SIZE; i < PIPE_MAX_SHADER_OUTPUTS; i++)
-      if (c.key.attrs & (1<<i)) {
-	 c.offset[i] = delta;
-	 delta += ATTR_SIZE;
-      }
-
-   c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
-   c.nr_bytes = c.nr_regs * REG_SIZE;
-
-   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
-
-   /* For some reason the thread is spawned with only 4 channels
-    * unmasked.
-    */
-   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
-
-
-   /* Would ideally have the option of producing a program which could
-    * do all three:
-    */
-   switch (key->primitive) {
-   case PIPE_PRIM_TRIANGLES:
-#if 0
-      if (key->do_unfilled)
-	 brw_emit_unfilled_clip( &c );
-      else
-#endif
-	 brw_emit_tri_clip( &c );
-      break;
-   case PIPE_PRIM_LINES:
-      brw_emit_line_clip( &c );
-      break;
-   case PIPE_PRIM_POINTS:
-      brw_emit_point_clip( &c );
-      break;
-   default:
-      assert(0);
-      return;
-   }
-
-
-
-   /* get the program
-    */
-   program = brw_get_program(&c.func, &program_size);
-
-   /* Upload
-    */
-   brw->clip.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_CLIP_PROG],
-						&c.key,
-						sizeof(c.key),
-						program,
-						program_size,
-						&c.prog_data,
-						&brw->clip.prog_data );
-}
-
-
-static boolean search_cache( struct brw_context *brw,
-			       struct brw_clip_prog_key *key )
-{
-   return brw_search_cache(&brw->cache[BRW_CLIP_PROG],
-			   key, sizeof(*key),
-			   &brw->clip.prog_data,
-			   &brw->clip.prog_gs_offset);
-}
-
-
-
-
-/* Calculate interpolants for triangle and line rasterization.
- */
-static void upload_clip_prog(struct brw_context *brw)
-{
-   struct brw_clip_prog_key key;
-
-   memset(&key, 0, sizeof(key));
-
-   /* Populate the key:
-    */
-   /* BRW_NEW_REDUCED_PRIMITIVE */
-   key.primitive = brw->reduced_primitive;
-   /* CACHE_NEW_VS_PROG */
-   key.attrs = brw->vs.prog_data->outputs_written;
-   /* BRW_NEW_RASTER */
-   key.do_flat_shading = (brw->attribs.Raster->flatshade);
-   /* BRW_NEW_CLIP */
-   key.nr_userclip = brw->attribs.Clip.nr; /* XXX */
-
-#if 0
-   key.clip_mode = BRW_CLIPMODE_NORMAL;
-
-   if (key.primitive == PIPE_PRIM_TRIANGLES) {
-      if (brw->attribs.Raster->cull_mode == PIPE_WINDING_BOTH)
-	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
-      else {
-         if (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
-             brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL)
-            key.do_unfilled = 1;
-
-	 /* Most cases the fixed function units will handle.  Cases where
-	  * one or more polygon faces are unfilled will require help:
-	  */
-	 if (key.do_unfilled) {
-	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
-
-	    if (brw->attribs.Raster->offset_cw ||
-                brw->attribs.Raster->offset_ccw) {
-	       key.offset_units = brw->attribs.Raster->offset_units;
-	       key.offset_factor = brw->attribs.Raster->offset_scale;
-	    }
-            key.fill_ccw = brw->attribs.Raster->fill_ccw;
-            key.fill_cw = brw->attribs.Raster->fill_cw;
-            key.offset_ccw = brw->attribs.Raster->offset_ccw;
-            key.offset_cw = brw->attribs.Raster->offset_cw;
-            if (brw->attribs.Raster->light_twoside &&
-                key.fill_cw != CLIP_CULL)
-               key.copy_bfc_cw = 1;
-	 }
-      }
-   }
-#else
-   key.clip_mode = BRW_CLIPMODE_ACCEPT_ALL;
-#endif
-
-   if (!search_cache(brw, &key))
-      compile_clip_prog( brw, &key );
-}
-
-const struct brw_tracked_state brw_clip_prog = {
-   .dirty = {
-      .brw   = (BRW_NEW_RASTERIZER |
-		BRW_NEW_CLIP |
-		BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = CACHE_NEW_VS_PROG
-   },
-   .update = upload_clip_prog
-};
diff --git a/src/gallium/drivers/i965simple/brw_clip.h b/src/gallium/drivers/i965simple/brw_clip.h
deleted file mode 100644
index d70fc094ff5..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#ifndef BRW_CLIP_H
-#define BRW_CLIP_H
-
-
-#include "brw_context.h"
-#include "brw_eu.h"
-
-#define MAX_VERTS (3+6+6)	
-
-/* Note that if unfilled primitives are being emitted, we have to fix
- * up polygon offset and flatshading at this point:
- */
-struct brw_clip_prog_key {
-   unsigned attrs:32;		
-   unsigned primitive:4;
-   unsigned nr_userclip:3;
-   unsigned do_flat_shading:1;
-   unsigned do_unfilled:1;
-   unsigned fill_cw:2;		/* includes cull information */
-   unsigned fill_ccw:2;		/* includes cull information */
-   unsigned offset_cw:1;
-   unsigned offset_ccw:1;
-   unsigned pad0:17;
-
-   unsigned copy_bfc_cw:1;
-   unsigned copy_bfc_ccw:1;
-   unsigned clip_mode:3;
-   unsigned pad1:27;
-   
-   float offset_factor;
-   float offset_units;
-};
-
-
-#define CLIP_LINE   0
-#define CLIP_POINT  1
-#define CLIP_FILL   2
-#define CLIP_CULL   3
-
-
-#define PRIM_MASK  (0x1f)
-
-struct brw_clip_compile {
-   struct brw_compile func;
-   struct brw_clip_prog_key key;
-   struct brw_clip_prog_data prog_data;
-   
-   struct {
-      struct brw_reg R0;
-      struct brw_reg vertex[MAX_VERTS];
-
-      struct brw_reg t;
-      struct brw_reg t0, t1;
-      struct brw_reg dp0, dp1;
-
-      struct brw_reg dpPrev;
-      struct brw_reg dp;
-      struct brw_reg loopcount;
-      struct brw_reg nr_verts;
-      struct brw_reg planemask;
-
-      struct brw_reg inlist;
-      struct brw_reg outlist;
-      struct brw_reg freelist;
-
-      struct brw_reg dir;
-      struct brw_reg tmp0, tmp1;
-      struct brw_reg offset;
-      
-      struct brw_reg fixed_planes;
-      struct brw_reg plane_equation;
-   } reg;
-
-   /* 3 different ways of expressing vertex size:
-    */
-   unsigned nr_attrs;
-   unsigned nr_regs;
-   unsigned nr_bytes;
-
-   unsigned first_tmp;
-   unsigned last_tmp;
-
-   boolean need_direction;
-
-   unsigned last_mrf;
-
-   unsigned header_position_offset;
-   unsigned offset[PIPE_MAX_ATTRIBS];
-};
-
-#define ATTR_SIZE  (4*4)
-
-/* Points are only culled, so no need for a clip routine, however it
- * works out easier to have a dummy one.
- */
-void brw_emit_unfilled_clip( struct brw_clip_compile *c );
-void brw_emit_tri_clip( struct brw_clip_compile *c );
-void brw_emit_line_clip( struct brw_clip_compile *c );
-void brw_emit_point_clip( struct brw_clip_compile *c );
-
-/* brw_clip_tri.c, for use by the unfilled clip routine:
- */
-void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
-void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
-void brw_clip_tri( struct brw_clip_compile *c );
-void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
-void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
-			      unsigned nr_verts );
-
-
-/* Utils:
- */
-
-void brw_clip_interp_vertex( struct brw_clip_compile *c,
-			     struct brw_indirect dest_ptr,
-			     struct brw_indirect v0_ptr, /* from */
-			     struct brw_indirect v1_ptr, /* to */
-			     struct brw_reg t0,
-			     boolean force_edgeflag );
-
-void brw_clip_init_planes( struct brw_clip_compile *c );
-
-void brw_clip_emit_vue(struct brw_clip_compile *c, 
-		       struct brw_indirect vert,
-		       boolean allocate,
-		       boolean eot,
-		       unsigned header);
-
-void brw_clip_kill_thread(struct brw_clip_compile *c);
-
-struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
-struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
-
-void brw_clip_copy_colors( struct brw_clip_compile *c,
-			   unsigned to, unsigned from );
-
-void brw_clip_init_clipmask( struct brw_clip_compile *c );
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_clip_line.c b/src/gallium/drivers/i965simple/brw_clip_line.c
deleted file mode 100644
index 75d9e5fcda2..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip_line.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_clip.h"
-
-
-
-static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
-{
-   unsigned i = 0,j;
-
-   /* Register usage is static, precompute here:
-    */
-   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
-
-   if (c->key.nr_userclip) {
-      c->reg.fixed_planes = brw_vec4_grf(i, 0);
-      i += (6 + c->key.nr_userclip + 1) / 2;
-
-      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
-   }
-   else
-      c->prog_data.curb_read_length = 0;
-
-
-   /* Payload vertices plus space for more generated vertices:
-    */
-   for (j = 0; j < 4; j++) {
-      c->reg.vertex[j] = brw_vec4_grf(i, 0);
-      i += c->nr_regs;
-   }
-
-   c->reg.t           = brw_vec1_grf(i, 0);
-   c->reg.t0          = brw_vec1_grf(i, 1);
-   c->reg.t1          = brw_vec1_grf(i, 2);
-   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
-   c->reg.plane_equation = brw_vec4_grf(i, 4);
-   i++;
-
-   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
-   c->reg.dp1         = brw_vec1_grf(i, 4);
-   i++;
-
-   if (!c->key.nr_userclip) {
-      c->reg.fixed_planes = brw_vec8_grf(i, 0);
-      i++;
-   }
-
-
-   c->first_tmp = i;
-   c->last_tmp = i;
-
-   c->prog_data.urb_read_length = c->nr_regs; /* ? */
-   c->prog_data.total_grf = i;
-}
-
-
-
-/* Line clipping, more or less following the following algorithm:
- *
- *  for (p=0;p<MAX_PLANES;p++) {
- *     if (clipmask & (1 << p)) {
- *        float dp0 = DOTPROD( vtx0, plane[p] );
- *        float dp1 = DOTPROD( vtx1, plane[p] );
- *
- *        if (IS_NEGATIVE(dp1)) {
- *           float t = dp1 / (dp1 - dp0);
- *           if (t > t1) t1 = t;
- *        } else {
- *           float t = dp0 / (dp0 - dp1);
- *           if (t > t0) t0 = t;
- *        }
- *
- *        if (t0 + t1 >= 1.0)
- *           return;
- *     }
- *  }
- *
- *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
- *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
- *
- */
-static void clip_and_emit_line( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_indirect vtx0     = brw_indirect(0, 0);
-   struct brw_indirect vtx1      = brw_indirect(1, 0);
-   struct brw_indirect newvtx0   = brw_indirect(2, 0);
-   struct brw_indirect newvtx1   = brw_indirect(3, 0);
-   struct brw_indirect plane_ptr = brw_indirect(4, 0);
-   struct brw_instruction *plane_loop;
-   struct brw_instruction *plane_active;
-   struct brw_instruction *is_negative;
-   struct brw_instruction *is_neg2;
-   struct brw_instruction *not_culled;
-   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
-
-   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
-   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
-   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
-   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
-   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
-
-   /* Note: init t0, t1 together:
-    */
-   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
-
-   brw_clip_init_planes(c);
-   brw_clip_init_clipmask(c);
-
-   /* -ve rhw workaround */
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
-	   brw_imm_ud(1<<20));
-   brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   plane_loop = brw_DO(p, BRW_EXECUTE_1);
-   {
-      /* if (planemask & 1)
-       */
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
-
-      plane_active = brw_IF(p, BRW_EXECUTE_1);
-      {
-	 if (c->key.nr_userclip)
-	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
-	 else
-	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
-
-#if 0
-	 /* dp = DP4(vtx->position, plane)
-	  */
-	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
-
-	 /* if (IS_NEGATIVE(dp1))
-	  */
-	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
-#else
-         #warning "disabled"
-#endif
-	 is_negative = brw_IF(p, BRW_EXECUTE_1);
-	 {
-	    brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
-	    brw_math_invert(p, c->reg.t, c->reg.t);
-	    brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
-
-	    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
-	    brw_MOV(p, c->reg.t1, c->reg.t);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	 }
-	 is_negative = brw_ELSE(p, is_negative);
-	 {
-	    /* Coming back in.  We know that both cannot be negative
-	     * because the line would have been culled in that case.
-	     */
-
-	    /* If both are positive, do nothing */
-             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
-             is_neg2 = brw_IF(p, BRW_EXECUTE_1);
-             {
-		brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
-		brw_math_invert(p, c->reg.t, c->reg.t);
-		brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
-
-		brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
-		brw_MOV(p, c->reg.t0, c->reg.t);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	     }
-	     brw_ENDIF(p, is_neg2);
-	 }
-	 brw_ENDIF(p, is_negative);
-      }
-      brw_ENDIF(p, plane_active);
-
-      /* plane_ptr++;
-       */
-      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
-
-      /* while (planemask>>=1) != 0
-       */
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
-   }
-   brw_WHILE(p, plane_loop);
-
-   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
-   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
-   not_culled = brw_IF(p, BRW_EXECUTE_1);
-   {
-      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, FALSE);
-      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, FALSE);
-
-      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
-      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
-   }
-   brw_ENDIF(p, not_culled);
-   brw_clip_kill_thread(c);
-}
-
-
-
-void brw_emit_line_clip( struct brw_clip_compile *c )
-{
-   brw_clip_line_alloc_regs(c);
-
-   if (c->key.do_flat_shading)
-      brw_clip_copy_colors(c, 0, 1);
-
-   clip_and_emit_line(c);
-}
diff --git a/src/gallium/drivers/i965simple/brw_clip_point.c b/src/gallium/drivers/i965simple/brw_clip_point.c
deleted file mode 100644
index 6fce7210d1b..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip_point.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_clip.h"
-
-
-/* Point clipping, nothing to do?
- */
-void brw_emit_point_clip( struct brw_clip_compile *c )
-{
-   /* Send an empty message to kill the thread:
-    */
-   brw_clip_tri_alloc_regs(c, 0);
-   brw_clip_kill_thread(c);
-}
diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c
deleted file mode 100644
index 8e78dd51be9..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip_state.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-static void upload_clip_unit( struct brw_context *brw )
-{
-   struct brw_clip_unit_state clip;
-
-   memset(&clip, 0, sizeof(clip));
-
-   /* CACHE_NEW_CLIP_PROG */
-   clip.thread0.grf_reg_count =
-      align(brw->clip.prog_data->total_grf, 16) / 16 - 1;
-   clip.thread0.kernel_start_pointer = brw->clip.prog_gs_offset >> 6;
-   clip.thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
-   clip.thread3.const_urb_entry_read_length = brw->clip.prog_data->curb_read_length;
-   clip.clip5.clip_mode = brw->clip.prog_data->clip_mode;
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   clip.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
-
-   /* BRW_NEW_URB_FENCE */
-   clip.thread4.nr_urb_entries = brw->urb.nr_clip_entries; 
-   clip.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-   clip.thread4.max_threads = 1; /* 2 threads */
-
-   if (BRW_DEBUG & DEBUG_STATS)
-      clip.thread4.stats_enable = 1; 
-
-   /* CONSTANT */
-   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   clip.thread1.single_program_flow = 1;
-   clip.thread3.dispatch_grf_start_reg = 1;
-   clip.thread3.urb_entry_read_offset = 0;
-   clip.clip5.userclip_enable_flags = 0x7f;
-   clip.clip5.userclip_must_clip = 1;
-   clip.clip5.guard_band_enable = 0;
-   clip.clip5.viewport_z_clip_enable = 1;
-   clip.clip5.viewport_xy_clip_enable = 1;
-   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
-   clip.clip5.api_mode = BRW_CLIP_API_OGL;   
-   clip.clip6.clipper_viewport_state_ptr = 0;
-   clip.viewport_xmin = -1;
-   clip.viewport_xmax = 1;
-   clip.viewport_ymin = -1;
-   clip.viewport_ymax = 1;
-
-   brw->clip.state_gs_offset = brw_cache_data( &brw->cache[BRW_CLIP_UNIT], &clip );
-}
-
-
-const struct brw_tracked_state brw_clip_unit = {
-   .dirty = {
-      .brw   = (BRW_NEW_CURBE_OFFSETS |
-		BRW_NEW_URB_FENCE),
-      .cache = CACHE_NEW_CLIP_PROG
-   },
-   .update = upload_clip_unit
-};
diff --git a/src/gallium/drivers/i965simple/brw_clip_tri.c b/src/gallium/drivers/i965simple/brw_clip_tri.c
deleted file mode 100644
index c5da7b825e0..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip_tri.c
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_clip.h"
-
-static struct brw_reg get_tmp( struct brw_clip_compile *c )
-{
-   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
-
-   if (++c->last_tmp > c->prog_data.total_grf)
-      c->prog_data.total_grf = c->last_tmp;
-
-   return tmp;
-}
-
-static void release_tmps( struct brw_clip_compile *c )
-{
-   c->last_tmp = c->first_tmp;
-}
-
-
-void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
-			      unsigned nr_verts )
-{
-   unsigned i = 0,j;
-
-   /* Register usage is static, precompute here:
-    */
-   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
-
-   if (c->key.nr_userclip) {
-      c->reg.fixed_planes = brw_vec4_grf(i, 0);
-      i += (6 + c->key.nr_userclip + 1) / 2;
-
-      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
-   }
-   else
-      c->prog_data.curb_read_length = 0;
-
-
-   /* Payload vertices plus space for more generated vertices:
-    */
-   for (j = 0; j < nr_verts; j++) {
-      c->reg.vertex[j] = brw_vec4_grf(i, 0);
-      i += c->nr_regs;
-   }
-
-   if (c->nr_attrs & 1) {
-      for (j = 0; j < 3; j++) {
-	 unsigned delta = c->nr_attrs*16 + 32;
-	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
-      }
-   }
-
-   c->reg.t          = brw_vec1_grf(i, 0);
-   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_UD);
-   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
-   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
-   c->reg.plane_equation = brw_vec4_grf(i, 4);
-   i++;
-
-   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
-   c->reg.dp         = brw_vec1_grf(i, 4);
-   i++;
-
-   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
-   i++;
-
-   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
-   i++;
-
-   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
-   i++;
-
-   if (!c->key.nr_userclip) {
-      c->reg.fixed_planes = brw_vec8_grf(i, 0);
-      i++;
-   }
-
-   if (c->key.do_unfilled) {
-      c->reg.dir     = brw_vec4_grf(i, 0);
-      c->reg.offset  = brw_vec4_grf(i, 4);
-      i++;
-      c->reg.tmp0    = brw_vec4_grf(i, 0);
-      c->reg.tmp1    = brw_vec4_grf(i, 4);
-      i++;
-   }
-
-   c->first_tmp = i;
-   c->last_tmp = i;
-
-   c->prog_data.urb_read_length = c->nr_regs; /* ? */
-   c->prog_data.total_grf = i;
-}
-
-
-
-void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
-   struct brw_instruction *is_rev;
-
-   /* Initial list of indices for incoming vertexes:
-    */
-   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
-   brw_CMP(p,
-	   vec1(brw_null_reg()),
-	   BRW_CONDITIONAL_EQ,
-	   tmp0,
-	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
-
-   /* XXX: Is there an easier way to do this?  Need to reverse every
-    * second tristrip element:  Can ignore sometimes?
-    */
-   is_rev = brw_IF(p, BRW_EXECUTE_1);
-   {
-      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
-      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
-      if (c->need_direction)
-	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
-   }
-   is_rev = brw_ELSE(p, is_rev);
-   {
-      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
-      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
-      if (c->need_direction)
-	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
-   }
-   brw_ENDIF(p, is_rev);
-
-   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
-   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
-   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
-}
-
-
-
-void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *is_poly;
-   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
-
-   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
-   brw_CMP(p,
-	   vec1(brw_null_reg()),
-	   BRW_CONDITIONAL_EQ,
-	   tmp0,
-	   brw_imm_ud(_3DPRIM_POLYGON));
-
-   is_poly = brw_IF(p, BRW_EXECUTE_1);
-   {
-      brw_clip_copy_colors(c, 1, 0);
-      brw_clip_copy_colors(c, 2, 0);
-   }
-   is_poly = brw_ELSE(p, is_poly);
-   {
-      brw_clip_copy_colors(c, 0, 2);
-      brw_clip_copy_colors(c, 1, 2);
-   }
-   brw_ENDIF(p, is_poly);
-}
-
-
-
-/* Use mesa's clipping algorithms, translated to GEN4 assembly.
- */
-void brw_clip_tri( struct brw_clip_compile *c )
-{
-#if 0
-   struct brw_compile *p = &c->func;
-   struct brw_indirect vtx = brw_indirect(0, 0);
-   struct brw_indirect vtxPrev = brw_indirect(1, 0);
-   struct brw_indirect vtxOut = brw_indirect(2, 0);
-   struct brw_indirect plane_ptr = brw_indirect(3, 0);
-   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
-   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
-   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
-   struct brw_instruction *plane_loop;
-   struct brw_instruction *plane_active;
-   struct brw_instruction *vertex_loop;
-   struct brw_instruction *next_test;
-   struct brw_instruction *prev_test;
-
-   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
-   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
-   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
-   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
-
-   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
-
-   plane_loop = brw_DO(p, BRW_EXECUTE_1);
-   {
-      /* if (planemask & 1)
-       */
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
-
-      plane_active = brw_IF(p, BRW_EXECUTE_1);
-      {
-	 /* vtxOut = freelist_ptr++
-	  */
-	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
-	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
-
-	 if (c->key.nr_userclip)
-	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
-	 else
-	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
-
-	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
-	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
-
-	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
-	 {
-	    /* vtx = *input_ptr;
-	     */
-	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
-
-	    /* IS_NEGATIVE(prev) */
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
-	    prev_test = brw_IF(p, BRW_EXECUTE_1);
-	    {
-	       /* IS_POSITIVE(next)
-		*/
-	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
-	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
-	       next_test = brw_IF(p, BRW_EXECUTE_1);
-	       {
-
-		  /* Coming back in.
-		   */
-		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
-		  brw_math_invert(p, c->reg.t, c->reg.t);
-		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
-
-		  /* If (vtxOut == 0) vtxOut = vtxPrev
-		   */
-		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
-		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
-		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, FALSE);
-
-		  /* *outlist_ptr++ = vtxOut;
-		   * nr_verts++;
-		   * vtxOut = 0;
-		   */
-		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
-		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
-		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
-		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
-	       }
-	       brw_ENDIF(p, next_test);
-
-	    }
-	    prev_test = brw_ELSE(p, prev_test);
-	    {
-	       /* *outlist_ptr++ = vtxPrev;
-		* nr_verts++;
-		*/
-	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
-	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
-	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
-
-	       /* IS_NEGATIVE(next)
-		*/
-	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
-	       next_test = brw_IF(p, BRW_EXECUTE_1);
-	       {
-		  /* Going out of bounds.  Avoid division by zero as we
-		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
-		   */
-		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
-		  brw_math_invert(p, c->reg.t, c->reg.t);
-		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
-
-		  /* If (vtxOut == 0) vtxOut = vtx
-		   */
-		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
-		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
-		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, TRUE);
-
-		  /* *outlist_ptr++ = vtxOut;
-		   * nr_verts++;
-		   * vtxOut = 0;
-		   */
-		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
-		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
-		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
-		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
-	       }
-	       brw_ENDIF(p, next_test);
-	    }
-	    brw_ENDIF(p, prev_test);
-
-	    /* vtxPrev = vtx;
-	     * inlist_ptr++;
-	     */
-	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
-	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
-
-	    /* while (--loopcount != 0)
-	     */
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
-	 }
-	 brw_WHILE(p, vertex_loop);
-
-	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
-	  * inlist = outlist
-	  * inlist_ptr = &inlist[0]
-	  * outlist_ptr = &outlist[0]
-	  */
-	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
-	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
-	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
-	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
-	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
-      }
-      brw_ENDIF(p, plane_active);
-
-      /* plane_ptr++;
-       */
-      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
-
-      /* nr_verts >= 3
-       */
-      brw_CMP(p,
-	      vec1(brw_null_reg()),
-	      BRW_CONDITIONAL_GE,
-	      c->reg.nr_verts,
-	      brw_imm_ud(3));
-
-      /* && (planemask>>=1) != 0
-       */
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
-   }
-   brw_WHILE(p, plane_loop);
-#else
-         #warning "disabled"
-#endif
-}
-
-
-
-void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *loop, *if_insn;
-
-   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
-    */
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
-   brw_ADD(p,
-	   c->reg.loopcount,
-	   c->reg.nr_verts,
-	   brw_imm_d(-2));
-
-   if_insn = brw_IF(p, BRW_EXECUTE_1);
-   {
-      struct brw_indirect v0 = brw_indirect(0, 0);
-      struct brw_indirect vptr = brw_indirect(1, 0);
-
-      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
-      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
-
-      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
-
-      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
-      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
-
-      loop = brw_DO(p, BRW_EXECUTE_1);
-      {
-	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
-
-	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
-	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
-
-	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
-      }
-      brw_WHILE(p, loop);
-
-      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
-   }
-   brw_ENDIF(p, if_insn);
-}
-
-static void do_clip_tri( struct brw_clip_compile *c )
-{
-   brw_clip_init_planes(c);
-
-   brw_clip_tri(c);
-}
-
-
-static void maybe_do_clip_tri( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *do_clip;
-
-   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
-   do_clip = brw_IF(p, BRW_EXECUTE_1);
-   {
-      do_clip_tri(c);
-   }
-   brw_ENDIF(p, do_clip);
-}
-
-static void brw_clip_test( struct brw_clip_compile *c )
-{
-#if 0
-    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
-    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
-    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
-    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
-
-    struct brw_reg v0 = get_tmp(c);
-    struct brw_reg v1 = get_tmp(c);
-    struct brw_reg v2 = get_tmp(c);
-
-    struct brw_indirect vt0 = brw_indirect(0, 0);
-    struct brw_indirect vt1 = brw_indirect(1, 0);
-    struct brw_indirect vt2 = brw_indirect(2, 0);
-
-    struct brw_compile *p = &c->func;
-
-    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
-    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
-    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
-    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
-    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
-    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
-
-    /* test nearz, xmin, ymin plane */
-    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_XOR(p, t, t1, t2);
-    brw_XOR(p, t1, t2, t3);
-    brw_OR(p, t, t, t1);
-
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
-	    get_element(t, 0), brw_imm_ud(0));
-    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
-	    get_element(t, 1), brw_imm_ud(0));
-    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
-	    get_element(t, 2), brw_imm_ud(0));
-    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-    /* test farz, xmax, ymax plane */
-    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-    brw_XOR(p, t, t1, t2);
-    brw_XOR(p, t1, t2, t3);
-    brw_OR(p, t, t, t1);
-
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
-	    get_element(t, 0), brw_imm_ud(0));
-    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
-	    get_element(t, 1), brw_imm_ud(0));
-    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
-	    get_element(t, 2), brw_imm_ud(0));
-    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
-    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-    release_tmps(c);
-#else
-         #warning "disabled"
-#endif
-}
-
-
-void brw_emit_tri_clip( struct brw_clip_compile *c )
-{
-   struct brw_instruction *neg_rhw;
-   struct brw_compile *p = &c->func;
-   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
-   brw_clip_tri_init_vertices(c);
-   brw_clip_init_clipmask(c);
-
-   /* if -ve rhw workaround bit is set,
-      do cliptest */
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
-	   brw_imm_ud(1<<20));
-   neg_rhw = brw_IF(p, BRW_EXECUTE_1);
-   {
-       brw_clip_test(c);
-   }
-   brw_ENDIF(p, neg_rhw);
-
-   /* Can't push into do_clip_tri because with polygon (or quad)
-    * flatshading, need to apply the flatshade here because we don't
-    * respect the PV when converting to trifan for emit:
-    */
-   if (c->key.do_flat_shading)
-      brw_clip_tri_flat_shade(c);
-
-   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
-      do_clip_tri(c);
-   else
-      maybe_do_clip_tri(c);
-
-   brw_clip_tri_emit_polygon(c);
-
-   /* Send an empty message to kill the thread:
-    */
-   brw_clip_kill_thread(c);
-}
diff --git a/src/gallium/drivers/i965simple/brw_clip_unfilled.c b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
deleted file mode 100644
index b774a76dd63..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip_unfilled.c
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_clip.h"
-
-
-
-/* This is performed against the original triangles, so no indirection
- * required:
-BZZZT!
- */
-static void compute_tri_direction( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg e = c->reg.tmp0;
-   struct brw_reg f = c->reg.tmp1;
-   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); 
-   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); 
-   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); 
-
-
-   /* Calculate the vectors of two edges of the triangle:
-    */
-   brw_ADD(p, e, v0, negate(v2)); 
-   brw_ADD(p, f, v1, negate(v2)); 
-
-   /* Take their crossproduct:
-    */
-   brw_set_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
-   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
-   brw_set_access_mode(p, BRW_ALIGN_1);
-
-   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
-}
-
-
-static void cull_direction( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *ccw;
-   unsigned conditional;
-
-   assert (!(c->key.fill_ccw == CLIP_CULL &&
-	     c->key.fill_cw == CLIP_CULL));
-
-   if (c->key.fill_ccw == CLIP_CULL)
-      conditional = BRW_CONDITIONAL_GE;
-   else
-      conditional = BRW_CONDITIONAL_L;
-
-   brw_CMP(p,
-	   vec1(brw_null_reg()),
-	   conditional,
-	   get_element(c->reg.dir, 2),
-	   brw_imm_f(0));
-   
-   ccw = brw_IF(p, BRW_EXECUTE_1);
-   {
-      brw_clip_kill_thread(c);
-   }
-   brw_ENDIF(p, ccw);
-}
-
-
-
-static void copy_bfc( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *ccw;
-   unsigned conditional;
-
-   /* Do we have any colors to copy? 
-    */
-   if (!(c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) &&
-       !(c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1]))
-      return;
-
-   /* In some wierd degnerate cases we can end up testing the
-    * direction twice, once for culling and once for bfc copying.  Oh
-    * well, that's what you get for setting wierd GL state.
-    */
-   if (c->key.copy_bfc_ccw)
-      conditional = BRW_CONDITIONAL_GE;
-   else
-      conditional = BRW_CONDITIONAL_L;
-
-   brw_CMP(p,
-	   vec1(brw_null_reg()),
-	   conditional,
-	   get_element(c->reg.dir, 2),
-	   brw_imm_f(0));
-   
-   ccw = brw_IF(p, BRW_EXECUTE_1);
-   {
-      unsigned i;
-
-      for (i = 0; i < 3; i++) {
-	 if (c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0])
-	    brw_MOV(p, 
-		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL0]),
-		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC0]));
-
-	 if (c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1])
-	    brw_MOV(p, 
-		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL1]),
-		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC1]));
-      }
-   }
-   brw_ENDIF(p, ccw);
-}
-
-
-
-
-/*
-  float iz	= 1.0 / dir.z;
-  float ac	= dir.x * iz;
-  float bc	= dir.y * iz;
-  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
-  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
-  offset *= MRD;
-*/
-static void compute_offset( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg off = c->reg.offset;
-   struct brw_reg dir = c->reg.dir;
-   
-   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
-   brw_MUL(p, vec2(off), dir, get_element(off, 2));
-
-   brw_CMP(p, 
-	   vec1(brw_null_reg()), 
-	   BRW_CONDITIONAL_GE,
-	   brw_abs(get_element(off, 0)), 
-	   brw_abs(get_element(off, 1)));
-
-   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
-   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
-}
-
-
-static void merge_edgeflags( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *is_poly;
-   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
-
-   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
-   brw_CMP(p, 
-	   vec1(brw_null_reg()), 
-	   BRW_CONDITIONAL_EQ, 
-	   tmp0,
-	   brw_imm_ud(_3DPRIM_POLYGON));
-
-   /* Get away with using reg.vertex because we know that this is not
-    * a _3DPRIM_TRISTRIP_REVERSE:
-    */
-   is_poly = brw_IF(p, BRW_EXECUTE_1);
-   {   
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
-      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
-      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
-      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
-      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   }
-   brw_ENDIF(p, is_poly);
-}
-
-
-
-static void apply_one_offset( struct brw_clip_compile *c,
-			  struct brw_indirect vert )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg pos = deref_4f(vert, c->offset[VERT_RESULT_HPOS]);
-   struct brw_reg z = get_element(pos, 2);
-
-   brw_ADD(p, z, z, vec1(c->reg.offset));
-}
-
-
-
-/***********************************************************************
- * Output clipped polygon as an unfilled primitive:
- */
-static void emit_lines(struct brw_clip_compile *c,
-		       boolean do_offset)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *loop;
-   struct brw_instruction *draw_edge;
-   struct brw_indirect v0 = brw_indirect(0, 0);
-   struct brw_indirect v1 = brw_indirect(1, 0);
-   struct brw_indirect v0ptr = brw_indirect(2, 0);
-   struct brw_indirect v1ptr = brw_indirect(3, 0);
-
-   /* Need a seperate loop for offset:
-    */
-   if (do_offset) {
-      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
-      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
-
-      loop = brw_DO(p, BRW_EXECUTE_1);
-      {
-	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
-	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
-	    
-	 apply_one_offset(c, v0);
-	    
-	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
-	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
-      }
-      brw_WHILE(p, loop);
-   }
-
-   /* v1ptr = &inlist[nr_verts]
-    * *v1ptr = v0
-    */
-   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
-   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
-   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
-   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
-   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
-
-   loop = brw_DO(p, BRW_EXECUTE_1);
-   {
-      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
-      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
-      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
-
-      /* draw edge if edgeflag != 0 */
-      brw_CMP(p, 
-	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
-	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
-	      brw_imm_f(0));
-      draw_edge = brw_IF(p, BRW_EXECUTE_1);
-      {
-	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
-	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
-      }
-      brw_ENDIF(p, draw_edge);
-
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
-   }
-   brw_WHILE(p, loop);
-}
-
-
-
-static void emit_points(struct brw_clip_compile *c,
-			boolean do_offset )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *loop;
-   struct brw_instruction *draw_point;
-
-   struct brw_indirect v0 = brw_indirect(0, 0);
-   struct brw_indirect v0ptr = brw_indirect(2, 0);
-
-   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
-   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
-
-   loop = brw_DO(p, BRW_EXECUTE_1);
-   {
-      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
-      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
-
-      /* draw if edgeflag != 0 
-       */
-      brw_CMP(p, 
-	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
-	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
-	      brw_imm_f(0));
-      draw_point = brw_IF(p, BRW_EXECUTE_1);
-      {
-	 if (do_offset)
-	    apply_one_offset(c, v0);
-
-	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
-      }
-      brw_ENDIF(p, draw_point);
-
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
-   }
-   brw_WHILE(p, loop);
-}
-
-
-
-
-
-
-
-static void emit_primitives( struct brw_clip_compile *c,
-			     unsigned mode, 
-			     boolean do_offset )
-{
-   switch (mode) {
-   case CLIP_FILL:
-      brw_clip_tri_emit_polygon(c);
-      break;
-
-   case CLIP_LINE:
-      emit_lines(c, do_offset);
-      break;
-
-   case CLIP_POINT:
-      emit_points(c, do_offset);
-      break;
-
-   case CLIP_CULL:
-      assert(0);
-      break;
-   }
-} 
-
-
-
-static void emit_unfilled_primitives( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *ccw;
-
-   /* Direction culling has already been done.
-    */
-   if (c->key.fill_ccw != c->key.fill_cw &&
-       c->key.fill_ccw != CLIP_CULL &&
-       c->key.fill_cw != CLIP_CULL)
-   {
-      brw_CMP(p,
-	      vec1(brw_null_reg()),
-	      BRW_CONDITIONAL_GE,
-	      get_element(c->reg.dir, 2),
-	      brw_imm_f(0));
-   
-      ccw = brw_IF(p, BRW_EXECUTE_1);
-      {
-	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
-      }
-      ccw = brw_ELSE(p, ccw);
-      {
-	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
-      }
-      brw_ENDIF(p, ccw);
-   }
-   else if (c->key.fill_cw != CLIP_CULL) {
-      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
-   }
-   else if (c->key.fill_ccw != CLIP_CULL) { 
-      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
-   }
-}
-
-
-
-
-static void check_nr_verts( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *if_insn;
-
-   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
-   if_insn = brw_IF(p, BRW_EXECUTE_1);
-   {
-      brw_clip_kill_thread(c);
-   }
-   brw_ENDIF(p, if_insn);
-}
-
-
-void brw_emit_unfilled_clip( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *do_clip;
-   
-
-   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
-			(c->key.fill_ccw != c->key.fill_cw) ||
-			c->key.fill_ccw == CLIP_CULL ||
-			c->key.fill_cw == CLIP_CULL ||
-			c->key.copy_bfc_cw ||
-			c->key.copy_bfc_ccw);
-
-   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
-   brw_clip_tri_init_vertices(c);
-
-   assert(c->offset[VERT_RESULT_EDGE]);
-
-   if (c->key.fill_ccw == CLIP_CULL &&
-       c->key.fill_cw == CLIP_CULL) {
-      brw_clip_kill_thread(c);
-      return;
-   }
-
-   merge_edgeflags(c);
-
-   /* Need to use the inlist indirection here: 
-    */
-   if (c->need_direction) 
-      compute_tri_direction(c);
-   
-   if (c->key.fill_ccw == CLIP_CULL ||
-       c->key.fill_cw == CLIP_CULL)
-      cull_direction(c);
-
-   if (c->key.offset_ccw ||
-       c->key.offset_cw)
-      compute_offset(c);
-
-   if (c->key.copy_bfc_ccw ||
-       c->key.copy_bfc_cw)
-      copy_bfc(c);
-
-   /* Need to do this whether we clip or not:
-    */
-   if (c->key.do_flat_shading)
-      brw_clip_tri_flat_shade(c);
-   
-   brw_clip_init_clipmask(c);
-   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
-   do_clip = brw_IF(p, BRW_EXECUTE_1);
-   {
-      brw_clip_init_planes(c);
-      brw_clip_tri(c);
-      check_nr_verts(c);
-   }
-   brw_ENDIF(p, do_clip);
-   
-   emit_unfilled_primitives(c);
-   brw_clip_kill_thread(c);
-}
-
-
-
diff --git a/src/gallium/drivers/i965simple/brw_clip_util.c b/src/gallium/drivers/i965simple/brw_clip_util.c
deleted file mode 100644
index 6d58ceafff3..00000000000
--- a/src/gallium/drivers/i965simple/brw_clip_util.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_clip.h"
-
-
-
-
-
-static struct brw_reg get_tmp( struct brw_clip_compile *c )
-{
-   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
-
-   if (++c->last_tmp > c->prog_data.total_grf)
-      c->prog_data.total_grf = c->last_tmp;
-
-   return tmp;
-}
-
-static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
-{
-   if (tmp.nr == c->last_tmp-1)
-      c->last_tmp--;
-}
-
-
-static struct brw_reg make_plane_ud(unsigned x, unsigned y, unsigned z, unsigned w)
-{
-   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
-}
-
-
-void brw_clip_init_planes( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-
-   if (!c->key.nr_userclip) {
-      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
-      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
-      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
-      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
-      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
-      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
-   }
-}
-
-
-
-#define W 3
-
-/* Project 'pos' to screen space (or back again), overwrite with results:
- */
-static void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
-{
-   struct brw_compile *p = &c->func;
-
-   /* calc rhw
-    */
-   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
-
-   /* value.xyz *= value.rhw
-    */
-   brw_set_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, brw_writemask(pos, TGSI_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
-   brw_set_access_mode(p, BRW_ALIGN_1);
-}
-
-
-static void brw_clip_project_vertex( struct brw_clip_compile *c,
-				     struct brw_indirect vert_addr )
-{
-#if 0
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = get_tmp(c);
-
-   /* Fixup position.  Extract from the original vertex and re-project
-    * to screen space:
-    */
-   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
-   brw_clip_project_position(c, tmp);
-   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
-
-   release_tmp(c, tmp);
-#else
-         #warning "disabled"
-#endif
-}
-
-
-
-
-/* Interpolate between two vertices and put the result into a0.0.
- * Increment a0.0 accordingly.
- */
-void brw_clip_interp_vertex( struct brw_clip_compile *c,
-			     struct brw_indirect dest_ptr,
-			     struct brw_indirect v0_ptr, /* from */
-			     struct brw_indirect v1_ptr, /* to */
-			     struct brw_reg t0,
-			     boolean force_edgeflag)
-{
-#if 0
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = get_tmp(c);
-   unsigned i;
-
-   /* Just copy the vertex header:
-    */
-   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
-
-   /* Iterate over each attribute (could be done in pairs?)
-    */
-   for (i = 0; i < c->nr_attrs; i++) {
-      unsigned delta = i*16 + 32;
-
-      if (delta == c->offset[VERT_RESULT_EDGE]) {
-	 if (force_edgeflag)
-	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
-	 else
-	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
-      }
-      else {
-	 /* Interpolate:
-	  *
-	  *        New = attr0 + t*attr1 - t*attr0
-	  */
-	 brw_MUL(p,
-		 vec4(brw_null_reg()),
-		 deref_4f(v1_ptr, delta),
-		 t0);
-
-	 brw_MAC(p,
-		 tmp,
-		 negate(deref_4f(v0_ptr, delta)),
-		 t0);
-
-	 brw_ADD(p,
-		 deref_4f(dest_ptr, delta),
-		 deref_4f(v0_ptr, delta),
-		 tmp);
-      }
-   }
-
-   if (i & 1) {
-      unsigned delta = i*16 + 32;
-      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
-   }
-
-   release_tmp(c, tmp);
-
-   /* Recreate the projected (NDC) coordinate in the new vertex
-    * header:
-    */
-   brw_clip_project_vertex(c, dest_ptr );
-#else
-         #warning "disabled"
-#endif
-}
-
-
-
-
-#define MAX_MRF 16
-
-void brw_clip_emit_vue(struct brw_clip_compile *c,
-		       struct brw_indirect vert,
-		       boolean allocate,
-		       boolean eot,
-		       unsigned header)
-{
-   struct brw_compile *p = &c->func;
-   unsigned start = c->last_mrf;
-
-   assert(!(allocate && eot));
-
-   /* Cycle through mrf regs - probably futile as we have to wait for
-    * the allocation response anyway.  Also, the order this function
-    * is invoked doesn't correspond to the order the instructions will
-    * be executed, so it won't have any effect in many cases.
-    */
-#if 0
-   if (start + c->nr_regs + 1 >= MAX_MRF)
-      start = 0;
-
-   c->last_mrf = start + c->nr_regs + 1;
-#endif
-
-   /* Copy the vertex from vertn into m1..mN+1:
-    */
-   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
-
-   /* Overwrite PrimType and PrimStart in the message header, for
-    * each vertex in turn:
-    */
-   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
-
-
-   /* Send each vertex as a seperate write to the urb.  This
-    * is different to the concept in brw_sf_emit.c, where
-    * subsequent writes are used to build up a single urb
-    * entry.  Each of these writes instantiates a seperate
-    * urb entry - (I think... what about 'allocate'?)
-    */
-   brw_urb_WRITE(p,
-		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
-		 start,
-		 c->reg.R0,
-		 allocate,
-		 1,		/* used */
-		 c->nr_regs + 1, /* msg length */
-		 allocate ? 1 : 0, /* response_length */
-		 eot,		/* eot */
-		 1,		/* writes_complete */
-		 0,		/* urb offset */
-		 BRW_URB_SWIZZLE_NONE);
-}
-
-
-
-void brw_clip_kill_thread(struct brw_clip_compile *c)
-{
-   struct brw_compile *p = &c->func;
-
-   /* Send an empty message to kill the thread and release any
-    * allocated urb entry:
-    */
-   brw_urb_WRITE(p,
-		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
-		 0,
-		 c->reg.R0,
-		 0,		/* allocate */
-		 0,		/* used */
-		 0, 		/* msg len */
-		 0, 		/* response len */
-		 1, 		/* eot */
-		 1,		/* writes complete */
-		 0,
-		 BRW_URB_SWIZZLE_NONE);
-}
-
-
-
-
-struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
-{
-   return brw_address(c->reg.fixed_planes);
-}
-
-
-struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
-{
-   if (c->key.nr_userclip) {
-      return brw_imm_uw(16);
-   }
-   else {
-      return brw_imm_uw(4);
-   }
-}
-
-
-/* If flatshading, distribute color from provoking vertex prior to
- * clipping.
- */
-void brw_clip_copy_colors( struct brw_clip_compile *c,
-			   unsigned to, unsigned from )
-{
-#if 0
-   struct brw_compile *p = &c->func;
-
-   if (c->offset[VERT_RESULT_COL0])
-      brw_MOV(p,
-	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]),
-	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0]));
-
-   if (c->offset[VERT_RESULT_COL1])
-      brw_MOV(p,
-	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]),
-	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1]));
-
-   if (c->offset[VERT_RESULT_BFC0])
-      brw_MOV(p,
-	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]),
-	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0]));
-
-   if (c->offset[VERT_RESULT_BFC1])
-      brw_MOV(p,
-	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]),
-	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1]));
-#else
-         #warning "disabled"
-#endif
-}
-
-
-
-void brw_clip_init_clipmask( struct brw_clip_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
-
-   /* Shift so that lowest outcode bit is rightmost:
-    */
-   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
-
-   if (c->key.nr_userclip) {
-      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
-
-      /* Rearrange userclip outcodes so that they come directly after
-       * the fixed plane bits.
-       */
-      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
-      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
-      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
-
-      release_tmp(c, tmp);
-   }
-}
-
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
deleted file mode 100644
index 9b33285bc73..00000000000
--- a/src/gallium/drivers/i965simple/brw_context.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_draw.h"
-#include "brw_vs.h"
-#include "brw_tex_layout.h"
-#include "brw_winsys.h"
-
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_context.h"
-#include "util/u_memory.h"
-#include "pipe/p_screen.h"
-
-
-#ifndef BRW_DEBUG
-int BRW_DEBUG = (0);
-#endif
-
-
-static void brw_destroy(struct pipe_context *pipe)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   if(brw->winsys->destroy)
-      brw->winsys->destroy(brw->winsys);
-   
-   FREE(brw);
-}
-
-
-static void brw_clear(struct pipe_context *pipe, struct pipe_surface *ps,
-                      unsigned clearValue)
-{
-   int x, y, w, h;
-   /* FIXME: corny... */
-
-   x = 0;
-   y = 0;
-   w = ps->width;
-   h = ps->height;
-
-   pipe->surface_fill(pipe, ps, x, y, w, h, clearValue);
-}
-
-static unsigned int
-brw_is_texture_referenced( struct pipe_context *pipe,
-			   struct pipe_texture *texture,
-			   unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-brw_is_buffer_referenced( struct pipe_context *pipe,
-			  struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-struct pipe_context *brw_create(struct pipe_screen *screen,
-                                struct brw_winsys *brw_winsys,
-                                unsigned pci_id)
-{
-   struct brw_context *brw;
-
-   debug_printf("%s: creating brw_context with pci id 0x%x\n",
-                __FUNCTION__, pci_id);
-
-   brw = CALLOC_STRUCT(brw_context);
-   if (brw == NULL)
-      return NULL;
-
-   brw->winsys = brw_winsys;
-   brw->pipe.winsys = screen->winsys;
-   brw->pipe.screen = screen;
-
-   brw->pipe.destroy = brw_destroy;
-   brw->pipe.clear = brw_clear;
-
-   brw->pipe.is_texture_referenced = brw_is_texture_referenced;
-   brw->pipe.is_buffer_referenced = brw_is_buffer_referenced;
-
-   brw_init_surface_functions(brw);
-   brw_init_texture_functions(brw);
-   brw_init_state_functions(brw);
-   brw_init_flush_functions(brw);
-   brw_init_draw_functions( brw );
-
-
-   brw_init_state( brw );
-
-   brw->pci_id = pci_id;
-   brw->dirty = ~0;
-   brw->hardware_dirty = ~0;
-
-   memset(&brw->wm.bind, ~0, sizeof(brw->wm.bind));
-
-   return &brw->pipe;
-}
-
diff --git a/src/gallium/drivers/i965simple/brw_context.h b/src/gallium/drivers/i965simple/brw_context.h
deleted file mode 100644
index 3079485180b..00000000000
--- a/src/gallium/drivers/i965simple/brw_context.h
+++ /dev/null
@@ -1,684 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#ifndef BRWCONTEXT_INC
-#define BRWCONTEXT_INC
-
-
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_state.h"
-
-#include "tgsi/tgsi_scan.h"
-
-#include "brw_structs.h"
-#include "brw_winsys.h"
-
-
-/* Glossary:
- *
- * URB - uniform resource buffer.  A mid-sized buffer which is
- * partitioned between the fixed function units and used for passing
- * values (vertices, primitives, constants) between them.
- *
- * CURBE - constant URB entry.  An urb region (entry) used to hold
- * constant values which the fixed function units can be instructed to
- * preload into the GRF when spawining a thread.
- *
- * VUE - vertex URB entry.  An urb entry holding a vertex and usually
- * a vertex header.  The header contains control information and
- * things like primitive type, Begin/end flags and clip codes.
- *
- * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
- * unit holding rasterization and interpolation parameters.
- *
- * GRF - general register file.  One of several register files
- * addressable by programmed threads.  The inputs (r0, payload, curbe,
- * urb) of the thread are preloaded to this area before the thread is
- * spawned.  The registers are individually 8 dwords wide and suitable
- * for general usage.  Registers holding thread input values are not
- * special and may be overwritten.
- *
- * MRF - message register file.  Threads communicate (and terminate)
- * by sending messages.  Message parameters are placed in contigous
- * MRF registers.  All program output is via these messages.  URB
- * entries are populated by sending a message to the shared URB
- * function containing the new data, together with a control word,
- * often an unmodified copy of R0.
- *
- * R0 - GRF register 0.  Typically holds control information used when
- * sending messages to other threads.
- *
- * EU or GEN4 EU: The name of the programmable subsystem of the
- * i965 hardware.  Threads are executed by the EU, the registers
- * described above are part of the EU architecture.
- *
- * Fixed function units:
- *
- * CS - Command streamer.  Notional first unit, little software
- * interaction.  Holds the URB entries used for constant data, ie the
- * CURBEs.
- *
- * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
- * this unit is responsible for pulling vertices out of vertex buffers
- * in vram and injecting them into the processing pipe as VUEs.  If
- * enabled, it first passes them to a VS thread which is a good place
- * for the driver to implement any active vertex shader.
- *
- * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
- * enabled, incoming strips etc are passed to GS threads in individual
- * line/triangle/point units.  The GS thread may perform arbitary
- * computation and emit whatever primtives with whatever vertices it
- * chooses.  This makes GS an excellent place to implement GL's
- * unfilled polygon modes, though of course it is capable of much
- * more.  Additionally, GS is used to translate away primitives not
- * handled by latter units, including Quads and Lineloops.
- *
- * CS - Clipper.  Mesa's clipping algorithms are imported to run on
- * this unit.  The fixed function part performs cliptesting against
- * the 6 fixed clipplanes and makes descisions on whether or not the
- * incoming primitive needs to be passed to a thread for clipping.
- * User clip planes are handled via cooperation with the VS thread.
- *
- * SF - Strips Fans or Setup: Triangles are prepared for
- * rasterization.  Interpolation coefficients are calculated.
- * Flatshading and two-side lighting usually performed here.
- *
- * WM - Windower.  Interpolation of vertex attributes performed here.
- * Fragment shader implemented here.  SIMD aspects of EU taken full
- * advantage of, as pixels are processed in blocks of 16.
- *
- * CC - Color Calculator.  No EU threads associated with this unit.
- * Handles blending and (presumably) depth and stencil testing.
- */
-
-#define BRW_MAX_CURBE                    (32*16)
-
-struct brw_context;
-struct brw_winsys;
-
-
-/* Raised when we receive new state across the pipe interface:
- */
-#define BRW_NEW_VIEWPORT                0x1
-#define BRW_NEW_RASTERIZER              0x2
-#define BRW_NEW_FS                      0x4
-#define BRW_NEW_BLEND                   0x8
-#define BRW_NEW_CLIP                    0x10
-#define BRW_NEW_SCISSOR                 0x20
-#define BRW_NEW_STIPPLE                 0x40
-#define BRW_NEW_FRAMEBUFFER             0x80
-#define BRW_NEW_ALPHA_TEST              0x100
-#define BRW_NEW_DEPTH_STENCIL           0x200
-#define BRW_NEW_SAMPLER                 0x400
-#define BRW_NEW_TEXTURE                 0x800
-#define BRW_NEW_CONSTANTS               0x1000
-#define BRW_NEW_VBO                     0x2000
-#define BRW_NEW_VS                      0x4000
-
-/* Raised for other internal events:
- */
-#define BRW_NEW_URB_FENCE               0x10000
-#define BRW_NEW_PSP                     0x20000
-#define BRW_NEW_CURBE_OFFSETS           0x40000
-#define BRW_NEW_REDUCED_PRIMITIVE       0x80000
-#define BRW_NEW_PRIMITIVE               0x100000
-#define BRW_NEW_SCENE                 0x200000
-#define BRW_NEW_SF_LINKAGE              0x400000
-
-extern int BRW_DEBUG;
-
-#define DEBUG_TEXTURE	0x1
-#define DEBUG_STATE	0x2
-#define DEBUG_IOCTL	0x4
-#define DEBUG_PRIMS	0x8
-#define DEBUG_VERTS	0x10
-#define DEBUG_FALLBACKS	0x20
-#define DEBUG_VERBOSE	0x40
-#define DEBUG_DRI       0x80
-#define DEBUG_DMA       0x100
-#define DEBUG_SANITY    0x200
-#define DEBUG_SYNC      0x400
-#define DEBUG_SLEEP     0x800
-#define DEBUG_PIXEL     0x1000
-#define DEBUG_STATS     0x2000
-#define DEBUG_TILE      0x4000
-#define DEBUG_SINGLE_THREAD   0x8000
-#define DEBUG_WM        0x10000
-#define DEBUG_URB       0x20000
-#define DEBUG_VS        0x40000
-#define DEBUG_BATCH	0x80000
-#define DEBUG_BUFMGR	0x100000
-#define DEBUG_BLIT	0x200000
-#define DEBUG_REGION	0x400000
-#define DEBUG_MIPTREE	0x800000
-
-#define DBG(...) do {						\
-   if (BRW_DEBUG & FILE_DEBUG_FLAG)				\
-      debug_printf(__VA_ARGS__);				\
-} while(0)
-
-#define PRINT(...) do {						\
-   debug_printf(__VA_ARGS__);			                \
-} while(0)
-
-struct brw_state_flags {
-   unsigned cache;
-   unsigned brw;
-};
-
-
-struct brw_vertex_program {
-   struct pipe_shader_state program;
-   struct tgsi_shader_info info;
-   int id;
-};
-
-
-struct brw_fragment_program {
-   struct pipe_shader_state program;
-   struct tgsi_shader_info info;
-   
-   boolean UsesDepth; /* XXX add this to tgsi_shader_info? */
-   int id;
-};
-
-
-struct pipe_setup_linkage {
-   struct {
-      unsigned vp_output:5;
-      unsigned interp_mode:4;
-      unsigned bf_vp_output:5;
-   } fp_input[PIPE_MAX_SHADER_INPUTS];
-
-   unsigned fp_input_count:5;
-   unsigned max_vp_output:5;
-};
-   
-
-
-struct brw_texture {
-   struct pipe_texture base;
-
-   /* Derived from the above:
-    */
-   unsigned stride;
-   unsigned depth_pitch;          /* per-image on i945? */
-   unsigned total_nblocksy;
-
-   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
-
-   /* Explicitly store the offset of each image for each cube face or
-    * depth value.  Pretty much have to accept that hardware formats
-    * are going to be so diverse that there is no unified way to
-    * compute the offsets of depth/cube images within a mipmap level,
-    * so have to store them as a lookup table:
-    */
-   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];   /**< array [depth] of offsets */
-
-   /* Includes image offset tables:
-    */
-   unsigned level_offset[PIPE_MAX_TEXTURE_LEVELS];
-
-   /* The data is held here:
-    */
-   struct pipe_buffer *buffer;
-};
-
-/* Data about a particular attempt to compile a program.  Note that
- * there can be many of these, each in a different GL state
- * corresponding to a different brw_wm_prog_key struct, with different
- * compiled programs:
- */
-/* Data about a particular attempt to compile a program.  Note that
- * there can be many of these, each in a different GL state
- * corresponding to a different brw_wm_prog_key struct, with different
- * compiled programs:
- */
-
-struct brw_wm_prog_data {
-   unsigned curb_read_length;
-   unsigned urb_read_length;
-
-   unsigned first_curbe_grf;
-   unsigned total_grf;
-   unsigned total_scratch;
-
-   /* Internally generated constants for the CURBE.  These are loaded
-    * ahead of the data from the constant buffer.
-    */
-   const float internal_const[8];
-   unsigned nr_internal_consts;
-   unsigned max_const;
-
-   boolean error;
-};
-
-struct brw_sf_prog_data {
-   unsigned urb_read_length;
-   unsigned total_grf;
-
-   /* Each vertex may have upto 12 attributes, 4 components each,
-    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
-    * rows.
-    *
-    * Actually we use 4 for each, so call it 12 rows.
-    */
-   unsigned urb_entry_size;
-};
-
-struct brw_clip_prog_data {
-   unsigned curb_read_length;	/* user planes? */
-   unsigned clip_mode;
-   unsigned urb_read_length;
-   unsigned total_grf;
-};
-
-struct brw_gs_prog_data {
-   unsigned urb_read_length;
-   unsigned total_grf;
-};
-
-struct brw_vs_prog_data {
-   unsigned curb_read_length;
-   unsigned urb_read_length;
-   unsigned total_grf;
-   unsigned outputs_written;
-
-   unsigned inputs_read;
-
-   unsigned max_const;
-
-   float    imm_buf[PIPE_MAX_CONSTANT][4];
-   unsigned num_imm;
-   unsigned num_consts;
-
-   /* Used for calculating urb partitions:
-    */
-   unsigned urb_entry_size;
-};
-
-
-#define BRW_MAX_TEX_UNIT 8
-#define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + 1
-
-/* Create a fixed sized struct for caching binding tables:
- */
-struct brw_surface_binding_table {
-   unsigned surf_ss_offset[BRW_WM_MAX_SURF];
-};
-
-
-struct brw_cache;
-
-struct brw_mem_pool {
-   struct pipe_buffer *buffer;
-
-   unsigned size;
-   unsigned offset;		/* offset of first free byte */
-
-   struct brw_context *brw;
-};
-
-struct brw_cache_item {
-   unsigned hash;
-   unsigned key_size;		/* for variable-sized keys */
-   const void *key;
-
-   unsigned offset;		/* offset within pool's buffer */
-   unsigned data_size;
-
-   struct brw_cache_item *next;
-};
-
-
-
-struct brw_cache {
-   unsigned id;
-
-   const char *name;
-
-   struct brw_context *brw;
-   struct brw_mem_pool *pool;
-
-   struct brw_cache_item **items;
-   unsigned size, n_items;
-
-   unsigned key_size;		/* for fixed-size keys */
-   unsigned aux_size;
-
-   unsigned last_addr;			/* offset of active item */
-};
-
-
-
-
-/* Considered adding a member to this struct to document which flags
- * an update might raise so that ordering of the state atoms can be
- * checked or derived at runtime.  Dropped the idea in favor of having
- * a debug mode where the state is monitored for flags which are
- * raised that have already been tested against.
- */
-struct brw_tracked_state {
-   struct brw_state_flags dirty;
-   void (*update)( struct brw_context *brw );
-};
-
-
-/* Flags for brw->state.cache.
- */
-#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
-#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
-#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
-#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
-#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
-#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
-#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
-#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
-#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
-#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
-#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
-#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
-#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
-#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
-#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
-#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
-#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
-#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
-
-
-
-
-enum brw_mempool_id {
-   BRW_GS_POOL,
-   BRW_SS_POOL,
-   BRW_MAX_POOL
-};
-
-
-struct brw_cached_batch_item {
-   struct header *header;
-   unsigned sz;
-   struct brw_cached_batch_item *next;
-};
-
-
-
-/* Protect against a future where PIPE_MAX_ATTRIBS > 32.  Wouldn't life
- * be easier if C allowed arrays of packed elements?
- */
-#define ATTRIB_BIT_DWORDS  ((PIPE_MAX_ATTRIBS+31)/32)
-
-
-
-
-struct brw_vertex_info {
-   unsigned varying;  /* varying:1[PIPE_MAX_ATTRIBS] */
-   unsigned sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[PIPE_MAX_ATTRIBS] */
-};
-
-
-
-
-
-struct brw_context
-{
-   struct pipe_context pipe;
-   struct brw_winsys *winsys;
-
-   unsigned primitive;
-   unsigned reduced_primitive;
-
-   boolean emit_state_always;
-
-   struct {
-      struct brw_state_flags dirty;
-   } state;
-
-
-   struct {
-      const struct pipe_blend_state         *Blend;
-      const struct pipe_depth_stencil_alpha_state *DepthStencil;
-      const struct pipe_poly_stipple        *PolygonStipple;
-      const struct pipe_rasterizer_state    *Raster;
-      const struct pipe_sampler_state       *Samplers[PIPE_MAX_SAMPLERS];
-      const struct brw_vertex_program       *VertexProgram;
-      const struct brw_fragment_program     *FragmentProgram;
-
-      struct pipe_clip_state          Clip;
-      struct pipe_blend_color         BlendColor;
-      struct pipe_scissor_state       Scissor;
-      struct pipe_viewport_state      Viewport;
-      struct pipe_framebuffer_state   FrameBuffer;
-
-      const struct pipe_constant_buffer *Constants[2];
-      const struct brw_texture          *Texture[PIPE_MAX_SAMPLERS];
-   } attribs;
-
-   unsigned num_samplers;
-   unsigned num_textures;
-
-   struct brw_mem_pool pool[BRW_MAX_POOL];
-   struct brw_cache cache[BRW_MAX_CACHE];
-   struct brw_cached_batch_item *cached_batch_items;
-
-   struct {
-
-      /* Arrays with buffer objects to copy non-bufferobj arrays into
-       * for upload:
-       */
-      const struct pipe_vertex_buffer *vbo_array[PIPE_MAX_ATTRIBS];
-
-      struct brw_vertex_element_state inputs[PIPE_MAX_ATTRIBS];
-
-#define BRW_NR_UPLOAD_BUFS 17
-#define BRW_UPLOAD_INIT_SIZE (128*1024)
-
-      /* Summary of size and varying of active arrays, so we can check
-       * for changes to this state:
-       */
-      struct brw_vertex_info info;
-   } vb;
-
-
-   unsigned hardware_dirty;
-   unsigned dirty;
-   unsigned pci_id;
-   /* BRW_NEW_URB_ALLOCATIONS:
-    */
-   struct {
-      unsigned vsize;		/* vertex size plus header in urb registers */
-      unsigned csize;		/* constant buffer size in urb registers */
-      unsigned sfsize;		/* setup data size in urb registers */
-
-      boolean constrained;
-
-      unsigned nr_vs_entries;
-      unsigned nr_gs_entries;
-      unsigned nr_clip_entries;
-      unsigned nr_sf_entries;
-      unsigned nr_cs_entries;
-
-/*       unsigned vs_size; */
-/*       unsigned gs_size; */
-/*       unsigned clip_size; */
-/*       unsigned sf_size; */
-/*       unsigned cs_size; */
-
-      unsigned vs_start;
-      unsigned gs_start;
-      unsigned clip_start;
-      unsigned sf_start;
-      unsigned cs_start;
-   } urb;
-
-
-   /* BRW_NEW_CURBE_OFFSETS:
-    */
-   struct {
-      unsigned wm_start;
-      unsigned wm_size;
-      unsigned clip_start;
-      unsigned clip_size;
-      unsigned vs_start;
-      unsigned vs_size;
-      unsigned total_size;
-
-      unsigned gs_offset;
-
-      float *last_buf;
-      unsigned last_bufsz;
-   } curbe;
-
-   struct {
-      struct brw_vs_prog_data *prog_data;
-
-      unsigned prog_gs_offset;
-      unsigned state_gs_offset;
-   } vs;
-
-   struct {
-      struct brw_gs_prog_data *prog_data;
-
-      boolean prog_active;
-      unsigned prog_gs_offset;
-      unsigned state_gs_offset;
-   } gs;
-
-   struct {
-      struct brw_clip_prog_data *prog_data;
-
-      unsigned prog_gs_offset;
-      unsigned vp_gs_offset;
-      unsigned state_gs_offset;
-   } clip;
-
-
-   struct {
-      struct brw_sf_prog_data *prog_data;
-
-      struct pipe_setup_linkage linkage;
-
-      unsigned prog_gs_offset;
-      unsigned vp_gs_offset;
-      unsigned state_gs_offset;
-   } sf;
-
-   struct {
-      struct brw_wm_prog_data *prog_data;
-
-//      struct brw_wm_compiler *compile_data;
-
-
-      /**
-       * Array of sampler state uploaded at sampler_gs_offset of BRW_SAMPLER
-       * cache
-       */
-      struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
-
-      unsigned render_surf;
-      unsigned nr_surfaces;
-
-      unsigned max_threads;
-      struct pipe_buffer *scratch_buffer;
-      unsigned scratch_buffer_size;
-
-      unsigned sampler_count;
-      unsigned sampler_gs_offset;
-
-      struct brw_surface_binding_table bind;
-      unsigned bind_ss_offset;
-
-      unsigned prog_gs_offset;
-      unsigned state_gs_offset;
-   } wm;
-
-
-   struct {
-      unsigned vp_gs_offset;
-      unsigned state_gs_offset;
-   } cc;
-
-
-   /* Used to give every program string a unique id
-    */
-   unsigned program_id;
-};
-
-
-#define BRW_PACKCOLOR8888(r,g,b,a)  ((r<<24) | (g<<16) | (b<<8) | a)
-
-
-/*======================================================================
- * brw_vtbl.c
- */
-void brw_do_flush( struct brw_context *brw,
-		   unsigned flags );
-
-
-/*======================================================================
- * brw_state.c
- */
-void brw_validate_state(struct brw_context *brw);
-void brw_init_state(struct brw_context *brw);
-void brw_destroy_state(struct brw_context *brw);
-
-
-/*======================================================================
- * brw_tex.c
- */
-void brwUpdateTextureState( struct brw_context *brw );
-
-
-/* brw_urb.c
- */
-void brw_upload_urb_fence(struct brw_context *brw);
-
-void brw_upload_constant_buffer_state(struct brw_context *brw);
-
-void brw_init_surface_functions(struct brw_context *brw);
-void brw_init_state_functions(struct brw_context *brw);
-void brw_init_flush_functions(struct brw_context *brw);
-void brw_init_string_functions(struct brw_context *brw);
-
-/*======================================================================
- * Inline conversion functions.  These are better-typed than the
- * macros used previously:
- */
-static inline struct brw_context *
-brw_context( struct pipe_context *ctx )
-{
-   return (struct brw_context *)ctx;
-}
-
-#endif
-
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
deleted file mode 100644
index 904cde8e301..00000000000
--- a/src/gallium/drivers/i965simple/brw_curbe.c
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-#include "brw_batch.h"
-#include "brw_util.h"
-#include "brw_wm.h"
-#include "pipe/p_state.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#define FILE_DEBUG_FLAG DEBUG_FALLBACKS
-
-/* Partition the CURBE between the various users of constant values:
- */
-static void calculate_curbe_offsets( struct brw_context *brw )
-{
-   /* CACHE_NEW_WM_PROG */
-   unsigned nr_fp_regs = align(brw->wm.prog_data->max_const, 16);
-
-   /* BRW_NEW_VERTEX_PROGRAM */
-   unsigned nr_vp_regs = align(brw->vs.prog_data->max_const, 16);
-   unsigned nr_clip_regs = 0;
-   unsigned total_regs;
-
-#if 0
-   /* BRW_NEW_CLIP ? */
-   if (brw->attribs.Transform->ClipPlanesEnabled) {
-      unsigned nr_planes = 6 + brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
-      nr_clip_regs = align(nr_planes * 4, 16);
-   }
-#endif
-
-
-   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
-
-   /* This can happen - what to do?  Probably rather than falling
-    * back, the best thing to do is emit programs which code the
-    * constants as immediate values.  Could do this either as a static
-    * cap on WM and VS, or adaptively.
-    *
-    * Unfortunately, this is currently dependent on the results of the
-    * program generation process (in the case of wm), so this would
-    * introduce the need to re-generate programs in the event of a
-    * curbe allocation failure.
-    */
-   /* Max size is 32 - just large enough to
-    * hold the 128 parameters allowed by
-    * the fragment and vertex program
-    * api's.  It's not clear what happens
-    * when both VP and FP want to use 128
-    * parameters, though.
-    */
-   assert(total_regs <= 32);
-
-   /* Lazy resize:
-    */
-   if (nr_fp_regs > brw->curbe.wm_size ||
-       nr_vp_regs > brw->curbe.vs_size ||
-       nr_clip_regs != brw->curbe.clip_size ||
-       (total_regs < brw->curbe.total_size / 4 &&
-	brw->curbe.total_size > 16)) {
-
-      unsigned reg = 0;
-
-      /* Calculate a new layout:
-       */
-      reg = 0;
-      brw->curbe.wm_start = reg;
-      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
-      brw->curbe.clip_start = reg;
-      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
-      brw->curbe.vs_start = reg;
-      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
-      brw->curbe.total_size = reg;
-
-#if 0
-      if (0)
-	 DBG("curbe wm %d+%d clip %d+%d vs %d+%d\n",
-		      brw->curbe.wm_start,
-		      brw->curbe.wm_size,
-		      brw->curbe.clip_start,
-		      brw->curbe.clip_size,
-		      brw->curbe.vs_start,
-		      brw->curbe.vs_size );
-#endif
-
-      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
-   }
-}
-
-
-const struct brw_tracked_state brw_curbe_offsets = {
-   .dirty = {
-      .brw  = (BRW_NEW_CLIP |
-	       BRW_NEW_VS),
-      .cache = CACHE_NEW_WM_PROG
-   },
-   .update = calculate_curbe_offsets
-};
-
-
-
-/* Define the number of curbes within CS's urb allocation.  Multiple
- * urb entries -> multiple curbes.  These will be used by
- * fixed-function hardware in a double-buffering scheme to avoid a
- * pipeline stall each time the contents of the curbe is changed.
- */
-void brw_upload_constant_buffer_state(struct brw_context *brw)
-{
-   struct brw_constant_buffer_state cbs;
-   memset(&cbs, 0, sizeof(cbs));
-
-   /* It appears that this is the state packet for the CS unit, ie. the
-    * urb entries detailed here are housed in the CS range from the
-    * URB_FENCE command.
-    */
-   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
-   cbs.header.length = sizeof(cbs)/4 - 2;
-
-   /* BRW_NEW_URB_FENCE */
-   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
-   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
-
-   assert(brw->urb.nr_cs_entries);
-   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
-}
-
-
-static float fixed_plane[6][4] = {
-   { 0,    0,   -1, 1 },
-   { 0,    0,    1, 1 },
-   { 0,   -1,    0, 1 },
-   { 0,    1,    0, 1 },
-   {-1,    0,    0, 1 },
-   { 1,    0,    0, 1 }
-};
-
-/* Upload a new set of constants.  Too much variability to go into the
- * cache mechanism, but maybe would benefit from a comparison against
- * the current uploaded set of constants.
- */
-static void upload_constant_buffer(struct brw_context *brw)
-{
-   struct brw_mem_pool *pool = &brw->pool[BRW_GS_POOL];
-   unsigned sz = brw->curbe.total_size;
-   unsigned bufsz = sz * sizeof(float);
-   float *buf;
-   unsigned i;
-
-
-   if (sz == 0) {
-      struct brw_constant_buffer cb;
-      cb.header.opcode = CMD_CONST_BUFFER;
-      cb.header.length = sizeof(cb)/4 - 2;
-      cb.header.valid = 0;
-      cb.bits0.buffer_length = 0;
-      cb.bits0.buffer_address = 0;
-      BRW_BATCH_STRUCT(brw, &cb);
-
-      if (brw->curbe.last_buf) {
-	 free(brw->curbe.last_buf);
-	 brw->curbe.last_buf = NULL;
-	 brw->curbe.last_bufsz  = 0;
-      }
-
-      return;
-   }
-
-   buf = (float *)malloc(bufsz);
-
-   memset(buf, 0, bufsz);
-
-   if (brw->curbe.wm_size) {
-      unsigned offset = brw->curbe.wm_start * 16;
-
-      /* First the constant buffer constants:
-       */
-      
-      /* Then any internally generated constants: 
-       */
-      for (i = 0; i < brw->wm.prog_data->nr_internal_consts; i++)
-	 buf[offset + i] = brw->wm.prog_data->internal_const[i];
-
-      assert(brw->wm.prog_data->max_const == 
-	     brw->wm.prog_data->nr_internal_consts);
-   }
-
-
-   /* The clipplanes are actually delivered to both CLIP and VS units.
-    * VS uses them to calculate the outcode bitmasks.
-    */
-   if (brw->curbe.clip_size) {
-      unsigned offset = brw->curbe.clip_start * 16;
-      unsigned j;
-
-      /* If any planes are going this way, send them all this way:
-       */
-      for (i = 0; i < 6; i++) {
-	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
-	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
-	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
-	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
-      }
-
-      /* Clip planes: BRW_NEW_CLIP:
-       */
-      for (j = 0; j < brw->attribs.Clip.nr; j++) {
-	 buf[offset + i * 4 + 0] = brw->attribs.Clip.ucp[j][0];
-	 buf[offset + i * 4 + 1] = brw->attribs.Clip.ucp[j][1];
-	 buf[offset + i * 4 + 2] = brw->attribs.Clip.ucp[j][2];
-	 buf[offset + i * 4 + 3] = brw->attribs.Clip.ucp[j][3];
-	 i++;
-      }
-   }
-
-
-   if (brw->curbe.vs_size) {
-      unsigned offset = brw->curbe.vs_start * 16;
-      /*unsigned nr = vp->max_const;*/
-      const struct pipe_constant_buffer *cbuffer = brw->attribs.Constants[0];
-      struct pipe_winsys *ws = brw->pipe.winsys;
-      /* FIXME: buffer size is num_consts + num_immediates */
-      if (brw->vs.prog_data->num_consts) {
-         /* map the vertex constant buffer and copy to curbe: */
-         void *data = ws->buffer_map(ws, cbuffer->buffer, 0);
-         /* FIXME: this is wrong. the cbuffer->buffer->size currently
-          * represents size of consts + immediates. so if we'll
-          * have both we'll copy over the end of the buffer
-          * with the subsequent memcpy */
-         memcpy(&buf[offset], data, cbuffer->buffer->size);
-         ws->buffer_unmap(ws, cbuffer->buffer);
-         offset += cbuffer->buffer->size;
-      }
-      /*immediates*/
-      if (brw->vs.prog_data->num_imm) {
-         memcpy(&buf[offset], brw->vs.prog_data->imm_buf,
-                brw->vs.prog_data->num_imm * 4 * sizeof(float));
-      }
-   }
-
-   if (1) {
-      for (i = 0; i < sz; i+=4)
-	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
-		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
-
-      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
-		   brw->curbe.last_buf, buf,
-		   bufsz, brw->curbe.last_bufsz,
-		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
-   }
-
-   if (brw->curbe.last_buf &&
-       bufsz == brw->curbe.last_bufsz &&
-       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
-      free(buf);
-/*       return; */
-   }
-   else {
-      if (brw->curbe.last_buf)
-	 free(brw->curbe.last_buf);
-      brw->curbe.last_buf = buf;
-      brw->curbe.last_bufsz = bufsz;
-
-
-      if (!brw_pool_alloc(pool,
-			  bufsz,
-			  1 << 6,
-			  &brw->curbe.gs_offset)) {
-	 debug_printf("out of GS memory for curbe\n");
-	 assert(0);
-	 return;
-      }
-
-
-      /* Copy data to the buffer:
-       */
-      brw->winsys->buffer_subdata_typed(brw->winsys,
-					pool->buffer, 
-					brw->curbe.gs_offset, 
-					bufsz, 
-					buf,
-					BRW_CONSTANT_BUFFER );
-   }
-
-   /* TODO: only emit the constant_buffer packet when necessary, ie:
-      - contents have changed
-      - offset has changed
-      - hw requirements due to other packets emitted.
-   */
-   {
-      struct brw_constant_buffer cb;
-
-      memset(&cb, 0, sizeof(cb));
-
-      cb.header.opcode = CMD_CONST_BUFFER;
-      cb.header.length = sizeof(cb)/4 - 2;
-      cb.header.valid = 1;
-      cb.bits0.buffer_length = sz - 1;
-      cb.bits0.buffer_address = brw->curbe.gs_offset >> 6;
-
-      /* Because this provokes an action (ie copy the constants into the
-       * URB), it shouldn't be shortcircuited if identical to the
-       * previous time - because eg. the urb destination may have
-       * changed, or the urb contents different to last time.
-       *
-       * Note that the data referred to is actually copied internally,
-       * not just used in place according to passed pointer.
-       *
-       * It appears that the CS unit takes care of using each available
-       * URB entry (Const URB Entry == CURBE) in turn, and issuing
-       * flushes as necessary when doublebuffering of CURBEs isn't
-       * possible.
-       */
-      BRW_BATCH_STRUCT(brw, &cb);
-   }
-}
-
-/* This tracked state is unique in that the state it monitors varies
- * dynamically depending on the parameters tracked by the fragment and
- * vertex programs.  This is the template used as a starting point,
- * each context will maintain a copy of this internally and update as
- * required.
- */
-const struct brw_tracked_state brw_constant_buffer = {
-   .dirty = {
-      .brw  = (BRW_NEW_CLIP |
-	       BRW_NEW_CONSTANTS |
-	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
-	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
-	       BRW_NEW_CURBE_OFFSETS),
-      .cache = (CACHE_NEW_WM_PROG)
-   },
-   .update = upload_constant_buffer
-};
-
diff --git a/src/gallium/drivers/i965simple/brw_defines.h b/src/gallium/drivers/i965simple/brw_defines.h
deleted file mode 100644
index 715d2d2d011..00000000000
--- a/src/gallium/drivers/i965simple/brw_defines.h
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#ifndef BRW_DEFINES_H
-#define BRW_DEFINES_H
-
-/*
- */
-#define MI_NOOP                              0x00
-#define MI_USER_INTERRUPT                    0x02
-#define MI_WAIT_FOR_EVENT                    0x03
-#define MI_FLUSH                             0x04
-#define MI_REPORT_HEAD                       0x07
-#define MI_ARB_ON_OFF                        0x08
-#define MI_BATCH_BUFFER_END                  0x0A
-#define MI_OVERLAY_FLIP                      0x11
-#define MI_LOAD_SCAN_LINES_INCL              0x12
-#define MI_LOAD_SCAN_LINES_EXCL              0x13
-#define MI_DISPLAY_BUFFER_INFO               0x14
-#define MI_SET_CONTEXT                       0x18
-#define MI_STORE_DATA_IMM                    0x20
-#define MI_STORE_DATA_INDEX                  0x21
-#define MI_LOAD_REGISTER_IMM                 0x22
-#define MI_STORE_REGISTER_MEM                0x24
-#define MI_BATCH_BUFFER_START                0x31
-
-#define MI_SYNCHRONOUS_FLIP                  0x0
-#define MI_ASYNCHRONOUS_FLIP                 0x1
-
-#define MI_BUFFER_SECURE                     0x0
-#define MI_BUFFER_NONSECURE                  0x1
-
-#define MI_ARBITRATE_AT_CHAIN_POINTS         0x0
-#define MI_ARBITRATE_BETWEEN_INSTS           0x1
-#define MI_NO_ARBITRATION                    0x3
-
-#define MI_CONDITION_CODE_WAIT_DISABLED      0x0
-#define MI_CONDITION_CODE_WAIT_0             0x1
-#define MI_CONDITION_CODE_WAIT_1             0x2
-#define MI_CONDITION_CODE_WAIT_2             0x3
-#define MI_CONDITION_CODE_WAIT_3             0x4
-#define MI_CONDITION_CODE_WAIT_4             0x5
-
-#define MI_DISPLAY_PIPE_A                    0x0
-#define MI_DISPLAY_PIPE_B                    0x1
-
-#define MI_DISPLAY_PLANE_A                   0x0
-#define MI_DISPLAY_PLANE_B                   0x1
-#define MI_DISPLAY_PLANE_C                   0x2
-
-#define MI_STANDARD_FLIP                                 0x0
-#define MI_ENQUEUE_FLIP_PERFORM_BASE_FRAME_NUMBER_LOAD   0x1
-#define MI_ENQUEUE_FLIP_TARGET_FRAME_NUMBER_RELATIVE     0x2
-#define MI_ENQUEUE_FLIP_ABSOLUTE_TARGET_FRAME_NUMBER     0x3
-
-#define MI_PHYSICAL_ADDRESS                  0x0
-#define MI_VIRTUAL_ADDRESS                   0x1
-
-#define MI_BUFFER_MEMORY_MAIN                0x0
-#define MI_BUFFER_MEMORY_GTT                 0x2
-#define MI_BUFFER_MEMORY_PER_PROCESS_GTT     0x3
-
-#define MI_FLIP_CONTINUE                     0x0
-#define MI_FLIP_ON                           0x1
-#define MI_FLIP_OFF                          0x2
-
-#define MI_UNTRUSTED_REGISTER_SPACE          0x0
-#define MI_TRUSTED_REGISTER_SPACE            0x1
-
-/* 3D state:
- */
-#define _3DOP_3DSTATE_PIPELINED       0x0
-#define _3DOP_3DSTATE_NONPIPELINED    0x1
-#define _3DOP_3DCONTROL               0x2
-#define _3DOP_3DPRIMITIVE             0x3
-
-#define _3DSTATE_PIPELINED_POINTERS       0x00
-#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
-#define _3DSTATE_VERTEX_BUFFERS           0x08
-#define _3DSTATE_VERTEX_ELEMENTS          0x09
-#define _3DSTATE_INDEX_BUFFER             0x0A
-#define _3DSTATE_VF_STATISTICS            0x0B
-#define _3DSTATE_DRAWING_RECTANGLE            0x00
-#define _3DSTATE_CONSTANT_COLOR               0x01
-#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
-#define _3DSTATE_CHROMA_KEY                   0x04
-#define _3DSTATE_DEPTH_BUFFER                 0x05
-#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
-#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
-#define _3DSTATE_LINE_STIPPLE                 0x08
-#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
-#define _3DCONTROL    0x00
-#define _3DPRIMITIVE  0x00
-
-#define PIPE_CONTROL_NOWRITE          0x00
-#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
-#define PIPE_CONTROL_WRITEDEPTH       0x02
-#define PIPE_CONTROL_WRITETIMESTAMP   0x03
-
-#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
-#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
-
-#define _3DPRIM_POINTLIST         0x01
-#define _3DPRIM_LINELIST          0x02
-#define _3DPRIM_LINESTRIP         0x03
-#define _3DPRIM_TRILIST           0x04
-#define _3DPRIM_TRISTRIP          0x05
-#define _3DPRIM_TRIFAN            0x06
-#define _3DPRIM_QUADLIST          0x07
-#define _3DPRIM_QUADSTRIP         0x08
-#define _3DPRIM_LINELIST_ADJ      0x09
-#define _3DPRIM_LINESTRIP_ADJ     0x0A
-#define _3DPRIM_TRILIST_ADJ       0x0B
-#define _3DPRIM_TRISTRIP_ADJ      0x0C
-#define _3DPRIM_TRISTRIP_REVERSE  0x0D
-#define _3DPRIM_POLYGON           0x0E
-#define _3DPRIM_RECTLIST          0x0F
-#define _3DPRIM_LINELOOP          0x10
-#define _3DPRIM_POINTLIST_BF      0x11
-#define _3DPRIM_LINESTRIP_CONT    0x12
-#define _3DPRIM_LINESTRIP_BF      0x13
-#define _3DPRIM_LINESTRIP_CONT_BF 0x14
-#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
-
-#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
-#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
-
-#define BRW_ANISORATIO_2     0
-#define BRW_ANISORATIO_4     1
-#define BRW_ANISORATIO_6     2
-#define BRW_ANISORATIO_8     3
-#define BRW_ANISORATIO_10    4
-#define BRW_ANISORATIO_12    5
-#define BRW_ANISORATIO_14    6
-#define BRW_ANISORATIO_16    7
-
-#define BRW_BLENDFACTOR_ONE                 0x1
-#define BRW_BLENDFACTOR_SRC_COLOR           0x2
-#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
-#define BRW_BLENDFACTOR_DST_ALPHA           0x4
-#define BRW_BLENDFACTOR_DST_COLOR           0x5
-#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
-#define BRW_BLENDFACTOR_CONST_COLOR         0x7
-#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
-#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
-#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
-#define BRW_BLENDFACTOR_ZERO                0x11
-#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
-#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
-#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
-#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
-#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
-#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
-#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
-#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
-
-#define BRW_BLENDFUNCTION_ADD               0
-#define BRW_BLENDFUNCTION_SUBTRACT          1
-#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
-#define BRW_BLENDFUNCTION_MIN               3
-#define BRW_BLENDFUNCTION_MAX               4
-
-#define BRW_ALPHATEST_FORMAT_UNORM8         0
-#define BRW_ALPHATEST_FORMAT_FLOAT32        1
-
-#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
-#define BRW_CHROMAKEY_REPLACE_BLACK      1
-
-#define BRW_CLIP_API_OGL     0
-#define BRW_CLIP_API_DX      1
-
-#define BRW_CLIPMODE_NORMAL              0
-#define BRW_CLIPMODE_CLIP_ALL            1
-#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
-#define BRW_CLIPMODE_REJECT_ALL          3
-#define BRW_CLIPMODE_ACCEPT_ALL          4
-
-#define BRW_CLIP_NDCSPACE     0
-#define BRW_CLIP_SCREENSPACE  1
-
-#define BRW_COMPAREFUNCTION_ALWAYS       0
-#define BRW_COMPAREFUNCTION_NEVER        1
-#define BRW_COMPAREFUNCTION_LESS         2
-#define BRW_COMPAREFUNCTION_EQUAL        3
-#define BRW_COMPAREFUNCTION_LEQUAL       4
-#define BRW_COMPAREFUNCTION_GREATER      5
-#define BRW_COMPAREFUNCTION_NOTEQUAL     6
-#define BRW_COMPAREFUNCTION_GEQUAL       7
-
-#define BRW_COVERAGE_PIXELS_HALF     0
-#define BRW_COVERAGE_PIXELS_1        1
-#define BRW_COVERAGE_PIXELS_2        2
-#define BRW_COVERAGE_PIXELS_4        3
-
-#define BRW_CULLMODE_BOTH        0
-#define BRW_CULLMODE_NONE        1
-#define BRW_CULLMODE_FRONT       2
-#define BRW_CULLMODE_BACK        3
-
-#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
-#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
-
-#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
-#define BRW_DEPTHFORMAT_D32_FLOAT                1
-#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
-#define BRW_DEPTHFORMAT_D16_UNORM                5
-
-#define BRW_FLOATING_POINT_IEEE_754        0
-#define BRW_FLOATING_POINT_NON_IEEE_754    1
-
-#define BRW_FRONTWINDING_CW      0
-#define BRW_FRONTWINDING_CCW     1
-
-#define BRW_SPRITE_POINT_ENABLE  16
-
-#define BRW_INDEX_BYTE     0
-#define BRW_INDEX_WORD     1
-#define BRW_INDEX_DWORD    2
-
-#define BRW_LOGICOPFUNCTION_CLEAR            0
-#define BRW_LOGICOPFUNCTION_NOR              1
-#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
-#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
-#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
-#define BRW_LOGICOPFUNCTION_INVERT           5
-#define BRW_LOGICOPFUNCTION_XOR              6
-#define BRW_LOGICOPFUNCTION_NAND             7
-#define BRW_LOGICOPFUNCTION_AND              8
-#define BRW_LOGICOPFUNCTION_EQUIV            9
-#define BRW_LOGICOPFUNCTION_NOOP             10
-#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
-#define BRW_LOGICOPFUNCTION_COPY             12
-#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
-#define BRW_LOGICOPFUNCTION_OR               14
-#define BRW_LOGICOPFUNCTION_SET              15
-
-#define BRW_MAPFILTER_NEAREST        0x0
-#define BRW_MAPFILTER_LINEAR         0x1
-#define BRW_MAPFILTER_ANISOTROPIC    0x2
-
-#define BRW_MIPFILTER_NONE        0
-#define BRW_MIPFILTER_NEAREST     1
-#define BRW_MIPFILTER_LINEAR      3
-
-#define BRW_POLYGON_FRONT_FACING     0
-#define BRW_POLYGON_BACK_FACING      1
-
-#define BRW_PREFILTER_ALWAYS     0x0
-#define BRW_PREFILTER_NEVER      0x1
-#define BRW_PREFILTER_LESS       0x2
-#define BRW_PREFILTER_EQUAL      0x3
-#define BRW_PREFILTER_LEQUAL     0x4
-#define BRW_PREFILTER_GREATER    0x5
-#define BRW_PREFILTER_NOTEQUAL   0x6
-#define BRW_PREFILTER_GEQUAL     0x7
-
-#define BRW_PROVOKING_VERTEX_0    0
-#define BRW_PROVOKING_VERTEX_1    1
-#define BRW_PROVOKING_VERTEX_2    2
-
-#define BRW_RASTRULE_UPPER_LEFT  0
-#define BRW_RASTRULE_UPPER_RIGHT 1
-/* These are listed as "Reserved, but not seen as useful"
- * in Intel documentation (page 212, "Point Rasterization Rule",
- * section 7.4 "SF Pipeline State Summary", of document
- * "Intel® 965 Express Chipset Family and Intel® G35 Express
- * Chipset Graphics Controller Programmer's Reference Manual,
- * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
- * available at 
- *     http://intellinuxgraphics.org/documentation.html
- * at the time of this writing).
- *
- * These appear to be supported on at least some
- * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
- * is useful when using OpenGL to render to a FBO
- * (which has the pixel coordinate Y orientation inverted
- * with respect to the normal OpenGL pixel coordinate system).
- */
-#define BRW_RASTRULE_LOWER_LEFT  2
-#define BRW_RASTRULE_LOWER_RIGHT 3
-
-#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
-#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
-#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
-
-#define BRW_STENCILOP_KEEP               0
-#define BRW_STENCILOP_ZERO               1
-#define BRW_STENCILOP_REPLACE            2
-#define BRW_STENCILOP_INCRSAT            3
-#define BRW_STENCILOP_DECRSAT            4
-#define BRW_STENCILOP_INCR               5
-#define BRW_STENCILOP_DECR               6
-#define BRW_STENCILOP_INVERT             7
-
-#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
-#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
-
-#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000
-#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001
-#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002
-#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003
-#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004
-#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005
-#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006
-#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
-#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
-#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040
-#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041
-#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042
-#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043
-#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044
-#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045
-#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046
-#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080
-#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081
-#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082
-#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083
-#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084
-#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085
-#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086
-#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087
-#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088
-#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089
-#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A
-#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B
-#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C
-#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D
-#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E
-#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F
-#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090
-#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091
-#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092
-#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
-#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
-#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
-#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
-#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0
-#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1
-#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2
-#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3
-#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4
-#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5
-#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7
-#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8
-#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9
-#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA
-#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB
-#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC
-#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD
-#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE
-#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF
-#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0
-#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1
-#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2
-#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3
-#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6
-#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7
-#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8
-#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9
-#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA
-#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF
-#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0
-#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1
-#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2
-#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3
-#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4
-#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5
-#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9
-#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA
-#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB
-#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC
-#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED
-#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE
-#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0
-#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1
-#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2
-#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
-#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
-#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
-#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
-#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
-#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
-#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
-#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100
-#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101
-#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102
-#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103
-#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104
-#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105
-#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106
-#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107
-#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108
-#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109
-#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A
-#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B
-#define BRW_SURFACEFORMAT_R16_SINT                       0x10C
-#define BRW_SURFACEFORMAT_R16_UINT                       0x10D
-#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E
-#define BRW_SURFACEFORMAT_I16_UNORM                      0x111
-#define BRW_SURFACEFORMAT_L16_UNORM                      0x112
-#define BRW_SURFACEFORMAT_A16_UNORM                      0x113
-#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114
-#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
-#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
-#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
-#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
-#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
-#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
-#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
-#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
-#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
-#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
-#define BRW_SURFACEFORMAT_R8_UNORM                       0x140
-#define BRW_SURFACEFORMAT_R8_SNORM                       0x141
-#define BRW_SURFACEFORMAT_R8_SINT                        0x142
-#define BRW_SURFACEFORMAT_R8_UINT                        0x143
-#define BRW_SURFACEFORMAT_A8_UNORM                       0x144
-#define BRW_SURFACEFORMAT_I8_UNORM                       0x145
-#define BRW_SURFACEFORMAT_L8_UNORM                       0x146
-#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147
-#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
-#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
-#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
-#define BRW_SURFACEFORMAT_R1_UINT                        0x181
-#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182
-#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183
-#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186
-#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187
-#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188
-#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189
-#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A
-#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B
-#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C
-#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D
-#define BRW_SURFACEFORMAT_MONO8                          0x18E
-#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F
-#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190
-#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191
-#define BRW_SURFACEFORMAT_FXT1                           0x192
-#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193
-#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194
-#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195
-#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196
-#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197
-#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198
-#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199
-#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A
-#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C
-#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D
-#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E
-#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
-
-#define BRW_SURFACERETURNFORMAT_FLOAT32  0
-#define BRW_SURFACERETURNFORMAT_S1       1
-
-#define BRW_SURFACE_1D      0
-#define BRW_SURFACE_2D      1
-#define BRW_SURFACE_3D      2
-#define BRW_SURFACE_CUBE    3
-#define BRW_SURFACE_BUFFER  4
-#define BRW_SURFACE_NULL    7
-
-#define BRW_TEXCOORDMODE_WRAP            0
-#define BRW_TEXCOORDMODE_MIRROR          1
-#define BRW_TEXCOORDMODE_CLAMP           2
-#define BRW_TEXCOORDMODE_CUBE            3
-#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
-#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
-
-#define BRW_THREAD_PRIORITY_NORMAL   0
-#define BRW_THREAD_PRIORITY_HIGH     1
-
-#define BRW_TILEWALK_XMAJOR                 0
-#define BRW_TILEWALK_YMAJOR                 1
-
-#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
-#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
-
-#define BRW_VERTEXBUFFER_ACCESS_VERTEXDATA     0
-#define BRW_VERTEXBUFFER_ACCESS_INSTANCEDATA   1
-
-#define BRW_VFCOMPONENT_NOSTORE      0
-#define BRW_VFCOMPONENT_STORE_SRC    1
-#define BRW_VFCOMPONENT_STORE_0      2
-#define BRW_VFCOMPONENT_STORE_1_FLT  3
-#define BRW_VFCOMPONENT_STORE_1_INT  4
-#define BRW_VFCOMPONENT_STORE_VID    5
-#define BRW_VFCOMPONENT_STORE_IID    6
-#define BRW_VFCOMPONENT_STORE_PID    7
-
-
-
-/* Execution Unit (EU) defines
- */
-
-#define BRW_ALIGN_1   0
-#define BRW_ALIGN_16  1
-
-#define BRW_ADDRESS_DIRECT                        0
-#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
-
-#define BRW_CHANNEL_X     0
-#define BRW_CHANNEL_Y     1
-#define BRW_CHANNEL_Z     2
-#define BRW_CHANNEL_W     3
-
-#define BRW_COMPRESSION_NONE          0
-#define BRW_COMPRESSION_2NDHALF       1
-#define BRW_COMPRESSION_COMPRESSED    2
-
-#define BRW_CONDITIONAL_NONE  0
-#define BRW_CONDITIONAL_Z     1
-#define BRW_CONDITIONAL_NZ    2
-#define BRW_CONDITIONAL_EQ    1	/* Z */
-#define BRW_CONDITIONAL_NEQ   2	/* NZ */
-#define BRW_CONDITIONAL_G     3
-#define BRW_CONDITIONAL_GE    4
-#define BRW_CONDITIONAL_L     5
-#define BRW_CONDITIONAL_LE    6
-#define BRW_CONDITIONAL_C     7
-#define BRW_CONDITIONAL_O     8
-
-#define BRW_DEBUG_NONE        0
-#define BRW_DEBUG_BREAKPOINT  1
-
-#define BRW_DEPENDENCY_NORMAL         0
-#define BRW_DEPENDENCY_NOTCLEARED     1
-#define BRW_DEPENDENCY_NOTCHECKED     2
-#define BRW_DEPENDENCY_DISABLE        3
-
-#define BRW_EXECUTE_1     0
-#define BRW_EXECUTE_2     1
-#define BRW_EXECUTE_4     2
-#define BRW_EXECUTE_8     3
-#define BRW_EXECUTE_16    4
-#define BRW_EXECUTE_32    5
-
-#define BRW_HORIZONTAL_STRIDE_0   0
-#define BRW_HORIZONTAL_STRIDE_1   1
-#define BRW_HORIZONTAL_STRIDE_2   2
-#define BRW_HORIZONTAL_STRIDE_4   3
-
-#define BRW_INSTRUCTION_NORMAL    0
-#define BRW_INSTRUCTION_SATURATE  1
-
-#define BRW_MASK_ENABLE   0
-#define BRW_MASK_DISABLE  1
-
-#define BRW_OPCODE_MOV        1
-#define BRW_OPCODE_SEL        2
-#define BRW_OPCODE_NOT        4
-#define BRW_OPCODE_AND        5
-#define BRW_OPCODE_OR         6
-#define BRW_OPCODE_XOR        7
-#define BRW_OPCODE_SHR        8
-#define BRW_OPCODE_SHL        9
-#define BRW_OPCODE_RSR        10
-#define BRW_OPCODE_RSL        11
-#define BRW_OPCODE_ASR        12
-#define BRW_OPCODE_CMP        16
-#define BRW_OPCODE_JMPI       32
-#define BRW_OPCODE_IF         34
-#define BRW_OPCODE_IFF        35
-#define BRW_OPCODE_ELSE       36
-#define BRW_OPCODE_ENDIF      37
-#define BRW_OPCODE_DO         38
-#define BRW_OPCODE_WHILE      39
-#define BRW_OPCODE_BREAK      40
-#define BRW_OPCODE_CONTINUE   41
-#define BRW_OPCODE_HALT       42
-#define BRW_OPCODE_MSAVE      44
-#define BRW_OPCODE_MRESTORE   45
-#define BRW_OPCODE_PUSH       46
-#define BRW_OPCODE_POP        47
-#define BRW_OPCODE_WAIT       48
-#define BRW_OPCODE_SEND       49
-#define BRW_OPCODE_ADD        64
-#define BRW_OPCODE_MUL        65
-#define BRW_OPCODE_AVG        66
-#define BRW_OPCODE_FRC        67
-#define BRW_OPCODE_RNDU       68
-#define BRW_OPCODE_RNDD       69
-#define BRW_OPCODE_RNDE       70
-#define BRW_OPCODE_RNDZ       71
-#define BRW_OPCODE_MAC        72
-#define BRW_OPCODE_MACH       73
-#define BRW_OPCODE_LZD        74
-#define BRW_OPCODE_SAD2       80
-#define BRW_OPCODE_SADA2      81
-#define BRW_OPCODE_DP4        84
-#define BRW_OPCODE_DPH        85
-#define BRW_OPCODE_DP3        86
-#define BRW_OPCODE_DP2        87
-#define BRW_OPCODE_DPA2       88
-#define BRW_OPCODE_LINE       89
-#define BRW_OPCODE_NOP        126
-
-#define BRW_PREDICATE_NONE             0
-#define BRW_PREDICATE_NORMAL           1
-#define BRW_PREDICATE_ALIGN1_ANYV             2
-#define BRW_PREDICATE_ALIGN1_ALLV             3
-#define BRW_PREDICATE_ALIGN1_ANY2H            4
-#define BRW_PREDICATE_ALIGN1_ALL2H            5
-#define BRW_PREDICATE_ALIGN1_ANY4H            6
-#define BRW_PREDICATE_ALIGN1_ALL4H            7
-#define BRW_PREDICATE_ALIGN1_ANY8H            8
-#define BRW_PREDICATE_ALIGN1_ALL8H            9
-#define BRW_PREDICATE_ALIGN1_ANY16H           10
-#define BRW_PREDICATE_ALIGN1_ALL16H           11
-#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
-#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
-#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
-#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
-#define BRW_PREDICATE_ALIGN16_ANY4H           6
-#define BRW_PREDICATE_ALIGN16_ALL4H           7
-
-#define BRW_ARCHITECTURE_REGISTER_FILE    0
-#define BRW_GENERAL_REGISTER_FILE         1
-#define BRW_MESSAGE_REGISTER_FILE         2
-#define BRW_IMMEDIATE_VALUE               3
-
-#define BRW_REGISTER_TYPE_UD  0
-#define BRW_REGISTER_TYPE_D   1
-#define BRW_REGISTER_TYPE_UW  2
-#define BRW_REGISTER_TYPE_W   3
-#define BRW_REGISTER_TYPE_UB  4
-#define BRW_REGISTER_TYPE_B   5
-#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
-#define BRW_REGISTER_TYPE_HF  6
-#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
-#define BRW_REGISTER_TYPE_F   7
-
-#define BRW_ARF_NULL                  0x00
-#define BRW_ARF_ADDRESS               0x10
-#define BRW_ARF_ACCUMULATOR           0x20
-#define BRW_ARF_FLAG                  0x30
-#define BRW_ARF_MASK                  0x40
-#define BRW_ARF_MASK_STACK            0x50
-#define BRW_ARF_MASK_STACK_DEPTH      0x60
-#define BRW_ARF_STATE                 0x70
-#define BRW_ARF_CONTROL               0x80
-#define BRW_ARF_NOTIFICATION_COUNT    0x90
-#define BRW_ARF_IP                    0xA0
-
-#define BRW_AMASK   0
-#define BRW_IMASK   1
-#define BRW_LMASK   2
-#define BRW_CMASK   3
-
-
-
-#define BRW_THREAD_NORMAL     0
-#define BRW_THREAD_ATOMIC     1
-#define BRW_THREAD_SWITCH     2
-
-#define BRW_VERTICAL_STRIDE_0                 0
-#define BRW_VERTICAL_STRIDE_1                 1
-#define BRW_VERTICAL_STRIDE_2                 2
-#define BRW_VERTICAL_STRIDE_4                 3
-#define BRW_VERTICAL_STRIDE_8                 4
-#define BRW_VERTICAL_STRIDE_16                5
-#define BRW_VERTICAL_STRIDE_32                6
-#define BRW_VERTICAL_STRIDE_64                7
-#define BRW_VERTICAL_STRIDE_128               8
-#define BRW_VERTICAL_STRIDE_256               9
-#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
-
-#define BRW_WIDTH_1       0
-#define BRW_WIDTH_2       1
-#define BRW_WIDTH_4       2
-#define BRW_WIDTH_8       3
-#define BRW_WIDTH_16      4
-
-#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
-#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
-#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
-#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
-#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
-#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
-#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
-#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
-#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
-#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
-#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
-#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
-
-#define BRW_POLYGON_FACING_FRONT      0
-#define BRW_POLYGON_FACING_BACK       1
-
-#define BRW_MESSAGE_TARGET_NULL               0
-#define BRW_MESSAGE_TARGET_MATH               1
-#define BRW_MESSAGE_TARGET_SAMPLER            2
-#define BRW_MESSAGE_TARGET_GATEWAY            3
-#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
-#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
-#define BRW_MESSAGE_TARGET_URB                6
-#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
-
-#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
-#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
-#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
-
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
-#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
-#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
-#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
-#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
-#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
-
-#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
-#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
-#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
-#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
-#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
-
-#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
-#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
-
-#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
-#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
-
-#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
-#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
-#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
-#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
-
-#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
-#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
-#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
-
-#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
-#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
-#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
-#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
-#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
-
-#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
-#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
-#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
-#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
-#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
-#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
-#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
-
-#define BRW_MATH_FUNCTION_INV                              1
-#define BRW_MATH_FUNCTION_LOG                              2
-#define BRW_MATH_FUNCTION_EXP                              3
-#define BRW_MATH_FUNCTION_SQRT                             4
-#define BRW_MATH_FUNCTION_RSQ                              5
-#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
-#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
-#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
-#define BRW_MATH_FUNCTION_TAN                              9
-#define BRW_MATH_FUNCTION_POW                              10
-#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
-#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
-#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
-
-#define BRW_MATH_INTEGER_UNSIGNED     0
-#define BRW_MATH_INTEGER_SIGNED       1
-
-#define BRW_MATH_PRECISION_FULL        0
-#define BRW_MATH_PRECISION_PARTIAL     1
-
-#define BRW_MATH_SATURATE_NONE         0
-#define BRW_MATH_SATURATE_SATURATE     1
-
-#define BRW_MATH_DATA_VECTOR  0
-#define BRW_MATH_DATA_SCALAR  1
-
-#define BRW_URB_OPCODE_WRITE  0
-
-#define BRW_URB_SWIZZLE_NONE          0
-#define BRW_URB_SWIZZLE_INTERLEAVE    1
-#define BRW_URB_SWIZZLE_TRANSPOSE     2
-
-#define BRW_SCRATCH_SPACE_SIZE_1K     0
-#define BRW_SCRATCH_SPACE_SIZE_2K     1
-#define BRW_SCRATCH_SPACE_SIZE_4K     2
-#define BRW_SCRATCH_SPACE_SIZE_8K     3
-#define BRW_SCRATCH_SPACE_SIZE_16K    4
-#define BRW_SCRATCH_SPACE_SIZE_32K    5
-#define BRW_SCRATCH_SPACE_SIZE_64K    6
-#define BRW_SCRATCH_SPACE_SIZE_128K   7
-#define BRW_SCRATCH_SPACE_SIZE_256K   8
-#define BRW_SCRATCH_SPACE_SIZE_512K   9
-#define BRW_SCRATCH_SPACE_SIZE_1M     10
-#define BRW_SCRATCH_SPACE_SIZE_2M     11
-
-
-
-
-#define CMD_URB_FENCE                 0x6000
-#define CMD_CONST_BUFFER_STATE        0x6001
-#define CMD_CONST_BUFFER              0x6002
-
-#define CMD_STATE_BASE_ADDRESS        0x6101
-#define CMD_STATE_INSN_POINTER        0x6102
-#define CMD_PIPELINE_SELECT           0x6104
-
-#define CMD_PIPELINED_STATE_POINTERS  0x7800
-#define CMD_BINDING_TABLE_PTRS        0x7801
-#define CMD_VERTEX_BUFFER             0x7808
-#define CMD_VERTEX_ELEMENT            0x7809
-#define CMD_INDEX_BUFFER              0x780a
-#define CMD_VF_STATISTICS             0x780b
-
-#define CMD_DRAW_RECT                 0x7900
-#define CMD_BLEND_CONSTANT_COLOR      0x7901
-#define CMD_CHROMA_KEY                0x7904
-#define CMD_DEPTH_BUFFER              0x7905
-#define CMD_POLY_STIPPLE_OFFSET       0x7906
-#define CMD_POLY_STIPPLE_PATTERN      0x7907
-#define CMD_LINE_STIPPLE_PATTERN      0x7908
-#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
-
-#define CMD_PIPE_CONTROL              0x7a00
-
-#define CMD_3D_PRIM                   0x7b00
-
-#define CMD_MI_FLUSH                  0x0200
-
-
-/* Various values from the R0 vertex header:
- */
-#define R02_PRIM_END    0x1
-#define R02_PRIM_START  0x2
-
-
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw.c b/src/gallium/drivers/i965simple/brw_draw.c
deleted file mode 100644
index 49d80cb41c5..00000000000
--- a/src/gallium/drivers/i965simple/brw_draw.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <stdlib.h>
-
-#include "brw_batch.h"
-#include "brw_draw.h"
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_state.h"
-
-#include "pipe/p_context.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_prim.h"
-
-static unsigned hw_prim[PIPE_PRIM_POLYGON+1] = {
-   _3DPRIM_POINTLIST,
-   _3DPRIM_LINELIST,
-   _3DPRIM_LINELOOP,
-   _3DPRIM_LINESTRIP,
-   _3DPRIM_TRILIST,
-   _3DPRIM_TRISTRIP,
-   _3DPRIM_TRIFAN,
-   _3DPRIM_QUADLIST,
-   _3DPRIM_QUADSTRIP,
-   _3DPRIM_POLYGON
-};
-
-
-/* When the primitive changes, set a state bit and re-validate.  Not
- * the nicest and would rather deal with this by having all the
- * programs be immune to the active primitive (ie. cope with all
- * possibilities).  That may not be realistic however.
- */
-static void brw_set_prim(struct brw_context *brw, int prim)
-{
-   PRINT("PRIM: %d\n", prim);
-
-   /* Slight optimization to avoid the GS program when not needed:
-    */
-   if (prim == PIPE_PRIM_QUAD_STRIP &&
-       brw->attribs.Raster->flatshade &&
-       brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_FILL &&
-       brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_FILL)
-      prim = PIPE_PRIM_TRIANGLE_STRIP;
-
-   if (prim != brw->primitive) {
-      brw->primitive = prim;
-      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
-
-      if (u_reduced_prim(prim) != brw->reduced_primitive) {
-	 brw->reduced_primitive = u_reduced_prim(prim);
-	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
-      }
-
-      brw_validate_state(brw);
-   }
-
-}
-
-
-static unsigned trim(int prim, unsigned length)
-{
-   if (prim == PIPE_PRIM_QUAD_STRIP)
-      return length > 3 ? (length - length % 2) : 0;
-   else if (prim == PIPE_PRIM_QUADS)
-      return length - length % 4;
-   else
-      return length;
-}
-
-
-
-static boolean brw_emit_prim( struct brw_context *brw,
-			      boolean indexed,
-			      unsigned start,
-			      unsigned count )
-
-{
-   struct brw_3d_primitive prim_packet;
-
-   if (BRW_DEBUG & DEBUG_PRIMS)
-      PRINT("PRIM: %d %d %d\n",  brw->primitive, start, count);
-
-   prim_packet.header.opcode = CMD_3D_PRIM;
-   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
-   prim_packet.header.pad = 0;
-   prim_packet.header.topology = hw_prim[brw->primitive];
-   prim_packet.header.indexed = indexed;
-
-   prim_packet.verts_per_instance = trim(brw->primitive, count);
-   prim_packet.start_vert_location = start;
-   prim_packet.instance_count = 1;
-   prim_packet.start_instance_location = 0;
-   prim_packet.base_vert_location = 0;
-
-   if (prim_packet.verts_per_instance == 0)
-      return TRUE;
-
-   return brw_batchbuffer_data( brw->winsys,
-                                &prim_packet,
-                                sizeof(prim_packet) );
-}
-
-
-/* May fail if out of video memory for texture or vbo upload, or on
- * fallback conditions.
- */
-static boolean brw_try_draw_elements( struct pipe_context *pipe,
-				      struct pipe_buffer *index_buffer,
-				      unsigned index_size,
-				      unsigned mode,
-				      unsigned start,
-				      unsigned count )
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   /* Set the first primitive ahead of validate_state:
-    */
-   brw_set_prim(brw, mode);
-
-   /* Upload index, vertex data:
-    */
-   if (index_buffer &&
-       !brw_upload_indices( brw, index_buffer, index_size, start, count ))
-      return FALSE;
-
-   if (!brw_upload_vertex_buffers(brw))
-      return FALSE;
-
-   if (!brw_upload_vertex_elements( brw ))
-      return FALSE;
-
-   /* XXX:  Need to separate validate and upload of state.
-    */
-   if (brw->state.dirty.brw)
-      brw_validate_state( brw );
-
-   if (!brw_emit_prim(brw, index_buffer != NULL,
-                      start, count))
-      return FALSE;
-
-   return TRUE;
-}
-
-
-
-static boolean brw_draw_elements( struct pipe_context *pipe,
-				  struct pipe_buffer *indexBuffer,
-				  unsigned indexSize,
-				  unsigned mode,
-				  unsigned start,
-				  unsigned count )
-{
-   if (!brw_try_draw_elements( pipe,
-			       indexBuffer,
-			       indexSize,
-			       mode, start, count ))
-   {
-      /* flush ? */
-
-      if (!brw_try_draw_elements( pipe,
-				  indexBuffer,
-				  indexSize,
-				  mode, start,
-				  count )) {
-	 assert(0);
-	 return FALSE;
-      }
-   }
-
-   return TRUE;
-}
-
-
-
-static boolean brw_draw_arrays( struct pipe_context *pipe,
-				    unsigned mode,
-				    unsigned start,
-				    unsigned count )
-{
-   if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
-      /* flush ? */
-
-      if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
-	 assert(0);
-	 return FALSE;
-      }
-   }
-   
-   return TRUE;
-}
-
-
-
-void brw_init_draw_functions( struct brw_context *brw )
-{
-   brw->pipe.draw_arrays = brw_draw_arrays;
-   brw->pipe.draw_elements = brw_draw_elements;
-}
-
-
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
deleted file mode 100644
index 2d9ca3f2ea0..00000000000
--- a/src/gallium/drivers/i965simple/brw_draw_upload.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <stdlib.h>
-
-#include "brw_batch.h"
-#include "brw_draw.h"
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_state.h"
-
-
-struct brw_array_state {
-   union header_union header;
-
-   struct {
-      union {
-	 struct {
-	    unsigned pitch:11;
-	    unsigned pad:15;
-	    unsigned access_type:1;
-	    unsigned vb_index:5;
-	 } bits;
-	 unsigned dword;
-      } vb0;
-
-      struct pipe_buffer *buffer;
-      unsigned offset;
-
-      unsigned max_index;
-      unsigned instance_data_step_rate;
-
-   } vb[BRW_VBP_MAX];
-};
-
-
-
-unsigned brw_translate_surface_format( unsigned id )
-{
-   switch (id) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return BRW_SURFACEFORMAT_R64_FLOAT;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return BRW_SURFACEFORMAT_R64G64_FLOAT;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
-
-   case PIPE_FORMAT_R32_FLOAT:
-      return BRW_SURFACEFORMAT_R32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return BRW_SURFACEFORMAT_R32G32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
-
-   case PIPE_FORMAT_R32_UNORM:
-      return BRW_SURFACEFORMAT_R32_UNORM;
-   case PIPE_FORMAT_R32G32_UNORM:
-      return BRW_SURFACEFORMAT_R32G32_UNORM;
-   case PIPE_FORMAT_R32G32B32_UNORM:
-      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
-
-   case PIPE_FORMAT_R32_USCALED:
-      return BRW_SURFACEFORMAT_R32_USCALED;
-   case PIPE_FORMAT_R32G32_USCALED:
-      return BRW_SURFACEFORMAT_R32G32_USCALED;
-   case PIPE_FORMAT_R32G32B32_USCALED:
-      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
-
-   case PIPE_FORMAT_R32_SNORM:
-      return BRW_SURFACEFORMAT_R32_SNORM;
-   case PIPE_FORMAT_R32G32_SNORM:
-      return BRW_SURFACEFORMAT_R32G32_SNORM;
-   case PIPE_FORMAT_R32G32B32_SNORM:
-      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
-
-   case PIPE_FORMAT_R32_SSCALED:
-      return BRW_SURFACEFORMAT_R32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return BRW_SURFACEFORMAT_R32G32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
-
-   case PIPE_FORMAT_R16_UNORM:
-      return BRW_SURFACEFORMAT_R16_UNORM;
-   case PIPE_FORMAT_R16G16_UNORM:
-      return BRW_SURFACEFORMAT_R16G16_UNORM;
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
-
-   case PIPE_FORMAT_R16_USCALED:
-      return BRW_SURFACEFORMAT_R16_USCALED;
-   case PIPE_FORMAT_R16G16_USCALED:
-      return BRW_SURFACEFORMAT_R16G16_USCALED;
-   case PIPE_FORMAT_R16G16B16_USCALED:
-      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
-
-   case PIPE_FORMAT_R16_SNORM:
-      return BRW_SURFACEFORMAT_R16_SNORM;
-   case PIPE_FORMAT_R16G16_SNORM:
-      return BRW_SURFACEFORMAT_R16G16_SNORM;
-   case PIPE_FORMAT_R16G16B16_SNORM:
-      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
-
-   case PIPE_FORMAT_R16_SSCALED:
-      return BRW_SURFACEFORMAT_R16_SSCALED;
-   case PIPE_FORMAT_R16G16_SSCALED:
-      return BRW_SURFACEFORMAT_R16G16_SSCALED;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
-
-   case PIPE_FORMAT_R8_UNORM:
-      return BRW_SURFACEFORMAT_R8_UNORM;
-   case PIPE_FORMAT_R8G8_UNORM:
-      return BRW_SURFACEFORMAT_R8G8_UNORM;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
-
-   case PIPE_FORMAT_R8_USCALED:
-      return BRW_SURFACEFORMAT_R8_USCALED;
-   case PIPE_FORMAT_R8G8_USCALED:
-      return BRW_SURFACEFORMAT_R8G8_USCALED;
-   case PIPE_FORMAT_R8G8B8_USCALED:
-      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
-
-   case PIPE_FORMAT_R8_SNORM:
-      return BRW_SURFACEFORMAT_R8_SNORM;
-   case PIPE_FORMAT_R8G8_SNORM:
-      return BRW_SURFACEFORMAT_R8G8_SNORM;
-   case PIPE_FORMAT_R8G8B8_SNORM:
-      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
-
-   case PIPE_FORMAT_R8_SSCALED:
-      return BRW_SURFACEFORMAT_R8_SSCALED;
-   case PIPE_FORMAT_R8G8_SSCALED:
-      return BRW_SURFACEFORMAT_R8G8_SSCALED;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
-
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
-static unsigned get_index_type(int type)
-{
-   switch (type) {
-   case 1: return BRW_INDEX_BYTE;
-   case 2: return BRW_INDEX_WORD;
-   case 4: return BRW_INDEX_DWORD;
-   default: assert(0); return 0;
-   }
-}
-
-
-boolean brw_upload_vertex_buffers( struct brw_context *brw )
-{
-   struct brw_array_state vbp;
-   unsigned nr_enabled = 0;
-   unsigned i;
-
-   memset(&vbp, 0, sizeof(vbp));
-
-   /* This is a hardware limit:
-    */
-
-   for (i = 0; i < BRW_VEP_MAX; i++)
-   {
-      if (brw->vb.vbo_array[i] == NULL) {
-	 nr_enabled = i;
-	 break;
-      }
-
-      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->stride;
-      vbp.vb[i].vb0.bits.pad = 0;
-      vbp.vb[i].vb0.bits.access_type = BRW_VERTEXBUFFER_ACCESS_VERTEXDATA;
-      vbp.vb[i].vb0.bits.vb_index = i;
-      vbp.vb[i].offset = brw->vb.vbo_array[i]->buffer_offset;
-      vbp.vb[i].buffer = brw->vb.vbo_array[i]->buffer;
-      vbp.vb[i].max_index = brw->vb.vbo_array[i]->max_index;
-   }
-
-
-   vbp.header.bits.length = (1 + nr_enabled * 4) - 2;
-   vbp.header.bits.opcode = CMD_VERTEX_BUFFER;
-
-   BEGIN_BATCH(vbp.header.bits.length+2, 0);
-   OUT_BATCH( vbp.header.dword );
-
-   for (i = 0; i < nr_enabled; i++) {
-      OUT_BATCH( vbp.vb[i].vb0.dword );
-      OUT_RELOC( vbp.vb[i].buffer,  PIPE_BUFFER_USAGE_GPU_READ,
-		 vbp.vb[i].offset);
-      OUT_BATCH( vbp.vb[i].max_index );
-      OUT_BATCH( vbp.vb[i].instance_data_step_rate );
-   }
-   ADVANCE_BATCH();
-   return TRUE;
-}
-
-
-
-boolean brw_upload_vertex_elements( struct brw_context *brw )
-{
-   struct brw_vertex_element_packet vep;
-
-   unsigned i;
-   unsigned nr_enabled = brw->attribs.VertexProgram->info.num_inputs;
-
-   memset(&vep, 0, sizeof(vep));
-
-   for (i = 0; i < nr_enabled; i++) 
-      vep.ve[i] = brw->vb.inputs[i];
-
-
-   vep.header.length = (1 + nr_enabled * sizeof(vep.ve[0])/4) - 2;
-   vep.header.opcode = CMD_VERTEX_ELEMENT;
-   brw_cached_batch_struct(brw, &vep, 4 + nr_enabled * sizeof(vep.ve[0]));
-
-   return TRUE;
-}
-
-boolean brw_upload_indices( struct brw_context *brw,
-                            const struct pipe_buffer *index_buffer,
-                            int ib_size, int start, int count)
-{
-   /* Emit the indexbuffer packet:
-    */
-   {
-      struct brw_indexbuffer ib;
-
-      memset(&ib, 0, sizeof(ib));
-
-      ib.header.bits.opcode = CMD_INDEX_BUFFER;
-      ib.header.bits.length = sizeof(ib)/4 - 2;
-      ib.header.bits.index_format = get_index_type(ib_size);
-      ib.header.bits.cut_index_enable = 0;
-
-
-      BEGIN_BATCH(4, 0);
-      OUT_BATCH( ib.header.dword );
-      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start);
-      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start + count);
-      OUT_BATCH( 0 );
-      ADVANCE_BATCH();
-   }
-   return TRUE;
-}
diff --git a/src/gallium/drivers/i965simple/brw_eu.c b/src/gallium/drivers/i965simple/brw_eu.c
deleted file mode 100644
index e2002d1821f..00000000000
--- a/src/gallium/drivers/i965simple/brw_eu.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-  
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_eu.h"
-
-
-
-/* How does predicate control work when execution_size != 8?  Do I
- * need to test/set for 0xffff when execution_size is 16?
- */
-void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value )
-{
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;
-
-   if (value != 0xff) {
-      if (value != p->flag_value) {
-	 brw_push_insn_state(p);
-	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
-	 p->flag_value = value;
-	 brw_pop_insn_state(p);
-      }
-
-      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
-   }   
-}
-
-void brw_set_predicate_control( struct brw_compile *p, unsigned pc )
-{
-   p->current->header.predicate_control = pc;
-}
-
-void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional )
-{
-   p->current->header.destreg__conditonalmod = conditional;
-}
-
-void brw_set_access_mode( struct brw_compile *p, unsigned access_mode )
-{
-   p->current->header.access_mode = access_mode;
-}
-
-void brw_set_compression_control( struct brw_compile *p, boolean compression_control )
-{
-   p->current->header.compression_control = compression_control;
-}
-
-void brw_set_mask_control( struct brw_compile *p, unsigned value )
-{
-   p->current->header.mask_control = value;
-}
-
-void brw_set_saturate( struct brw_compile *p, unsigned value )
-{
-   p->current->header.saturate = value;
-}
-
-void brw_push_insn_state( struct brw_compile *p )
-{
-   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
-   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
-   p->current++;   
-}
-
-void brw_pop_insn_state( struct brw_compile *p )
-{
-   assert(p->current != p->stack);
-   p->current--;
-}
-
-
-/***********************************************************************
- */
-void brw_init_compile( struct brw_compile *p )
-{
-   p->nr_insn = 0;
-   p->current = p->stack;
-   memset(p->current, 0, sizeof(p->current[0]));
-
-   /* Some defaults?
-    */
-   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
-   brw_set_saturate(p, 0);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_predicate_control_flag_value(p, 0xff); 
-}
-
-
-const unsigned *brw_get_program( struct brw_compile *p,
-			       unsigned *sz )
-{
-   unsigned i;
-
-   for (i = 0; i < 8; i++)
-      brw_NOP(p);
-
-   *sz = p->nr_insn * sizeof(struct brw_instruction);
-   return (const unsigned *)p->store;
-}
-
diff --git a/src/gallium/drivers/i965simple/brw_eu.h b/src/gallium/drivers/i965simple/brw_eu.h
deleted file mode 100644
index 23151ae9ed6..00000000000
--- a/src/gallium/drivers/i965simple/brw_eu.h
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#ifndef BRW_EU_H
-#define BRW_EU_H
-
-#include "brw_structs.h"
-#include "brw_defines.h"
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_shader_tokens.h"
-
-#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
-#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
-
-#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
-#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
-#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
-#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
-
-
-#define REG_SIZE (8*4)
-
-
-/* These aren't hardware structs, just something useful for us to pass around:
- *
- * Align1 operation has a lot of control over input ranges.  Used in
- * WM programs to implement shaders decomposed into "channel serial"
- * or "structure of array" form:
- */
-struct brw_reg
-{
-   unsigned type:4;
-   unsigned file:2;
-   unsigned nr:8;
-   unsigned subnr:5;		/* :1 in align16 */
-   unsigned negate:1;		/* source only */
-   unsigned abs:1;		/* source only */
-   unsigned vstride:4;		/* source only */
-   unsigned width:3;		/* src only, align1 only */
-   unsigned hstride:2;   		/* src only, align1 only */
-   unsigned address_mode:1;	/* relative addressing, hopefully! */
-   unsigned pad0:1;
-
-   union {
-      struct {
-	 unsigned swizzle:8;		/* src only, align16 only */
-	 unsigned writemask:4;		/* dest only, align16 only */
-	 int  indirect_offset:10;	/* relative addressing offset */
-	 unsigned pad1:10;		/* two dwords total */
-      } bits;
-
-      float f;
-      int   d;
-      unsigned ud;
-   } dw1;
-};
-
-
-struct brw_indirect {
-   unsigned addr_subnr:4;
-   int addr_offset:10;
-   unsigned pad:18;
-};
-
-
-#define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 1200
-
-struct brw_compile {
-   struct brw_instruction store[BRW_EU_MAX_INSN];
-   unsigned nr_insn;
-
-   /* Allow clients to push/pop instruction state:
-    */
-   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
-   struct brw_instruction *current;
-
-   unsigned flag_value;
-   boolean single_program_flow;
-};
-
-
-
-static __inline int type_sz( unsigned type )
-{
-   switch( type ) {
-   case BRW_REGISTER_TYPE_UD:
-   case BRW_REGISTER_TYPE_D:
-   case BRW_REGISTER_TYPE_F:
-      return 4;
-   case BRW_REGISTER_TYPE_HF:
-   case BRW_REGISTER_TYPE_UW:
-   case BRW_REGISTER_TYPE_W:
-      return 2;
-   case BRW_REGISTER_TYPE_UB:
-   case BRW_REGISTER_TYPE_B:
-      return 1;
-   default:
-      return 0;
-   }
-}
-
-static __inline struct brw_reg brw_reg( unsigned file,
-					unsigned nr,
-					unsigned subnr,
-					unsigned type,
-					unsigned vstride,
-					unsigned width,
-					unsigned hstride,
-					unsigned swizzle,
-					unsigned writemask)
-{
-
-   struct brw_reg reg;
-   reg.type = type;
-   reg.file = file;
-   reg.nr = nr;
-   reg.subnr = subnr * type_sz(type);
-   reg.negate = 0;
-   reg.abs = 0;
-   reg.vstride = vstride;
-   reg.width = width;
-   reg.hstride = hstride;
-   reg.address_mode = BRW_ADDRESS_DIRECT;
-   reg.pad0 = 0;
-
-   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
-    * set swizzle and writemask to W, as the lower bits of subnr will
-    * be lost when converted to align16.  This is probably too much to
-    * keep track of as you'd want it adjusted by suboffset(), etc.
-    * Perhaps fix up when converting to align16?
-    */
-   reg.dw1.bits.swizzle = swizzle;
-   reg.dw1.bits.writemask = writemask;
-   reg.dw1.bits.indirect_offset = 0;
-   reg.dw1.bits.pad1 = 0;
-   return reg;
-}
-
-static __inline struct brw_reg brw_vec16_reg( unsigned file,
-					      unsigned nr,
-					      unsigned subnr )
-{
-   return brw_reg(file,
-		  nr,
-		  subnr,
-		  BRW_REGISTER_TYPE_F,
-		  BRW_VERTICAL_STRIDE_16,
-		  BRW_WIDTH_16,
-		  BRW_HORIZONTAL_STRIDE_1,
-		  BRW_SWIZZLE_XYZW,
-		  TGSI_WRITEMASK_XYZW);
-}
-
-static __inline struct brw_reg brw_vec8_reg( unsigned file,
-					     unsigned nr,
-					     unsigned subnr )
-{
-   return brw_reg(file,
-		  nr,
-		  subnr,
-		  BRW_REGISTER_TYPE_F,
-		  BRW_VERTICAL_STRIDE_8,
-		  BRW_WIDTH_8,
-		  BRW_HORIZONTAL_STRIDE_1,
-		  BRW_SWIZZLE_XYZW,
-		  TGSI_WRITEMASK_XYZW);
-}
-
-
-static __inline struct brw_reg brw_vec4_reg( unsigned file,
-					      unsigned nr,
-					      unsigned subnr )
-{
-   return brw_reg(file,
-		  nr,
-		  subnr,
-		  BRW_REGISTER_TYPE_F,
-		  BRW_VERTICAL_STRIDE_4,
-		  BRW_WIDTH_4,
-		  BRW_HORIZONTAL_STRIDE_1,
-		  BRW_SWIZZLE_XYZW,
-		  TGSI_WRITEMASK_XYZW);
-}
-
-
-static __inline struct brw_reg brw_vec2_reg( unsigned file,
-					      unsigned nr,
-					      unsigned subnr )
-{
-   return brw_reg(file,
-		  nr,
-		  subnr,
-		  BRW_REGISTER_TYPE_F,
-		  BRW_VERTICAL_STRIDE_2,
-		  BRW_WIDTH_2,
-		  BRW_HORIZONTAL_STRIDE_1,
-		  BRW_SWIZZLE_XYXY,
-		  TGSI_WRITEMASK_XY);
-}
-
-static __inline struct brw_reg brw_vec1_reg( unsigned file,
-					     unsigned nr,
-					     unsigned subnr )
-{
-   return brw_reg(file,
-		  nr,
-		  subnr,
-		  BRW_REGISTER_TYPE_F,
-		  BRW_VERTICAL_STRIDE_0,
-		  BRW_WIDTH_1,
-		  BRW_HORIZONTAL_STRIDE_0,
-		  BRW_SWIZZLE_XXXX,
-		  TGSI_WRITEMASK_X);
-}
-
-
-static __inline struct brw_reg retype( struct brw_reg reg,
-				       unsigned type )
-{
-   reg.type = type;
-   return reg;
-}
-
-static __inline struct brw_reg suboffset( struct brw_reg reg,
-					  unsigned delta )
-{
-   reg.subnr += delta * type_sz(reg.type);
-   return reg;
-}
-
-
-static __inline struct brw_reg offset( struct brw_reg reg,
-				       unsigned delta )
-{
-   reg.nr += delta;
-   return reg;
-}
-
-
-static __inline struct brw_reg byte_offset( struct brw_reg reg,
-					    unsigned bytes )
-{
-   unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
-   reg.nr = newoffset / REG_SIZE;
-   reg.subnr = newoffset % REG_SIZE;
-   return reg;
-}
-
-
-static __inline struct brw_reg brw_uw16_reg( unsigned file,
-					     unsigned nr,
-					     unsigned subnr )
-{
-   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
-}
-
-static __inline struct brw_reg brw_uw8_reg( unsigned file,
-					    unsigned nr,
-					    unsigned subnr )
-{
-   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
-}
-
-static __inline struct brw_reg brw_uw1_reg( unsigned file,
-					    unsigned nr,
-					    unsigned subnr )
-{
-   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
-}
-
-static __inline struct brw_reg brw_imm_reg( unsigned type )
-{
-   return brw_reg( BRW_IMMEDIATE_VALUE,
-		   0,
-		   0,
-		   type,
-		   BRW_VERTICAL_STRIDE_0,
-		   BRW_WIDTH_1,
-		   BRW_HORIZONTAL_STRIDE_0,
-		   0,
-		   0);
-}
-
-static __inline struct brw_reg brw_imm_f( float f )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
-   imm.dw1.f = f;
-   return imm;
-}
-
-static __inline struct brw_reg brw_imm_d( int d )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
-   imm.dw1.d = d;
-   return imm;
-}
-
-static __inline struct brw_reg brw_imm_ud( unsigned ud )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
-   imm.dw1.ud = ud;
-   return imm;
-}
-
-static __inline struct brw_reg brw_imm_uw( ushort uw )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
-   imm.dw1.ud = uw;
-   return imm;
-}
-
-static __inline struct brw_reg brw_imm_w( short w )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
-   imm.dw1.d = w;
-   return imm;
-}
-
-/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
- * numbers alias with _V and _VF below:
- */
-
-/* Vector of eight signed half-byte values:
- */
-static __inline struct brw_reg brw_imm_v( unsigned v )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
-   imm.vstride = BRW_VERTICAL_STRIDE_0;
-   imm.width = BRW_WIDTH_8;
-   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
-   imm.dw1.ud = v;
-   return imm;
-}
-
-/* Vector of four 8-bit float values:
- */
-static __inline struct brw_reg brw_imm_vf( unsigned v )
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
-   imm.vstride = BRW_VERTICAL_STRIDE_0;
-   imm.width = BRW_WIDTH_4;
-   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
-   imm.dw1.ud = v;
-   return imm;
-}
-
-#define VF_ZERO 0x0
-#define VF_ONE  0x30
-#define VF_NEG  (1<<7)
-
-static __inline struct brw_reg brw_imm_vf4( unsigned v0,
-					    unsigned v1,
-					    unsigned v2,
-					    unsigned v3)
-{
-   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
-   imm.vstride = BRW_VERTICAL_STRIDE_0;
-   imm.width = BRW_WIDTH_4;
-   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
-   imm.dw1.ud = ((v0 << 0) |
-		 (v1 << 8) |
-		 (v2 << 16) |
-		 (v3 << 24));
-   return imm;
-}
-
-
-static __inline struct brw_reg brw_address( struct brw_reg reg )
-{
-   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
-}
-
-
-static __inline struct brw_reg brw_vec1_grf( unsigned nr,
-					       unsigned subnr )
-{
-   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
-}
-
-static __inline struct brw_reg brw_vec8_grf( unsigned nr,
-					     unsigned subnr )
-{
-   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
-}
-
-static __inline struct brw_reg brw_vec4_grf( unsigned nr,
-					     unsigned subnr )
-{
-   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
-}
-
-
-static __inline struct brw_reg brw_vec2_grf( unsigned nr,
-					     unsigned subnr )
-{
-   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
-}
-
-static __inline struct brw_reg brw_uw8_grf( unsigned nr,
-					    unsigned subnr )
-{
-   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
-}
-
-static __inline struct brw_reg brw_null_reg( void )
-{
-   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-		       BRW_ARF_NULL,
-		       0);
-}
-
-static __inline struct brw_reg brw_address_reg( unsigned subnr )
-{
-   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-		      BRW_ARF_ADDRESS,
-		      subnr);
-}
-
-/* If/else instructions break in align16 mode if writemask & swizzle
- * aren't xyzw.  This goes against the convention for other scalar
- * regs:
- */
-static __inline struct brw_reg brw_ip_reg( void )
-{
-   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-		  BRW_ARF_IP,
-		  0,
-		  BRW_REGISTER_TYPE_UD,
-		  BRW_VERTICAL_STRIDE_4, /* ? */
-		  BRW_WIDTH_1,
-		  BRW_HORIZONTAL_STRIDE_0,
-		  BRW_SWIZZLE_XYZW, /* NOTE! */
-		  TGSI_WRITEMASK_XYZW); /* NOTE! */
-}
-
-static __inline struct brw_reg brw_acc_reg( void )
-{
-   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-		       BRW_ARF_ACCUMULATOR,
-		       0);
-}
-
-
-static __inline struct brw_reg brw_flag_reg( void )
-{
-   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-		      BRW_ARF_FLAG,
-		      0);
-}
-
-
-static __inline struct brw_reg brw_mask_reg( unsigned subnr )
-{
-   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-		      BRW_ARF_MASK,
-		      subnr);
-}
-
-static __inline struct brw_reg brw_message_reg( unsigned nr )
-{
-   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
-		       nr,
-		       0);
-}
-
-
-
-
-/* This is almost always called with a numeric constant argument, so
- * make things easy to evaluate at compile time:
- */
-static __inline unsigned cvt( unsigned val )
-{
-   switch (val) {
-   case 0: return 0;
-   case 1: return 1;
-   case 2: return 2;
-   case 4: return 3;
-   case 8: return 4;
-   case 16: return 5;
-   case 32: return 6;
-   }
-   return 0;
-}
-
-static __inline struct brw_reg stride( struct brw_reg reg,
-				       unsigned vstride,
-				       unsigned width,
-				       unsigned hstride )
-{
-
-   reg.vstride = cvt(vstride);
-   reg.width = cvt(width) - 1;
-   reg.hstride = cvt(hstride);
-   return reg;
-}
-
-static __inline struct brw_reg vec16( struct brw_reg reg )
-{
-   return stride(reg, 16,16,1);
-}
-
-static __inline struct brw_reg vec8( struct brw_reg reg )
-{
-   return stride(reg, 8,8,1);
-}
-
-static __inline struct brw_reg vec4( struct brw_reg reg )
-{
-   return stride(reg, 4,4,1);
-}
-
-static __inline struct brw_reg vec2( struct brw_reg reg )
-{
-   return stride(reg, 2,2,1);
-}
-
-static __inline struct brw_reg vec1( struct brw_reg reg )
-{
-   return stride(reg, 0,1,0);
-}
-
-static __inline struct brw_reg get_element( struct brw_reg reg, unsigned elt )
-{
-   return vec1(suboffset(reg, elt));
-}
-
-static __inline struct brw_reg get_element_ud( struct brw_reg reg, unsigned elt )
-{
-   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
-}
-
-
-static __inline struct brw_reg brw_swizzle( struct brw_reg reg,
-					    unsigned x,
-					    unsigned y,
-					    unsigned z,
-					    unsigned w)
-{
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
-				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
-				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
-				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
-   return reg;
-}
-
-
-static __inline struct brw_reg brw_swizzle1( struct brw_reg reg,
-					     unsigned x )
-{
-   return brw_swizzle(reg, x, x, x, x);
-}
-
-static __inline struct brw_reg brw_writemask( struct brw_reg reg,
-					      unsigned mask )
-{
-   reg.dw1.bits.writemask &= mask;
-   return reg;
-}
-
-static __inline struct brw_reg brw_set_writemask( struct brw_reg reg,
-						  unsigned mask )
-{
-   reg.dw1.bits.writemask = mask;
-   return reg;
-}
-
-static __inline struct brw_reg negate( struct brw_reg reg )
-{
-   reg.negate ^= 1;
-   return reg;
-}
-
-static __inline struct brw_reg brw_abs( struct brw_reg reg )
-{
-   reg.abs = 1;
-   return reg;
-}
-
-/***********************************************************************
- */
-static __inline struct brw_reg brw_vec4_indirect( unsigned subnr,
-						  int offset )
-{
-   struct brw_reg reg =  brw_vec4_grf(0, 0);
-   reg.subnr = subnr;
-   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
-   reg.dw1.bits.indirect_offset = offset;
-   return reg;
-}
-
-static __inline struct brw_reg brw_vec1_indirect( unsigned subnr,
-						  int offset )
-{
-   struct brw_reg reg =  brw_vec1_grf(0, 0);
-   reg.subnr = subnr;
-   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
-   reg.dw1.bits.indirect_offset = offset;
-   return reg;
-}
-
-static __inline struct brw_reg deref_4f(struct brw_indirect ptr, int offset)
-{
-   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
-}
-
-static __inline struct brw_reg deref_1f(struct brw_indirect ptr, int offset)
-{
-   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
-}
-
-static __inline struct brw_reg deref_4b(struct brw_indirect ptr, int offset)
-{
-   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
-}
-
-static __inline struct brw_reg deref_1uw(struct brw_indirect ptr, int offset)
-{
-   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
-}
-
-static __inline struct brw_reg deref_1ud(struct brw_indirect ptr, int offset)
-{
-   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
-}
-
-static __inline struct brw_reg get_addr_reg(struct brw_indirect ptr)
-{
-   return brw_address_reg(ptr.addr_subnr);
-}
-
-static __inline struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, int offset )
-{
-   ptr.addr_offset += offset;
-   return ptr;
-}
-
-static __inline struct brw_indirect brw_indirect( unsigned addr_subnr, int offset )
-{
-   struct brw_indirect ptr;
-   ptr.addr_subnr = addr_subnr;
-   ptr.addr_offset = offset;
-   ptr.pad = 0;
-   return ptr;
-}
-
-static __inline struct brw_instruction *current_insn( struct brw_compile *p)
-{
-	return &p->store[p->nr_insn];
-}
-
-void brw_pop_insn_state( struct brw_compile *p );
-void brw_push_insn_state( struct brw_compile *p );
-void brw_set_mask_control( struct brw_compile *p, unsigned value );
-void brw_set_saturate( struct brw_compile *p, unsigned value );
-void brw_set_access_mode( struct brw_compile *p, unsigned access_mode );
-void brw_set_compression_control( struct brw_compile *p, boolean control );
-void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value );
-void brw_set_predicate_control( struct brw_compile *p, unsigned pc );
-void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional );
-
-void brw_init_compile( struct brw_compile *p );
-const unsigned *brw_get_program( struct brw_compile *p, unsigned *sz );
-
-
-struct brw_instruction *brw_alu1( struct brw_compile *p,
-				  unsigned opcode,
-				  struct brw_reg dest,
-				  struct brw_reg src );
-
-struct brw_instruction *brw_alu2(struct brw_compile *p,
-				 unsigned opcode,
-				 struct brw_reg dest,
-				 struct brw_reg src0,
-				 struct brw_reg src1 );
-
-/* Helpers for regular instructions:
- */
-#define ALU1(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,	\
-	      struct brw_reg dest,			\
-	      struct brw_reg src0);
-
-#define ALU2(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,	\
-	      struct brw_reg dest,			\
-	      struct brw_reg src0,			\
-	      struct brw_reg src1);
-
-ALU1(MOV)
-ALU2(SEL)
-ALU1(NOT)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHR)
-ALU2(SHL)
-ALU2(RSR)
-ALU2(RSL)
-ALU2(ASR)
-ALU2(JMPI)
-ALU2(ADD)
-ALU2(MUL)
-ALU1(FRC)
-ALU1(RNDD)
-ALU2(MAC)
-ALU2(MACH)
-ALU1(LZD)
-ALU2(DP4)
-ALU2(DPH)
-ALU2(DP3)
-ALU2(DP2)
-ALU2(LINE)
-
-#undef ALU1
-#undef ALU2
-
-
-
-/* Helpers for SEND instruction:
- */
-void brw_urb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   unsigned msg_reg_nr,
-		   struct brw_reg src0,
-		   boolean allocate,
-		   boolean used,
-		   unsigned msg_length,
-		   unsigned response_length,
-		   boolean eot,
-		   boolean writes_complete,
-		   unsigned offset,
-		   unsigned swizzle);
-
-void brw_fb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   unsigned msg_reg_nr,
-		   struct brw_reg src0,
-		   unsigned binding_table_index,
-		   unsigned msg_length,
-		   unsigned response_length,
-		   boolean eot);
-
-void brw_SAMPLE(struct brw_compile *p,
-		struct brw_reg dest,
-		unsigned msg_reg_nr,
-		struct brw_reg src0,
-		unsigned binding_table_index,
-		unsigned sampler,
-		unsigned writemask,
-		unsigned msg_type,
-		unsigned response_length,
-		unsigned msg_length,
-		boolean eot);
-
-void brw_math_16( struct brw_compile *p,
-		  struct brw_reg dest,
-		  unsigned function,
-		  unsigned saturate,
-		  unsigned msg_reg_nr,
-		  struct brw_reg src,
-		  unsigned precision );
-
-void brw_math( struct brw_compile *p,
-	       struct brw_reg dest,
-	       unsigned function,
-	       unsigned saturate,
-	       unsigned msg_reg_nr,
-	       struct brw_reg src,
-	       unsigned data_type,
-	       unsigned precision );
-
-void brw_dp_READ_16( struct brw_compile *p,
-		     struct brw_reg dest,
-		     unsigned msg_reg_nr,
-		     unsigned scratch_offset );
-
-void brw_dp_WRITE_16( struct brw_compile *p,
-		      struct brw_reg src,
-		      unsigned msg_reg_nr,
-		      unsigned scratch_offset );
-
-/* If/else/endif.  Works by manipulating the execution flags on each
- * channel.
- */
-struct brw_instruction *brw_IF(struct brw_compile *p,
-			       unsigned execute_size);
-
-struct brw_instruction *brw_ELSE(struct brw_compile *p,
-				 struct brw_instruction *if_insn);
-
-void brw_ENDIF(struct brw_compile *p,
-	       struct brw_instruction *if_or_else_insn);
-
-
-/* DO/WHILE loops:
- */
-struct brw_instruction *brw_DO(struct brw_compile *p,
-			       unsigned execute_size);
-
-struct brw_instruction *brw_WHILE(struct brw_compile *p,
-	       struct brw_instruction *patch_insn);
-
-struct brw_instruction *brw_BREAK(struct brw_compile *p);
-struct brw_instruction *brw_CONT(struct brw_compile *p);
-/* Forward jumps:
- */
-void brw_land_fwd_jump(struct brw_compile *p,
-		       struct brw_instruction *jmp_insn);
-
-
-
-void brw_NOP(struct brw_compile *p);
-
-/* Special case: there is never a destination, execution size will be
- * taken from src0:
- */
-void brw_CMP(struct brw_compile *p,
-	     struct brw_reg dest,
-	     unsigned conditional,
-	     struct brw_reg src0,
-	     struct brw_reg src1);
-
-void brw_print_reg( struct brw_reg reg );
-
-
-/***********************************************************************
- * brw_eu_util.c:
- */
-
-void brw_copy_indirect_to_indirect(struct brw_compile *p,
-				   struct brw_indirect dst_ptr,
-				   struct brw_indirect src_ptr,
-				   unsigned count);
-
-void brw_copy_from_indirect(struct brw_compile *p,
-			    struct brw_reg dst,
-			    struct brw_indirect ptr,
-			    unsigned count);
-
-void brw_copy4(struct brw_compile *p,
-	       struct brw_reg dst,
-	       struct brw_reg src,
-	       unsigned count);
-
-void brw_copy8(struct brw_compile *p,
-	       struct brw_reg dst,
-	       struct brw_reg src,
-	       unsigned count);
-
-void brw_math_invert( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg src);
-
-void brw_set_src1( struct brw_instruction *insn,
-                          struct brw_reg reg );
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_eu_debug.c b/src/gallium/drivers/i965simple/brw_eu_debug.c
deleted file mode 100644
index 4adfb0c02f5..00000000000
--- a/src/gallium/drivers/i965simple/brw_eu_debug.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-    
-
-#include "util/u_debug.h"
-
-#include "brw_eu.h"
-
-void brw_print_reg( struct brw_reg hwreg )
-{
-   static const char *file[] = {
-      "arf",
-      "grf",
-      "msg",
-      "imm"
-   };
-
-   static const char *type[] = {
-      "ud",
-      "d",
-      "uw",
-      "w",
-      "ub",
-      "vf",
-      "hf",
-      "f"
-   };
-
-   debug_printf("%s%s", 
-		hwreg.abs ? "abs/" : "",
-		hwreg.negate ? "-" : "");
-     
-   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
-       hwreg.nr % 2 == 0 &&
-       hwreg.subnr == 0 &&
-       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
-       hwreg.width == BRW_WIDTH_8 &&
-       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
-       hwreg.type == BRW_REGISTER_TYPE_F) {
-      debug_printf("vec%d", hwreg.nr);
-   }
-   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
-	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
-	    hwreg.width == BRW_WIDTH_1 &&
-	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
-	    hwreg.type == BRW_REGISTER_TYPE_F) {      
-      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
-   }
-   else {
-      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
-		   file[hwreg.file],
-		   hwreg.nr,
-		   hwreg.subnr / type_sz(hwreg.type),
-		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
-		   1<<hwreg.width,
-		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
-		   type[hwreg.type]);
-   }
-}
-
-
-
diff --git a/src/gallium/drivers/i965simple/brw_eu_emit.c b/src/gallium/drivers/i965simple/brw_eu_emit.c
deleted file mode 100644
index 400a80b6fba..00000000000
--- a/src/gallium/drivers/i965simple/brw_eu_emit.c
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_eu.h"
-
-
-
-
-/***********************************************************************
- * Internal helper for constructing instructions
- */
-
-static void guess_execution_size( struct brw_instruction *insn,
-				  struct brw_reg reg )
-{
-   if (reg.width == BRW_WIDTH_8 &&
-       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED)
-      insn->header.execution_size = BRW_EXECUTE_16;
-   else
-      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
-}
-
-
-static void brw_set_dest( struct brw_instruction *insn,
-			  struct brw_reg dest )
-{
-   insn->bits1.da1.dest_reg_file = dest.file;
-   insn->bits1.da1.dest_reg_type = dest.type;
-   insn->bits1.da1.dest_address_mode = dest.address_mode;
-
-   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
-      insn->bits1.da1.dest_reg_nr = dest.nr;
-
-      if (insn->header.access_mode == BRW_ALIGN_1) {
-	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
-	 insn->bits1.da1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
-      }
-      else {
-	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
-	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
-      }
-   }
-   else {
-      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
-
-      /* These are different sizes in align1 vs align16:
-       */
-      if (insn->header.access_mode == BRW_ALIGN_1) {
-	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
-	 insn->bits1.ia1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
-      }
-      else {
-	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
-      }
-   }
-
-   /* NEW: Set the execution size based on dest.width and
-    * insn->compression_control:
-    */
-   guess_execution_size(insn, dest);
-}
-
-static void brw_set_src0( struct brw_instruction *insn,
-		      struct brw_reg reg )
-{
-   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
-
-   insn->bits1.da1.src0_reg_file = reg.file;
-   insn->bits1.da1.src0_reg_type = reg.type;
-   insn->bits2.da1.src0_abs = reg.abs;
-   insn->bits2.da1.src0_negate = reg.negate;
-   insn->bits2.da1.src0_address_mode = reg.address_mode;
-
-   if (reg.file == BRW_IMMEDIATE_VALUE) {
-      insn->bits3.ud = reg.dw1.ud;
-
-      /* Required to set some fields in src1 as well:
-       */
-      insn->bits1.da1.src1_reg_file = 0; /* arf */
-      insn->bits1.da1.src1_reg_type = reg.type;
-   }
-   else
-   {
-      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
-	 if (insn->header.access_mode == BRW_ALIGN_1) {
-	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
-	    insn->bits2.da1.src0_reg_nr = reg.nr;
-	 }
-	 else {
-	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
-	    insn->bits2.da16.src0_reg_nr = reg.nr;
-	 }
-      }
-      else {
-	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
-
-	 if (insn->header.access_mode == BRW_ALIGN_1) {
-	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
-	 }
-	 else {
-	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
-	 }
-      }
-
-      if (insn->header.access_mode == BRW_ALIGN_1) {
-	 if (reg.width == BRW_WIDTH_1 &&
-	     insn->header.execution_size == BRW_EXECUTE_1) {
-	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
-	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
-	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
-	 }
-	 else {
-	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
-	    insn->bits2.da1.src0_width = reg.width;
-	    insn->bits2.da1.src0_vert_stride = reg.vstride;
-	 }
-      }
-      else {
-	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
-	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
-	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
-	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
-
-	 /* This is an oddity of the fact we're using the same
-	  * descriptions for registers in align_16 as align_1:
-	  */
-	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
-	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
-	 else
-	    insn->bits2.da16.src0_vert_stride = reg.vstride;
-      }
-   }
-}
-
-
-void brw_set_src1( struct brw_instruction *insn,
-			  struct brw_reg reg )
-{
-   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
-
-   insn->bits1.da1.src1_reg_file = reg.file;
-   insn->bits1.da1.src1_reg_type = reg.type;
-   insn->bits3.da1.src1_abs = reg.abs;
-   insn->bits3.da1.src1_negate = reg.negate;
-
-   /* Only src1 can be immediate in two-argument instructions.
-    */
-   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
-
-   if (reg.file == BRW_IMMEDIATE_VALUE) {
-      insn->bits3.ud = reg.dw1.ud;
-   }
-   else {
-      /* This is a hardware restriction, which may or may not be lifted
-       * in the future:
-       */
-      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
-      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
-
-      if (insn->header.access_mode == BRW_ALIGN_1) {
-	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
-	 insn->bits3.da1.src1_reg_nr = reg.nr;
-      }
-      else {
-	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
-	 insn->bits3.da16.src1_reg_nr = reg.nr;
-      }
-
-      if (insn->header.access_mode == BRW_ALIGN_1) {
-	 if (reg.width == BRW_WIDTH_1 &&
-	     insn->header.execution_size == BRW_EXECUTE_1) {
-	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
-	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
-	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
-	 }
-	 else {
-	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
-	    insn->bits3.da1.src1_width = reg.width;
-	    insn->bits3.da1.src1_vert_stride = reg.vstride;
-	 }
-      }
-      else {
-	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
-	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
-	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
-	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
-
-	 /* This is an oddity of the fact we're using the same
-	  * descriptions for registers in align_16 as align_1:
-	  */
-	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
-	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
-	 else
-	    insn->bits3.da16.src1_vert_stride = reg.vstride;
-      }
-   }
-}
-
-
-
-static void brw_set_math_message( struct brw_instruction *insn,
-				  unsigned msg_length,
-				  unsigned response_length,
-				  unsigned function,
-				  unsigned integer_type,
-				  boolean low_precision,
-				  boolean saturate,
-				  unsigned dataType )
-{
-   brw_set_src1(insn, brw_imm_d(0));
-
-   insn->bits3.math.function = function;
-   insn->bits3.math.int_type = integer_type;
-   insn->bits3.math.precision = low_precision;
-   insn->bits3.math.saturate = saturate;
-   insn->bits3.math.data_type = dataType;
-   insn->bits3.math.response_length = response_length;
-   insn->bits3.math.msg_length = msg_length;
-   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
-   insn->bits3.math.end_of_thread = 0;
-}
-
-static void brw_set_urb_message( struct brw_instruction *insn,
-				 boolean allocate,
-				 boolean used,
-				 unsigned msg_length,
-				 unsigned response_length,
-				 boolean end_of_thread,
-				 boolean complete,
-				 unsigned offset,
-				 unsigned swizzle_control )
-{
-   brw_set_src1(insn, brw_imm_d(0));
-
-   insn->bits3.urb.opcode = 0;	/* ? */
-   insn->bits3.urb.offset = offset;
-   insn->bits3.urb.swizzle_control = swizzle_control;
-   insn->bits3.urb.allocate = allocate;
-   insn->bits3.urb.used = used;	/* ? */
-   insn->bits3.urb.complete = complete;
-   insn->bits3.urb.response_length = response_length;
-   insn->bits3.urb.msg_length = msg_length;
-   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
-   insn->bits3.urb.end_of_thread = end_of_thread;
-}
-
-static void brw_set_dp_write_message( struct brw_instruction *insn,
-				      unsigned binding_table_index,
-				      unsigned msg_control,
-				      unsigned msg_type,
-				      unsigned msg_length,
-				      unsigned pixel_scoreboard_clear,
-				      unsigned response_length,
-				      unsigned end_of_thread )
-{
-   brw_set_src1(insn, brw_imm_d(0));
-
-   insn->bits3.dp_write.binding_table_index = binding_table_index;
-   insn->bits3.dp_write.msg_control = msg_control;
-   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
-   insn->bits3.dp_write.msg_type = msg_type;
-   insn->bits3.dp_write.send_commit_msg = 0;
-   insn->bits3.dp_write.response_length = response_length;
-   insn->bits3.dp_write.msg_length = msg_length;
-   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-   insn->bits3.urb.end_of_thread = end_of_thread;
-}
-
-static void brw_set_dp_read_message( struct brw_instruction *insn,
-				      unsigned binding_table_index,
-				      unsigned msg_control,
-				      unsigned msg_type,
-				      unsigned target_cache,
-				      unsigned msg_length,
-				      unsigned response_length,
-				      unsigned end_of_thread )
-{
-   brw_set_src1(insn, brw_imm_d(0));
-
-   insn->bits3.dp_read.binding_table_index = binding_table_index;
-   insn->bits3.dp_read.msg_control = msg_control;
-   insn->bits3.dp_read.msg_type = msg_type;
-   insn->bits3.dp_read.target_cache = target_cache;
-   insn->bits3.dp_read.response_length = response_length;
-   insn->bits3.dp_read.msg_length = msg_length;
-   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ;
-   insn->bits3.dp_read.end_of_thread = end_of_thread;
-}
-
-static void brw_set_sampler_message( struct brw_instruction *insn,
-				     unsigned binding_table_index,
-				     unsigned sampler,
-				     unsigned msg_type,
-				     unsigned response_length,
-				     unsigned msg_length,
-				     boolean eot)
-{
-   brw_set_src1(insn, brw_imm_d(0));
-
-   insn->bits3.sampler.binding_table_index = binding_table_index;
-   insn->bits3.sampler.sampler = sampler;
-   insn->bits3.sampler.msg_type = msg_type;
-   insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
-   insn->bits3.sampler.response_length = response_length;
-   insn->bits3.sampler.msg_length = msg_length;
-   insn->bits3.sampler.end_of_thread = eot;
-   insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
-}
-
-
-
-static struct brw_instruction *next_insn( struct brw_compile *p,
-					  unsigned opcode )
-{
-   struct brw_instruction *insn;
-
-   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
-
-   insn = &p->store[p->nr_insn++];
-   memcpy(insn, p->current, sizeof(*insn));
-
-   /* Reset this one-shot flag:
-    */
-
-   if (p->current->header.destreg__conditonalmod) {
-      p->current->header.destreg__conditonalmod = 0;
-      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
-   }
-
-   insn->header.opcode = opcode;
-   return insn;
-}
-
-
-struct brw_instruction *brw_alu1( struct brw_compile *p,
-				  unsigned opcode,
-				  struct brw_reg dest,
-				  struct brw_reg src )
-{
-   struct brw_instruction *insn = next_insn(p, opcode);
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   return insn;
-}
-
-struct brw_instruction *brw_alu2(struct brw_compile *p,
-				 unsigned opcode,
-				 struct brw_reg dest,
-				 struct brw_reg src0,
-				 struct brw_reg src1 )
-{
-   struct brw_instruction *insn = next_insn(p, opcode);
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, src1);
-   return insn;
-}
-
-
-/***********************************************************************
- * Convenience routines.
- */
-#define ALU1(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
-	      struct brw_reg dest,			\
-	      struct brw_reg src0)   			\
-{							\
-   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
-}
-
-#define ALU2(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
-	      struct brw_reg dest,			\
-	      struct brw_reg src0,			\
-	      struct brw_reg src1)   			\
-{							\
-   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
-}
-
-
-ALU1(MOV)
-ALU2(SEL)
-ALU1(NOT)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHR)
-ALU2(SHL)
-ALU2(RSR)
-ALU2(RSL)
-ALU2(ASR)
-ALU2(ADD)
-ALU2(MUL)
-ALU1(FRC)
-ALU1(RNDD)
-ALU2(MAC)
-ALU2(MACH)
-ALU1(LZD)
-ALU2(DP4)
-ALU2(DPH)
-ALU2(DP3)
-ALU2(DP2)
-ALU2(LINE)
-
-
-
-
-void brw_NOP(struct brw_compile *p)
-{
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
-   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-   brw_set_src1(insn, brw_imm_ud(0x0));
-}
-
-
-
-
-
-/***********************************************************************
- * Comparisons, if/else/endif
- */
-
-struct brw_instruction *brw_JMPI(struct brw_compile *p,
-	      struct brw_reg dest,
-	      struct brw_reg src0,
-	      struct brw_reg src1)
-{
-   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
-
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;
-
-   return insn;
-}
-
-/* EU takes the value from the flag register and pushes it onto some
- * sort of a stack (presumably merging with any flag value already on
- * the stack).  Within an if block, the flags at the top of the stack
- * control execution on each channel of the unit, eg. on each of the
- * 16 pixel values in our wm programs.
- *
- * When the matching 'else' instruction is reached (presumably by
- * countdown of the instruction count patched in by our ELSE/ENDIF
- * functions), the relevent flags are inverted.
- *
- * When the matching 'endif' instruction is reached, the flags are
- * popped off.  If the stack is now empty, normal execution resumes.
- *
- * No attempt is made to deal with stack overflow (14 elements?).
- */
-struct brw_instruction *brw_IF(struct brw_compile *p, unsigned execute_size)
-{
-   struct brw_instruction *insn;
-
-   if (p->single_program_flow) {
-      assert(execute_size == BRW_EXECUTE_1);
-
-      insn = next_insn(p, BRW_OPCODE_ADD);
-      insn->header.predicate_inverse = 1;
-   } else {
-      insn = next_insn(p, BRW_OPCODE_IF);
-   }
-
-   /* Override the defaults for this instruction:
-    */
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
-
-   insn->header.execution_size = execute_size;
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
-   insn->header.mask_control = BRW_MASK_ENABLE;
-
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;
-
-   return insn;
-}
-
-
-struct brw_instruction *brw_ELSE(struct brw_compile *p,
-				 struct brw_instruction *if_insn)
-{
-   struct brw_instruction *insn;
-
-   if (p->single_program_flow) {
-      insn = next_insn(p, BRW_OPCODE_ADD);
-   } else {
-      insn = next_insn(p, BRW_OPCODE_ELSE);
-   }
-
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
-
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.execution_size = if_insn->header.execution_size;
-   insn->header.mask_control = BRW_MASK_ENABLE;
-
-   /* Patch the if instruction to point at this instruction.
-    */
-   if (p->single_program_flow) {
-      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
-
-      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
-   } else {
-      assert(if_insn->header.opcode == BRW_OPCODE_IF);
-
-      if_insn->bits3.if_else.jump_count = insn - if_insn;
-      if_insn->bits3.if_else.pop_count = 1;
-      if_insn->bits3.if_else.pad0 = 0;
-   }
-
-   return insn;
-}
-
-void brw_ENDIF(struct brw_compile *p,
-	       struct brw_instruction *patch_insn)
-{
-   if (p->single_program_flow) {
-      /* In single program flow mode, there's no need to execute an ENDIF,
-       * since we don't need to do any stack operations, and if we're executing
-       * currently, we want to just continue executing.
-       */
-      struct brw_instruction *next = &p->store[p->nr_insn];
-
-      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
-
-      patch_insn->bits3.ud = (next - patch_insn) * 16;
-   } else {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
-
-      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-      brw_set_src1(insn, brw_imm_d(0x0));
-
-      insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.execution_size = patch_insn->header.execution_size;
-      insn->header.mask_control = BRW_MASK_ENABLE;
-
-      assert(patch_insn->bits3.if_else.jump_count == 0);
-
-      /* Patch the if or else instructions to point at this or the next
-       * instruction respectively.
-       */
-      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
-	 /* Automagically turn it into an IFF:
-	  */
-	 patch_insn->header.opcode = BRW_OPCODE_IFF;
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
-	 patch_insn->bits3.if_else.pop_count = 0;
-	 patch_insn->bits3.if_else.pad0 = 0;
-      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
-	 patch_insn->bits3.if_else.pop_count = 1;
-	 patch_insn->bits3.if_else.pad0 = 0;
-      } else {
-	 assert(0);
-      }
-
-      /* Also pop item off the stack in the endif instruction:
-       */
-      insn->bits3.if_else.jump_count = 0;
-      insn->bits3.if_else.pop_count = 1;
-      insn->bits3.if_else.pad0 = 0;
-   }
-}
-
-struct brw_instruction *brw_BREAK(struct brw_compile *p)
-{
-   struct brw_instruction *insn;
-   insn = next_insn(p, BRW_OPCODE_BREAK);
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.execution_size = BRW_EXECUTE_8;
-   insn->header.mask_control = BRW_MASK_DISABLE;
-   insn->bits3.if_else.pad0 = 0;
-   return insn;
-}
-
-struct brw_instruction *brw_CONT(struct brw_compile *p)
-{
-   struct brw_instruction *insn;
-   insn = next_insn(p, BRW_OPCODE_CONTINUE);
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.execution_size = BRW_EXECUTE_8;
-   insn->header.mask_control = BRW_MASK_DISABLE;
-   insn->bits3.if_else.pad0 = 0;
-   return insn;
-}
-
-/* DO/WHILE loop:
- */
-struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
-{
-   if (p->single_program_flow) {
-      return &p->store[p->nr_insn];
-   } else {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
-
-      /* Override the defaults for this instruction:
-       */
-      brw_set_dest(insn, brw_null_reg());
-      brw_set_src0(insn, brw_null_reg());
-      brw_set_src1(insn, brw_null_reg());
-
-      insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.execution_size = execute_size;
-      insn->header.predicate_control = BRW_PREDICATE_NONE;
-      /* insn->header.mask_control = BRW_MASK_ENABLE; */
-      insn->header.mask_control = BRW_MASK_DISABLE;
-
-      return insn;
-   }
-}
-
-
-
-struct brw_instruction *brw_WHILE(struct brw_compile *p,
-	       struct brw_instruction *do_insn)
-{
-   struct brw_instruction *insn;
-
-   if (p->single_program_flow)
-      insn = next_insn(p, BRW_OPCODE_ADD);
-   else
-      insn = next_insn(p, BRW_OPCODE_WHILE);
-
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
-
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-
-   if (p->single_program_flow) {
-      insn->header.execution_size = BRW_EXECUTE_1;
-
-      insn->bits3.d = (do_insn - insn) * 16;
-   } else {
-      insn->header.execution_size = do_insn->header.execution_size;
-
-      assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = do_insn - insn;
-      insn->bits3.if_else.pop_count = 0;
-      insn->bits3.if_else.pad0 = 0;
-   }
-
-/*    insn->header.mask_control = BRW_MASK_ENABLE; */
-
-   insn->header.mask_control = BRW_MASK_DISABLE;
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;
-   return insn;
-}
-
-
-/* FORWARD JUMPS:
- */
-void brw_land_fwd_jump(struct brw_compile *p,
-		       struct brw_instruction *jmp_insn)
-{
-   struct brw_instruction *landing = &p->store[p->nr_insn];
-
-   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
-
-   jmp_insn->bits3.ud = (landing - jmp_insn) - 1;
-}
-
-
-
-/* To integrate with the above, it makes sense that the comparison
- * instruction should populate the flag register.  It might be simpler
- * just to use the flag reg for most WM tasks?
- */
-void brw_CMP(struct brw_compile *p,
-	     struct brw_reg dest,
-	     unsigned conditional,
-	     struct brw_reg src0,
-	     struct brw_reg src1)
-{
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
-
-   insn->header.destreg__conditonalmod = conditional;
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, src1);
-
-/*    guess_execution_size(insn, src0); */
-
-
-   /* Make it so that future instructions will use the computed flag
-    * value until brw_set_predicate_control_flag_value() is called
-    * again.
-    */
-   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
-       dest.nr == 0) {
-      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
-      p->flag_value = 0xff;
-   }
-}
-
-
-
-/***********************************************************************
- * Helpers for the various SEND message types:
- */
-
-/* Invert 8 values
- */
-void brw_math( struct brw_compile *p,
-	       struct brw_reg dest,
-	       unsigned function,
-	       unsigned saturate,
-	       unsigned msg_reg_nr,
-	       struct brw_reg src,
-	       unsigned data_type,
-	       unsigned precision )
-{
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
-   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
-
-   /* Example code doesn't set predicate_control for send
-    * instructions.
-    */
-   insn->header.predicate_control = 0;
-   insn->header.destreg__conditonalmod = msg_reg_nr;
-
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   brw_set_math_message(insn,
-			msg_length, response_length,
-			function,
-			BRW_MATH_INTEGER_UNSIGNED,
-			precision,
-			saturate,
-			data_type);
-}
-
-/* Use 2 send instructions to invert 16 elements
- */
-void brw_math_16( struct brw_compile *p,
-		  struct brw_reg dest,
-		  unsigned function,
-		  unsigned saturate,
-		  unsigned msg_reg_nr,
-		  struct brw_reg src,
-		  unsigned precision )
-{
-   struct brw_instruction *insn;
-   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
-   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
-
-   /* First instruction:
-    */
-   brw_push_insn_state(p);
-   brw_set_predicate_control_flag_value(p, 0xff);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-   insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditonalmod = msg_reg_nr;
-
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src);
-   brw_set_math_message(insn,
-			msg_length, response_length,
-			function,
-			BRW_MATH_INTEGER_UNSIGNED,
-			precision,
-			saturate,
-			BRW_MATH_DATA_VECTOR);
-
-   /* Second instruction:
-    */
-   insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
-   insn->header.destreg__conditonalmod = msg_reg_nr+1;
-
-   brw_set_dest(insn, offset(dest,1));
-   brw_set_src0(insn, src);
-   brw_set_math_message(insn,
-			msg_length, response_length,
-			function,
-			BRW_MATH_INTEGER_UNSIGNED,
-			precision,
-			saturate,
-			BRW_MATH_DATA_VECTOR);
-
-   brw_pop_insn_state(p);
-}
-
-
-
-
-void brw_dp_WRITE_16( struct brw_compile *p,
-		      struct brw_reg src,
-		      unsigned msg_reg_nr,
-		      unsigned scratch_offset )
-{
-   {
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-      brw_MOV(p,
-	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
-	      brw_imm_d(scratch_offset));
-
-      brw_pop_insn_state(p);
-   }
-
-   {
-      unsigned msg_length = 3;
-      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
-
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src);
-
-      brw_set_dp_write_message(insn,
-			       255, /* bti */
-			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
-			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
-			       msg_length,
-			       0, /* pixel scoreboard */
-			       0, /* response_length */
-			       0); /* eot */
-   }
-
-}
-
-
-void brw_dp_READ_16( struct brw_compile *p,
-		      struct brw_reg dest,
-		      unsigned msg_reg_nr,
-		      unsigned scratch_offset )
-{
-   {
-      brw_push_insn_state(p);
-      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-
-      brw_MOV(p,
-	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
-	      brw_imm_d(scratch_offset));
-
-      brw_pop_insn_state(p);
-   }
-
-   {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
-
-      brw_set_dest(insn, dest);	/* UW? */
-      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
-
-      brw_set_dp_read_message(insn,
-			      255, /* bti */
-			      3,  /* msg_control */
-			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-			      1, /* target cache */
-			      1, /* msg_length */
-			      2, /* response_length */
-			      0); /* eot */
-   }
-}
-
-
-void brw_fb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   unsigned msg_reg_nr,
-		   struct brw_reg src0,
-		   unsigned binding_table_index,
-		   unsigned msg_length,
-		   unsigned response_length,
-		   boolean eot)
-{
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-
-   insn->header.predicate_control = 0; /* XXX */
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
-   insn->header.destreg__conditonalmod = msg_reg_nr;
-
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_dp_write_message(insn,
-			    binding_table_index,
-			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
-			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
-			    msg_length,
-			    1,	/* pixel scoreboard */
-			    response_length,
-			    eot);
-}
-
-
-
-void brw_SAMPLE(struct brw_compile *p,
-		struct brw_reg dest,
-		unsigned msg_reg_nr,
-		struct brw_reg src0,
-		unsigned binding_table_index,
-		unsigned sampler,
-		unsigned writemask,
-		unsigned msg_type,
-		unsigned response_length,
-		unsigned msg_length,
-		boolean eot)
-{
-   boolean need_stall = 0;
-
-   if(writemask == 0) {
-/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
-      return;
-   }
-
-   /* Hardware doesn't do destination dependency checking on send
-    * instructions properly.  Add a workaround which generates the
-    * dependency by other means.  In practice it seems like this bug
-    * only crops up for texture samples, and only where registers are
-    * written by the send and then written again later without being
-    * read in between.  Luckily for us, we already track that
-    * information and use it to modify the writemask for the
-    * instruction, so that is a guide for whether a workaround is
-    * needed.
-    */
-   if (writemask != TGSI_WRITEMASK_XYZW) {
-      unsigned dst_offset = 0;
-      unsigned i, newmask = 0, len = 0;
-
-      for (i = 0; i < 4; i++) {
-	 if (writemask & (1<<i))
-	    break;
-	 dst_offset += 2;
-      }
-      for (; i < 4; i++) {
-	 if (!(writemask & (1<<i)))
-	    break;
-	 newmask |= 1<<i;
-	 len++;
-      }
-
-      if (newmask != writemask) {
-	 need_stall = 1;
-/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
-      }
-      else {
-	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
-
-	 newmask = ~newmask & TGSI_WRITEMASK_XYZW;
-
-	 brw_push_insn_state(p);
-
-	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-	 brw_set_mask_control(p, BRW_MASK_DISABLE);
-
-	 brw_MOV(p, m1, brw_vec8_grf(0,0));
-  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
-
-	 brw_pop_insn_state(p);
-
-  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
-	 dest = offset(dest, dst_offset);
-	 response_length = len * 2;
-      }
-   }
-
-   {
-      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-
-      insn->header.predicate_control = 0; /* XXX */
-      insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
-
-      brw_set_dest(insn, dest);
-      brw_set_src0(insn, src0);
-      brw_set_sampler_message(insn,
-			      binding_table_index,
-			      sampler,
-			      msg_type,
-			      response_length,
-			      msg_length,
-			      eot);
-   }
-
-   if (need_stall)
-   {
-      struct brw_reg reg = vec8(offset(dest, response_length-1));
-
-      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
-       */
-      brw_push_insn_state(p);
-      brw_set_compression_control(p, FALSE);
-      brw_MOV(p, reg, reg);
-      brw_pop_insn_state(p);
-   }
-
-}
-
-/* All these variables are pretty confusing - we might be better off
- * using bitmasks and macros for this, in the old style.  Or perhaps
- * just having the caller instantiate the fields in dword3 itself.
- */
-void brw_urb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   unsigned msg_reg_nr,
-		   struct brw_reg src0,
-		   boolean allocate,
-		   boolean used,
-		   unsigned msg_length,
-		   unsigned response_length,
-		   boolean eot,
-		   boolean writes_complete,
-		   unsigned offset,
-		   unsigned swizzle)
-{
-   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
-
-   assert(msg_length < 16);
-
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, src0);
-   brw_set_src1(insn, brw_imm_d(0));
-
-   insn->header.destreg__conditonalmod = msg_reg_nr;
-
-   brw_set_urb_message(insn,
-		       allocate,
-		       used,
-		       msg_length,
-		       response_length,
-		       eot,
-		       writes_complete,
-		       offset,
-		       swizzle);
-}
-
diff --git a/src/gallium/drivers/i965simple/brw_eu_util.c b/src/gallium/drivers/i965simple/brw_eu_util.c
deleted file mode 100644
index 3a65b141f07..00000000000
--- a/src/gallium/drivers/i965simple/brw_eu_util.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-      
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_eu.h"
-
-
-void brw_math_invert( struct brw_compile *p, 
-			     struct brw_reg dst,
-			     struct brw_reg src)
-{
-   brw_math( p, 
-	     dst,
-	     BRW_MATH_FUNCTION_INV, 
-	     BRW_MATH_SATURATE_NONE,
-	     0,
-	     src,
-	     BRW_MATH_PRECISION_FULL, 
-	     BRW_MATH_DATA_VECTOR );
-}
-
-
-
-void brw_copy4(struct brw_compile *p,
-	       struct brw_reg dst,
-	       struct brw_reg src,
-	       unsigned count)
-{
-   unsigned i;
-
-   dst = vec4(dst);
-   src = vec4(src);
-
-   for (i = 0; i < count; i++)
-   {
-      unsigned delta = i*32;
-      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
-      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
-   }
-}
-
-
-void brw_copy8(struct brw_compile *p,
-	       struct brw_reg dst,
-	       struct brw_reg src,
-	       unsigned count)
-{
-   unsigned i;
-
-   dst = vec8(dst);
-   src = vec8(src);
-
-   for (i = 0; i < count; i++)
-   {
-      unsigned delta = i*32;
-      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
-   }
-}
-
-
-void brw_copy_indirect_to_indirect(struct brw_compile *p,
-				   struct brw_indirect dst_ptr,
-				   struct brw_indirect src_ptr,
-				   unsigned count)
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++)
-   {
-      unsigned delta = i*32;
-      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
-      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
-   }
-}
-
-
-void brw_copy_from_indirect(struct brw_compile *p,
-			    struct brw_reg dst,
-			    struct brw_indirect ptr,
-			    unsigned count)
-{
-   unsigned i;
-
-   dst = vec4(dst);
-
-   for (i = 0; i < count; i++)
-   {
-      unsigned delta = i*32;
-      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
-      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
-   }
-}
-
-
-
-
diff --git a/src/gallium/drivers/i965simple/brw_gs.c b/src/gallium/drivers/i965simple/brw_gs.c
deleted file mode 100644
index de60868ccca..00000000000
--- a/src/gallium/drivers/i965simple/brw_gs.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_state.h"
-#include "brw_gs.h"
-
-
-
-static void compile_gs_prog( struct brw_context *brw,
-			     struct brw_gs_prog_key *key )
-{
-   struct brw_gs_compile c;
-   const unsigned *program;
-   unsigned program_size;
-
-   memset(&c, 0, sizeof(c));
-
-   c.key = *key;
-
-   /* Need to locate the two positions present in vertex + header.
-    * These are currently hardcoded:
-    */
-   c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
-   c.nr_bytes = c.nr_regs * REG_SIZE;
-
-
-   /* Begin the compilation:
-    */
-   brw_init_compile(&c.func);
-
-   c.func.single_program_flow = 1;
-
-   /* For some reason the thread is spawned with only 4 channels
-    * unmasked.
-    */
-   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
-
-
-   /* Note that primitives which don't require a GS program have
-    * already been weeded out by this stage:
-    */
-   switch (key->primitive) {
-   case PIPE_PRIM_QUADS:
-      brw_gs_quads( &c );
-      break;
-   case PIPE_PRIM_QUAD_STRIP:
-      brw_gs_quad_strip( &c );
-      break;
-   case PIPE_PRIM_LINE_LOOP:
-      brw_gs_lines( &c );
-      break;
-   case PIPE_PRIM_LINES:
-      if (key->hint_gs_always)
-	 brw_gs_lines( &c );
-      else {
-	 return;
-      }
-      break;
-   case PIPE_PRIM_TRIANGLES:
-      if (key->hint_gs_always)
-	 brw_gs_tris( &c );
-      else {
-	 return;
-      }
-      break;
-   case PIPE_PRIM_POINTS:
-      if (key->hint_gs_always)
-	 brw_gs_points( &c );
-      else {
-	 return;
-      }
-      break;
-   default:
-      return;
-   }
-
-   /* get the program
-    */
-   program = brw_get_program(&c.func, &program_size);
-
-   /* Upload
-    */
-   brw->gs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_GS_PROG],
-					      &c.key,
-					      sizeof(c.key),
-					      program,
-					      program_size,
-					      &c.prog_data,
-					      &brw->gs.prog_data );
-}
-
-
-static boolean search_cache( struct brw_context *brw,
-			       struct brw_gs_prog_key *key )
-{
-   return brw_search_cache(&brw->cache[BRW_GS_PROG],
-			   key, sizeof(*key),
-			   &brw->gs.prog_data,
-			   &brw->gs.prog_gs_offset);
-}
-
-
-static const int gs_prim[PIPE_PRIM_POLYGON+1] = {
-   PIPE_PRIM_POINTS,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINE_LOOP,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_QUADS,
-   PIPE_PRIM_QUAD_STRIP,
-   PIPE_PRIM_TRIANGLES
-};
-
-static void populate_key( struct brw_context *brw,
-			  struct brw_gs_prog_key *key )
-{
-   memset(key, 0, sizeof(*key));
-
-   /* CACHE_NEW_VS_PROG */
-   key->attrs = brw->vs.prog_data->outputs_written;
-
-   /* BRW_NEW_PRIMITIVE */
-   key->primitive = gs_prim[brw->primitive];
-
-   key->hint_gs_always = 0;	/* debug code? */
-
-   key->need_gs_prog = (key->hint_gs_always ||
-			brw->primitive == PIPE_PRIM_QUADS ||
-			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
-			brw->primitive == PIPE_PRIM_LINE_LOOP);
-}
-
-/* Calculate interpolants for triangle and line rasterization.
- */
-static void upload_gs_prog( struct brw_context *brw )
-{
-   struct brw_gs_prog_key key;
-
-   /* Populate the key:
-    */
-   populate_key(brw, &key);
-
-   if (brw->gs.prog_active != key.need_gs_prog) {
-      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
-      brw->gs.prog_active = key.need_gs_prog;
-   }
-
-   if (brw->gs.prog_active) {
-      if (!search_cache(brw, &key))
-	 compile_gs_prog( brw, &key );
-   }
-}
-
-
-const struct brw_tracked_state brw_gs_prog = {
-   .dirty = {
-      .brw   = BRW_NEW_PRIMITIVE,
-      .cache = CACHE_NEW_VS_PROG
-   },
-   .update = upload_gs_prog
-};
diff --git a/src/gallium/drivers/i965simple/brw_gs.h b/src/gallium/drivers/i965simple/brw_gs.h
deleted file mode 100644
index f09141c6aa1..00000000000
--- a/src/gallium/drivers/i965simple/brw_gs.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
- 
-
-#ifndef BRW_GS_H
-#define BRW_GS_H
-
-
-#include "brw_context.h"
-#include "brw_eu.h"
-
-#define MAX_GS_VERTS (4)	     
-
-struct brw_gs_prog_key {
-   unsigned attrs:32;
-   unsigned primitive:4;
-   unsigned hint_gs_always:1;
-   unsigned need_gs_prog:1;
-   unsigned pad:26;
-};
-
-struct brw_gs_compile {
-   struct brw_compile func;
-   struct brw_gs_prog_key key;
-   struct brw_gs_prog_data prog_data;
-   
-   struct {
-      struct brw_reg R0;
-      struct brw_reg vertex[MAX_GS_VERTS];
-   } reg;
-
-   /* 3 different ways of expressing vertex size:
-    */
-   unsigned nr_attrs;
-   unsigned nr_regs;
-   unsigned nr_bytes;
-};
-
-#define ATTR_SIZE  (4*4)
-
-void brw_gs_quads( struct brw_gs_compile *c );
-void brw_gs_quad_strip( struct brw_gs_compile *c );
-void brw_gs_tris( struct brw_gs_compile *c );
-void brw_gs_lines( struct brw_gs_compile *c );
-void brw_gs_points( struct brw_gs_compile *c );
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_gs_emit.c b/src/gallium/drivers/i965simple/brw_gs_emit.c
deleted file mode 100644
index c3cc90b10f8..00000000000
--- a/src/gallium/drivers/i965simple/brw_gs_emit.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_gs.h"
-
-static void brw_gs_alloc_regs( struct brw_gs_compile *c,
-			       unsigned nr_verts )
-{
-   unsigned i = 0,j;
-
-   /* Register usage is static, precompute here:
-    */
-   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
-
-   /* Payload vertices plus space for more generated vertices:
-    */
-   for (j = 0; j < nr_verts; j++) {
-      c->reg.vertex[j] = brw_vec4_grf(i, 0);
-      i += c->nr_regs;
-   }
-
-   c->prog_data.urb_read_length = c->nr_regs; 
-   c->prog_data.total_grf = i;
-}
-
-
-static void brw_gs_emit_vue(struct brw_gs_compile *c, 
-			    struct brw_reg vert,
-			    boolean last,
-			    unsigned header)
-{
-   struct brw_compile *p = &c->func;
-   boolean allocate = !last;
-
-   /* Overwrite PrimType and PrimStart in the message header, for
-    * each vertex in turn:
-    */
-   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
-
-   /* Copy the vertex from vertn into m1..mN+1:
-    */
-   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
-
-   /* Send each vertex as a seperate write to the urb.  This is
-    * different to the concept in brw_sf_emit.c, where subsequent
-    * writes are used to build up a single urb entry.  Each of these
-    * writes instantiates a seperate urb entry, and a new one must be
-    * allocated each time.
-    */
-   brw_urb_WRITE(p, 
-		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
-		 0,
-		 c->reg.R0,
-		 allocate,
-		 1,		/* used */
-		 c->nr_regs + 1, /* msg length */
-		 allocate ? 1 : 0, /* response length */
-		 allocate ? 0 : 1, /* eot */
-		 1,		/* writes_complete */
-		 0,		/* urb offset */
-		 BRW_URB_SWIZZLE_NONE);
-}
-
-
-
-void brw_gs_quads( struct brw_gs_compile *c )
-{
-   brw_gs_alloc_regs(c, 4);
-   
-   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
-    * is the PV for quads, but vertex 0 for polygons:
-    */
-   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
-   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
-}
-
-void brw_gs_quad_strip( struct brw_gs_compile *c )
-{
-   brw_gs_alloc_regs(c, 4);
-   
-   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
-   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
-}
-
-void brw_gs_tris( struct brw_gs_compile *c )
-{
-   brw_gs_alloc_regs(c, 3);
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
-}
-
-void brw_gs_lines( struct brw_gs_compile *c )
-{
-   brw_gs_alloc_regs(c, 2);
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
-}
-
-void brw_gs_points( struct brw_gs_compile *c )
-{
-   brw_gs_alloc_regs(c, 1);
-   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
-}
-
-
-
-
-
-
-
-
diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c
deleted file mode 100644
index 5b8016b2e93..00000000000
--- a/src/gallium/drivers/i965simple/brw_gs_state.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-
-static void upload_gs_unit( struct brw_context *brw )
-{
-   struct brw_gs_unit_state gs;
-
-   memset(&gs, 0, sizeof(gs));
-
-   /* CACHE_NEW_GS_PROG */
-   if (brw->gs.prog_active) {
-      gs.thread0.grf_reg_count =
-	 align(brw->gs.prog_data->total_grf, 16) / 16 - 1;
-      gs.thread0.kernel_start_pointer = brw->gs.prog_gs_offset >> 6;
-      gs.thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
-   }
-   else {
-      gs.thread0.grf_reg_count = 0;
-      gs.thread0.kernel_start_pointer = 0;
-      gs.thread3.urb_entry_read_length = 1;
-   }
-
-   /* BRW_NEW_URB_FENCE */
-   gs.thread4.nr_urb_entries = brw->urb.nr_gs_entries;
-   gs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-
-   gs.thread4.max_threads = 0; /* Hardware requirement */
-
-   if (BRW_DEBUG & DEBUG_STATS)
-      gs.thread4.stats_enable = 1;
-
-   /* CONSTANT */
-   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   gs.thread1.single_program_flow = 1;
-   gs.thread3.dispatch_grf_start_reg = 1;
-   gs.thread3.const_urb_entry_read_offset = 0;
-   gs.thread3.const_urb_entry_read_length = 0;
-   gs.thread3.urb_entry_read_offset = 0;
-
-
-   brw->gs.state_gs_offset = brw_cache_data( &brw->cache[BRW_GS_UNIT], &gs );
-}
-
-
-const struct brw_tracked_state brw_gs_unit = {
-   .dirty = {
-      .brw   = (BRW_NEW_CURBE_OFFSETS |
-		BRW_NEW_URB_FENCE),
-      .cache = CACHE_NEW_GS_PROG
-   },
-   .update = upload_gs_unit
-};
diff --git a/src/gallium/drivers/i965simple/brw_misc_state.c b/src/gallium/drivers/i965simple/brw_misc_state.c
deleted file mode 100644
index 99ff4403a59..00000000000
--- a/src/gallium/drivers/i965simple/brw_misc_state.c
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_batch.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-
-
-
-
-
-/***********************************************************************
- * Blend color
- */
-
-static void upload_blend_constant_color(struct brw_context *brw)
-{
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = brw->attribs.BlendColor.color[0];
-   bcc.blend_constant_color[1] = brw->attribs.BlendColor.color[1];
-   bcc.blend_constant_color[2] = brw->attribs.BlendColor.color[2];
-   bcc.blend_constant_color[3] = brw->attribs.BlendColor.color[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
-}
-
-
-const struct brw_tracked_state brw_blend_constant_color = {
-   .dirty = {
-      .brw = BRW_NEW_BLEND,
-      .cache = 0
-   },
-   .update = upload_blend_constant_color
-};
-
-
-/***********************************************************************
- * Drawing rectangle 
- */
-static void upload_drawing_rect(struct brw_context *brw)
-{
-   struct brw_drawrect bdr;
-
-   memset(&bdr, 0, sizeof(bdr));
-   bdr.header.opcode = CMD_DRAW_RECT;
-   bdr.header.length = sizeof(bdr)/4 - 2;
-   bdr.xmin = 0;
-   bdr.ymin = 0;
-   bdr.xmax = brw->attribs.FrameBuffer.cbufs[0]->width;
-   bdr.ymax = brw->attribs.FrameBuffer.cbufs[0]->height;
-   bdr.xorg = 0;
-   bdr.yorg = 0;
-
-   /* Can't use BRW_CACHED_BATCH_STRUCT because this is also emitted
-    * uncached in brw_draw.c:
-    */
-   BRW_BATCH_STRUCT(brw, &bdr);
-}
-
-const struct brw_tracked_state brw_drawing_rect = {
-   .dirty = {
-      .brw = BRW_NEW_SCENE,
-      .cache = 0
-   },
-   .update = upload_drawing_rect
-};
-
-/**
- * Upload the binding table pointers, which point each stage's array of surface
- * state pointers.
- *
- * The binding table pointers are relative to the surface state base address,
- * which is the BRW_SS_POOL cache buffer.
- */
-static void upload_binding_table_pointers(struct brw_context *brw)
-{
-   struct brw_binding_table_pointers btp;
-   memset(&btp, 0, sizeof(btp));
-
-   btp.header.opcode = CMD_BINDING_TABLE_PTRS;
-   btp.header.length = sizeof(btp)/4 - 2;
-   btp.vs = 0;
-   btp.gs = 0;
-   btp.clp = 0;
-   btp.sf = 0;
-   btp.wm = brw->wm.bind_ss_offset;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &btp);
-}
-
-const struct brw_tracked_state brw_binding_table_pointers = {
-   .dirty = {
-      .brw = 0,
-      .cache = CACHE_NEW_SURF_BIND
-   },
-   .update = upload_binding_table_pointers,
-};
-
-
-/**
- * Upload pointers to the per-stage state.
- *
- * The state pointers in this packet are all relative to the general state
- * base address set by CMD_STATE_BASE_ADDRESS, which is the BRW_GS_POOL buffer.
- */
-static void upload_pipelined_state_pointers(struct brw_context *brw )
-{
-   struct brw_pipelined_state_pointers psp;
-   memset(&psp, 0, sizeof(psp));
-
-   psp.header.opcode = CMD_PIPELINED_STATE_POINTERS;
-   psp.header.length = sizeof(psp)/4 - 2;
-
-   psp.vs.offset = brw->vs.state_gs_offset >> 5;
-   psp.sf.offset = brw->sf.state_gs_offset >> 5;
-   psp.wm.offset = brw->wm.state_gs_offset >> 5;
-   psp.cc.offset = brw->cc.state_gs_offset >> 5;
-
-   /* GS gets turned on and off regularly.  Need to re-emit URB fence
-    * after this occurs.
-    */
-   if (brw->gs.prog_active) {
-      psp.gs.offset = brw->gs.state_gs_offset >> 5;
-      psp.gs.enable = 1;
-   }
-
-   if (0) {
-      psp.clp.offset = brw->clip.state_gs_offset >> 5;
-      psp.clp.enable = 1;
-   }
-
-
-   if (BRW_CACHED_BATCH_STRUCT(brw, &psp))
-      brw->state.dirty.brw |= BRW_NEW_PSP;
-}
-
-const struct brw_tracked_state brw_pipelined_state_pointers = {
-   .dirty = {
-      .brw = 0,
-      .cache = (CACHE_NEW_VS_UNIT |
-		CACHE_NEW_GS_UNIT |
-		CACHE_NEW_GS_PROG |
-		CACHE_NEW_CLIP_UNIT |
-		CACHE_NEW_SF_UNIT |
-		CACHE_NEW_WM_UNIT |
-		CACHE_NEW_CC_UNIT)
-   },
-   .update = upload_pipelined_state_pointers
-};
-
-static void upload_psp_urb_cbs(struct brw_context *brw )
-{
-   upload_pipelined_state_pointers(brw);
-   brw_upload_urb_fence(brw);
-   brw_upload_constant_buffer_state(brw);
-}
-
-
-const struct brw_tracked_state brw_psp_urb_cbs = {
-   .dirty = {
-      .brw = BRW_NEW_URB_FENCE,
-      .cache = (CACHE_NEW_VS_UNIT |
-		CACHE_NEW_GS_UNIT |
-		CACHE_NEW_GS_PROG |
-		CACHE_NEW_CLIP_UNIT |
-		CACHE_NEW_SF_UNIT |
-		CACHE_NEW_WM_UNIT |
-		CACHE_NEW_CC_UNIT)
-   },
-   .update = upload_psp_urb_cbs
-};
-
-/**
- * Upload the depthbuffer offset and format.
- *
- * We have to do this per state validation as we need to emit the relocation
- * in the batch buffer.
- */
-static void upload_depthbuffer(struct brw_context *brw)
-{
-   struct pipe_surface *depth_surface = brw->attribs.FrameBuffer.zsbuf;
-
-   BEGIN_BATCH(5, INTEL_BATCH_NO_CLIPRECTS);
-   OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (5 - 2));
-   if (depth_surface == NULL) {
-      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
-		(BRW_SURFACE_NULL << 29));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   } else {
-      unsigned int format;
-      struct brw_texture *tex = (struct brw_texture *)depth_surface->texture;
-      assert(depth_surface->block.width == 1);
-      assert(depth_surface->block.height == 1);
-      switch (depth_surface->block.size) {
-      case 2:
-	 format = BRW_DEPTHFORMAT_D16_UNORM;
-	 break;
-      case 4:
-	 if (depth_surface->format == PIPE_FORMAT_Z32_FLOAT)
-	    format = BRW_DEPTHFORMAT_D32_FLOAT;
-	 else
-	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
-	 break;
-      default:
-	 assert(0);
-	 return;
-      }
-
-      OUT_BATCH((depth_surface->stride - 1) |
-		(format << 18) |
-		(BRW_TILEWALK_YMAJOR << 26) |
-//		(depth_surface->region->tiled << 27) |
-		(BRW_SURFACE_2D << 29));
-      OUT_RELOC(tex->buffer,
-		PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE, 0);
-      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
-		((depth_surface->stride/depth_surface->block.size - 1) << 6) |
-		((depth_surface->height - 1) << 19));
-      OUT_BATCH(0);
-   }
-   ADVANCE_BATCH();
-}
-
-const struct brw_tracked_state brw_depthbuffer = {
-   .dirty = {
-      .brw = BRW_NEW_SCENE,
-      .cache = 0
-   },
-   .update = upload_depthbuffer,
-};
-
-
-
-
-/***********************************************************************
- * Polygon stipple packet
- */
-
-static void upload_polygon_stipple(struct brw_context *brw)
-{
-   struct brw_polygon_stipple bps;
-   unsigned i;
-
-   memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
-   bps.header.length = sizeof(bps)/4-2;
-
-   /* XXX: state tracker should send *all* state down initially!
-    */
-   if (brw->attribs.PolygonStipple)
-      for (i = 0; i < 32; i++)
-	 bps.stipple[i] = brw->attribs.PolygonStipple->stipple[31 - i]; /* invert */
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bps);
-}
-
-const struct brw_tracked_state brw_polygon_stipple = {
-   .dirty = {
-      .brw = BRW_NEW_STIPPLE,
-      .cache = 0
-   },
-   .update = upload_polygon_stipple
-};
-
-
-/***********************************************************************
- * Line stipple packet
- */
-
-static void upload_line_stipple(struct brw_context *brw)
-{
-   struct brw_line_stipple bls;
-   float tmp;
-   int tmpi;
-
-   memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
-   bls.header.length = sizeof(bls)/4 - 2;
-
-   bls.bits0.pattern = brw->attribs.Raster->line_stipple_pattern;
-   bls.bits1.repeat_count = brw->attribs.Raster->line_stipple_factor;
-
-   tmp = 1.0 / (float) brw->attribs.Raster->line_stipple_factor;
-   tmpi = tmp * (1<<13);
-
-
-   bls.bits1.inverse_repeat_count = tmpi;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bls);
-}
-
-const struct brw_tracked_state brw_line_stipple = {
-   .dirty = {
-      .brw = BRW_NEW_STIPPLE,
-      .cache = 0
-   },
-   .update = upload_line_stipple
-};
-
-
-/***********************************************************************
- * Misc constant state packets
- */
-
-static void upload_pipe_control(struct brw_context *brw)
-{
-   struct brw_pipe_control pc;
-
-   return;
-
-   memset(&pc, 0, sizeof(pc));
-
-   pc.header.opcode = CMD_PIPE_CONTROL;
-   pc.header.length = sizeof(pc)/4 - 2;
-   pc.header.post_sync_operation = PIPE_CONTROL_NOWRITE;
-
-   pc.header.instruction_state_cache_flush_enable = 1;
-
-   pc.bits1.dest_addr_type = PIPE_CONTROL_GTTWRITE_GLOBAL;
-
-   BRW_BATCH_STRUCT(brw, &pc);
-}
-
-const struct brw_tracked_state brw_pipe_control = {
-   .dirty = {
-      .brw = BRW_NEW_SCENE,
-      .cache = 0
-   },
-   .update = upload_pipe_control
-};
-
-
-/***********************************************************************
- * Misc invarient state packets
- */
-
-static void upload_invarient_state( struct brw_context *brw )
-{
-   {
-      struct brw_mi_flush flush;
-
-      memset(&flush, 0, sizeof(flush));      
-      flush.opcode = CMD_MI_FLUSH;
-      flush.flags = BRW_FLUSH_STATE_CACHE | BRW_FLUSH_READ_CACHE;
-      BRW_BATCH_STRUCT(brw, &flush);
-   }
-
-   {
-      /* 0x61040000  Pipeline Select */
-      /*     PipelineSelect            : 0 */
-      struct brw_pipeline_select ps;
-
-      memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = CMD_PIPELINE_SELECT;
-      ps.header.pipeline_select = 0;
-      BRW_BATCH_STRUCT(brw, &ps);
-   }
-
-   {
-      struct brw_global_depth_offset_clamp gdo;
-      memset(&gdo, 0, sizeof(gdo));
-
-      /* Disable depth offset clamping.
-       */
-      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
-      gdo.header.length = sizeof(gdo)/4 - 2;
-      gdo.depth_offset_clamp = 0.0;
-
-      BRW_BATCH_STRUCT(brw, &gdo);
-   }
-
-
-   /* 0x61020000  State Instruction Pointer */
-   {
-      struct brw_system_instruction_pointer sip;
-      memset(&sip, 0, sizeof(sip));
-
-      sip.header.opcode = CMD_STATE_INSN_POINTER;
-      sip.header.length = 0;
-      sip.bits0.pad = 0;
-      sip.bits0.system_instruction_pointer = 0;
-      BRW_BATCH_STRUCT(brw, &sip);
-   }
-
-
-   {
-      struct brw_vf_statistics vfs;
-      memset(&vfs, 0, sizeof(vfs));
-
-      vfs.opcode = CMD_VF_STATISTICS;
-      if (BRW_DEBUG & DEBUG_STATS)
-	 vfs.statistics_enable = 1;
-
-      BRW_BATCH_STRUCT(brw, &vfs);
-   }
-
-   
-   {
-      struct brw_polygon_stipple_offset bpso;
-      
-      memset(&bpso, 0, sizeof(bpso));
-      bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
-      bpso.header.length = sizeof(bpso)/4-2;      
-      bpso.bits0.x_offset = 0;
-      bpso.bits0.y_offset = 0;
-
-      BRW_BATCH_STRUCT(brw, &bpso);
-   }
-}
-
-const struct brw_tracked_state brw_invarient_state = {
-   .dirty = {
-      .brw = BRW_NEW_SCENE,
-      .cache = 0
-   },
-   .update = upload_invarient_state
-};
-
-/**
- * Define the base addresses which some state is referenced from.
- *
- * This allows us to avoid having to emit relocations in many places for
- * cached state, and instead emit pointers inside of large, mostly-static
- * state pools.  This comes at the expense of memory, and more expensive cache
- * misses.
- */
-static void upload_state_base_address( struct brw_context *brw )
-{
-   /* Output the structure (brw_state_base_address) directly to the
-    * batchbuffer, so we can emit relocations inline.
-    */
-   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
-   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
-   OUT_RELOC(brw->pool[BRW_GS_POOL].buffer,
-	     PIPE_BUFFER_USAGE_GPU_READ,
-	     1); /* General state base address */
-   OUT_RELOC(brw->pool[BRW_SS_POOL].buffer,
-	     PIPE_BUFFER_USAGE_GPU_READ,
-	     1); /* Surface state base address */
-   OUT_BATCH(1); /* Indirect object base address */
-   OUT_BATCH(1); /* General state upper bound */
-   OUT_BATCH(1); /* Indirect object upper bound */
-   ADVANCE_BATCH();
-}
-
-
-const struct brw_tracked_state brw_state_base_address = {
-   .dirty = {
-      .brw = BRW_NEW_SCENE,
-      .cache = 0
-   },
-   .update = upload_state_base_address
-};
diff --git a/src/gallium/drivers/i965simple/brw_reg.h b/src/gallium/drivers/i965simple/brw_reg.h
deleted file mode 100644
index 9e885c3b3b7..00000000000
--- a/src/gallium/drivers/i965simple/brw_reg.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#define CMD_MI				(0x0 << 29)
-#define CMD_2D				(0x2 << 29)
-#define CMD_3D				(0x3 << 29)
-
-#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
-
-/* Stalls command execution waiting for the given events to have occurred. */
-#define MI_WAIT_FOR_EVENT               (CMD_MI | (0x3 << 23))
-#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
-#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
-
-/* Primitive dispatch on 830-945 */
-#define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
-#define PRIM_INDIRECT            (1<<23)
-#define PRIM_INLINE              (0<<23)
-#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
-#define PRIM_INDIRECT_ELTS       (1<<17)
-
-#define PRIM3D_TRILIST		(0x0<<18)
-#define PRIM3D_TRISTRIP 	(0x1<<18)
-#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
-#define PRIM3D_TRIFAN		(0x3<<18)
-#define PRIM3D_POLY		(0x4<<18)
-#define PRIM3D_LINELIST 	(0x5<<18)
-#define PRIM3D_LINESTRIP	(0x6<<18)
-#define PRIM3D_RECTLIST 	(0x7<<18)
-#define PRIM3D_POINTLIST	(0x8<<18)
-#define PRIM3D_DIB		(0x9<<18)
-#define PRIM3D_MASK		(0x1f<<18)
-
-#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
-
-#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
-
-#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
-
-/* BR00 */
-#define XY_BLT_WRITE_ALPHA	(1 << 21)
-#define XY_BLT_WRITE_RGB	(1 << 20)
-#define XY_SRC_TILED		(1 << 15)
-#define XY_DST_TILED		(1 << 11)
-
-/* BR13 */
-#define BR13_565		(0x1 << 24)
-#define BR13_8888		(0x3 << 24)
-
-#define FENCE_LINEAR 0
-#define FENCE_XMAJOR 1
-#define FENCE_YMAJOR 2
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
deleted file mode 100644
index b22e105f106..00000000000
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "util/u_memory.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_string.h"
-#include "util/u_simple_screen.h"
-
-#include "brw_context.h"
-#include "brw_screen.h"
-#include "brw_tex_layout.h"
-
-
-static const char *
-brw_get_vendor( struct pipe_screen *screen )
-{
-   return "Tungsten Graphics, Inc.";
-}
-
-
-static const char *
-brw_get_name( struct pipe_screen *screen )
-{
-   static char buffer[128];
-   const char *chipset;
-
-   switch (brw_screen(screen)->pci_id) {
-   case PCI_CHIP_I965_Q:
-      chipset = "Intel(R) 965Q";
-      break;
-   case PCI_CHIP_I965_G:
-   case PCI_CHIP_I965_G_1:
-      chipset = "Intel(R) 965G";
-      break;
-   case PCI_CHIP_I965_GM:
-      chipset = "Intel(R) 965GM";
-      break;
-   case PCI_CHIP_I965_GME:
-      chipset = "Intel(R) 965GME/GLE";
-      break;
-   default:
-      chipset = "unknown";
-      break;
-   }
-
-   util_snprintf(buffer, sizeof(buffer), "i965 (chipset: %s)", chipset);
-   return buffer;
-}
-
-
-static int
-brw_get_param(struct pipe_screen *screen, int param)
-{
-   switch (param) {
-   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
-      return 8;
-   case PIPE_CAP_NPOT_TEXTURES:
-      return 1;
-   case PIPE_CAP_TWO_SIDED_STENCIL:
-      return 1;
-   case PIPE_CAP_GLSL:
-      return 0;
-   case PIPE_CAP_S3TC:
-      return 0;
-   case PIPE_CAP_ANISOTROPIC_FILTER:
-      return 0;
-   case PIPE_CAP_POINT_SPRITE:
-      return 0;
-   case PIPE_CAP_MAX_RENDER_TARGETS:
-      return 1;
-   case PIPE_CAP_OCCLUSION_QUERY:
-      return 0;
-   case PIPE_CAP_TEXTURE_SHADOW_MAP:
-      return 1;
-   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 11; /* max 1024x1024 */
-   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 8;  /* max 128x128x128 */
-   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 11; /* max 1024x1024 */
-   default:
-      return 0;
-   }
-}
-
-
-static float
-brw_get_paramf(struct pipe_screen *screen, int param)
-{
-   switch (param) {
-   case PIPE_CAP_MAX_LINE_WIDTH:
-      /* fall-through */
-   case PIPE_CAP_MAX_LINE_WIDTH_AA:
-      return 7.5;
-
-   case PIPE_CAP_MAX_POINT_WIDTH:
-      /* fall-through */
-   case PIPE_CAP_MAX_POINT_WIDTH_AA:
-      return 255.0;
-
-   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
-      return 4.0;
-
-   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
-      return 16.0;
-
-   default:
-      return 0;
-   }
-}
-
-
-static boolean
-brw_is_format_supported( struct pipe_screen *screen,
-                         enum pipe_format format, 
-                         enum pipe_texture_target target,
-                         unsigned tex_usage, 
-                         unsigned geom_flags )
-{
-#if 0
-   /* XXX: This is broken -- rewrite if still needed. */
-   static const unsigned tex_supported[] = {
-      PIPE_FORMAT_R8G8B8A8_UNORM,
-      PIPE_FORMAT_A8R8G8B8_UNORM,
-      PIPE_FORMAT_R5G6B5_UNORM,
-      PIPE_FORMAT_L8_UNORM,
-      PIPE_FORMAT_A8_UNORM,
-      PIPE_FORMAT_I8_UNORM,
-      PIPE_FORMAT_L8A8_UNORM,
-      PIPE_FORMAT_YCBCR,
-      PIPE_FORMAT_YCBCR_REV,
-      PIPE_FORMAT_S8_Z24,
-   };
-
-
-   /* Actually a lot more than this - add later:
-    */
-   static const unsigned render_supported[] = {
-      PIPE_FORMAT_A8R8G8B8_UNORM,
-      PIPE_FORMAT_R5G6B5_UNORM,
-   };
-
-   /*
-    */
-   static const unsigned z_stencil_supported[] = {
-      PIPE_FORMAT_Z16_UNORM,
-      PIPE_FORMAT_Z32_UNORM,
-      PIPE_FORMAT_S8Z24_UNORM,
-   };
-
-   switch (type) {
-   case PIPE_RENDER_FORMAT:
-      *numFormats = Elements(render_supported);
-      return render_supported;
-
-   case PIPE_TEX_FORMAT:
-      *numFormats = Elements(tex_supported);
-      return render_supported;
-
-   case PIPE_Z_STENCIL_FORMAT:
-      *numFormats = Elements(render_supported);
-      return render_supported;
-
-   default:
-      *numFormats = 0;
-      return NULL;
-   }
-#else
-   switch (format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-   case PIPE_FORMAT_R5G6B5_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-      return TRUE;
-   default:
-      return FALSE;
-   };
-   return FALSE;
-#endif
-}
-
-
-static void
-brw_destroy_screen( struct pipe_screen *screen )
-{
-   struct pipe_winsys *winsys = screen->winsys;
-
-   if(winsys->destroy)
-      winsys->destroy(winsys);
-
-   FREE(screen);
-}
-
-
-/**
- * Create a new brw_screen object
- */
-struct pipe_screen *
-brw_create_screen(struct pipe_winsys *winsys, uint pci_id)
-{
-   struct brw_screen *brwscreen = CALLOC_STRUCT(brw_screen);
-
-   if (!brwscreen)
-      return NULL;
-
-   brwscreen->pci_id = pci_id;
-
-   brwscreen->screen.winsys = winsys;
-
-   brwscreen->screen.destroy = brw_destroy_screen;
-
-   brwscreen->screen.get_name = brw_get_name;
-   brwscreen->screen.get_vendor = brw_get_vendor;
-   brwscreen->screen.get_param = brw_get_param;
-   brwscreen->screen.get_paramf = brw_get_paramf;
-   brwscreen->screen.is_format_supported = brw_is_format_supported;
-
-   brw_init_screen_texture_funcs(&brwscreen->screen);
-   u_simple_screen_init(&brwscreen->screen);
-
-   return &brwscreen->screen;
-}
diff --git a/src/gallium/drivers/i965simple/brw_sf.c b/src/gallium/drivers/i965simple/brw_sf.c
deleted file mode 100644
index b82a2e143ba..00000000000
--- a/src/gallium/drivers/i965simple/brw_sf.c
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_sf.h"
-#include "brw_state.h"
-#include "tgsi/tgsi_parse.h"
-
-
-static void compile_sf_prog( struct brw_context *brw,
-			     struct brw_sf_prog_key *key )
-{
-   struct brw_sf_compile c;
-   const unsigned *program;
-   unsigned program_size;
-
-   memset(&c, 0, sizeof(c));
-
-   /* Begin the compilation:
-    */
-   brw_init_compile(&c.func);
-
-   c.key = *key;
-
-
-   c.nr_attrs = c.key.vp_output_count;
-   c.nr_attr_regs = (c.nr_attrs+1)/2;
-
-   c.nr_setup_attrs = c.key.fp_input_count + 1; /* +1 for position */
-   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
-
-   c.prog_data.urb_read_length = c.nr_attr_regs;
-   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
-
-
-   /* Which primitive?  Or all three?
-    */
-   switch (key->primitive) {
-   case SF_TRIANGLES:
-      c.nr_verts = 3;
-      brw_emit_tri_setup( &c );
-      break;
-   case SF_LINES:
-      c.nr_verts = 2;
-      brw_emit_line_setup( &c );
-      break;
-   case SF_POINTS:
-      c.nr_verts = 1;
-      brw_emit_point_setup( &c );
-      break;
-
-   case SF_UNFILLED_TRIS:
-   default:
-      assert(0);
-      return;
-   }
-
-
-
-   /* get the program
-    */
-   program = brw_get_program(&c.func, &program_size);
-
-   /* Upload
-    */
-   brw->sf.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_SF_PROG],
-					      &c.key,
-					      sizeof(c.key),
-					      program,
-					      program_size,
-					      &c.prog_data,
-					      &brw->sf.prog_data );
-}
-
-
-static boolean search_cache( struct brw_context *brw,
-			       struct brw_sf_prog_key *key )
-{
-   return brw_search_cache(&brw->cache[BRW_SF_PROG],
-			   key, sizeof(*key),
-			   &brw->sf.prog_data,
-			   &brw->sf.prog_gs_offset);
-}
-
-
-/* Calculate interpolants for triangle and line rasterization.
- */
-static void upload_sf_prog( struct brw_context *brw )
-{
-   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
-   struct brw_sf_prog_key key;
-   struct tgsi_parse_context parse;
-   int i, done = 0;
-
-
-   memset(&key, 0, sizeof(key));
-
-   /* Populate the key, noting state dependencies:
-    */
-   /* CACHE_NEW_VS_PROG */
-   key.vp_output_count = brw->vs.prog_data->outputs_written;
-
-   /* BRW_NEW_FS */
-   key.fp_input_count = brw->attribs.FragmentProgram->info.file_max[TGSI_FILE_INPUT] + 1;
-
-
-   /* BRW_NEW_REDUCED_PRIMITIVE */
-   switch (brw->reduced_primitive) {
-   case PIPE_PRIM_TRIANGLES:
-//      if (key.attrs & (1<<VERT_RESULT_EDGE))
-//	 key.primitive = SF_UNFILLED_TRIS;
-//      else
-      key.primitive = SF_TRIANGLES;
-      break;
-   case PIPE_PRIM_LINES:
-      key.primitive = SF_LINES;
-      break;
-   case PIPE_PRIM_POINTS:
-      key.primitive = SF_POINTS;
-      break;
-   }
-
-
-
-   /* Scan fp inputs to figure out what interpolation modes are
-    * required for each incoming vp output.  There is an assumption
-    * that the state tracker makes sure there is a 1:1 linkage between
-    * these sets of attributes (XXX: position??)
-    */
-   tgsi_parse_init( &parse, fs->program.tokens );
-   while( !done &&
-	  !tgsi_parse_end_of_tokens( &parse ) ) 
-   {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
-	 {
-	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
-	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
-	    int interp_mode = parse.FullToken.FullDeclaration.Declaration.Interpolate;
-	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
-	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-
-	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
-	    
-	    switch (interp_mode) {
-	    case TGSI_INTERPOLATE_CONSTANT:
-	       for (i = first; i <= last; i++) 
-		  key.const_mask |= (1 << i);
-	       break;
-	    case TGSI_INTERPOLATE_LINEAR:
-	       for (i = first; i <= last; i++) 
-		  key.linear_mask |= (1 << i);
-	       break;
-	    case TGSI_INTERPOLATE_PERSPECTIVE:
-	       for (i = first; i <= last; i++) 
-		  key.persp_mask |= (1 << i);
-	       break;
-	    default:
-	       break;
-	    }
-
-	    /* Also need stuff for flat shading, twosided color.
-	     */
-
-	 }
-	 break;
-      default:
-	 done = 1;
-	 break;
-      }
-   }
-
-   /* Hack: Adjust for position.  Optimize away when not required (ie
-    * for perspective interpolation).
-    */
-   key.persp_mask <<= 1;
-   key.linear_mask <<= 1; 
-   key.linear_mask |= 1;
-   key.const_mask <<= 1;
-
-   debug_printf("key.persp_mask: %x\n", key.persp_mask);
-   debug_printf("key.linear_mask: %x\n", key.linear_mask);
-   debug_printf("key.const_mask: %x\n", key.const_mask);
-
-
-//   key.do_point_sprite = brw->attribs.Point->PointSprite;
-//   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
-
-//   key.do_flat_shading = (brw->attribs.Raster->flatshade);
-//   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
-
-//   if (key.do_twoside_color)
-//      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
-
-
-   if (!search_cache(brw, &key))
-      compile_sf_prog( brw, &key );
-}
-
-
-const struct brw_tracked_state brw_sf_prog = {
-   .dirty = {
-      .brw   = (BRW_NEW_RASTERIZER |
-		BRW_NEW_REDUCED_PRIMITIVE |
-		BRW_NEW_VS |
-		BRW_NEW_FS),
-      .cache = 0,
-   },
-   .update = upload_sf_prog
-};
-
-
-
-#if 0
-/* Build a struct like the one we'd like the state tracker to pass to
- * us.
- */
-static void update_sf_linkage( struct brw_context *brw )
-{
-   const struct brw_vertex_program *vs = brw->attribs.VertexProgram;
-   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
-   struct pipe_setup_linkage state;
-   struct tgsi_parse_context parse;
-
-   int i, j;
-   int nr_vp_outputs = 0;
-   int done = 0;
-
-   struct { 
-      unsigned semantic:8;
-      unsigned semantic_index:16;
-   } fp_semantic[32], vp_semantic[32];
-
-   memset(&state, 0, sizeof(state));
-
-   state.fp_input_count = 0;
-
-
-
-   
-
-
-   assert(state.fp_input_count == fs->program.num_inputs);
-
-      
-   /* Then scan vp outputs
-    */
-   done = 0;
-   tgsi_parse_init( &parse, vs->program.tokens );
-   while( !done &&
-	  !tgsi_parse_end_of_tokens( &parse ) ) 
-   {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
-	 {
-	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
-	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
-
-	    for (i = first; i < last; i++) {
-	       vp_semantic[i].semantic = 
-		  parse.FullToken.FullDeclaration.Semantic.SemanticName;
-	       vp_semantic[i].semantic_index = 
-		  parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-	    }
-	    
-	    assert(last > nr_vp_outputs);
-	    nr_vp_outputs = last;
-	 }
-	 break;
-      default:
-	 done = 1;
-	 break;
-      }
-   }
-
-
-   /* Now match based on semantic information.
-    */
-   for (i = 0; i< state.fp_input_count; i++) {
-      for (j = 0; j < nr_vp_outputs; j++) {
-	 if (fp_semantic[i].semantic == vp_semantic[j].semantic &&
-	     fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
-	    state.fp_input[i].vp_output = j;
-	 }
-      }
-      if (fp_semantic[i].semantic == TGSI_SEMANTIC_COLOR) {
-	 for (j = 0; j < nr_vp_outputs; j++) {
-	    if (TGSI_SEMANTIC_BCOLOR == vp_semantic[j].semantic &&
-		fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
-	       state.fp_input[i].bf_vp_output = j;
-	    }
-	 }
-      }
-   }
-
-   if (memcmp(&brw->sf.linkage, &state, sizeof(state)) != 0) {
-      brw->sf.linkage = state;
-      brw->state.dirty.brw |= BRW_NEW_SF_LINKAGE;
-   }
-}
-
-
-const struct brw_tracked_state brw_sf_linkage = {
-   .dirty = {
-      .brw   = (BRW_NEW_VS |
-		BRW_NEW_FS),
-      .cache = 0,
-   },
-   .update = update_sf_linkage
-};
-
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf.h b/src/gallium/drivers/i965simple/brw_sf.h
deleted file mode 100644
index b7ada475604..00000000000
--- a/src/gallium/drivers/i965simple/brw_sf.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#ifndef BRW_SF_H
-#define BRW_SF_H
-
-#include "brw_context.h"
-#include "brw_eu.h"
-
-
-#define SF_POINTS    0
-#define SF_LINES     1
-#define SF_TRIANGLES 2
-#define SF_UNFILLED_TRIS   3
-
-
-
-struct brw_sf_prog_key {
-   unsigned vp_output_count:5;
-   unsigned fp_input_count:5;
-
-   unsigned primitive:2;
-   unsigned do_twoside_color:1;
-   unsigned do_flat_shading:1;
-   unsigned frontface_ccw:1;
-   unsigned do_point_sprite:1;
-
-   /* Interpolation masks;
-    */
-   unsigned linear_mask;
-   unsigned persp_mask;
-   unsigned const_mask;
-
-
-//   int SpriteOrigin;
-};
-
-struct brw_sf_point_tex {
-	boolean CoordReplace;
-};
-
-struct brw_sf_compile {
-   struct brw_compile func;
-   struct brw_sf_prog_key key;
-   struct brw_sf_prog_data prog_data;
-
-   struct brw_reg pv;
-   struct brw_reg det;
-   struct brw_reg dx0;
-   struct brw_reg dx2;
-   struct brw_reg dy0;
-   struct brw_reg dy2;
-
-   /* z and 1/w passed in seperately:
-    */
-   struct brw_reg z[3];
-   struct brw_reg inv_w[3];
-
-   /* The vertices:
-    */
-   struct brw_reg vert[3];
-
-    /* Temporaries, allocated after last vertex reg.
-    */
-   struct brw_reg inv_det;
-   struct brw_reg a1_sub_a0;
-   struct brw_reg a2_sub_a0;
-   struct brw_reg tmp;
-
-   struct brw_reg m1Cx;
-   struct brw_reg m2Cy;
-   struct brw_reg m3C0;
-
-   unsigned nr_verts;
-   unsigned nr_attrs;
-   unsigned nr_attr_regs;
-   unsigned nr_setup_attrs;
-   unsigned nr_setup_regs;
-#if 0
-   ubyte attr_to_idx[VERT_RESULT_MAX];
-   ubyte idx_to_attr[VERT_RESULT_MAX];
-   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
-#endif
-};
-
-
-void brw_emit_tri_setup( struct brw_sf_compile *c );
-void brw_emit_line_setup( struct brw_sf_compile *c );
-void brw_emit_point_setup( struct brw_sf_compile *c );
-void brw_emit_point_sprite_setup( struct brw_sf_compile *c );
-void brw_emit_anyprim_setup( struct brw_sf_compile *c );
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf_emit.c b/src/gallium/drivers/i965simple/brw_sf_emit.c
deleted file mode 100644
index 78d6fa5e9e5..00000000000
--- a/src/gallium/drivers/i965simple/brw_sf_emit.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_defines.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_util.h"
-#include "brw_sf.h"
-
-
-
-/***********************************************************************
- * Triangle setup.
- */
-
-
-static void alloc_regs( struct brw_sf_compile *c )
-{
-   unsigned reg, i;
-
-   /* Values computed by fixed function unit:
-    */
-   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
-   c->det = brw_vec1_grf(1, 2);
-   c->dx0 = brw_vec1_grf(1, 3);
-   c->dx2 = brw_vec1_grf(1, 4);
-   c->dy0 = brw_vec1_grf(1, 5);
-   c->dy2 = brw_vec1_grf(1, 6);
-
-   /* z and 1/w passed in seperately:
-    */
-   c->z[0]     = brw_vec1_grf(2, 0);
-   c->inv_w[0] = brw_vec1_grf(2, 1);
-   c->z[1]     = brw_vec1_grf(2, 2);
-   c->inv_w[1] = brw_vec1_grf(2, 3);
-   c->z[2]     = brw_vec1_grf(2, 4);
-   c->inv_w[2] = brw_vec1_grf(2, 5);
-
-   /* The vertices:
-    */
-   reg = 3;
-   for (i = 0; i < c->nr_verts; i++) {
-      c->vert[i] = brw_vec8_grf(reg, 0);
-      reg += c->nr_attr_regs;
-   }
-
-   /* Temporaries, allocated after last vertex reg.
-    */
-   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
-   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
-   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
-   c->tmp = brw_vec8_grf(reg, 0);  reg++;
-
-   /* Note grf allocation:
-    */
-   c->prog_data.total_grf = reg;
-
-
-   /* Outputs of this program - interpolation coefficients for
-    * rasterization:
-    */
-   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
-   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
-   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
-}
-
-
-static void copy_z_inv_w( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-   brw_push_insn_state(p);
-
-   /* Copy both scalars with a single MOV:
-    */
-   for (i = 0; i < c->nr_verts; i++)
-      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
-
-   brw_pop_insn_state(p);
-}
-
-
-static void invert_det( struct brw_sf_compile *c)
-{
-   brw_math(&c->func,
-	    c->inv_det,
-	    BRW_MATH_FUNCTION_INV,
-	    BRW_MATH_SATURATE_NONE,
-	    0,
-	    c->det,
-	    BRW_MATH_DATA_SCALAR,
-	    BRW_MATH_PRECISION_FULL);
-
-}
-
-#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
-                               FRAG_BIT_COL0 | \
-			       FRAG_BIT_COL1)
-
-static boolean calculate_masks( struct brw_sf_compile *c,
-				  unsigned reg,
-				  ushort *pc,
-				  ushort *pc_persp,
-				  ushort *pc_linear)
-{
-   boolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   unsigned persp_mask = c->key.persp_mask;
-   unsigned linear_mask = c->key.linear_mask;
-
-   debug_printf("persp_mask: %x\n", persp_mask);
-   debug_printf("linear_mask: %x\n", linear_mask);
-
-   *pc_persp = 0;
-   *pc_linear = 0;
-   *pc = 0xf;
-
-   if (persp_mask & (1 << (reg*2)))
-      *pc_persp = 0xf;
-
-   if (linear_mask & (1 << (reg*2)))
-      *pc_linear = 0xf;
-
-   /* Maybe only processs one attribute on the final round:
-    */
-   if (reg*2+1 < c->nr_setup_attrs) {
-      *pc |= 0xf0;
-
-      if (persp_mask & (1 << (reg*2+1)))
-	 *pc_persp |= 0xf0;
-
-      if (linear_mask & (1 << (reg*2+1)))
-	 *pc_linear |= 0xf0;
-   }
-
-   debug_printf("pc: %x\n", *pc);
-   debug_printf("pc_persp: %x\n", *pc_persp);
-   debug_printf("pc_linear: %x\n", *pc_linear);
-   
-
-   return is_last_attr;
-}
-
-
-
-void brw_emit_tri_setup( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-   debug_printf("%s START ==============\n", __FUNCTION__);
-
-   c->nr_verts = 3;
-   alloc_regs(c);
-   invert_det(c);
-   copy_z_inv_w(c);
-
-
-   for (i = 0; i < c->nr_setup_regs; i++)
-   {
-      /* Pair of incoming attributes:
-       */
-      struct brw_reg a0 = offset(c->vert[0], i);
-      struct brw_reg a1 = offset(c->vert[1], i);
-      struct brw_reg a2 = offset(c->vert[2], i);
-      ushort pc = 0, pc_persp = 0, pc_linear = 0;
-      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
-
-      if (pc_persp)
-      {
-	 brw_set_predicate_control_flag_value(p, pc_persp);
-	 brw_MUL(p, a0, a0, c->inv_w[0]);
-	 brw_MUL(p, a1, a1, c->inv_w[1]);
-	 brw_MUL(p, a2, a2, c->inv_w[2]);
-      }
-
-
-      /* Calculate coefficients for interpolated values:
-       */
-      if (pc_linear)
-      {
-	 brw_set_predicate_control_flag_value(p, pc_linear);
-
-	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
-	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
-
-	 /* calculate dA/dx
-	  */
-	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
-	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
-	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
-
-	 /* calculate dA/dy
-	  */
-	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
-	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
-	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
-      }
-
-      {
-	 brw_set_predicate_control_flag_value(p, pc);
-	 /* start point for interpolation
-	  */
-	 brw_MOV(p, c->m3C0, a0);
-
-	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
-	  * the send instruction:
-	  */
-	 brw_urb_WRITE(p,
-		       brw_null_reg(),
-		       0,
-		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
-		       0, 	/* allocate */
-		       1,	/* used */
-		       4, 	/* msg len */
-		       0,	/* response len */
-		       last,	/* eot */
-		       last, 	/* writes complete */
-		       i*4,	/* offset */
-		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
-      }
-   }
-
-   debug_printf("%s DONE ==============\n", __FUNCTION__);
-
-}
-
-
-
-void brw_emit_line_setup( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-
-   c->nr_verts = 2;
-   alloc_regs(c);
-   invert_det(c);
-   copy_z_inv_w(c);
-
-   for (i = 0; i < c->nr_setup_regs; i++)
-   {
-      /* Pair of incoming attributes:
-       */
-      struct brw_reg a0 = offset(c->vert[0], i);
-      struct brw_reg a1 = offset(c->vert[1], i);
-      ushort pc, pc_persp, pc_linear;
-      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
-
-      if (pc_persp)
-      {
-	 brw_set_predicate_control_flag_value(p, pc_persp);
-	 brw_MUL(p, a0, a0, c->inv_w[0]);
-	 brw_MUL(p, a1, a1, c->inv_w[1]);
-      }
-
-      /* Calculate coefficients for position, color:
-       */
-      if (pc_linear) {
-	 brw_set_predicate_control_flag_value(p, pc_linear);
-
-	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
-
- 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
-	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
-
-	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
-	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
-      }
-
-      {
-	 brw_set_predicate_control_flag_value(p, pc);
-
-	 /* start point for interpolation
-	  */
-	 brw_MOV(p, c->m3C0, a0);
-
-	 /* Copy m0..m3 to URB.
-	  */
-	 brw_urb_WRITE(p,
-		       brw_null_reg(),
-		       0,
-		       brw_vec8_grf(0, 0),
-		       0, 	/* allocate */
-		       1, 	/* used */
-		       4, 	/* msg len */
-		       0,	/* response len */
-		       last, 	/* eot */
-		       last, 	/* writes complete */
-		       i*4,	/* urb destination offset */
-		       BRW_URB_SWIZZLE_TRANSPOSE);
-      }
-   }
-}
-
-
-/* Points setup - several simplifications as all attributes are
- * constant across the face of the point (point sprites excluded!)
- */
-void brw_emit_point_setup( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   unsigned i;
-
-   c->nr_verts = 1;
-   alloc_regs(c);
-   copy_z_inv_w(c);
-
-   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
-   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
-
-   for (i = 0; i < c->nr_setup_regs; i++)
-   {
-      struct brw_reg a0 = offset(c->vert[0], i);
-      ushort pc, pc_persp, pc_linear;
-      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
-
-      if (pc_persp)
-      {
-	 /* This seems odd as the values are all constant, but the
-	  * fragment shader will be expecting it:
-	  */
-	 brw_set_predicate_control_flag_value(p, pc_persp);
-	 brw_MUL(p, a0, a0, c->inv_w[0]);
-      }
-
-
-      /* The delta values are always zero, just send the starting
-       * coordinate.  Again, this is to fit in with the interpolation
-       * code in the fragment shader.
-       */
-      {
-	 brw_set_predicate_control_flag_value(p, pc);
-
-	 brw_MOV(p, c->m3C0, a0); /* constant value */
-
-	 /* Copy m0..m3 to URB.
-	  */
-	 brw_urb_WRITE(p,
-		       brw_null_reg(),
-		       0,
-		       brw_vec8_grf(0, 0),
-		       0, 	/* allocate */
-		       1,	/* used */
-		       4, 	/* msg len */
-		       0,	/* response len */
-		       last, 	/* eot */
-		       last, 	/* writes complete */
-		       i*4,	/* urb destination offset */
-		       BRW_URB_SWIZZLE_TRANSPOSE);
-      }
-   }
-}
diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c
deleted file mode 100644
index 2a5de61c219..00000000000
--- a/src/gallium/drivers/i965simple/brw_sf_state.c
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-static void upload_sf_vp(struct brw_context *brw)
-{
-   struct brw_sf_viewport sfv;
-
-   memset(&sfv, 0, sizeof(sfv));
-
-
-   /* BRW_NEW_VIEWPORT */
-   {
-      const float *scale = brw->attribs.Viewport.scale;
-      const float *trans = brw->attribs.Viewport.translate;
-
-      sfv.viewport.m00 = scale[0];
-      sfv.viewport.m11 = scale[1];
-      sfv.viewport.m22 = scale[2]; 
-      sfv.viewport.m30 = trans[0];
-      sfv.viewport.m31 = trans[1];
-      sfv.viewport.m32 = trans[2];
-   }
-
-   /* _NEW_SCISSOR */
-   sfv.scissor.xmin = brw->attribs.Scissor.minx;
-   sfv.scissor.xmax = brw->attribs.Scissor.maxx - 1;
-   sfv.scissor.ymin = brw->attribs.Scissor.miny;
-   sfv.scissor.ymax = brw->attribs.Scissor.maxy - 1;
-
-   brw->sf.vp_gs_offset = brw_cache_data( &brw->cache[BRW_SF_VP], &sfv );
-}
-
-const struct brw_tracked_state brw_sf_vp = {
-   .dirty = {
-      .brw   = (BRW_NEW_SCISSOR |
-		BRW_NEW_VIEWPORT),
-      .cache = 0
-   },
-   .update = upload_sf_vp
-};
-
-static void upload_sf_unit( struct brw_context *brw )
-{
-   struct brw_sf_unit_state sf;
-   memset(&sf, 0, sizeof(sf));
-
-   /* CACHE_NEW_SF_PROG */
-   sf.thread0.grf_reg_count = align(brw->sf.prog_data->total_grf, 16) / 16 - 1;
-   sf.thread0.kernel_start_pointer = brw->sf.prog_gs_offset >> 6;
-   sf.thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
-
-   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   sf.thread3.dispatch_grf_start_reg = 3;
-   sf.thread3.urb_entry_read_offset = 1;
-
-   /* BRW_NEW_URB_FENCE */
-   sf.thread4.nr_urb_entries = brw->urb.nr_sf_entries;
-   sf.thread4.urb_entry_allocation_size = brw->urb.sfsize - 1;
-   sf.thread4.max_threads = MIN2(12, brw->urb.nr_sf_entries / 2) - 1;
-
-   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
-      sf.thread4.max_threads = 0;
-
-   if (BRW_DEBUG & DEBUG_STATS)
-      sf.thread4.stats_enable = 1;
-
-   /* CACHE_NEW_SF_VP */
-   sf.sf5.sf_viewport_state_offset = brw->sf.vp_gs_offset >> 5;
-   sf.sf5.viewport_transform = 1;
-
-   /* BRW_NEW_RASTER */
-   if (brw->attribs.Raster->scissor)
-      sf.sf6.scissor = 1;
-
-#if 0
-   if (brw->attribs.Polygon->FrontFace == GL_CCW)
-      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
-   else
-      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
-
-
-   if (brw->attribs.Polygon->CullFlag) {
-      switch (brw->attribs.Polygon->CullFaceMode) {
-      case GL_FRONT:
-	 sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
-	 break;
-      case GL_BACK:
-	 sf.sf6.cull_mode = BRW_CULLMODE_BACK;
-	 break;
-      case GL_FRONT_AND_BACK:
-	 sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
-	 break;
-      default:
-	 assert(0);
-	 break;
-      }
-   }
-   else
-      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
-#else
-   sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
-   sf.sf6.cull_mode = BRW_CULLMODE_NONE;
-#endif
-
-   sf.sf6.line_width = CLAMP(brw->attribs.Raster->line_width, 1.0, 5.0) * (1<<1);
-
-   sf.sf6.line_endcap_aa_region_width = 1;
-   if (brw->attribs.Raster->line_smooth)
-      sf.sf6.aa_enable = 1;
-   else if (sf.sf6.line_width <= 0x2)
-       sf.sf6.line_width = 0;
-
-   sf.sf6.point_rast_rule = 1;	/* opengl conventions */
-
-   sf.sf7.sprite_point = brw->attribs.Raster->point_sprite;
-   sf.sf7.point_size = CLAMP(brw->attribs.Raster->line_width, 1.0, 255.0) * (1<<3);
-   sf.sf7.use_point_size_state = !brw->attribs.Raster->point_size_per_vertex;
-
-   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
-    */
-   sf.sf7.trifan_pv = 2;
-   sf.sf7.linestrip_pv = 1;
-   sf.sf7.tristrip_pv = 2;
-   sf.sf7.line_last_pixel_enable = 0;
-
-   /* Set bias for OpenGL rasterization rules:
-    */
-   sf.sf6.dest_org_vbias = 0x8;
-   sf.sf6.dest_org_hbias = 0x8;
-
-   brw->sf.state_gs_offset = brw_cache_data( &brw->cache[BRW_SF_UNIT], &sf );
-}
-
-
-const struct brw_tracked_state brw_sf_unit = {
-   .dirty = {
-      .brw   = (BRW_NEW_RASTERIZER |
-		BRW_NEW_URB_FENCE),
-      .cache = (CACHE_NEW_SF_VP |
-		CACHE_NEW_SF_PROG)
-   },
-   .update = upload_sf_unit
-};
-
-
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
deleted file mode 100644
index 86d877d7efd..00000000000
--- a/src/gallium/drivers/i965simple/brw_shader_info.c
+++ /dev/null
@@ -1,48 +0,0 @@
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "util/u_memory.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-
-
-/**
- * XXX this obsolete new and no longer compiled.
- */
-void brw_shader_info(const struct tgsi_token *tokens,
-		     struct brw_shader_info *info )
-{
-   struct tgsi_parse_context parse;
-   int done = 0;
-
-   tgsi_parse_init( &parse, tokens );
-
-   while( !done &&
-	  !tgsi_parse_end_of_tokens( &parse ) ) 
-   {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-      {
-	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
-	 unsigned last = decl->DeclarationRange.Last;
-      
-	 // Broken by crazy wpos init:
-	 //assert( info->nr_regs[decl->Declaration.File] <= last);
-
-	 info->nr_regs[decl->Declaration.File] = MAX2(info->nr_regs[decl->Declaration.File],
-						      last+1);
-	 break;
-      }
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-      default:
-	 done = 1;
-	 break;
-      }
-   }
-
-   tgsi_parse_free (&parse);
-   
-}
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
deleted file mode 100644
index b47f5373f3c..00000000000
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/* Authors:  Zack Rusin <zack@tungstengraphics.com>
- *           Keith Whitwell <keith@tungstengraphics.com>
- */
-
-
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_memory.h"
-#include "pipe/p_inlines.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_parse.h"
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-#include "brw_draw.h"
-
-
-#define DUP( TYPE, VAL )                        \
-do {                                            \
-   struct TYPE *x = malloc(sizeof(*x));         \
-   memcpy(x, VAL, sizeof(*x) );                 \
-   return x;                                    \
-} while (0)
-
-/************************************************************************
- * Blend 
- */
-static void *
-brw_create_blend_state(struct pipe_context *pipe,
-                        const struct pipe_blend_state *blend)
-{   
-   DUP( pipe_blend_state, blend );
-}
-
-static void brw_bind_blend_state(struct pipe_context *pipe,
-                                 void *blend)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.Blend = (struct pipe_blend_state*)blend;
-   brw->state.dirty.brw |= BRW_NEW_BLEND;
-}
-
-
-static void brw_delete_blend_state(struct pipe_context *pipe, void *blend)
-{
-   free(blend);
-}
-
-static void brw_set_blend_color( struct pipe_context *pipe,
-			     const struct pipe_blend_color *blend_color )
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.BlendColor = *blend_color;
-
-   brw->state.dirty.brw |= BRW_NEW_BLEND;
-}
-
-/************************************************************************
- * Sampler 
- */
-
-static void *
-brw_create_sampler_state(struct pipe_context *pipe,
-                          const struct pipe_sampler_state *sampler)
-{
-   DUP( pipe_sampler_state, sampler );
-}
-
-static void brw_bind_sampler_states(struct pipe_context *pipe,
-                                    unsigned num, void **sampler)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   assert(num <= PIPE_MAX_SAMPLERS);
-
-   /* Check for no-op */
-   if (num == brw->num_samplers &&
-       !memcmp(brw->attribs.Samplers, sampler, num * sizeof(void *)))
-      return;
-
-   memcpy(brw->attribs.Samplers, sampler, num * sizeof(void *));
-   memset(&brw->attribs.Samplers[num], 0, (PIPE_MAX_SAMPLERS - num) *
-          sizeof(void *));
-
-   brw->num_samplers = num;
-
-   brw->state.dirty.brw |= BRW_NEW_SAMPLER;
-}
-
-static void brw_delete_sampler_state(struct pipe_context *pipe,
-                                      void *sampler)
-{
-   free(sampler);
-}
-
-
-/************************************************************************
- * Depth stencil 
- */
-
-static void *
-brw_create_depth_stencil_state(struct pipe_context *pipe,
-                           const struct pipe_depth_stencil_alpha_state *depth_stencil)
-{
-   DUP( pipe_depth_stencil_alpha_state, depth_stencil );
-}
-
-static void brw_bind_depth_stencil_state(struct pipe_context *pipe,
-                                         void *depth_stencil)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.DepthStencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
-
-   brw->state.dirty.brw |= BRW_NEW_DEPTH_STENCIL;
-}
-
-static void brw_delete_depth_stencil_state(struct pipe_context *pipe,
-                                           void *depth_stencil)
-{
-   free(depth_stencil);
-}
-
-/************************************************************************
- * Scissor
- */
-static void brw_set_scissor_state( struct pipe_context *pipe,
-                                 const struct pipe_scissor_state *scissor )
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   memcpy( &brw->attribs.Scissor, scissor, sizeof(*scissor) );
-   brw->state.dirty.brw |= BRW_NEW_SCISSOR;
-}
-
-
-/************************************************************************
- * Stipple
- */
-
-static void brw_set_polygon_stipple( struct pipe_context *pipe,
-                                   const struct pipe_poly_stipple *stipple )
-{
-}
-
-
-/************************************************************************
- * Fragment shader
- */
-
-static void * brw_create_fs_state(struct pipe_context *pipe,
-                                   const struct pipe_shader_state *shader)
-{
-   struct brw_fragment_program *brw_fp = CALLOC_STRUCT(brw_fragment_program);
-
-   brw_fp->program.tokens = tgsi_dup_tokens(shader->tokens);
-   brw_fp->id = brw_context(pipe)->program_id++;
-
-   tgsi_scan_shader(shader->tokens, &brw_fp->info);
-
-#if 0
-   brw_shader_info(shader->tokens,
-		   &brw_fp->info2);
-#endif
-
-   tgsi_dump(shader->tokens, 0);
-
-
-   return (void *)brw_fp;
-}
-
-static void brw_bind_fs_state(struct pipe_context *pipe, void *shader)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.FragmentProgram = (struct brw_fragment_program *)shader;
-   brw->state.dirty.brw |= BRW_NEW_FS;
-}
-
-static void brw_delete_fs_state(struct pipe_context *pipe, void *shader)
-{
-   struct brw_fragment_program *brw_fp = (struct brw_fragment_program *) shader;
-
-   FREE((void *) brw_fp->program.tokens);
-   FREE(brw_fp);
-}
-
-
-/************************************************************************
- * Vertex shader and other TNL state 
- */
-
-static void *brw_create_vs_state(struct pipe_context *pipe,
-                                 const struct pipe_shader_state *shader)
-{
-   struct brw_vertex_program *brw_vp = CALLOC_STRUCT(brw_vertex_program);
-
-   brw_vp->program.tokens = tgsi_dup_tokens(shader->tokens);
-   brw_vp->id = brw_context(pipe)->program_id++;
-
-   tgsi_scan_shader(shader->tokens, &brw_vp->info);
-
-#if 0
-   brw_shader_info(shader->tokens,
-		   &brw_vp->info2);
-#endif
-   tgsi_dump(shader->tokens, 0);
-
-   return (void *)brw_vp;
-}
-
-static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
-   brw->state.dirty.brw |= BRW_NEW_VS;
-
-   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
-}
-
-static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
-{
-   struct brw_vertex_program *brw_vp = (struct brw_vertex_program *) shader;
-
-   FREE((void *) brw_vp->program.tokens);
-   FREE(brw_vp);
-}
-
-
-static void brw_set_clip_state( struct pipe_context *pipe,
-                                const struct pipe_clip_state *clip )
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.Clip = *clip;
-}
-
-
-static void brw_set_viewport_state( struct pipe_context *pipe,
-				     const struct pipe_viewport_state *viewport )
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.Viewport = *viewport; /* struct copy */
-   brw->state.dirty.brw |= BRW_NEW_VIEWPORT;
-
-   /* pass the viewport info to the draw module */
-   //draw_set_viewport_state(brw->draw, viewport);
-}
-
-
-static void brw_set_vertex_buffers(struct pipe_context *pipe,
-				   unsigned count,
-				   const struct pipe_vertex_buffer *buffers)
-{
-   struct brw_context *brw = brw_context(pipe);
-   memcpy(brw->vb.vbo_array, buffers, count * sizeof(buffers[0]));
-}
-
-static void brw_set_vertex_elements(struct pipe_context *pipe,
-                                    unsigned count,
-                                    const struct pipe_vertex_element *elements)
-{
-   /* flush ? */
-   struct brw_context *brw = brw_context(pipe);
-   uint i;
-
-   assert(count <= PIPE_MAX_ATTRIBS);
-
-   for (i = 0; i < count; i++) {
-      struct brw_vertex_element_state el;
-      memset(&el, 0, sizeof(el));
-
-      el.ve0.src_offset = elements[i].src_offset;
-      el.ve0.src_format = brw_translate_surface_format(elements[i].src_format);
-      el.ve0.valid = 1;
-      el.ve0.vertex_buffer_index = elements[i].vertex_buffer_index;
-
-      el.ve1.dst_offset   = i * 4;
-
-      el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_SRC;
-      el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_SRC;
-      el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_SRC;
-      el.ve1.vfcomponent0 = BRW_VFCOMPONENT_STORE_SRC;
-
-      switch (elements[i].nr_components) {
-      case 1: el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_0;
-      case 2: el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_0;
-      case 3: el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_1_FLT;
-         break;
-      }
-
-      brw->vb.inputs[i] = el;
-   }
-}
-
-
-
-/************************************************************************
- * Constant buffers
- */
-
-static void brw_set_constant_buffer(struct pipe_context *pipe,
-                                     uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   assert(buf == 0 || index == 0);
-
-   brw->attribs.Constants[shader] = buf;
-   brw->state.dirty.brw |= BRW_NEW_CONSTANTS;
-}
-
-
-/************************************************************************
- * Texture surfaces
- */
-
-
-static void brw_set_sampler_textures(struct pipe_context *pipe,
-                                     unsigned num,
-                                     struct pipe_texture **texture)
-{
-   struct brw_context *brw = brw_context(pipe);
-   uint i;
-
-   assert(num <= PIPE_MAX_SAMPLERS);
-
-   /* Check for no-op */
-   if (num == brw->num_textures &&
-       !memcmp(brw->attribs.Texture, texture, num *
-               sizeof(struct pipe_texture *)))
-      return;
-
-   for (i = 0; i < num; i++)
-      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
-                             texture[i]);
-
-   for (i = num; i < brw->num_textures; i++)
-      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
-                             NULL);
-
-   brw->num_textures = num;
-
-   brw->state.dirty.brw |= BRW_NEW_TEXTURE;
-}
-
-
-/************************************************************************
- * Render targets, etc
- */
-
-static void brw_set_framebuffer_state(struct pipe_context *pipe,
-				       const struct pipe_framebuffer_state *fb)
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.FrameBuffer = *fb; /* struct copy */
-
-   brw->state.dirty.brw |= BRW_NEW_FRAMEBUFFER;
-}
-
-
-
-/************************************************************************
- * Rasterizer state
- */
-
-static void *
-brw_create_rasterizer_state(struct pipe_context *pipe,
-                             const struct pipe_rasterizer_state *rasterizer)
-{
-   DUP(pipe_rasterizer_state, rasterizer);
-}
-
-static void brw_bind_rasterizer_state( struct pipe_context *pipe,
-                                        void *setup )
-{
-   struct brw_context *brw = brw_context(pipe);
-
-   brw->attribs.Raster = (struct pipe_rasterizer_state *)setup;
-
-   /* Also pass-through to draw module:
-    */
-   //draw_set_rasterizer_state(brw->draw, setup);
-
-   brw->state.dirty.brw |= BRW_NEW_RASTERIZER;
-}
-
-static void brw_delete_rasterizer_state(struct pipe_context *pipe,
-                                         void *setup)
-{
-   free(setup);
-}
-
-
-
-void
-brw_init_state_functions( struct brw_context *brw )
-{
-   brw->pipe.create_blend_state = brw_create_blend_state;
-   brw->pipe.bind_blend_state = brw_bind_blend_state;
-   brw->pipe.delete_blend_state = brw_delete_blend_state;
-
-   brw->pipe.create_sampler_state = brw_create_sampler_state;
-   brw->pipe.bind_sampler_states = brw_bind_sampler_states;
-   brw->pipe.delete_sampler_state = brw_delete_sampler_state;
-
-   brw->pipe.create_depth_stencil_alpha_state = brw_create_depth_stencil_state;
-   brw->pipe.bind_depth_stencil_alpha_state = brw_bind_depth_stencil_state;
-   brw->pipe.delete_depth_stencil_alpha_state = brw_delete_depth_stencil_state;
-
-   brw->pipe.create_rasterizer_state = brw_create_rasterizer_state;
-   brw->pipe.bind_rasterizer_state = brw_bind_rasterizer_state;
-   brw->pipe.delete_rasterizer_state = brw_delete_rasterizer_state;
-   brw->pipe.create_fs_state = brw_create_fs_state;
-   brw->pipe.bind_fs_state = brw_bind_fs_state;
-   brw->pipe.delete_fs_state = brw_delete_fs_state;
-   brw->pipe.create_vs_state = brw_create_vs_state;
-   brw->pipe.bind_vs_state = brw_bind_vs_state;
-   brw->pipe.delete_vs_state = brw_delete_vs_state;
-
-   brw->pipe.set_blend_color = brw_set_blend_color;
-   brw->pipe.set_clip_state = brw_set_clip_state;
-   brw->pipe.set_constant_buffer = brw_set_constant_buffer;
-   brw->pipe.set_framebuffer_state = brw_set_framebuffer_state;
-
-//   brw->pipe.set_feedback_state = brw_set_feedback_state;
-//   brw->pipe.set_feedback_buffer = brw_set_feedback_buffer;
-
-   brw->pipe.set_polygon_stipple = brw_set_polygon_stipple;
-   brw->pipe.set_scissor_state = brw_set_scissor_state;
-   brw->pipe.set_sampler_textures = brw_set_sampler_textures;
-   brw->pipe.set_viewport_state = brw_set_viewport_state;
-   brw->pipe.set_vertex_buffers = brw_set_vertex_buffers;
-   brw->pipe.set_vertex_elements = brw_set_vertex_elements;
-}
diff --git a/src/gallium/drivers/i965simple/brw_state.h b/src/gallium/drivers/i965simple/brw_state.h
deleted file mode 100644
index de0a6371b84..00000000000
--- a/src/gallium/drivers/i965simple/brw_state.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-    
-
-#ifndef BRW_STATE_H
-#define BRW_STATE_H
-
-#include "brw_context.h"
-#include "brw_winsys.h"
-
-
-const struct brw_tracked_state brw_blend_constant_color;
-const struct brw_tracked_state brw_cc_unit;
-const struct brw_tracked_state brw_cc_vp;
-const struct brw_tracked_state brw_clip_prog;
-const struct brw_tracked_state brw_clip_unit;
-const struct brw_tracked_state brw_constant_buffer_state;
-const struct brw_tracked_state brw_constant_buffer;
-const struct brw_tracked_state brw_curbe_offsets;
-const struct brw_tracked_state brw_invarient_state;
-const struct brw_tracked_state brw_gs_prog;
-const struct brw_tracked_state brw_gs_unit;
-const struct brw_tracked_state brw_drawing_rect;
-const struct brw_tracked_state brw_line_stipple;
-const struct brw_tracked_state brw_pipelined_state_pointers;
-const struct brw_tracked_state brw_binding_table_pointers;
-const struct brw_tracked_state brw_depthbuffer;
-const struct brw_tracked_state brw_polygon_stipple_offset;
-const struct brw_tracked_state brw_polygon_stipple;
-const struct brw_tracked_state brw_program_parameters;
-const struct brw_tracked_state brw_recalculate_urb_fence;
-const struct brw_tracked_state brw_sf_prog;
-const struct brw_tracked_state brw_sf_unit;
-const struct brw_tracked_state brw_sf_vp;
-const struct brw_tracked_state brw_state_base_address;
-const struct brw_tracked_state brw_urb_fence;
-const struct brw_tracked_state brw_vertex_state;
-const struct brw_tracked_state brw_vs_prog;
-const struct brw_tracked_state brw_vs_unit;
-const struct brw_tracked_state brw_wm_prog;
-const struct brw_tracked_state brw_wm_samplers;
-const struct brw_tracked_state brw_wm_surfaces;
-const struct brw_tracked_state brw_wm_unit;
-
-const struct brw_tracked_state brw_psp_urb_cbs;
-
-const struct brw_tracked_state brw_active_vertprog;
-const struct brw_tracked_state brw_tnl_vertprog;
-const struct brw_tracked_state brw_pipe_control;
-
-const struct brw_tracked_state brw_clear_surface_cache;
-const struct brw_tracked_state brw_clear_batch_cache;
-
-/***********************************************************************
- * brw_state_cache.c
- */
-unsigned brw_cache_data(struct brw_cache *cache,
-		      const void *data );
-
-unsigned brw_cache_data_sz(struct brw_cache *cache,
-			 const void *data,
-			 unsigned data_sz);
-
-unsigned brw_upload_cache( struct brw_cache *cache,
-			 const void *key,
-			 unsigned key_sz,
-			 const void *data,
-			 unsigned data_sz,
-			 const void *aux,
-			 void *aux_return );
-
-boolean brw_search_cache( struct brw_cache *cache,
-			    const void *key,
-			    unsigned key_size,
-			    void *aux_return,
-			    unsigned *offset_return);
-
-void brw_init_caches( struct brw_context *brw );
-void brw_destroy_caches( struct brw_context *brw );
-
-static inline struct pipe_buffer *brw_cache_buffer(struct brw_context *brw,
-                                                          enum brw_cache_id id)
-{
-   return brw->cache[id].pool->buffer;
-}
-
-/***********************************************************************
- * brw_state_batch.c
- */
-#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
-
-boolean brw_cached_batch_struct( struct brw_context *brw,
-				   const void *data,
-				   unsigned sz );
-
-void brw_destroy_batch_cache( struct brw_context *brw );
-
-
-/***********************************************************************
- * brw_state_pool.c
- */
-void brw_init_pools( struct brw_context *brw );
-void brw_destroy_pools( struct brw_context *brw );
-
-boolean brw_pool_alloc( struct brw_mem_pool *pool,
-			  unsigned size,
-			  unsigned alignment,
-			  unsigned *offset_return);
-
-void brw_pool_fence( struct brw_context *brw,
-		     struct brw_mem_pool *pool,
-		     unsigned fence );
-
-
-void brw_pool_check_wrap( struct brw_context *brw,
-			  struct brw_mem_pool *pool );
-
-void brw_clear_all_caches( struct brw_context *brw );
-void brw_invalidate_pools( struct brw_context *brw );
-void brw_clear_batch_cache_flush( struct brw_context *brw );
-
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c
deleted file mode 100644
index 43a1c89fc40..00000000000
--- a/src/gallium/drivers/i965simple/brw_state_batch.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_state.h"
-#include "brw_winsys.h"
-
-#include "util/u_memory.h"
-
-/* A facility similar to the data caching code above, which aims to
- * prevent identical commands being issued repeatedly.
- */
-boolean brw_cached_batch_struct( struct brw_context *brw,
-                                 const void *data,
-                                 unsigned sz )
-{
-   struct brw_cached_batch_item *item = brw->cached_batch_items;
-   struct header *newheader = (struct header *)data;
-
-   if (brw->emit_state_always) {
-      brw_batchbuffer_data(brw->winsys, data, sz);
-      return TRUE;
-   }
-
-   while (item) {
-      if (item->header->opcode == newheader->opcode) {
-	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
-	    return FALSE;
-	 if (item->sz != sz) {
-	    FREE(item->header);
-	    item->header = MALLOC(sz);
-	    item->sz = sz;
-	 }
-	 goto emit;
-      }
-      item = item->next;
-   }
-
-   assert(!item);
-   item = CALLOC_STRUCT(brw_cached_batch_item);
-   item->header = MALLOC(sz);
-   item->sz = sz;
-   item->next = brw->cached_batch_items;
-   brw->cached_batch_items = item;
-
-emit:
-   memcpy(item->header, newheader, sz);
-   brw_batchbuffer_data(brw->winsys, data, sz);
-   return TRUE;
-}
-
-static void clear_batch_cache( struct brw_context *brw )
-{
-   struct brw_cached_batch_item *item = brw->cached_batch_items;
-
-   while (item) {
-      struct brw_cached_batch_item *next = item->next;
-      free((void *)item->header);
-      free(item);
-      item = next;
-   }
-
-   brw->cached_batch_items = NULL;
-
-
-   brw_clear_all_caches(brw);
-
-   brw_invalidate_pools(brw);
-}
-
-void brw_clear_batch_cache_flush( struct brw_context *brw )
-{
-   clear_batch_cache(brw);
-
-/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
-
-   brw->state.dirty.brw |= ~0;
-   brw->state.dirty.cache |= ~0;
-}
-
-
-
-void brw_destroy_batch_cache( struct brw_context *brw )
-{
-   clear_batch_cache(brw);
-}
diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c
deleted file mode 100644
index 094248fa691..00000000000
--- a/src/gallium/drivers/i965simple/brw_state_cache.c
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_state.h"
-
-#include "brw_wm.h"
-#include "brw_vs.h"
-#include "brw_clip.h"
-#include "brw_sf.h"
-#include "brw_gs.h"
-
-#include "util/u_memory.h"
-
-
-
-/***********************************************************************
- * Check cache for uploaded version of struct, else upload new one.
- * Fail when memory is exhausted.
- *
- * XXX: FIXME: Currently search is so slow it would be quicker to
- * regenerate the data every time...
- */
-
-static unsigned hash_key( const void *key, unsigned key_size )
-{
-   unsigned *ikey = (unsigned *)key;
-   unsigned hash = 0, i;
-
-   assert(key_size % 4 == 0);
-
-   /* I'm sure this can be improved on:
-    */
-   for (i = 0; i < key_size/4; i++)
-      hash ^= ikey[i];
-
-   return hash;
-}
-
-static struct brw_cache_item *search_cache( struct brw_cache *cache,
-					     unsigned hash,
-					     const void *key,
-					     unsigned key_size)
-{
-   struct brw_cache_item *c;
-
-   for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->hash == hash &&
-	  c->key_size == key_size &&
-	  memcmp(c->key, key, key_size) == 0)
-	 return c;
-   }
-
-   return NULL;
-}
-
-
-static void rehash( struct brw_cache *cache )
-{
-   struct brw_cache_item **items;
-   struct brw_cache_item *c, *next;
-   unsigned size, i;
-
-   size = cache->size * 3;
-   items = (struct brw_cache_item**) MALLOC(size * sizeof(*items));
-   memset(items, 0, size * sizeof(*items));
-
-   for (i = 0; i < cache->size; i++)
-      for (c = cache->items[i]; c; c = next) {
-	 next = c->next;
-	 c->next = items[c->hash % size];
-	 items[c->hash % size] = c;
-      }
-
-   FREE(cache->items);
-   cache->items = items;
-   cache->size = size;
-}
-
-
-boolean brw_search_cache( struct brw_cache *cache,
-			    const void *key,
-			    unsigned key_size,
-			    void *aux_return,
-			    unsigned *offset_return)
-{
-   struct brw_cache_item *item;
-   unsigned addr = 0;
-   unsigned hash = hash_key(key, key_size);
-
-   item = search_cache(cache, hash, key, key_size);
-
-   if (item) {
-      if (aux_return)
-	 *(void **)aux_return = (void *)((char *)item->key + item->key_size);
-
-      *offset_return = addr = item->offset;
-   }
-
-   if (item == NULL || addr != cache->last_addr) {
-      cache->brw->state.dirty.cache |= 1<<cache->id;
-      cache->last_addr = addr;
-   }
-
-   return item != NULL;
-}
-
-unsigned brw_upload_cache( struct brw_cache *cache,
-			 const void *key,
-			 unsigned key_size,
-			 const void *data,
-			 unsigned data_size,
-			 const void *aux,
-			 void *aux_return )
-{
-   unsigned offset;
-   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
-   unsigned hash = hash_key(key, key_size);
-   void *tmp = MALLOC(key_size + cache->aux_size);
-
-   if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
-      /* Should not be possible:
-       */
-      debug_printf("brw_pool_alloc failed\n");
-      exit(1);
-   }
-
-   memcpy(tmp, key, key_size);
-
-   if (cache->aux_size)
-      memcpy(tmp+key_size, aux, cache->aux_size);
-
-   item->key = tmp;
-   item->hash = hash;
-   item->key_size = key_size;
-   item->offset = offset;
-   item->data_size = data_size;
-
-   if (++cache->n_items > cache->size * 1.5)
-      rehash(cache);
-
-   hash %= cache->size;
-   item->next = cache->items[hash];
-   cache->items[hash] = item;
-
-   if (aux_return) {
-      assert(cache->aux_size);
-      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
-   }
-
-   if (BRW_DEBUG & DEBUG_STATE)
-      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
-             cache->name, 
-	     data_size,
-             (void*)cache->pool->buffer,
-             offset);
-
-   /* Copy data to the buffer:
-    */
-   cache->brw->winsys->buffer_subdata_typed(cache->brw->winsys,
-					    cache->pool->buffer, 
-					    offset, 
-					    data_size, 
-					    data,
-					    cache->id);
-
-   cache->brw->state.dirty.cache |= 1<<cache->id;
-   cache->last_addr = offset;
-
-   return offset;
-}
-
-/* This doesn't really work with aux data.  Use search/upload instead
- */
-unsigned brw_cache_data_sz(struct brw_cache *cache,
-			 const void *data,
-			 unsigned data_size)
-{
-   unsigned addr;
-
-   if (!brw_search_cache(cache, data, data_size, NULL, &addr)) {
-      addr = brw_upload_cache(cache,
-			      data, data_size,
-			      data, data_size,
-			      NULL, NULL);
-   }
-
-   return addr;
-}
-
-unsigned brw_cache_data(struct brw_cache *cache,
-		      const void *data)
-{
-   return brw_cache_data_sz(cache, data, cache->key_size);
-}
-
-enum pool_type {
-   DW_SURFACE_STATE,
-   DW_GENERAL_STATE
-};
-
-static void brw_init_cache( struct brw_context *brw,
-			    const char *name,
-			    unsigned id,
-			    unsigned key_size,
-			    unsigned aux_size,
-			    enum pool_type pool_type)
-{
-   struct brw_cache *cache = &brw->cache[id];
-   cache->brw = brw;
-   cache->id = id;
-   cache->name = name;
-   cache->items = NULL;
-
-   cache->size = 7;
-   cache->n_items = 0;
-   cache->items = (struct brw_cache_item **)
-      CALLOC(cache->size, sizeof(struct brw_cache_item));
-
-
-   cache->key_size = key_size;
-   cache->aux_size = aux_size;
-   switch (pool_type) {
-   case DW_GENERAL_STATE: cache->pool = &brw->pool[BRW_GS_POOL]; break;
-   case DW_SURFACE_STATE: cache->pool = &brw->pool[BRW_SS_POOL]; break;
-   default: assert(0); break;
-   }
-}
-
-void brw_init_caches( struct brw_context *brw )
-{
-
-   brw_init_cache(brw,
-		  "CC_VP",
-		  BRW_CC_VP,
-		  sizeof(struct brw_cc_viewport),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "CC_UNIT",
-		  BRW_CC_UNIT,
-		  sizeof(struct brw_cc_unit_state),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "WM_PROG",
-		  BRW_WM_PROG,
-		  sizeof(struct brw_wm_prog_key),
-		  sizeof(struct brw_wm_prog_data),
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "SAMPLER_DEFAULT_COLOR",
-		  BRW_SAMPLER_DEFAULT_COLOR,
-		  sizeof(struct brw_sampler_default_color),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "SAMPLER",
-		  BRW_SAMPLER,
-		  0,		/* variable key/data size */
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "WM_UNIT",
-		  BRW_WM_UNIT,
-		  sizeof(struct brw_wm_unit_state),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "SF_PROG",
-		  BRW_SF_PROG,
-		  sizeof(struct brw_sf_prog_key),
-		  sizeof(struct brw_sf_prog_data),
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "SF_VP",
-		  BRW_SF_VP,
-		  sizeof(struct brw_sf_viewport),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "SF_UNIT",
-		  BRW_SF_UNIT,
-		  sizeof(struct brw_sf_unit_state),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "VS_UNIT",
-		  BRW_VS_UNIT,
-		  sizeof(struct brw_vs_unit_state),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "VS_PROG",
-		  BRW_VS_PROG,
-		  sizeof(struct brw_vs_prog_key),
-		  sizeof(struct brw_vs_prog_data),
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "CLIP_UNIT",
-		  BRW_CLIP_UNIT,
-		  sizeof(struct brw_clip_unit_state),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "CLIP_PROG",
-		  BRW_CLIP_PROG,
-		  sizeof(struct brw_clip_prog_key),
-		  sizeof(struct brw_clip_prog_data),
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "GS_UNIT",
-		  BRW_GS_UNIT,
-		  sizeof(struct brw_gs_unit_state),
-		  0,
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "GS_PROG",
-		  BRW_GS_PROG,
-		  sizeof(struct brw_gs_prog_key),
-		  sizeof(struct brw_gs_prog_data),
-		  DW_GENERAL_STATE);
-
-   brw_init_cache(brw,
-		  "SS_SURFACE",
-		  BRW_SS_SURFACE,
-		  sizeof(struct brw_surface_state),
-		  0,
-		  DW_SURFACE_STATE);
-
-   brw_init_cache(brw,
-		  "SS_SURF_BIND",
-		  BRW_SS_SURF_BIND,
-		  sizeof(struct brw_surface_binding_table),
-		  0,
-		  DW_SURFACE_STATE);
-}
-
-
-/* When we lose hardware context, need to invalidate the surface cache
- * as these structs must be explicitly re-uploaded.  They are subject
- * to fixup by the memory manager as they contain absolute agp
- * offsets, so we need to ensure there is a fresh version of the
- * struct available to receive the fixup.
- *
- * XXX: Need to ensure that there aren't two versions of a surface or
- * bufferobj with different backing data active in the same buffer at
- * once?  Otherwise the cache could confuse them.  Maybe better not to
- * cache at all?
- *
- * --> Isn't this the same as saying need to ensure batch is flushed
- *         before new data is uploaded to an existing buffer?  We
- *         already try to make sure of that.
- */
-static void clear_cache( struct brw_cache *cache )
-{
-   struct brw_cache_item *c, *next;
-   unsigned i;
-
-   for (i = 0; i < cache->size; i++) {
-      for (c = cache->items[i]; c; c = next) {
-	 next = c->next;
-	 free((void *)c->key);
-	 free(c);
-      }
-      cache->items[i] = NULL;
-   }
-
-   cache->n_items = 0;
-}
-
-void brw_clear_all_caches( struct brw_context *brw )
-{
-   int i;
-
-   if (BRW_DEBUG & DEBUG_STATE)
-      debug_printf("%s\n", __FUNCTION__);
-
-   for (i = 0; i < BRW_MAX_CACHE; i++)
-      clear_cache(&brw->cache[i]);
-
-   if (brw->curbe.last_buf) {
-      FREE(brw->curbe.last_buf);
-      brw->curbe.last_buf = NULL;
-   }
-
-   brw->state.dirty.brw |= ~0;
-   brw->state.dirty.cache |= ~0;
-}
-
-
-
-
-
-void brw_destroy_caches( struct brw_context *brw )
-{
-   unsigned i;
-
-   for (i = 0; i < BRW_MAX_CACHE; i++)
-      clear_cache(&brw->cache[i]);
-}
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
deleted file mode 100644
index e91263cb1ff..00000000000
--- a/src/gallium/drivers/i965simple/brw_state_pool.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-/** @file brw_state_pool.c
- * Implements the state pool allocator.
- *
- * For the 965, we create two state pools for state cache entries.  Objects
- * will be allocated into the pools depending on which state base address
- * their pointer is relative to in other 965 state.
- *
- * The state pools are relatively simple: As objects are allocated, increment
- * the offset to allocate space.  When the pool is "full" (rather, close to
- * full), we reset the pool and reset the state cache entries that point into
- * the pool.
- */
-
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "pipe/p_inlines.h"
-#include "brw_context.h"
-#include "brw_state.h"
-
-boolean brw_pool_alloc( struct brw_mem_pool *pool,
-			  unsigned size,
-			  unsigned alignment,
-			  unsigned *offset_return)
-{
-   unsigned fixup = align(pool->offset, alignment) - pool->offset;
-
-   size = align(size, 4);
-
-   if (pool->offset + fixup + size >= pool->size) {
-      debug_printf("%s failed\n", __FUNCTION__);
-      assert(0);
-      exit(0);
-   }
-
-   pool->offset += fixup;
-   *offset_return = pool->offset;
-   pool->offset += size;
-
-   return TRUE;
-}
-
-static
-void brw_invalidate_pool( struct brw_mem_pool *pool )
-{
-   if (BRW_DEBUG & DEBUG_STATE)
-      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
-
-   pool->offset = 0;
-
-   brw_clear_all_caches(pool->brw);
-}
-
-
-static void brw_init_pool( struct brw_context *brw,
-			   unsigned pool_id,
-			   unsigned size )
-{
-   struct brw_mem_pool *pool = &brw->pool[pool_id];
-
-   pool->size = size;
-   pool->brw = brw;
-
-   pool->buffer = pipe_buffer_create(brw->pipe.screen,
-                                     4096,
-                                     0 /*  DRM_BO_FLAG_MEM_TT */,
-                                     size);
-}
-
-static void brw_destroy_pool( struct brw_context *brw,
-			      unsigned pool_id )
-{
-   struct brw_mem_pool *pool = &brw->pool[pool_id];
-
-   pipe_buffer_reference( pool->brw->pipe.screen,
-			  &pool->buffer,
-			  NULL );
-}
-
-
-void brw_pool_check_wrap( struct brw_context *brw,
-			  struct brw_mem_pool *pool )
-{
-   if (pool->offset > (pool->size * 3) / 4) {
-      brw->state.dirty.brw |= BRW_NEW_SCENE;
-   }
-
-}
-
-void brw_init_pools( struct brw_context *brw )
-{
-   brw_init_pool(brw, BRW_GS_POOL, 0x80000);
-   brw_init_pool(brw, BRW_SS_POOL, 0x80000);
-}
-
-void brw_destroy_pools( struct brw_context *brw )
-{
-   brw_destroy_pool(brw, BRW_GS_POOL);
-   brw_destroy_pool(brw, BRW_SS_POOL);
-}
-
-
-void brw_invalidate_pools( struct brw_context *brw )
-{
-   brw_invalidate_pool(&brw->pool[BRW_GS_POOL]);
-   brw_invalidate_pool(&brw->pool[BRW_SS_POOL]);
-}
diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c
deleted file mode 100644
index bac9161b5f1..00000000000
--- a/src/gallium/drivers/i965simple/brw_state_upload.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-
-#include "util/u_memory.h"
-
-/* This is used to initialize brw->state.atoms[].  We could use this
- * list directly except for a single atom, brw_constant_buffer, which
- * has a .dirty value which changes according to the parameters of the
- * current fragment and vertex programs, and so cannot be a static
- * value.
- */
-const struct brw_tracked_state *atoms[] =
-{
-   &brw_vs_prog,
-   &brw_gs_prog,
-   &brw_clip_prog,
-   &brw_sf_prog,
-   &brw_wm_prog,
-
-   /* Once all the programs are done, we know how large urb entry
-    * sizes need to be and can decide if we need to change the urb
-    * layout.
-    */
-   &brw_curbe_offsets,
-   &brw_recalculate_urb_fence,
-
-
-   &brw_cc_vp,
-   &brw_cc_unit,
-
-   &brw_wm_surfaces,		/* must do before samplers */
-   &brw_wm_samplers,
-
-   &brw_wm_unit,
-   &brw_sf_vp,
-   &brw_sf_unit,
-   &brw_vs_unit,		/* always required, enabled or not */
-   &brw_clip_unit,
-   &brw_gs_unit,
-
-   /* Command packets:
-    */
-   &brw_invarient_state,
-   &brw_state_base_address,
-   &brw_pipe_control,
-
-   &brw_binding_table_pointers,
-   &brw_blend_constant_color,
-
-   &brw_drawing_rect,
-   &brw_depthbuffer,
-
-   &brw_polygon_stipple,
-   &brw_line_stipple,
-
-   &brw_psp_urb_cbs,
-
-   &brw_constant_buffer
-};
-
-
-void brw_init_state( struct brw_context *brw )
-{
-   brw_init_pools(brw);
-   brw_init_caches(brw);
-
-   brw->state.dirty.brw = ~0;
-   brw->emit_state_always = 0;
-}
-
-
-void brw_destroy_state( struct brw_context *brw )
-{
-   brw_destroy_caches(brw);
-   brw_destroy_batch_cache(brw);
-   brw_destroy_pools(brw);
-}
-
-/***********************************************************************
- */
-
-static boolean check_state( const struct brw_state_flags *a,
-			      const struct brw_state_flags *b )
-{
-   return ((a->brw & b->brw) ||
-	   (a->cache & b->cache));
-}
-
-static void accumulate_state( struct brw_state_flags *a,
-			      const struct brw_state_flags *b )
-{
-   a->brw |= b->brw;
-   a->cache |= b->cache;
-}
-
-
-static void xor_states( struct brw_state_flags *result,
-			     const struct brw_state_flags *a,
-			      const struct brw_state_flags *b )
-{
-   result->brw = a->brw ^ b->brw;
-   result->cache = a->cache ^ b->cache;
-}
-
-
-/***********************************************************************
- * Emit all state:
- */
-void brw_validate_state( struct brw_context *brw )
-{
-   struct brw_state_flags *state = &brw->state.dirty;
-   unsigned i;
-
-   if (brw->emit_state_always) 
-      state->brw |= ~0;
-
-   if (state->cache == 0 &&
-       state->brw == 0)
-      return;
-
-   if (brw->state.dirty.brw & BRW_NEW_SCENE)
-      brw_clear_batch_cache_flush(brw);
-
-   if (BRW_DEBUG) {
-      /* Debug version which enforces various sanity checks on the
-       * state flags which are generated and checked to help ensure
-       * state atoms are ordered correctly in the list.
-       */
-      struct brw_state_flags examined, prev;
-      memset(&examined, 0, sizeof(examined));
-      prev = *state;
-
-      for (i = 0; i < Elements(atoms); i++) {
-	 const struct brw_tracked_state *atom = atoms[i];
-	 struct brw_state_flags generated;
-
-	 assert(atom->dirty.brw ||
-		atom->dirty.cache);
-	 assert(atom->update);
-
-	 if (check_state(state, &atom->dirty)) {
-	    atom->update( brw );
-	 }
-
-	 accumulate_state(&examined, &atom->dirty);
-
-	 /* generated = (prev ^ state)
-	  * if (examined & generated)
-	  *     fail;
-	  */
-	 xor_states(&generated, &prev, state);
-	 assert(!check_state(&examined, &generated));
-	 prev = *state;
-      }
-   }
-   else {
-      for (i = 0; i < Elements(atoms); i++) {
-	 const struct brw_tracked_state *atom = atoms[i];
-
-	 assert(atom->dirty.brw ||
-		atom->dirty.cache);
-	 assert(atom->update);
-
-	 if (check_state(state, &atom->dirty))
-	    atom->update( brw );
-      }
-   }
-
-   memset(state, 0, sizeof(*state));
-}
diff --git a/src/gallium/drivers/i965simple/brw_structs.h b/src/gallium/drivers/i965simple/brw_structs.h
deleted file mode 100644
index bbb087e95d6..00000000000
--- a/src/gallium/drivers/i965simple/brw_structs.h
+++ /dev/null
@@ -1,1348 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#ifndef BRW_STRUCTS_H
-#define BRW_STRUCTS_H
-
-#include "pipe/p_compiler.h"
-
-/* Command packets:
- */
-struct header
-{
-   unsigned length:16;
-   unsigned opcode:16;
-};
-
-
-union header_union
-{
-   struct header bits;
-   unsigned dword;
-};
-
-struct brw_3d_control
-{
-   struct
-   {
-      unsigned length:8;
-      unsigned notify_enable:1;
-      unsigned pad:3;
-      unsigned wc_flush_enable:1;
-      unsigned depth_stall_enable:1;
-      unsigned operation:2;
-      unsigned opcode:16;
-   } header;
-
-   struct
-   {
-      unsigned pad:2;
-      unsigned dest_addr_type:1;
-      unsigned dest_addr:29;
-   } dest;
-
-   unsigned dword2;
-   unsigned dword3;
-};
-
-
-struct brw_3d_primitive
-{
-   struct
-   {
-      unsigned length:8;
-      unsigned pad:2;
-      unsigned topology:5;
-      unsigned indexed:1;
-      unsigned opcode:16;
-   } header;
-
-   unsigned verts_per_instance;
-   unsigned start_vert_location;
-   unsigned instance_count;
-   unsigned start_instance_location;
-   unsigned base_vert_location;
-};
-
-/* These seem to be passed around as function args, so it works out
- * better to keep them as #defines:
- */
-#define BRW_FLUSH_READ_CACHE           0x1
-#define BRW_FLUSH_STATE_CACHE          0x2
-#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4
-#define BRW_FLUSH_SNAPSHOT_COUNTERS    0x8
-
-struct brw_mi_flush
-{
-   unsigned flags:4;
-   unsigned pad:12;
-   unsigned opcode:16;
-};
-
-struct brw_vf_statistics
-{
-   unsigned statistics_enable:1;
-   unsigned pad:15;
-   unsigned opcode:16;
-};
-
-
-
-struct brw_binding_table_pointers
-{
-   struct header header;
-   unsigned vs;
-   unsigned gs;
-   unsigned clp;
-   unsigned sf;
-   unsigned wm;
-};
-
-
-struct brw_blend_constant_color
-{
-   struct header header;
-   float blend_constant_color[4];
-};
-
-
-struct brw_depthbuffer
-{
-   union header_union header;
-
-   union {
-      struct {
-	 unsigned pitch:18;
-	 unsigned format:3;
-	 unsigned pad:4;
-	 unsigned depth_offset_disable:1;
-	 unsigned tile_walk:1;
-	 unsigned tiled_surface:1;
-	 unsigned pad2:1;
-	 unsigned surface_type:3;
-      } bits;
-      unsigned dword;
-   } dword1;
-
-   unsigned dword2_base_addr;
-
-   union {
-      struct {
-	 unsigned pad:1;
-	 unsigned mipmap_layout:1;
-	 unsigned lod:4;
-	 unsigned width:13;
-	 unsigned height:13;
-      } bits;
-      unsigned dword;
-   } dword3;
-
-   union {
-      struct {
-	 unsigned pad:12;
-	 unsigned min_array_element:9;
-	 unsigned depth:11;
-      } bits;
-      unsigned dword;
-   } dword4;
-};
-
-struct brw_drawrect
-{
-   struct header header;
-   unsigned xmin:16;
-   unsigned ymin:16;
-   unsigned xmax:16;
-   unsigned ymax:16;
-   unsigned xorg:16;
-   unsigned yorg:16;
-};
-
-
-
-
-struct brw_global_depth_offset_clamp
-{
-   struct header header;
-   float depth_offset_clamp;
-};
-
-struct brw_indexbuffer
-{
-   union {
-      struct
-      {
-	 unsigned length:8;
-	 unsigned index_format:2;
-	 unsigned cut_index_enable:1;
-	 unsigned pad:5;
-	 unsigned opcode:16;
-      } bits;
-      unsigned dword;
-
-   } header;
-
-   unsigned buffer_start;
-   unsigned buffer_end;
-};
-
-
-struct brw_line_stipple
-{
-   struct header header;
-
-   struct
-   {
-      unsigned pattern:16;
-      unsigned pad:16;
-   } bits0;
-
-   struct
-   {
-      unsigned repeat_count:9;
-      unsigned pad:7;
-      unsigned inverse_repeat_count:16;
-   } bits1;
-};
-
-
-struct brw_pipelined_state_pointers
-{
-   struct header header;
-
-   struct {
-      unsigned pad:5;
-      unsigned offset:27;
-   } vs;
-
-   struct
-   {
-      unsigned enable:1;
-      unsigned pad:4;
-      unsigned offset:27;
-   } gs;
-
-   struct
-   {
-      unsigned enable:1;
-      unsigned pad:4;
-      unsigned offset:27;
-   } clp;
-
-   struct
-   {
-      unsigned pad:5;
-      unsigned offset:27;
-   } sf;
-
-   struct
-   {
-      unsigned pad:5;
-      unsigned offset:27;
-   } wm;
-
-   struct
-   {
-      unsigned pad:5;
-      unsigned offset:27; /* KW: check me! */
-   } cc;
-};
-
-
-struct brw_polygon_stipple_offset
-{
-   struct header header;
-
-   struct {
-      unsigned y_offset:5;
-      unsigned pad:3;
-      unsigned x_offset:5;
-      unsigned pad0:19;
-   } bits0;
-};
-
-
-
-struct brw_polygon_stipple
-{
-   struct header header;
-   unsigned stipple[32];
-};
-
-
-
-struct brw_pipeline_select
-{
-   struct
-   {
-      unsigned pipeline_select:1;
-      unsigned pad:15;
-      unsigned opcode:16;
-   } header;
-};
-
-
-struct brw_pipe_control
-{
-   struct
-   {
-      unsigned length:8;
-      unsigned notify_enable:1;
-      unsigned pad:2;
-      unsigned instruction_state_cache_flush_enable:1;
-      unsigned write_cache_flush_enable:1;
-      unsigned depth_stall_enable:1;
-      unsigned post_sync_operation:2;
-
-      unsigned opcode:16;
-   } header;
-
-   struct
-   {
-      unsigned pad:2;
-      unsigned dest_addr_type:1;
-      unsigned dest_addr:29;
-   } bits1;
-
-   unsigned data0;
-   unsigned data1;
-};
-
-
-struct brw_urb_fence
-{
-   struct
-   {
-      unsigned length:8;
-      unsigned vs_realloc:1;
-      unsigned gs_realloc:1;
-      unsigned clp_realloc:1;
-      unsigned sf_realloc:1;
-      unsigned vfe_realloc:1;
-      unsigned cs_realloc:1;
-      unsigned pad:2;
-      unsigned opcode:16;
-   } header;
-
-   struct
-   {
-      unsigned vs_fence:10;
-      unsigned gs_fence:10;
-      unsigned clp_fence:10;
-      unsigned pad:2;
-   } bits0;
-
-   struct
-   {
-      unsigned sf_fence:10;
-      unsigned vf_fence:10;
-      unsigned cs_fence:10;
-      unsigned pad:2;
-   } bits1;
-};
-
-struct brw_constant_buffer_state /* previously brw_command_streamer */
-{
-   struct header header;
-
-   struct
-   {
-      unsigned nr_urb_entries:3;
-      unsigned pad:1;
-      unsigned urb_entry_size:5;
-      unsigned pad0:23;
-   } bits0;
-};
-
-struct brw_constant_buffer
-{
-   struct
-   {
-      unsigned length:8;
-      unsigned valid:1;
-      unsigned pad:7;
-      unsigned opcode:16;
-   } header;
-
-   struct
-   {
-      unsigned buffer_length:6;
-      unsigned buffer_address:26;
-   } bits0;
-};
-
-struct brw_state_base_address
-{
-   struct header header;
-
-   struct
-   {
-      unsigned modify_enable:1;
-      unsigned pad:4;
-      unsigned general_state_address:27;
-   } bits0;
-
-   struct
-   {
-      unsigned modify_enable:1;
-      unsigned pad:4;
-      unsigned surface_state_address:27;
-   } bits1;
-
-   struct
-   {
-      unsigned modify_enable:1;
-      unsigned pad:4;
-      unsigned indirect_object_state_address:27;
-   } bits2;
-
-   struct
-   {
-      unsigned modify_enable:1;
-      unsigned pad:11;
-      unsigned general_state_upper_bound:20;
-   } bits3;
-
-   struct
-   {
-      unsigned modify_enable:1;
-      unsigned pad:11;
-      unsigned indirect_object_state_upper_bound:20;
-   } bits4;
-};
-
-struct brw_state_prefetch
-{
-   struct header header;
-
-   struct
-   {
-      unsigned prefetch_count:3;
-      unsigned pad:3;
-      unsigned prefetch_pointer:26;
-   } bits0;
-};
-
-struct brw_system_instruction_pointer
-{
-   struct header header;
-
-   struct
-   {
-      unsigned pad:4;
-      unsigned system_instruction_pointer:28;
-   } bits0;
-};
-
-
-
-
-/* State structs for the various fixed function units:
- */
-
-
-struct thread0
-{
-   unsigned pad0:1;
-   unsigned grf_reg_count:3;
-   unsigned pad1:2;
-   unsigned kernel_start_pointer:26;
-};
-
-struct thread1
-{
-   unsigned ext_halt_exception_enable:1;
-   unsigned sw_exception_enable:1;
-   unsigned mask_stack_exception_enable:1;
-   unsigned timeout_exception_enable:1;
-   unsigned illegal_op_exception_enable:1;
-   unsigned pad0:3;
-   unsigned depth_coef_urb_read_offset:6;	/* WM only */
-   unsigned pad1:2;
-   unsigned floating_point_mode:1;
-   unsigned thread_priority:1;
-   unsigned binding_table_entry_count:8;
-   unsigned pad3:5;
-   unsigned single_program_flow:1;
-};
-
-struct thread2
-{
-   unsigned per_thread_scratch_space:4;
-   unsigned pad0:6;
-   unsigned scratch_space_base_pointer:22;
-};
-
-
-struct thread3
-{
-   unsigned dispatch_grf_start_reg:4;
-   unsigned urb_entry_read_offset:6;
-   unsigned pad0:1;
-   unsigned urb_entry_read_length:6;
-   unsigned pad1:1;
-   unsigned const_urb_entry_read_offset:6;
-   unsigned pad2:1;
-   unsigned const_urb_entry_read_length:6;
-   unsigned pad3:1;
-};
-
-
-
-struct brw_clip_unit_state
-{
-   struct thread0 thread0;
-   struct
-   {
-      unsigned pad0:7;
-      unsigned sw_exception_enable:1;
-      unsigned pad1:3;
-      unsigned mask_stack_exception_enable:1;
-      unsigned pad2:1;
-      unsigned illegal_op_exception_enable:1;
-      unsigned pad3:2;
-      unsigned floating_point_mode:1;
-      unsigned thread_priority:1;
-      unsigned binding_table_entry_count:8;
-      unsigned pad4:5;
-      unsigned single_program_flow:1;
-   } thread1;
-
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:9;
-      unsigned gs_output_stats:1; /* not always */
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:1; 	/* may be less */
-      unsigned pad3:6;
-   } thread4;
-
-   struct
-   {
-      unsigned pad0:13;
-      unsigned clip_mode:3;
-      unsigned userclip_enable_flags:8;
-      unsigned userclip_must_clip:1;
-      unsigned pad1:1;
-      unsigned guard_band_enable:1;
-      unsigned viewport_z_clip_enable:1;
-      unsigned viewport_xy_clip_enable:1;
-      unsigned vertex_position_space:1;
-      unsigned api_mode:1;
-      unsigned pad2:1;
-   } clip5;
-
-   struct
-   {
-      unsigned pad0:5;
-      unsigned clipper_viewport_state_ptr:27;
-   } clip6;
-
-
-   float viewport_xmin;
-   float viewport_xmax;
-   float viewport_ymin;
-   float viewport_ymax;
-};
-
-
-
-struct brw_cc_unit_state
-{
-   struct
-   {
-      unsigned pad0:3;
-      unsigned bf_stencil_pass_depth_pass_op:3;
-      unsigned bf_stencil_pass_depth_fail_op:3;
-      unsigned bf_stencil_fail_op:3;
-      unsigned bf_stencil_func:3;
-      unsigned bf_stencil_enable:1;
-      unsigned pad1:2;
-      unsigned stencil_write_enable:1;
-      unsigned stencil_pass_depth_pass_op:3;
-      unsigned stencil_pass_depth_fail_op:3;
-      unsigned stencil_fail_op:3;
-      unsigned stencil_func:3;
-      unsigned stencil_enable:1;
-   } cc0;
-
-
-   struct
-   {
-      unsigned bf_stencil_ref:8;
-      unsigned stencil_write_mask:8;
-      unsigned stencil_test_mask:8;
-      unsigned stencil_ref:8;
-   } cc1;
-
-
-   struct
-   {
-      unsigned logicop_enable:1;
-      unsigned pad0:10;
-      unsigned depth_write_enable:1;
-      unsigned depth_test_function:3;
-      unsigned depth_test:1;
-      unsigned bf_stencil_write_mask:8;
-      unsigned bf_stencil_test_mask:8;
-   } cc2;
-
-
-   struct
-   {
-      unsigned pad0:8;
-      unsigned alpha_test_func:3;
-      unsigned alpha_test:1;
-      unsigned blend_enable:1;
-      unsigned ia_blend_enable:1;
-      unsigned pad1:1;
-      unsigned alpha_test_format:1;
-      unsigned pad2:16;
-   } cc3;
-
-   struct
-   {
-      unsigned pad0:5;
-      unsigned cc_viewport_state_offset:27;
-   } cc4;
-
-   struct
-   {
-      unsigned pad0:2;
-      unsigned ia_dest_blend_factor:5;
-      unsigned ia_src_blend_factor:5;
-      unsigned ia_blend_function:3;
-      unsigned statistics_enable:1;
-      unsigned logicop_func:4;
-      unsigned pad1:11;
-      unsigned dither_enable:1;
-   } cc5;
-
-   struct
-   {
-      unsigned clamp_post_alpha_blend:1;
-      unsigned clamp_pre_alpha_blend:1;
-      unsigned clamp_range:2;
-      unsigned pad0:11;
-      unsigned y_dither_offset:2;
-      unsigned x_dither_offset:2;
-      unsigned dest_blend_factor:5;
-      unsigned src_blend_factor:5;
-      unsigned blend_function:3;
-   } cc6;
-
-   struct {
-      union {
-	 float f;
-	 ubyte ub[4];
-      } alpha_ref;
-   } cc7;
-};
-
-
-
-struct brw_sf_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:10;
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:6;
-      unsigned pad3:1;
-   } thread4;
-
-   struct
-   {
-      unsigned front_winding:1;
-      unsigned viewport_transform:1;
-      unsigned pad0:3;
-      unsigned sf_viewport_state_offset:27;
-   } sf5;
-
-   struct
-   {
-      unsigned pad0:9;
-      unsigned dest_org_vbias:4;
-      unsigned dest_org_hbias:4;
-      unsigned scissor:1;
-      unsigned disable_2x2_trifilter:1;
-      unsigned disable_zero_pix_trifilter:1;
-      unsigned point_rast_rule:2;
-      unsigned line_endcap_aa_region_width:2;
-      unsigned line_width:4;
-      unsigned fast_scissor_disable:1;
-      unsigned cull_mode:2;
-      unsigned aa_enable:1;
-   } sf6;
-
-   struct
-   {
-      unsigned point_size:11;
-      unsigned use_point_size_state:1;
-      unsigned subpixel_precision:1;
-      unsigned sprite_point:1;
-      unsigned pad0:11;
-      unsigned trifan_pv:2;
-      unsigned linestrip_pv:2;
-      unsigned tristrip_pv:2;
-      unsigned line_last_pixel_enable:1;
-   } sf7;
-
-};
-
-
-struct brw_gs_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:10;
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:1;
-      unsigned pad3:6;
-   } thread4;
-
-   struct
-   {
-      unsigned sampler_count:3;
-      unsigned pad0:2;
-      unsigned sampler_state_pointer:27;
-   } gs5;
-
-
-   struct
-   {
-      unsigned max_vp_index:4;
-      unsigned pad0:26;
-      unsigned reorder_enable:1;
-      unsigned pad1:1;
-   } gs6;
-};
-
-
-struct brw_vs_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct
-   {
-      unsigned pad0:10;
-      unsigned stats_enable:1;
-      unsigned nr_urb_entries:7;
-      unsigned pad1:1;
-      unsigned urb_entry_allocation_size:5;
-      unsigned pad2:1;
-      unsigned max_threads:4;
-      unsigned pad3:3;
-   } thread4;
-
-   struct
-   {
-      unsigned sampler_count:3;
-      unsigned pad0:2;
-      unsigned sampler_state_pointer:27;
-   } vs5;
-
-   struct
-   {
-      unsigned vs_enable:1;
-      unsigned vert_cache_disable:1;
-      unsigned pad0:30;
-   } vs6;
-};
-
-
-struct brw_wm_unit_state
-{
-   struct thread0 thread0;
-   struct thread1 thread1;
-   struct thread2 thread2;
-   struct thread3 thread3;
-
-   struct {
-      unsigned stats_enable:1;
-      unsigned pad0:1;
-      unsigned sampler_count:3;
-      unsigned sampler_state_pointer:27;
-   } wm4;
-
-   struct
-   {
-      unsigned enable_8_pix:1;
-      unsigned enable_16_pix:1;
-      unsigned enable_32_pix:1;
-      unsigned pad0:7;
-      unsigned legacy_global_depth_bias:1;
-      unsigned line_stipple:1;
-      unsigned depth_offset:1;
-      unsigned polygon_stipple:1;
-      unsigned line_aa_region_width:2;
-      unsigned line_endcap_aa_region_width:2;
-      unsigned early_depth_test:1;
-      unsigned thread_dispatch_enable:1;
-      unsigned program_uses_depth:1;
-      unsigned program_computes_depth:1;
-      unsigned program_uses_killpixel:1;
-      unsigned legacy_line_rast: 1;
-      unsigned pad1:1;
-      unsigned max_threads:6;
-      unsigned pad2:1;
-   } wm5;
-
-   float global_depth_offset_constant;
-   float global_depth_offset_scale;
-};
-
-struct brw_sampler_default_color {
-   float color[4];
-};
-
-struct brw_sampler_state
-{
-
-   struct
-   {
-      unsigned shadow_function:3;
-      unsigned lod_bias:11;
-      unsigned min_filter:3;
-      unsigned mag_filter:3;
-      unsigned mip_filter:2;
-      unsigned base_level:5;
-      unsigned pad:1;
-      unsigned lod_preclamp:1;
-      unsigned default_color_mode:1;
-      unsigned pad0:1;
-      unsigned disable:1;
-   } ss0;
-
-   struct
-   {
-      unsigned r_wrap_mode:3;
-      unsigned t_wrap_mode:3;
-      unsigned s_wrap_mode:3;
-      unsigned pad:3;
-      unsigned max_lod:10;
-      unsigned min_lod:10;
-   } ss1;
-
-
-   struct
-   {
-      unsigned pad:5;
-      unsigned default_color_pointer:27;
-   } ss2;
-
-   struct
-   {
-      unsigned pad:19;
-      unsigned max_aniso:3;
-      unsigned chroma_key_mode:1;
-      unsigned chroma_key_index:2;
-      unsigned chroma_key_enable:1;
-      unsigned monochrome_filter_width:3;
-      unsigned monochrome_filter_height:3;
-   } ss3;
-};
-
-
-struct brw_clipper_viewport
-{
-   float xmin;
-   float xmax;
-   float ymin;
-   float ymax;
-};
-
-struct brw_cc_viewport
-{
-   float min_depth;
-   float max_depth;
-};
-
-struct brw_sf_viewport
-{
-   struct {
-      float m00;
-      float m11;
-      float m22;
-      float m30;
-      float m31;
-      float m32;
-   } viewport;
-
-   struct {
-      short xmin;
-      short ymin;
-      short xmax;
-      short ymax;
-   } scissor;
-};
-
-/* Documented in the subsystem/shared-functions/sampler chapter...
- */
-struct brw_surface_state
-{
-   struct {
-      unsigned cube_pos_z:1;
-      unsigned cube_neg_z:1;
-      unsigned cube_pos_y:1;
-      unsigned cube_neg_y:1;
-      unsigned cube_pos_x:1;
-      unsigned cube_neg_x:1;
-      unsigned pad:4;
-      unsigned mipmap_layout_mode:1;
-      unsigned vert_line_stride_ofs:1;
-      unsigned vert_line_stride:1;
-      unsigned color_blend:1;
-      unsigned writedisable_blue:1;
-      unsigned writedisable_green:1;
-      unsigned writedisable_red:1;
-      unsigned writedisable_alpha:1;
-      unsigned surface_format:9;
-      unsigned data_return_format:1;
-      unsigned pad0:1;
-      unsigned surface_type:3;
-   } ss0;
-
-   struct {
-      unsigned base_addr;
-   } ss1;
-
-   struct {
-      unsigned pad:2;
-      unsigned mip_count:4;
-      unsigned width:13;
-      unsigned height:13;
-   } ss2;
-
-   struct {
-      unsigned tile_walk:1;
-      unsigned tiled_surface:1;
-      unsigned pad:1;
-      unsigned pitch:18;
-      unsigned depth:11;
-   } ss3;
-
-   struct {
-      unsigned pad:19;
-      unsigned min_array_elt:9;
-      unsigned min_lod:4;
-   } ss4;
-};
-
-
-
-struct brw_vertex_buffer_state
-{
-   struct {
-      unsigned pitch:11;
-      unsigned pad:15;
-      unsigned access_type:1;
-      unsigned vb_index:5;
-   } vb0;
-
-   unsigned start_addr;
-   unsigned max_index;
-#if 1
-   unsigned instance_data_step_rate; /* not included for sequential/random vertices? */
-#endif
-};
-
-#define BRW_VBP_MAX 17
-
-struct brw_vb_array_state {
-   struct header header;
-   struct brw_vertex_buffer_state vb[BRW_VBP_MAX];
-};
-
-
-struct brw_vertex_element_state
-{
-   struct
-   {
-      unsigned src_offset:11;
-      unsigned pad:5;
-      unsigned src_format:9;
-      unsigned pad0:1;
-      unsigned valid:1;
-      unsigned vertex_buffer_index:5;
-   } ve0;
-
-   struct
-   {
-      unsigned dst_offset:8;
-      unsigned pad:8;
-      unsigned vfcomponent3:4;
-      unsigned vfcomponent2:4;
-      unsigned vfcomponent1:4;
-      unsigned vfcomponent0:4;
-   } ve1;
-};
-
-#define BRW_VEP_MAX 18
-
-struct brw_vertex_element_packet {
-   struct header header;
-   struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */
-};
-
-
-struct brw_urb_immediate {
-   unsigned opcode:4;
-   unsigned offset:6;
-   unsigned swizzle_control:2;
-   unsigned pad:1;
-   unsigned allocate:1;
-   unsigned used:1;
-   unsigned complete:1;
-   unsigned response_length:4;
-   unsigned msg_length:4;
-   unsigned msg_target:4;
-   unsigned pad1:3;
-   unsigned end_of_thread:1;
-};
-
-/* Instruction format for the execution units:
- */
-
-struct brw_instruction
-{
-   struct
-   {
-      unsigned opcode:7;
-      unsigned pad:1;
-      unsigned access_mode:1;
-      unsigned mask_control:1;
-      unsigned dependency_control:2;
-      unsigned compression_control:2;
-      unsigned thread_control:2;
-      unsigned predicate_control:4;
-      unsigned predicate_inverse:1;
-      unsigned execution_size:3;
-      unsigned destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
-      unsigned pad0:2;
-      unsigned debug_control:1;
-      unsigned saturate:1;
-   } header;
-
-   union {
-      struct
-      {
-	 unsigned dest_reg_file:2;
-	 unsigned dest_reg_type:3;
-	 unsigned src0_reg_file:2;
-	 unsigned src0_reg_type:3;
-	 unsigned src1_reg_file:2;
-	 unsigned src1_reg_type:3;
-	 unsigned pad:1;
-	 unsigned dest_subreg_nr:5;
-	 unsigned dest_reg_nr:8;
-	 unsigned dest_horiz_stride:2;
-	 unsigned dest_address_mode:1;
-      } da1;
-
-      struct
-      {
-	 unsigned dest_reg_file:2;
-	 unsigned dest_reg_type:3;
-	 unsigned src0_reg_file:2;
-	 unsigned src0_reg_type:3;
-	 unsigned pad:6;
-	 int dest_indirect_offset:10;	/* offset against the deref'd address reg */
-	 unsigned dest_subreg_nr:3; /* subnr for the address reg a0.x */
-	 unsigned dest_horiz_stride:2;
-	 unsigned dest_address_mode:1;
-      } ia1;
-
-      struct
-      {
-	 unsigned dest_reg_file:2;
-	 unsigned dest_reg_type:3;
-	 unsigned src0_reg_file:2;
-	 unsigned src0_reg_type:3;
-	 unsigned src1_reg_file:2;
-	 unsigned src1_reg_type:3;
-	 unsigned pad0:1;
-	 unsigned dest_writemask:4;
-	 unsigned dest_subreg_nr:1;
-	 unsigned dest_reg_nr:8;
-	 unsigned pad1:2;
-	 unsigned dest_address_mode:1;
-      } da16;
-
-      struct
-      {
-	 unsigned dest_reg_file:2;
-	 unsigned dest_reg_type:3;
-	 unsigned src0_reg_file:2;
-	 unsigned src0_reg_type:3;
-	 unsigned pad0:6;
-	 unsigned dest_writemask:4;
-	 int dest_indirect_offset:6;
-	 unsigned dest_subreg_nr:3;
-	 unsigned pad1:2;
-	 unsigned dest_address_mode:1;
-      } ia16;
-   } bits1;
-
-
-   union {
-      struct
-      {
-	 unsigned src0_subreg_nr:5;
-	 unsigned src0_reg_nr:8;
-	 unsigned src0_abs:1;
-	 unsigned src0_negate:1;
-	 unsigned src0_address_mode:1;
-	 unsigned src0_horiz_stride:2;
-	 unsigned src0_width:3;
-	 unsigned src0_vert_stride:4;
-	 unsigned flag_reg_nr:1;
-	 unsigned pad:6;
-      } da1;
-
-      struct
-      {
-	 int src0_indirect_offset:10;
-	 unsigned src0_subreg_nr:3;
-	 unsigned src0_abs:1;
-	 unsigned src0_negate:1;
-	 unsigned src0_address_mode:1;
-	 unsigned src0_horiz_stride:2;
-	 unsigned src0_width:3;
-	 unsigned src0_vert_stride:4;
-	 unsigned flag_reg_nr:1;
-	 unsigned pad:6;
-      } ia1;
-
-      struct
-      {
-	 unsigned src0_swz_x:2;
-	 unsigned src0_swz_y:2;
-	 unsigned src0_subreg_nr:1;
-	 unsigned src0_reg_nr:8;
-	 unsigned src0_abs:1;
-	 unsigned src0_negate:1;
-	 unsigned src0_address_mode:1;
-	 unsigned src0_swz_z:2;
-	 unsigned src0_swz_w:2;
-	 unsigned pad0:1;
-	 unsigned src0_vert_stride:4;
-	 unsigned flag_reg_nr:1;
-	 unsigned pad1:6;
-      } da16;
-
-      struct
-      {
-	 unsigned src0_swz_x:2;
-	 unsigned src0_swz_y:2;
-	 int src0_indirect_offset:6;
-	 unsigned src0_subreg_nr:3;
-	 unsigned src0_abs:1;
-	 unsigned src0_negate:1;
-	 unsigned src0_address_mode:1;
-	 unsigned src0_swz_z:2;
-	 unsigned src0_swz_w:2;
-	 unsigned pad0:1;
-	 unsigned src0_vert_stride:4;
-	 unsigned flag_reg_nr:1;
-	 unsigned pad1:6;
-      } ia16;
-
-   } bits2;
-
-   union
-   {
-      struct
-      {
-	 unsigned src1_subreg_nr:5;
-	 unsigned src1_reg_nr:8;
-	 unsigned src1_abs:1;
-	 unsigned src1_negate:1;
-	 unsigned pad:1;
-	 unsigned src1_horiz_stride:2;
-	 unsigned src1_width:3;
-	 unsigned src1_vert_stride:4;
-	 unsigned pad0:7;
-      } da1;
-
-      struct
-      {
-	 unsigned src1_swz_x:2;
-	 unsigned src1_swz_y:2;
-	 unsigned src1_subreg_nr:1;
-	 unsigned src1_reg_nr:8;
-	 unsigned src1_abs:1;
-	 unsigned src1_negate:1;
-	 unsigned pad0:1;
-	 unsigned src1_swz_z:2;
-	 unsigned src1_swz_w:2;
-	 unsigned pad1:1;
-	 unsigned src1_vert_stride:4;
-	 unsigned pad2:7;
-      } da16;
-
-      struct
-      {
-	 int  src1_indirect_offset:10;
-	 unsigned src1_subreg_nr:3;
-	 unsigned src1_abs:1;
-	 unsigned src1_negate:1;
-	 unsigned pad0:1;
-	 unsigned src1_horiz_stride:2;
-	 unsigned src1_width:3;
-	 unsigned src1_vert_stride:4;
-	 unsigned flag_reg_nr:1;
-	 unsigned pad1:6;
-      } ia1;
-
-      struct
-      {
-	 unsigned src1_swz_x:2;
-	 unsigned src1_swz_y:2;
-	 int  src1_indirect_offset:6;
-	 unsigned src1_subreg_nr:3;
-	 unsigned src1_abs:1;
-	 unsigned src1_negate:1;
-	 unsigned pad0:1;
-	 unsigned src1_swz_z:2;
-	 unsigned src1_swz_w:2;
-	 unsigned pad1:1;
-	 unsigned src1_vert_stride:4;
-	 unsigned flag_reg_nr:1;
-	 unsigned pad2:6;
-      } ia16;
-
-
-      struct
-      {
-	 int  jump_count:16;	/* note: signed */
-	 unsigned  pop_count:4;
-	 unsigned  pad0:12;
-      } if_else;
-
-      struct {
-	 unsigned function:4;
-	 unsigned int_type:1;
-	 unsigned precision:1;
-	 unsigned saturate:1;
-	 unsigned data_type:1;
-	 unsigned pad0:8;
-	 unsigned response_length:4;
-	 unsigned msg_length:4;
-	 unsigned msg_target:4;
-	 unsigned pad1:3;
-	 unsigned end_of_thread:1;
-      } math;
-
-      struct {
-	 unsigned binding_table_index:8;
-	 unsigned sampler:4;
-	 unsigned return_format:2;
-	 unsigned msg_type:2;
-	 unsigned response_length:4;
-	 unsigned msg_length:4;
-	 unsigned msg_target:4;
-	 unsigned pad1:3;
-	 unsigned end_of_thread:1;
-      } sampler;
-
-      struct brw_urb_immediate urb;
-
-      struct {
-	 unsigned binding_table_index:8;
-	 unsigned msg_control:4;
-	 unsigned msg_type:2;
-	 unsigned target_cache:2;
-	 unsigned response_length:4;
-	 unsigned msg_length:4;
-	 unsigned msg_target:4;
-	 unsigned pad1:3;
-	 unsigned end_of_thread:1;
-      } dp_read;
-
-      struct {
-	 unsigned binding_table_index:8;
-	 unsigned msg_control:3;
-	 unsigned pixel_scoreboard_clear:1;
-	 unsigned msg_type:3;
-	 unsigned send_commit_msg:1;
-	 unsigned response_length:4;
-	 unsigned msg_length:4;
-	 unsigned msg_target:4;
-	 unsigned pad1:3;
-	 unsigned end_of_thread:1;
-      } dp_write;
-
-      struct {
-	 unsigned pad:16;
-	 unsigned response_length:4;
-	 unsigned msg_length:4;
-	 unsigned msg_target:4;
-	 unsigned pad1:3;
-	 unsigned end_of_thread:1;
-      } generic;
-
-      int d;
-      unsigned ud;
-   } bits3;
-};
-
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
deleted file mode 100644
index 724a69b2eee..00000000000
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "brw_blit.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_tile.h"
-#include "util/u_rect.h"
-
-
-
-/* Assumes all values are within bounds -- no checking at this level -
- * do it higher up if required.
- */
-static void
-brw_surface_copy(struct pipe_context *pipe,
-                 struct pipe_surface *dst,
-                 unsigned dstx, unsigned dsty,
-                 struct pipe_surface *src,
-                 unsigned srcx, unsigned srcy, unsigned width, unsigned height)
-{
-   assert( dst != src );
-   assert( dst->block.size == src->block.size );
-   assert( dst->block.width == src->block.height );
-   assert( dst->block.height == src->block.height );
-
-   if (0) {
-      void *dst_map = pipe->screen->surface_map( pipe->screen,
-                                                 dst,
-                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
-      
-      const void *src_map = pipe->screen->surface_map( pipe->screen,
-                                                       src,
-                                                       PIPE_BUFFER_USAGE_CPU_READ );
-      
-      util_copy_rect(dst_map,
-                     &dst->block,
-                     dst->stride,
-                     dstx, dsty,
-                     width, height,
-                     src_map,
-                     src->stride,
-                     srcx, srcy);
-
-      pipe->screen->surface_unmap(pipe->screen, src);
-      pipe->screen->surface_unmap(pipe->screen, dst);
-   }
-   else {
-      struct brw_texture *dst_tex = (struct brw_texture *)dst->texture;
-      struct brw_texture *src_tex = (struct brw_texture *)src->texture;
-      assert(dst->block.width == 1);
-      assert(dst->block.height == 1);
-      brw_copy_blit(brw_context(pipe),
-                    FALSE,
-                    dst->block.size,
-                    (short) src->stride/src->block.size, src_tex->buffer, src->offset, FALSE,
-                    (short) dst->stride/dst->block.size, dst_tex->buffer, dst->offset, FALSE,
-                    (short) srcx, (short) srcy, (short) dstx, (short) dsty,
-                    (short) width, (short) height, PIPE_LOGICOP_COPY);
-   }
-}
-
-
-static void
-brw_surface_fill(struct pipe_context *pipe,
-                 struct pipe_surface *dst,
-                 unsigned dstx, unsigned dsty,
-                 unsigned width, unsigned height, unsigned value)
-{
-   if (0) {
-      void *dst_map = pipe->screen->surface_map( pipe->screen,
-                                                 dst,
-                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
-
-      util_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
-
-      pipe->screen->surface_unmap(pipe->screen, dst);
-   }
-   else {
-      struct brw_texture *tex = (struct brw_texture *)dst->texture;
-      assert(dst->block.width == 1);
-      assert(dst->block.height == 1);
-      brw_fill_blit(brw_context(pipe),
-                    dst->block.size,
-                    (short) dst->stride/dst->block.size, 
-                    tex->buffer, dst->offset, FALSE,
-                    (short) dstx, (short) dsty,
-                    (short) width, (short) height,
-                    value);
-   }
-}
-
-
-void
-brw_init_surface_functions(struct brw_context *brw)
-{
-   brw->pipe.surface_copy  = brw_surface_copy;
-   brw->pipe.surface_fill  = brw_surface_fill;
-}
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
deleted file mode 100644
index 998ffaeac4a..00000000000
--- a/src/gallium/drivers/i965simple/brw_tex_layout.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-/* Code to layout images in a mipmap tree for i965.
- */
-
-#include "pipe/p_state.h"
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "brw_context.h"
-#include "brw_tex_layout.h"
-
-
-#define FILE_DEBUG_FLAG DEBUG_TEXTURE
-
-#if 0
-unsigned intel_compressed_alignment(unsigned internalFormat)
-{
-    unsigned alignment = 4;
-
-    switch (internalFormat) {
-    case GL_COMPRESSED_RGB_FXT1_3DFX:
-    case GL_COMPRESSED_RGBA_FXT1_3DFX:
-        alignment = 8;
-        break;
-
-    default:
-        break;
-    }
-
-    return alignment;
-}
-#endif
-
-
-static void intel_miptree_set_image_offset(struct brw_texture *tex,
-                                           unsigned level,
-                                           unsigned img,
-                                           unsigned x, unsigned y)
-{
-   struct pipe_texture *pt = &tex->base;
-   if (img == 0 && level == 0)
-      assert(x == 0 && y == 0);
-   assert(img < tex->nr_images[level]);
-
-   tex->image_offset[level][img] = y * tex->stride + x * pt->block.size;
-}
-
-static void intel_miptree_set_level_info(struct brw_texture *tex,
-                                         unsigned level,
-                                         unsigned nr_images,
-                                         unsigned x, unsigned y,
-                                         unsigned w, unsigned h, unsigned d)
-{
-   struct pipe_texture *pt = &tex->base;
-
-   assert(level < PIPE_MAX_TEXTURE_LEVELS);
-
-   pt->width[level] = w;
-   pt->height[level] = h;
-   pt->depth[level] = d;
-   
-   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
-   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
-
-   tex->level_offset[level] = y * tex->stride + x * tex->base.block.size;
-   tex->nr_images[level] = nr_images;
-
-   /*
-   DBG("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
-       level, w, h, d, x, y, tex->level_offset[level]);
-   */
-
-   /* Not sure when this would happen, but anyway: 
-    */
-   if (tex->image_offset[level]) {
-      FREE(tex->image_offset[level]);
-      tex->image_offset[level] = NULL;
-   }
-
-   assert(nr_images);
-   assert(!tex->image_offset[level]);
-
-   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
-   tex->image_offset[level][0] = 0;
-}
-
-static void i945_miptree_layout_2d(struct brw_texture *tex)
-{
-   struct pipe_texture *pt = &tex->base;
-   const int align_x = 2, align_y = 4;
-   unsigned level;
-   unsigned x = 0;
-   unsigned y = 0;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned nblocksx = pt->nblocksx[0];
-   unsigned nblocksy = pt->nblocksy[0];
-
-   tex->stride = align(pt->nblocksx[0] * pt->block.size, 4);
-
-   /* May need to adjust pitch to accomodate the placement of
-    * the 2nd mipmap level.  This occurs when the alignment
-    * constraints of mipmap placement push the right edge of the
-    * 2nd mipmap level out past the width of its parent.
-    */
-   if (pt->last_level > 0) {
-      unsigned mip1_nblocksx 
-	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
-         + pf_get_nblocksx(&pt->block, minify(minify(width)));
-
-      if (mip1_nblocksx > nblocksx)
-	 tex->stride = mip1_nblocksx * pt->block.size;
-   }
-
-   /* Pitch must be a whole number of dwords
-    */
-   tex->stride = align(tex->stride, 64);
-   tex->total_nblocksy = 0;
-
-   for (level = 0; level <= pt->last_level; level++) {
-      intel_miptree_set_level_info(tex, level, 1, x, y, width,
-				   height, 1);
-
-      nblocksy = align(nblocksy, align_y);
-
-      /* Because the images are packed better, the final offset
-       * might not be the maximal one:
-       */
-      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
-
-      /* Layout_below: step right after second mipmap level.
-       */
-      if (level == 1) {
-	 x += align(nblocksx, align_x);
-      }
-      else {
-	 y += nblocksy;
-      }
-
-      width  = minify(width);
-      height = minify(height);
-      nblocksx = pf_get_nblocksx(&pt->block, width);
-      nblocksy = pf_get_nblocksy(&pt->block, height);
-   }
-}
-
-static boolean brw_miptree_layout(struct brw_texture *tex)
-{
-   struct pipe_texture *pt = &tex->base;
-   /* XXX: these vary depending on image format:
-    */
-/*    int align_w = 4; */
-
-   switch (pt->target) {
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_3D: {
-      unsigned width  = pt->width[0];
-      unsigned height = pt->height[0];
-      unsigned depth = pt->depth[0];
-      unsigned nblocksx = pt->nblocksx[0];
-      unsigned nblocksy = pt->nblocksy[0];
-      unsigned pack_x_pitch, pack_x_nr;
-      unsigned pack_y_pitch;
-      unsigned level;
-      unsigned align_h = 2;
-      unsigned align_w = 4;
-
-      tex->total_nblocksy = 0;
-
-      tex->stride = align(pt->nblocksx[0], 4);
-      pack_y_pitch = align(pt->nblocksy[0], align_h);
-
-      pack_x_pitch = tex->stride / pt->block.size;
-      pack_x_nr = 1;
-
-      for (level = 0; level <= pt->last_level; level++) {
-	 unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
-	 int x = 0;
-	 int y = 0;
-	 uint q, j;
-
-	 intel_miptree_set_level_info(tex, level, nr_images,
-				      0, tex->total_nblocksy,
-				      width, height, depth);
-
-	 for (q = 0; q < nr_images;) {
-	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
-	       intel_miptree_set_image_offset(tex, level, q, x, y);
-	       x += pack_x_pitch;
-	    }
-
-	    x = 0;
-	    y += pack_y_pitch;
-	 }
-
-
-	 tex->total_nblocksy += y;
-	 width  = minify(width);
-	 height = minify(height);
-	 depth  = minify(depth);
-         nblocksx = pf_get_nblocksx(&pt->block, width);
-         nblocksy = pf_get_nblocksy(&pt->block, height);
-
-         if (pf_is_compressed(pt->format)) {
-            pack_y_pitch = (height + 3) / 4;
-
-            if (pack_x_pitch > align(width, align_w)) {
-               pack_x_pitch = align(width, align_w);
-               pack_x_nr <<= 1;
-            }
-         } else {
-            if (pack_x_pitch > 4) {
-               pack_x_pitch >>= 1;
-               pack_x_nr <<= 1;
-               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
-            }
-
-            if (pack_y_pitch > 2) {
-               pack_y_pitch >>= 1;
-               pack_y_pitch = align(pack_y_pitch, align_h);
-            }
-         }
-
-      }
-      break;
-   }
-
-   default:
-      i945_miptree_layout_2d(tex);
-      break;
-   }
-#if 0
-   PRINT("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
-       pt->pitch,
-       pt->total_nblocksy,
-       pt->block.size,
-       pt->stride * pt->total_nblocksy );
-#endif
-
-   return TRUE;
-}
-
-
-static struct pipe_texture *
-brw_texture_create_screen(struct pipe_screen *screen,
-                          const struct pipe_texture *templat)
-{
-   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
-
-   if (tex) {
-      tex->base = *templat;
-      pipe_reference_init(&tex->base.reference, 1);
-
-      tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
-      tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
-   
-      if (brw_miptree_layout(tex))
-	 tex->buffer = screen->buffer_create(screen, 64,
-                                          PIPE_BUFFER_USAGE_PIXEL,
-                                          tex->stride *
-                                          tex->total_nblocksy);
-
-      if (!tex->buffer) {
-	 FREE(tex);
-         return NULL;
-      }
-   }
-
-   return &tex->base;
-}
-
-
-static void
-brw_texture_destroy_screen(struct pipe_texture *pt)
-{
-   struct brw_texture *tex = (struct brw_texture *)pt;
-   uint i;
-
-   /*
-     DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
-   */
-
-   pipe_buffer_reference(&tex->buffer, NULL);
-
-   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
-      if (tex->image_offset[i])
-         free(tex->image_offset[i]);
-
-   free(tex);
-}
-
-
-static struct pipe_surface *
-brw_get_tex_surface_screen(struct pipe_screen *screen,
-                           struct pipe_texture *pt,
-                           unsigned face, unsigned level, unsigned zslice)
-{
-   struct brw_texture *tex = (struct brw_texture *)pt;
-   struct pipe_surface *ps;
-   unsigned offset;  /* in bytes */
-
-   offset = tex->level_offset[level];
-
-   if (pt->target == PIPE_TEXTURE_CUBE) {
-      offset += tex->image_offset[level][face];
-   }
-   else if (pt->target == PIPE_TEXTURE_3D) {
-      offset += tex->image_offset[level][zslice];
-   }
-   else {
-      assert(face == 0);
-      assert(zslice == 0);
-   }
-
-   ps = CALLOC_STRUCT(pipe_surface);
-   if (ps) {
-      pipe_reference_init(&ps->reference, 1);
-      pipe_texture_reference(&ps->texture, pt);
-      ps->format = pt->format;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
-      ps->block = pt->block;
-      ps->nblocksx = pt->nblocksx[level];
-      ps->nblocksy = pt->nblocksy[level];
-      ps->stride = tex->stride;
-      ps->offset = offset;
-   }
-   return ps;
-}
-
-
-void
-brw_init_texture_functions(struct brw_context *brw)
-{
-//   brw->pipe.texture_update = brw_texture_update;
-}
-
-
-void
-brw_init_screen_texture_funcs(struct pipe_screen *screen)
-{
-   screen->texture_create  = brw_texture_create_screen;
-   screen->texture_destroy = brw_texture_destroy_screen;
-   screen->get_tex_surface = brw_get_tex_surface_screen;
-}
-
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.h b/src/gallium/drivers/i965simple/brw_tex_layout.h
deleted file mode 100644
index a6b6ba81460..00000000000
--- a/src/gallium/drivers/i965simple/brw_tex_layout.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
-
-
-#ifndef BRW_TEX_LAYOUT_H
-#define BRW_TEX_LAYOUT_H
-
-
-struct brw_context;
-struct pipe_screen;
-
-
-extern void
-brw_init_texture_functions(struct brw_context *brw);
-
-extern void
-brw_init_screen_texture_funcs(struct pipe_screen *screen);
-
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_urb.c b/src/gallium/drivers/i965simple/brw_urb.c
deleted file mode 100644
index 101a4367b90..00000000000
--- a/src/gallium/drivers/i965simple/brw_urb.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-//#include "brw_state.h"
-#include "brw_batch.h"
-#include "brw_defines.h"
-
-#define VS 0
-#define GS 1
-#define CLP 2
-#define SF 3
-#define CS 4
-
-/* XXX: Are the min_entry_size numbers useful?
- * XXX: Verify min_nr_entries, esp for VS.
- * XXX: Verify SF min_entry_size.
- */
-static const struct {
-   unsigned min_nr_entries;
-   unsigned preferred_nr_entries;
-   unsigned min_entry_size;
-   unsigned max_entry_size;
-} limits[CS+1] = {
-   { 8, 32, 1, 5 },			/* vs */
-   { 4, 8,  1, 5 },			/* gs */
-   { 6, 8,  1, 5 },			/* clp */
-   { 1, 8,  1, 12 },		        /* sf */
-   { 1, 4,  1, 32 }			/* cs */
-};
-
-
-static boolean check_urb_layout( struct brw_context *brw )
-{
-   brw->urb.vs_start = 0;
-   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
-   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
-   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
-   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
-
-   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= 256;
-}
-
-/* Most minimal update, forces re-emit of URB fence packet after GS
- * unit turned on/off.
- */
-static void recalculate_urb_fence( struct brw_context *brw )
-{
-   unsigned csize = brw->curbe.total_size;
-   unsigned vsize = brw->vs.prog_data->urb_entry_size;
-   unsigned sfsize = brw->sf.prog_data->urb_entry_size;
-
-   if (csize < limits[CS].min_entry_size)
-      csize = limits[CS].min_entry_size;
-
-   if (vsize < limits[VS].min_entry_size)
-      vsize = limits[VS].min_entry_size;
-
-   if (sfsize < limits[SF].min_entry_size)
-      sfsize = limits[SF].min_entry_size;
-
-   if (brw->urb.vsize < vsize ||
-       brw->urb.sfsize < sfsize ||
-       brw->urb.csize < csize ||
-       (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize ||
-				 brw->urb.sfsize > brw->urb.sfsize ||
-				 brw->urb.csize > brw->urb.csize))) {
-
-
-      brw->urb.csize = csize;
-      brw->urb.sfsize = sfsize;
-      brw->urb.vsize = vsize;
-
-      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
-      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;
-      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
-      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
-      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;
-
-      if (!check_urb_layout(brw)) {
-	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;
-	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;
-	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
-	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;
-	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;
-
-	 brw->urb.constrained = 1;
-
-	 if (!check_urb_layout(brw)) {
-	    /* This is impossible, given the maximal sizes of urb
-	     * entries and the values for minimum nr of entries
-	     * provided above.
-	     */
-	    debug_printf("couldn't calculate URB layout!\n");
-	    exit(1);
-	 }
-
-	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    debug_printf("URB CONSTRAINED\n");
-      }
-      else
-	 brw->urb.constrained = 0;
-
-      if (BRW_DEBUG & DEBUG_URB)
-	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
-		      brw->urb.vs_start,
-		      brw->urb.gs_start,
-		      brw->urb.clip_start,
-		      brw->urb.sf_start,
-		      brw->urb.cs_start,
-		      256);
-
-      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
-   }
-}
-
-
-const struct brw_tracked_state brw_recalculate_urb_fence = {
-   .dirty = {
-      .brw = BRW_NEW_CURBE_OFFSETS,
-      .cache = (CACHE_NEW_VS_PROG |
-		CACHE_NEW_SF_PROG)
-   },
-   .update = recalculate_urb_fence
-};
-
-
-
-
-
-void brw_upload_urb_fence(struct brw_context *brw)
-{
-   struct brw_urb_fence uf;
-   memset(&uf, 0, sizeof(uf));
-
-   uf.header.opcode = CMD_URB_FENCE;
-   uf.header.length = sizeof(uf)/4-2;
-   uf.header.vs_realloc = 1;
-   uf.header.gs_realloc = 1;
-   uf.header.clp_realloc = 1;
-   uf.header.sf_realloc = 1;
-   uf.header.vfe_realloc = 1;
-   uf.header.cs_realloc = 1;
-
-   /* The ordering below is correct, not the layout in the
-    * instruction.
-    *
-    * There are 256 urb reg pairs in total.
-    */
-   uf.bits0.vs_fence  = brw->urb.gs_start;
-   uf.bits0.gs_fence  = brw->urb.clip_start;
-   uf.bits0.clp_fence = brw->urb.sf_start;
-   uf.bits1.sf_fence  = brw->urb.cs_start;
-   uf.bits1.cs_fence  = 256;
-
-   BRW_BATCH_STRUCT(brw, &uf);
-}
diff --git a/src/gallium/drivers/i965simple/brw_util.c b/src/gallium/drivers/i965simple/brw_util.c
deleted file mode 100644
index 42391d7c8c5..00000000000
--- a/src/gallium/drivers/i965simple/brw_util.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_util.h"
-#include "brw_defines.h"
-
-#include "pipe/p_defines.h"
-
-unsigned brw_count_bits( unsigned val )
-{
-   unsigned i;
-   for (i = 0; val ; val >>= 1)
-      if (val & 1)
-	 i++;
-   return i;
-}
-
-
-unsigned brw_translate_blend_equation( int mode )
-{
-   switch (mode) {
-   case PIPE_BLEND_ADD:
-      return BRW_BLENDFUNCTION_ADD;
-   case PIPE_BLEND_MIN:
-      return BRW_BLENDFUNCTION_MIN;
-   case PIPE_BLEND_MAX:
-      return BRW_BLENDFUNCTION_MAX;
-   case PIPE_BLEND_SUBTRACT:
-      return BRW_BLENDFUNCTION_SUBTRACT;
-   case PIPE_BLEND_REVERSE_SUBTRACT:
-      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT;
-   default:
-      assert(0);
-      return BRW_BLENDFUNCTION_ADD;
-   }
-}
-
-unsigned brw_translate_blend_factor( int factor )
-{
-   switch(factor) {
-   case PIPE_BLENDFACTOR_ZERO:
-      return BRW_BLENDFACTOR_ZERO;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
-      return BRW_BLENDFACTOR_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_ONE:
-      return BRW_BLENDFACTOR_ONE;
-   case PIPE_BLENDFACTOR_SRC_COLOR:
-      return BRW_BLENDFACTOR_SRC_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      return BRW_BLENDFACTOR_INV_SRC_COLOR;
-   case PIPE_BLENDFACTOR_DST_COLOR:
-      return BRW_BLENDFACTOR_DST_COLOR;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      return BRW_BLENDFACTOR_INV_DST_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      return BRW_BLENDFACTOR_INV_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_DST_ALPHA:
-      return BRW_BLENDFACTOR_DST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      return BRW_BLENDFACTOR_INV_DST_ALPHA;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
-   case PIPE_BLENDFACTOR_CONST_COLOR:
-      return BRW_BLENDFACTOR_CONST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      return BRW_BLENDFACTOR_INV_CONST_COLOR;
-   case PIPE_BLENDFACTOR_CONST_ALPHA:
-      return BRW_BLENDFACTOR_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
-   default:
-      assert(0);
-      return BRW_BLENDFACTOR_ZERO;
-   }
-}
diff --git a/src/gallium/drivers/i965simple/brw_util.h b/src/gallium/drivers/i965simple/brw_util.h
deleted file mode 100644
index d60e5934dba..00000000000
--- a/src/gallium/drivers/i965simple/brw_util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-          
-
-#ifndef BRW_UTIL_H
-#define BRW_UTIL_H
-
-#include "pipe/p_state.h"
-
-extern unsigned brw_count_bits( unsigned val );
-extern unsigned brw_translate_blend_factor( int factor );
-extern unsigned brw_translate_blend_equation( int mode );
-
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs.c b/src/gallium/drivers/i965simple/brw_vs.c
deleted file mode 100644
index 92327e896db..00000000000
--- a/src/gallium/drivers/i965simple/brw_vs.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_vs.h"
-#include "brw_util.h"
-#include "brw_state.h"
-
-
-static void do_vs_prog( struct brw_context *brw,
-			const struct brw_vertex_program *vp,
-			struct brw_vs_prog_key *key )
-{
-   unsigned program_size;
-   const unsigned *program;
-   struct brw_vs_compile c;
-
-   memset(&c, 0, sizeof(c));
-   memcpy(&c.key, key, sizeof(*key));
-
-   brw_init_compile(&c.func);
-   c.vp = vp;
-
-   c.prog_data.outputs_written = vp->info.num_outputs;
-   c.prog_data.inputs_read = vp->info.num_inputs;
-
-#if 0
-   if (c.key.copy_edgeflag) {
-      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
-      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
-   }
-#endif
-
-   /* Emit GEN4 code.
-    */
-   brw_vs_emit(&c);
-
-   /* get the program
-    */
-   program = brw_get_program(&c.func, &program_size);
-
-   /*
-    */
-   brw->vs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_VS_PROG],
-					      &c.key,
-					      sizeof(c.key),
-					      program,
-					      program_size,
-					      &c.prog_data,
-					      &brw->vs.prog_data);
-}
-
-
-static void brw_upload_vs_prog( struct brw_context *brw )
-{
-   struct brw_vs_prog_key key;
-   const struct brw_vertex_program *vp = brw->attribs.VertexProgram;
-
-   assert(vp);
-
-   memset(&key, 0, sizeof(key));
-
-   /* Just upload the program verbatim for now.  Always send it all
-    * the inputs it asks for, whether they are varying or not.
-    */
-   key.program_string_id = vp->id;
-   key.nr_userclip = brw->attribs.Clip.nr;
-   key.copy_edgeflag = (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
-			brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL);
-
-   /* Make an early check for the key.
-    */
-   if (brw_search_cache(&brw->cache[BRW_VS_PROG],
-			&key, sizeof(key),
-			&brw->vs.prog_data,
-			&brw->vs.prog_gs_offset))
-       return;
-
-   do_vs_prog(brw, vp, &key);
-}
-
-
-/* See brw_vs.c:
- */
-const struct brw_tracked_state brw_vs_prog = {
-   .dirty = {
-      .brw   = BRW_NEW_VS,
-      .cache = 0
-   },
-   .update = brw_upload_vs_prog
-};
diff --git a/src/gallium/drivers/i965simple/brw_vs.h b/src/gallium/drivers/i965simple/brw_vs.h
deleted file mode 100644
index 070f9dfcaef..00000000000
--- a/src/gallium/drivers/i965simple/brw_vs.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#ifndef BRW_VS_H
-#define BRW_VS_H
-
-
-#include "brw_context.h"
-#include "brw_eu.h"
-
-
-struct brw_vs_prog_key {
-   unsigned program_string_id;
-   unsigned nr_userclip:4;
-   unsigned copy_edgeflag:1;
-   unsigned know_w_is_one:1;
-   unsigned pad:26;
-};
-
-
-struct brw_vs_compile {
-   struct brw_compile func;
-   struct brw_vs_prog_key key;
-   struct brw_vs_prog_data prog_data;
-
-   const struct brw_vertex_program *vp;
-
-   unsigned nr_inputs;
-
-   unsigned first_output;
-   unsigned nr_outputs;
-
-   unsigned first_tmp;
-   unsigned last_tmp;
-
-   struct brw_reg r0;
-   struct brw_reg r1;
-   struct brw_reg regs[12][128];
-   struct brw_reg tmp;
-   struct brw_reg stack;
-
-   struct {
-       boolean used_in_src;
-       struct brw_reg reg;
-   } output_regs[128];
-
-   struct brw_reg userplane[6];
-
-};
-
-void brw_vs_emit( struct brw_vs_compile *c );
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
deleted file mode 100644
index 3ee82d95b3a..00000000000
--- a/src/gallium/drivers/i965simple/brw_vs_emit.c
+++ /dev/null
@@ -1,1330 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_context.h"
-#include "brw_vs.h"
-
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-
-struct brw_prog_info {
-   unsigned num_temps;
-   unsigned num_addrs;
-   unsigned num_consts;
-
-   unsigned writes_psize;
-
-   unsigned pos_idx;
-   unsigned result_edge_idx;
-   unsigned edge_flag_idx;
-   unsigned psize_idx;
-};
-
-/* Do things as simply as possible.  Allocate and populate all regs
- * ahead of time.
- */
-static void brw_vs_alloc_regs( struct brw_vs_compile *c,
-                               struct brw_prog_info *info )
-{
-   unsigned i, reg = 0, mrf;
-   unsigned nr_params;
-
-   /* r0 -- reserved as usual
-    */
-   c->r0 = brw_vec8_grf(reg, 0); reg++;
-
-   /* User clip planes from curbe:
-    */
-   if (c->key.nr_userclip) {
-      for (i = 0; i < c->key.nr_userclip; i++) {
-	 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
-      }
-
-      /* Deal with curbe alignment:
-       */
-      reg += ((6+c->key.nr_userclip+3)/4)*2;
-   }
-
-   /* Vertex program parameters from curbe:
-    */
-   nr_params = c->prog_data.max_const;
-   for (i = 0; i < nr_params; i++) {
-      c->regs[TGSI_FILE_CONSTANT][i] = stride(brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-   }
-   reg += (nr_params+1)/2;
-   c->prog_data.curb_read_length = reg - 1;
-
-
-
-   /* Allocate input regs:
-    */
-   c->nr_inputs = c->vp->info.num_inputs;
-   for (i = 0; i < c->nr_inputs; i++) {
-	 c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
-	 reg++;
-   }
-
-
-   /* Allocate outputs: TODO: could organize the non-position outputs
-    * to go straight into message regs.
-    */
-   c->nr_outputs = 0;
-   c->first_output = reg;
-   mrf = 4;
-   for (i = 0; i < c->vp->info.num_outputs; i++) {
-      c->nr_outputs++;
-#if 0
-      if (i == VERT_RESULT_HPOS) {
-         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-         reg++;
-      }
-      else if (i == VERT_RESULT_PSIZ) {
-         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-         reg++;
-         mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
-      }
-      else {
-         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
-         mrf++;
-      }
-#else
-      /*treat pos differently for now */
-      if (i == info->pos_idx) {
-         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
-         reg++;
-      } else {
-         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
-         mrf++;
-      }
-#endif
-   }
-
-   /* Allocate program temporaries:
-    */
-   for (i = 0; i < info->num_temps; i++) {
-      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
-      reg++;
-   }
-
-   /* Address reg(s).  Don't try to use the internal address reg until
-    * deref time.
-    */
-   for (i = 0; i < info->num_addrs; i++) {
-      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
-                                               reg,
-                                               0,
-                                               BRW_REGISTER_TYPE_D,
-                                               BRW_VERTICAL_STRIDE_8,
-                                               BRW_WIDTH_8,
-                                               BRW_HORIZONTAL_STRIDE_1,
-                                               BRW_SWIZZLE_XXXX,
-                                               TGSI_WRITEMASK_X);
-      reg++;
-   }
-
-   for (i = 0; i < 128; i++) {
-      if (c->output_regs[i].used_in_src) {
-         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
-         reg++;
-      }
-   }
-
-   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
-   reg += 2;
-
-
-   /* Some opcodes need an internal temporary:
-    */
-   c->first_tmp = reg;
-   c->last_tmp = reg;		/* for allocation purposes */
-
-   /* Each input reg holds data from two vertices.  The
-    * urb_read_length is the number of registers read from *each*
-    * vertex urb, so is half the amount:
-    */
-   c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
-
-   c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
-   c->prog_data.total_grf = reg;
-}
-
-
-static struct brw_reg get_tmp( struct brw_vs_compile *c )
-{
-   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
-
-   if (++c->last_tmp > c->prog_data.total_grf)
-      c->prog_data.total_grf = c->last_tmp;
-
-   return tmp;
-}
-
-static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
-{
-   if (tmp.nr == c->last_tmp-1)
-      c->last_tmp--;
-}
-
-static void release_tmps( struct brw_vs_compile *c )
-{
-   c->last_tmp = c->first_tmp;
-}
-
-
-static void unalias1( struct brw_vs_compile *c,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      void (*func)( struct brw_vs_compile *,
-				    struct brw_reg,
-				    struct brw_reg ))
-{
-   if (dst.file == arg0.file && dst.nr == arg0.nr) {
-      struct brw_compile *p = &c->func;
-      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
-      func(c, tmp, arg0);
-      brw_MOV(p, dst, tmp);
-   }
-   else {
-      func(c, dst, arg0);
-   }
-}
-
-static void unalias2( struct brw_vs_compile *c,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1,
-		      void (*func)( struct brw_vs_compile *,
-				    struct brw_reg,
-				    struct brw_reg,
-				    struct brw_reg ))
-{
-   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
-       (dst.file == arg1.file && dst.nr == arg1.nr)) {
-      struct brw_compile *p = &c->func;
-      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
-      func(c, tmp, arg0, arg1);
-      brw_MOV(p, dst, tmp);
-   }
-   else {
-      func(c, dst, arg0, arg1);
-   }
-}
-
-static void emit_sop( struct brw_compile *p,
-                      struct brw_reg dst,
-                      struct brw_reg arg0,
-                      struct brw_reg arg1,
-		      unsigned cond)
-{
-   brw_push_insn_state(p);
-   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   brw_MOV(p, dst, brw_imm_f(1.0f));
-   brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-   brw_MOV(p, dst, brw_imm_f(0.0f));
-   brw_pop_insn_state(p);
-}
-
-static void emit_seq( struct brw_compile *p,
-                      struct brw_reg dst,
-                      struct brw_reg arg0,
-                      struct brw_reg arg1 )
-{
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
-}
-
-static void emit_sne( struct brw_compile *p,
-                      struct brw_reg dst,
-                      struct brw_reg arg0,
-                      struct brw_reg arg1 )
-{
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
-}
-static void emit_slt( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1 )
-{
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
-}
-
-static void emit_sle( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1 )
-{
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
-}
-
-static void emit_sgt( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1 )
-{
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
-}
-
-static void emit_sge( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1 )
-{
-  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
-}
-
-static void emit_max( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1 )
-{
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
-   brw_SEL(p, dst, arg1, arg0);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-}
-
-static void emit_min( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg arg0,
-		      struct brw_reg arg1 )
-{
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
-   brw_SEL(p, dst, arg0, arg1);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-}
-
-
-static void emit_math1( struct brw_vs_compile *c,
-			unsigned function,
-			struct brw_reg dst,
-			struct brw_reg arg0,
-			unsigned precision)
-{
-   /* There are various odd behaviours with SEND on the simulator.  In
-    * addition there are documented issues with the fact that the GEN4
-    * processor doesn't do dependency control properly on SEND
-    * results.  So, on balance, this kludge to get around failures
-    * with writemasked math results looks like it might be necessary
-    * whether that turns out to be a simulator bug or not:
-    */
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = dst;
-   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   brw_math(p,
-	    tmp,
-	    function,
-	    BRW_MATH_SATURATE_NONE,
-	    2,
-	    arg0,
-	    BRW_MATH_DATA_SCALAR,
-	    precision);
-
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
-
-static void emit_math2( struct brw_vs_compile *c,
-			unsigned function,
-			struct brw_reg dst,
-			struct brw_reg arg0,
-			struct brw_reg arg1,
-			unsigned precision)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = dst;
-   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   brw_MOV(p, brw_message_reg(3), arg1);
-
-   brw_math(p,
-	    tmp,
-	    function,
-	    BRW_MATH_SATURATE_NONE,
-	    2,
- 	    arg0,
-	    BRW_MATH_DATA_SCALAR,
-	    precision);
-
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
-
-
-
-static void emit_exp_noalias( struct brw_vs_compile *c,
-			      struct brw_reg dst,
-			      struct brw_reg arg0 )
-{
-   struct brw_compile *p = &c->func;
-
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X) {
-      struct brw_reg tmp = get_tmp(c);
-      struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
-
-      /* tmp_d = floor(arg0.x) */
-      brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
-
-      /* result[0] = 2.0 ^ tmp */
-
-      /* Adjust exponent for floating point:
-       * exp += 127
-       */
-      brw_ADD(p, brw_writemask(tmp_d, TGSI_WRITEMASK_X), tmp_d, brw_imm_d(127));
-
-      /* Install exponent and sign.
-       * Excess drops off the edge:
-       */
-      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), TGSI_WRITEMASK_X),
-	      tmp_d, brw_imm_d(23));
-
-      release_tmp(c, tmp);
-   }
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y) {
-      /* result[1] = arg0.x - floor(arg0.x) */
-      brw_FRC(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0, 0));
-   }
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
-      /* As with the LOG instruction, we might be better off just
-       * doing a taylor expansion here, seeing as we have to do all
-       * the prep work.
-       *
-       * If mathbox partial precision is too low, consider also:
-       * result[3] = result[0] * EXP(result[1])
-       */
-      emit_math1(c,
-		 BRW_MATH_FUNCTION_EXP,
-		 brw_writemask(dst, TGSI_WRITEMASK_Z),
-		 brw_swizzle1(arg0, 0),
-		 BRW_MATH_PRECISION_PARTIAL);
-   }
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
-      /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), brw_imm_f(1));
-   }
-}
-
-
-static void emit_log_noalias( struct brw_vs_compile *c,
-			      struct brw_reg dst,
-			      struct brw_reg arg0 )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = dst;
-   struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
-   struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
-   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-
-   if (need_tmp) {
-      tmp = get_tmp(c);
-      tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
-   }
-
-   /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
-    * according to spec:
-    *
-    * These almost look likey they could be joined up, but not really
-    * practical:
-    *
-    * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
-    * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
-    */
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_XZ) {
-      brw_AND(p,
-	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
-	      brw_swizzle1(arg0_ud, 0),
-	      brw_imm_ud((1U<<31)-1));
-
-      brw_SHR(p,
-	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
-	      tmp_ud,
-	      brw_imm_ud(23));
-
-      brw_ADD(p,
-	      brw_writemask(tmp, TGSI_WRITEMASK_X),
-	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
-	      brw_imm_d(-127));
-   }
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_YZ) {
-      brw_AND(p,
-	      brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
-	      brw_swizzle1(arg0_ud, 0),
-	      brw_imm_ud((1<<23)-1));
-
-      brw_OR(p,
-	     brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
-	     tmp_ud,
-	     brw_imm_ud(127<<23));
-   }
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
-      /* result[2] = result[0] + LOG2(result[1]); */
-
-      /* Why bother?  The above is just a hint how to do this with a
-       * taylor series.  Maybe we *should* use a taylor series as by
-       * the time all the above has been done it's almost certainly
-       * quicker than calling the mathbox, even with low precision.
-       *
-       * Options are:
-       *    - result[0] + mathbox.LOG2(result[1])
-       *    - mathbox.LOG2(arg0.x)
-       *    - result[0] + inline_taylor_approx(result[1])
-       */
-      emit_math1(c,
-		 BRW_MATH_FUNCTION_LOG,
-		 brw_writemask(tmp, TGSI_WRITEMASK_Z),
-		 brw_swizzle1(tmp, 1),
-		 BRW_MATH_PRECISION_FULL);
-
-      brw_ADD(p,
-	      brw_writemask(tmp, TGSI_WRITEMASK_Z),
-	      brw_swizzle1(tmp, 2),
-	      brw_swizzle1(tmp, 0));
-   }
-
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
-      /* result[3] = 1.0; */
-      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_W), brw_imm_f(1));
-   }
-
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
-
-
-
-
-/* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
- */
-static void emit_dst_noalias( struct brw_vs_compile *c,
-			      struct brw_reg dst,
-			      struct brw_reg arg0,
-			      struct brw_reg arg1)
-{
-   struct brw_compile *p = &c->func;
-
-   /* There must be a better way to do this:
-    */
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X)
-      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_X), brw_imm_f(1.0));
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y)
-      brw_MUL(p, brw_writemask(dst, TGSI_WRITEMASK_Y), arg0, arg1);
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z)
-      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Z), arg0);
-   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W)
-      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), arg1);
-}
-
-static void emit_xpd( struct brw_compile *p,
-		      struct brw_reg dst,
-		      struct brw_reg t,
-		      struct brw_reg u)
-{
-   brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
-   brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
-}
-
-
-
-static void emit_lit_noalias( struct brw_vs_compile *c,
-			      struct brw_reg dst,
-			      struct brw_reg arg0 )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *if_insn;
-   struct brw_reg tmp = dst;
-   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_YZ), brw_imm_f(0));
-   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_XW), brw_imm_f(1));
-
-   /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
-    * to get all channels active inside the IF.  In the clipping code
-    * we run with NoMask, so it's not an option and we can use
-    * BRW_EXECUTE_1 for all comparisions.
-    */
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
-   if_insn = brw_IF(p, BRW_EXECUTE_8);
-   {
-      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0,0));
-
-      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
-      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_Z),  brw_swizzle1(arg0,1));
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-      emit_math2(c,
-		 BRW_MATH_FUNCTION_POW,
-		 brw_writemask(dst, TGSI_WRITEMASK_Z),
-		 brw_swizzle1(tmp, 2),
-		 brw_swizzle1(arg0, 3),
-		 BRW_MATH_PRECISION_PARTIAL);
-   }
-
-   brw_ENDIF(p, if_insn);
-}
-
-
-
-
-
-/* TODO: relative addressing!
- */
-static struct brw_reg get_reg( struct brw_vs_compile *c,
-			       unsigned file,
-			       unsigned index )
-{
-   switch (file) {
-   case TGSI_FILE_TEMPORARY:
-   case TGSI_FILE_INPUT:
-   case TGSI_FILE_OUTPUT:
-      assert(c->regs[file][index].nr != 0);
-      return c->regs[file][index];
-   case TGSI_FILE_CONSTANT:
-      assert(c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm].nr != 0);
-      return c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm];
-   case TGSI_FILE_IMMEDIATE:
-      assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
-      return c->regs[TGSI_FILE_CONSTANT][index];
-   case TGSI_FILE_ADDRESS:
-      assert(index == 0);
-      return c->regs[file][index];
-
-   case TGSI_FILE_NULL:			/* undef values */
-      return brw_null_reg();
-
-   default:
-      assert(0);
-      return brw_null_reg();
-   }
-}
-
-
-
-static struct brw_reg deref( struct brw_vs_compile *c,
-			     struct brw_reg arg,
-			     int offset)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = vec4(get_tmp(c));
-   struct brw_reg vp_address = retype(vec1(get_reg(c, TGSI_FILE_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
-   unsigned byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
-   struct brw_reg indirect = brw_vec4_indirect(0,0);
-
-   {
-      brw_push_insn_state(p);
-      brw_set_access_mode(p, BRW_ALIGN_1);
-
-      /* This is pretty clunky - load the address register twice and
-       * fetch each 4-dword value in turn.  There must be a way to do
-       * this in a single pass, but I couldn't get it to work.
-       */
-      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
-      brw_MOV(p, tmp, indirect);
-
-      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
-      brw_MOV(p, suboffset(tmp, 4), indirect);
-
-      brw_pop_insn_state(p);
-   }
-
-   return vec8(tmp);
-}
-
-
-static void emit_arl( struct brw_vs_compile *c,
-		      struct brw_reg dst,
-		      struct brw_reg arg0 )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg tmp = dst;
-   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   brw_RNDD(p, tmp, arg0);
-   brw_MUL(p, dst, tmp, brw_imm_d(16));
-
-   if (need_tmp)
-      release_tmp(c, tmp);
-}
-
-
-/* Will return mangled results for SWZ op.  The emit_swz() function
- * ignores this result and recalculates taking extended swizzles into
- * account.
- */
-static struct brw_reg get_arg( struct brw_vs_compile *c,
-			       struct tgsi_src_register *src )
-{
-   struct brw_reg reg;
-
-   if (src->File == TGSI_FILE_NULL)
-      return brw_null_reg();
-
-#if 0
-   if (src->RelAddr)
-      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
-   else
-#endif
-      reg = get_reg(c, src->File, src->Index);
-
-   /* Convert 3-bit swizzle to 2-bit.
-    */
-   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SwizzleX,
-				       src->SwizzleY,
-				       src->SwizzleZ,
-				       src->SwizzleW);
-
-   /* Note this is ok for non-swizzle instructions:
-    */
-   reg.negate = src->Negate ? 1 : 0;
-
-   return reg;
-}
-
-
-static struct brw_reg get_dst( struct brw_vs_compile *c,
-			       const struct tgsi_dst_register *dst )
-{
-   struct brw_reg reg = get_reg(c, dst->File, dst->Index);
-
-   reg.dw1.bits.writemask = dst->WriteMask;
-
-   return reg;
-}
-
-
-
-
-static void emit_swz( struct brw_vs_compile *c,
-		      struct brw_reg dst,
-		      struct tgsi_src_register src )
-{
-   struct brw_compile *p = &c->func;
-   unsigned zeros_mask = 0;
-   unsigned ones_mask = 0;
-   unsigned src_mask = 0;
-   ubyte src_swz[4];
-   boolean need_tmp = (src.Negate &&
-			 dst.file != BRW_GENERAL_REGISTER_FILE);
-   struct brw_reg tmp = dst;
-   unsigned i;
-
-   if (need_tmp)
-      tmp = get_tmp(c);
-
-   for (i = 0; i < 4; i++) {
-      if (dst.dw1.bits.writemask & (1<<i)) {
-	 ubyte s = 0;
-         switch(i) {
-         case 0:
-            s = src.SwizzleX;
-            break;
-            s = src.SwizzleY;
-         case 1:
-            break;
-            s = src.SwizzleZ;
-         case 2:
-            break;
-            s = src.SwizzleW;
-         case 3:
-            break;
-         }
-	 switch (s) {
-	 case TGSI_SWIZZLE_X:
-	 case TGSI_SWIZZLE_Y:
-	 case TGSI_SWIZZLE_Z:
-	 case TGSI_SWIZZLE_W:
-	    src_mask |= 1<<i;
-	    src_swz[i] = s;
-	    break;
-	 case TGSI_EXTSWIZZLE_ZERO:
-	    zeros_mask |= 1<<i;
-	    break;
-	 case TGSI_EXTSWIZZLE_ONE:
-	    ones_mask |= 1<<i;
-	    break;
-	 }
-      }
-   }
-
-   /* Do src first, in case dst aliases src:
-    */
-   if (src_mask) {
-      struct brw_reg arg0;
-
-#if 0
-      if (src.RelAddr)
-	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
-      else
-#endif
-	 arg0 = get_reg(c, src.File, src.Index);
-
-      arg0 = brw_swizzle(arg0,
-			 src_swz[0], src_swz[1],
-			 src_swz[2], src_swz[3]);
-
-      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
-   }
-
-   if (zeros_mask)
-      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
-
-   if (ones_mask)
-      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
-
-   if (src.Negate)
-      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
-
-   if (need_tmp) {
-      brw_MOV(p, dst, tmp);
-      release_tmp(c, tmp);
-   }
-}
-
-
-
-/* Post-vertex-program processing.  Send the results to the URB.
- */
-static void emit_vertex_write( struct brw_vs_compile *c, struct brw_prog_info *info)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg m0 = brw_message_reg(0);
-   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][info->pos_idx];
-   struct brw_reg ndc;
-
-   if (c->key.copy_edgeflag) {
-      brw_MOV(p,
-	      get_reg(c, TGSI_FILE_OUTPUT, info->result_edge_idx),
-	      get_reg(c, TGSI_FILE_INPUT, info->edge_flag_idx));
-   }
-
-
-   /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
-    */
-   if (!c->key.know_w_is_one) {
-      ndc = get_tmp(c);
-      emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
-      brw_MUL(p, brw_writemask(ndc, TGSI_WRITEMASK_XYZ), pos, ndc);
-   }
-   else {
-      ndc = pos;
-   }
-
-   /* This includes the workaround for -ve rhw, so is no longer an
-    * optional step:
-    */
-   if (info->writes_psize ||
-       c->key.nr_userclip ||
-       !c->key.know_w_is_one)
-   {
-      struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
-      unsigned i;
-
-      brw_MOV(p, header1, brw_imm_ud(0));
-
-      brw_set_access_mode(p, BRW_ALIGN_16);
-
-      if (info->writes_psize) {
-	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][info->psize_idx];
-	 brw_MUL(p, brw_writemask(header1, TGSI_WRITEMASK_W),
-                 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
-	 brw_AND(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1,
-                 brw_imm_ud(0x7ff<<8));
-      }
-
-
-      for (i = 0; i < c->key.nr_userclip; i++) {
-	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
-	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
-	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<i));
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      }
-
-
-      /* i965 clipping workaround:
-       * 1) Test for -ve rhw
-       * 2) If set,
-       *      set ndc = (0,0,0,0)
-       *      set ucp[6] = 1
-       *
-       * Later, clipping will detect ucp[6] and ensure the primitive is
-       * clipped against all fixed planes.
-       */
-      if (!c->key.know_w_is_one) {
-	 brw_CMP(p,
-		 vec8(brw_null_reg()),
-		 BRW_CONDITIONAL_L,
-		 brw_swizzle1(ndc, 3),
-		 brw_imm_f(0));
-
-	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<6));
-	 brw_MOV(p, ndc, brw_imm_f(0));
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      }
-
-      brw_set_access_mode(p, BRW_ALIGN_1);	/* why? */
-      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
-      brw_set_access_mode(p, BRW_ALIGN_16);
-
-      release_tmp(c, header1);
-   }
-   else {
-      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
-   }
-
-
-   /* Emit the (interleaved) headers for the two vertices - an 8-reg
-    * of zeros followed by two sets of NDC coordinates:
-    */
-   brw_set_access_mode(p, BRW_ALIGN_1);
-   brw_MOV(p, offset(m0, 2), ndc);
-   brw_MOV(p, offset(m0, 3), pos);
-
-
-   brw_urb_WRITE(p,
-		 brw_null_reg(), /* dest */
-		 0,		/* starting mrf reg nr */
-		 c->r0,		/* src */
-		 0,		/* allocate */
-		 1,		/* used */
-		 c->nr_outputs + 3, /* msg len */
-		 0,		/* response len */
-		 1, 		/* eot */
-		 1, 		/* writes complete */
-		 0, 		/* urb destination offset */
-		 BRW_URB_SWIZZLE_INTERLEAVE);
-
-}
-
-static void
-post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
-{
-   struct tgsi_parse_context parse;
-   const struct tgsi_token *tokens = c->vp->program.tokens;
-   tgsi_parse_init(&parse, tokens);
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
-      if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
-#if 0
-         struct brw_instruction *brw_inst1, *brw_inst2;
-         const struct tgsi_full_instruction *inst1, *inst2;
-         int offset;
-         inst1 = &parse.FullToken.FullInstruction;
-         brw_inst1 = inst1->Data;
-         switch (inst1->Opcode) {
-	 case TGSI_OPCODE_CAL:
-	 case TGSI_OPCODE_BRA:
-	    target_insn = inst1->BranchTarget;
-	    inst2 = &c->vp->program.Base.Instructions[target_insn];
-	    brw_inst2 = inst2->Data;
-	    offset = brw_inst2 - brw_inst1;
-	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	    break;
-	 case TGSI_OPCODE_END:
-	    offset = end_inst - brw_inst1;
-	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	    break;
-	 default:
-	    break;
-         }
-#endif
-      }
-   }
-   tgsi_parse_free(&parse);
-}
-
-static void process_declaration(const struct tgsi_full_declaration *decl,
-                                struct brw_prog_info *info)
-{
-   int first = decl->DeclarationRange.First;
-   int last = decl->DeclarationRange.Last;
-   
-   switch(decl->Declaration.File) {
-   case TGSI_FILE_CONSTANT: 
-      info->num_consts += last - first + 1;
-      break;
-   case TGSI_FILE_INPUT: {
-   }
-      break;
-   case TGSI_FILE_OUTPUT: {
-      assert(last == first);	/* for now */
-      if (decl->Declaration.Semantic) {
-         switch (decl->Semantic.SemanticName) {
-         case TGSI_SEMANTIC_POSITION: {
-            info->pos_idx = first;
-         }
-            break;
-         case TGSI_SEMANTIC_COLOR:
-            break;
-         case TGSI_SEMANTIC_BCOLOR:
-            break;
-         case TGSI_SEMANTIC_FOG:
-            break;
-         case TGSI_SEMANTIC_PSIZE: {
-            info->writes_psize = TRUE;
-            info->psize_idx = first;
-         }
-            break;
-         case TGSI_SEMANTIC_GENERIC:
-            break;
-         }
-      }
-   }
-      break;
-   case TGSI_FILE_TEMPORARY: {
-      info->num_temps += (last - first) + 1;
-   }
-      break;
-   case TGSI_FILE_SAMPLER: {
-   }
-      break;
-   case TGSI_FILE_ADDRESS: {
-      info->num_addrs += (last - first) + 1;
-   }
-      break;
-   case TGSI_FILE_IMMEDIATE: {
-   }
-      break;
-   case TGSI_FILE_NULL: {
-   }
-      break;
-   }
-}
-
-static void process_instruction(struct brw_vs_compile *c,
-                                struct tgsi_full_instruction *inst,
-                                struct brw_prog_info *info)
-{
-   struct brw_reg args[3], dst;
-   struct brw_compile *p = &c->func;
-   /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
-   unsigned i;
-   unsigned index;
-   unsigned file;
-   /*FIXME: might not be the only one*/
-   const struct tgsi_dst_register *dst_reg = &inst->FullDstRegisters[0].DstRegister;
-   /*
-   struct brw_instruction *if_inst[MAX_IFSN];
-   unsigned insn, if_insn = 0;
-   */
-
-   for (i = 0; i < 3; i++) {
-      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
-      index = src->SrcRegister.Index;
-      file = src->SrcRegister.File;
-      if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
-         args[i] = c->output_regs[index].reg;
-      else
-         args[i] = get_arg(c, &src->SrcRegister);
-   }
-
-   /* Get dest regs.  Note that it is possible for a reg to be both
-    * dst and arg, given the static allocation of registers.  So
-    * care needs to be taken emitting multi-operation instructions.
-    */
-   index = dst_reg->Index;
-   file = dst_reg->File;
-   if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
-      dst = c->output_regs[index].reg;
-   else
-      dst = get_dst(c, dst_reg);
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ABS:
-      brw_MOV(p, dst, brw_abs(args[0]));
-      break;
-   case TGSI_OPCODE_ADD:
-      brw_ADD(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_DP3:
-      brw_DP3(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_DP4:
-      brw_DP4(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_DPH:
-      brw_DPH(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_DST:
-      unalias2(c, dst, args[0], args[1], emit_dst_noalias);
-      break;
-   case TGSI_OPCODE_EXP:
-      unalias1(c, dst, args[0], emit_exp_noalias);
-      break;
-   case TGSI_OPCODE_EX2:
-      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
-      break;
-   case TGSI_OPCODE_ARL:
-      emit_arl(c, dst, args[0]);
-      break;
-   case TGSI_OPCODE_FLR:
-      brw_RNDD(p, dst, args[0]);
-      break;
-   case TGSI_OPCODE_FRC:
-      brw_FRC(p, dst, args[0]);
-      break;
-   case TGSI_OPCODE_LOG:
-      unalias1(c, dst, args[0], emit_log_noalias);
-      break;
-   case TGSI_OPCODE_LG2:
-      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
-      break;
-   case TGSI_OPCODE_LIT:
-      unalias1(c, dst, args[0], emit_lit_noalias);
-      break;
-   case TGSI_OPCODE_MAD:
-      brw_MOV(p, brw_acc_reg(), args[2]);
-      brw_MAC(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_MAX:
-      emit_max(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_MIN:
-      emit_min(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
-#if 0
-      /* The args[0] value can't be used here as it won't have
-       * correctly encoded the full swizzle:
-       */
-      emit_swz(c, dst, inst->SrcReg[0] );
-#endif
-      brw_MOV(p, dst, args[0]);
-      break;
-   case TGSI_OPCODE_MUL:
-      brw_MUL(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_POW:
-      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
-      break;
-   case TGSI_OPCODE_RCP:
-      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
-      break;
-   case TGSI_OPCODE_RSQ:
-      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      emit_seq(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_SNE:
-      emit_sne(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_SGE:
-      emit_sge(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_SGT:
-      emit_sgt(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_SLT:
-      emit_slt(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_SLE:
-      emit_sle(p, dst, args[0], args[1]);
-      break;
-   case TGSI_OPCODE_SUB:
-      brw_ADD(p, dst, args[0], negate(args[1]));
-      break;
-   case TGSI_OPCODE_XPD:
-      emit_xpd(p, dst, args[0], args[1]);
-      break;
-#if 0
-   case TGSI_OPCODE_IF:
-      assert(if_insn < MAX_IFSN);
-      if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
-      break;
-   case TGSI_OPCODE_ELSE:
-      if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
-      break;
-   case TGSI_OPCODE_ENDIF:
-      assert(if_insn > 0);
-      brw_ENDIF(p, if_inst[--if_insn]);
-      break;
-   case TGSI_OPCODE_BRA:
-      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-      brw_set_predicate_control_flag_value(p, 0xff);
-      break;
-   case TGSI_OPCODE_CAL:
-      brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_ADD(p, deref_1uw(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-      brw_set_access_mode(p, BRW_ALIGN_16);
-      brw_ADD(p, get_addr_reg(stack_index),
-              get_addr_reg(stack_index), brw_imm_d(4));
-      inst->Data = &p->store[p->nr_insn];
-      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-      break;
-#endif
-   case TGSI_OPCODE_RET:
-#if 0
-      brw_ADD(p, get_addr_reg(stack_index),
-              get_addr_reg(stack_index), brw_imm_d(-4));
-      brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_MOV(p, brw_ip_reg(), deref_1uw(stack_index, 0));
-      brw_set_access_mode(p, BRW_ALIGN_16);
-#else
-      /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
-#endif
-      break;
-   case TGSI_OPCODE_END:
-      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-      break;
-   case TGSI_OPCODE_BGNSUB:
-   case TGSI_OPCODE_ENDSUB:
-      break;
-   default:
-      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
-      break;
-   }
-
-   if (dst_reg->File == TGSI_FILE_OUTPUT
-       && dst_reg->Index != info->pos_idx
-       && c->output_regs[dst_reg->Index].used_in_src)
-      brw_MOV(p, get_dst(c, dst_reg), dst);
-
-   release_tmps(c);
-}
-
-/* Emit the fragment program instructions here.
- */
-void brw_vs_emit(struct brw_vs_compile *c)
-{
-#define MAX_IFSN 32
-   struct brw_compile *p = &c->func;
-   struct brw_instruction *end_inst;
-   struct tgsi_parse_context parse;
-   struct brw_indirect stack_index = brw_indirect(0, 0);
-   const struct tgsi_token *tokens = c->vp->program.tokens;
-   struct brw_prog_info prog_info;
-   unsigned allocated_registers = 0;
-   memset(&prog_info, 0, sizeof(struct brw_prog_info));
-
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_access_mode(p, BRW_ALIGN_16);
-
-   tgsi_parse_init(&parse, tokens);
-   /* Message registers can't be read, so copy the output into GRF register
-      if they are used in source registers */
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
-      unsigned i;
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION: {
-         const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
-         for (i = 0; i < 3; ++i) {
-            const struct tgsi_src_register *src = &inst->FullSrcRegisters[i].SrcRegister;
-            unsigned index = src->Index;
-            unsigned file = src->File;
-            if (file == TGSI_FILE_OUTPUT)
-               c->output_regs[index].used_in_src = TRUE;
-         }
-      }
-         break;
-      default:
-         /* nothing */
-         break;
-      }
-   }
-   tgsi_parse_free(&parse);
-
-   tgsi_parse_init(&parse, tokens);
-
-   while (!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_DECLARATION: {
-         struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
-         process_declaration(decl, &prog_info);
-      }
-         break;
-      case TGSI_TOKEN_TYPE_IMMEDIATE: {
-         struct tgsi_full_immediate *imm = &parse.FullToken.FullImmediate;
-         assert(imm->Immediate.NrTokens == 4 + 1);
-         c->prog_data.imm_buf[c->prog_data.num_imm][0] = imm->u[0].Float;
-         c->prog_data.imm_buf[c->prog_data.num_imm][1] = imm->u[1].Float;
-         c->prog_data.imm_buf[c->prog_data.num_imm][2] = imm->u[2].Float;
-         c->prog_data.imm_buf[c->prog_data.num_imm][3] = imm->u[3].Float;
-         c->prog_data.num_imm++;
-      }
-         break;
-      case TGSI_TOKEN_TYPE_INSTRUCTION: {
-         struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
-         if (!allocated_registers) {
-            /* first instruction (declerations finished).
-             * now that we know what vars are being used allocate
-             * registers for them.*/
-            c->prog_data.num_consts = prog_info.num_consts;
-            c->prog_data.max_const = prog_info.num_consts + c->prog_data.num_imm;
-            brw_vs_alloc_regs(c, &prog_info);
-
-	    brw_set_access_mode(p, BRW_ALIGN_1);
-            brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
-	    brw_set_access_mode(p, BRW_ALIGN_16);
-            allocated_registers = 1;
-         }
-         process_instruction(c, inst, &prog_info);
-      }
-         break;
-      }
-   }
-
-   end_inst = &p->store[p->nr_insn];
-   emit_vertex_write(c, &prog_info);
-   post_vs_emit(c, end_inst);
-   tgsi_parse_free(&parse);
-
-}
diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c
deleted file mode 100644
index 1eaff878928..00000000000
--- a/src/gallium/drivers/i965simple/brw_vs_state.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-static void upload_vs_unit( struct brw_context *brw )
-{
-   struct brw_vs_unit_state vs;
-
-   memset(&vs, 0, sizeof(vs));
-
-   /* CACHE_NEW_VS_PROG */
-   vs.thread0.kernel_start_pointer = brw->vs.prog_gs_offset >> 6;
-   vs.thread0.grf_reg_count = align(brw->vs.prog_data->total_grf, 16) / 16 - 1;
-   vs.thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
-   vs.thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
-   vs.thread3.dispatch_grf_start_reg = 1;
-
-
-   /* BRW_NEW_URB_FENCE  */
-   vs.thread4.nr_urb_entries = brw->urb.nr_vs_entries;
-   vs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
-   vs.thread4.max_threads = MIN2(
-      MAX2(0, (brw->urb.nr_vs_entries - 6) / 2 - 1),
-      15);
-
-
-
-   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
-      vs.thread4.max_threads = 0;
-
-   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
-   if (0 /*brw->attribs.Clip->ClipPlanesEnabled*/) {
-      /* Note that we read in the userclip planes as well, hence
-       * clip_start:
-       */
-      vs.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
-   }
-   else {
-      vs.thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
-   }
-
-   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   vs.thread3.urb_entry_read_offset = 0;
-
-   /* No samplers for ARB_vp programs:
-    */
-   vs.vs5.sampler_count = 0;
-
-   if (BRW_DEBUG & DEBUG_STATS)
-      vs.thread4.stats_enable = 1;
-
-   /* Vertex program always enabled:
-    */
-   vs.vs6.vs_enable = 1;
-
-   brw->vs.state_gs_offset = brw_cache_data( &brw->cache[BRW_VS_UNIT], &vs );
-}
-
-
-const struct brw_tracked_state brw_vs_unit = {
-   .dirty = {
-      .brw   = (BRW_NEW_CLIP |
-		BRW_NEW_CURBE_OFFSETS |
-		BRW_NEW_URB_FENCE),
-      .cache = CACHE_NEW_VS_PROG
-   },
-   .update = upload_vs_unit
-};
diff --git a/src/gallium/drivers/i965simple/brw_winsys.h b/src/gallium/drivers/i965simple/brw_winsys.h
deleted file mode 100644
index ec1e400418f..00000000000
--- a/src/gallium/drivers/i965simple/brw_winsys.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * \file
- * This is the interface that i965simple requires any window system
- * hosting it to implement.  This is the only include file in i965simple
- * which is public.
- *
- */
-
-#ifndef BRW_WINSYS_H
-#define BRW_WINSYS_H
-
-
-#include "pipe/p_defines.h"
-
-
-/* Pipe drivers are (meant to be!) independent of both GL and the
- * window system.  The window system provides a buffer manager and a
- * set of additional hooks for things like command buffer submission,
- * etc.
- *
- * There clearly has to be some agreement between the window system
- * driver and the hardware driver about the format of command buffers,
- * etc.
- */
-
-struct pipe_buffer;
-struct pipe_fence_handle;
-struct pipe_winsys;
-struct pipe_screen;
-
-
-/* The pipe driver currently understands the following chipsets:
- */
-#define PCI_CHIP_I965_G			0x29A2
-#define PCI_CHIP_I965_Q			0x2992
-#define PCI_CHIP_I965_G_1		0x2982
-#define PCI_CHIP_I965_GM                0x2A02
-#define PCI_CHIP_I965_GME               0x2A12
-
-
-/* These are the names of all the state caches managed by the driver.
- * 
- * When data is uploaded to a buffer with buffer_subdata, we use the
- * special version of that function below so that information about
- * what type of data this is can be passed to the winsys backend.
- * That in turn allows the correct flags to be set in the aub file
- * dump to allow human-readable file dumps later on.
- */
-
-enum brw_cache_id {
-   BRW_CC_VP,
-   BRW_CC_UNIT,
-   BRW_WM_PROG,
-   BRW_SAMPLER_DEFAULT_COLOR,
-   BRW_SAMPLER,
-   BRW_WM_UNIT,
-   BRW_SF_PROG,
-   BRW_SF_VP,
-   BRW_SF_UNIT,
-   BRW_VS_UNIT,
-   BRW_VS_PROG,
-   BRW_GS_UNIT,
-   BRW_GS_PROG,
-   BRW_CLIP_VP,
-   BRW_CLIP_UNIT,
-   BRW_CLIP_PROG,
-   BRW_SS_SURFACE,
-   BRW_SS_SURF_BIND,
-
-   BRW_MAX_CACHE
-};
-
-#define BRW_CONSTANT_BUFFER BRW_MAX_CACHE
-
-/**
- * Additional winsys interface for i965simple.
- *
- * It is an over-simple batchbuffer mechanism.  Will want to improve the
- * performance of this, perhaps based on the cmdstream stuff.  It
- * would be pretty impossible to implement swz on top of this
- * interface.
- *
- * Will also need additions/changes to implement static/dynamic
- * indirect state.
- */
-struct brw_winsys {
-
-   void (*destroy)(struct brw_winsys *);
-   
-   /**
-    * Reserve space on batch buffer.
-    *
-    * Returns a null pointer if there is insufficient space in the batch buffer
-    * to hold the requested number of dwords and relocations.
-    *
-    * The number of dwords should also include the number of relocations.
-    */
-   unsigned *(*batch_start)(struct brw_winsys *sws,
-                            unsigned dwords,
-                            unsigned relocs);
-
-   void (*batch_dword)(struct brw_winsys *sws,
-                       unsigned dword);
-
-   /**
-    * Emit a relocation to a buffer.
-    *
-    * Used not only when the buffer addresses are not pinned, but also to
-    * ensure refered buffers will not be destroyed until the current batch
-    * buffer execution is finished.
-    *
-    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and
-    * I915_BUFFER_ACCESS_READ macros.
-    */
-   void (*batch_reloc)(struct brw_winsys *sws,
-                       struct pipe_buffer *buf,
-                       unsigned access_flags,
-                       unsigned delta);
-
-
-   /* Not used yet, but really want this:
-    */
-   void (*batch_end)( struct brw_winsys *sws );
-
-   /**
-    * Flush the batch buffer.
-    *
-    * Fence argument must point to NULL or to a previous fence, and the caller
-    * must call fence_reference when done with the fence.
-    */
-   void (*batch_flush)(struct brw_winsys *sws,
-                       struct pipe_fence_handle **fence);
-
-
-   /* A version of buffer_subdata that includes information for the
-    * simulator:
-    */
-   void (*buffer_subdata_typed)(struct brw_winsys *sws, 
-				struct pipe_buffer *buf,
-				unsigned long offset, 
-				unsigned long size, 
-				const void *data,
-				unsigned data_type);
-   
-
-   /* A cheat so we don't have to think about relocations in a couple
-    * of places yet:
-    */
-   unsigned (*get_buffer_offset)( struct brw_winsys *sws,
-				  struct pipe_buffer *buf,
-				  unsigned flags );
-
-};
-
-#define BRW_BUFFER_ACCESS_WRITE   0x1
-#define BRW_BUFFER_ACCESS_READ    0x2
-
-#define BRW_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
-
-
-struct pipe_context *brw_create(struct pipe_screen *,
-                                struct brw_winsys *,
-                                unsigned pci_id);
-
-static inline boolean brw_batchbuffer_data(struct brw_winsys *winsys,
-                                           const void *data,
-                                           unsigned bytes)
-{
-   static const unsigned incr = sizeof(unsigned);
-   uint i;
-   const unsigned *udata = (const unsigned*)(data);
-   unsigned size = bytes/incr;
-
-   winsys->batch_start(winsys, size, 0);
-   for (i = 0; i < size; ++i) {
-      winsys->batch_dword(winsys, udata[i]);
-   }
-   winsys->batch_end(winsys);
-
-   return (i == size);
-}
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
deleted file mode 100644
index 10161f2d2f6..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_util.h"
-#include "brw_wm.h"
-#include "brw_eu.h"
-#include "brw_state.h"
-#include "util/u_memory.h"
-
-
-
-static void do_wm_prog( struct brw_context *brw,
-			struct brw_fragment_program *fp,
-			struct brw_wm_prog_key *key)
-{
-   struct brw_wm_compile *c = CALLOC_STRUCT(brw_wm_compile);
-   const unsigned *program;
-   unsigned program_size;
-
-   c->key = *key;
-   c->fp = fp;
-   
-   c->delta_xy[0] = brw_null_reg();
-   c->delta_xy[1] = brw_null_reg();
-   c->pixel_xy[0] = brw_null_reg();
-   c->pixel_xy[1] = brw_null_reg();
-   c->pixel_w = brw_null_reg();
-
-
-   debug_printf("XXXXXXXX FP\n");
-   
-   brw_wm_glsl_emit(c);
-
-   /* get the program
-    */
-   program = brw_get_program(&c->func, &program_size);
-
-   /*
-    */
-   brw->wm.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_WM_PROG],
-					      &c->key,
-					      sizeof(c->key),
-					      program,
-					      program_size,
-					      &c->prog_data,
-					      &brw->wm.prog_data );
-
-   FREE(c);
-}
-
-
-
-static void brw_wm_populate_key( struct brw_context *brw,
-				 struct brw_wm_prog_key *key )
-{
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct brw_fragment_program *fp =
-      (struct brw_fragment_program *)brw->attribs.FragmentProgram;
-   unsigned lookup = 0;
-   unsigned line_aa;
-   
-   memset(key, 0, sizeof(*key));
-
-   /* Build the index for table lookup
-    */
-   /* BRW_NEW_DEPTH_STENCIL */
-   if (fp->info.uses_kill ||
-       brw->attribs.DepthStencil->alpha.enabled)
-      lookup |= IZ_PS_KILL_ALPHATEST_BIT;
-
-   if (fp->info.writes_z)
-      lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
-
-   if (brw->attribs.DepthStencil->depth.enabled)
-      lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
-
-   if (brw->attribs.DepthStencil->depth.enabled &&
-       brw->attribs.DepthStencil->depth.writemask) /* ?? */
-      lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
-
-   if (brw->attribs.DepthStencil->stencil[0].enabled) {
-      lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
-
-      if (brw->attribs.DepthStencil->stencil[0].writemask ||
-	  brw->attribs.DepthStencil->stencil[1].writemask)
-	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
-   }
-
-   /* XXX: when should this be disabled?
-    */
-   if (1)
-      lookup |= IZ_EARLY_DEPTH_TEST_BIT;
-
-
-   line_aa = AA_NEVER;
-
-   /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
-   if (brw->attribs.Raster->line_smooth) {
-      if (brw->reduced_primitive == PIPE_PRIM_LINES) {
-	 line_aa = AA_ALWAYS;
-      }
-      else if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
-	 if (brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_LINE) {
-	    line_aa = AA_SOMETIMES;
-
-	    if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE ||
-		(brw->attribs.Raster->cull_mode == PIPE_WINDING_CW))
-	       line_aa = AA_ALWAYS;
-	 }
-	 else if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE) {
-	    line_aa = AA_SOMETIMES;
-
-	    if (brw->attribs.Raster->cull_mode == PIPE_WINDING_CCW)
-	       line_aa = AA_ALWAYS;
-	 }
-      }
-   }
-
-   brw_wm_lookup_iz(line_aa,
-		    lookup,
-		    key);
-
-
-#if 0
-   /* BRW_NEW_SAMPLER 
-    *
-    * Not doing any of this at the moment:
-    */
-   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      const struct pipe_sampler_state *unit = brw->attribs.Samplers[i];
-
-      if (unit) {
-
-	 if (unit->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-	    key->shadowtex_mask |= 1<<i;
-	 }
-	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA)
-	    key->yuvtex_mask |= 1<<i;
-      }
-   }
-#endif
-
-
-   /* Extra info:
-    */
-   key->program_string_id = fp->id;
-
-}
-
-
-static void brw_upload_wm_prog( struct brw_context *brw )
-{
-   struct brw_wm_prog_key key;
-   struct brw_fragment_program *fp = (struct brw_fragment_program *)
-      brw->attribs.FragmentProgram;
-
-   brw_wm_populate_key(brw, &key);
-
-   /* Make an early check for the key.
-    */
-   if (brw_search_cache(&brw->cache[BRW_WM_PROG],
-			&key, sizeof(key),
-			&brw->wm.prog_data,
-			&brw->wm.prog_gs_offset))
-      return;
-
-   do_wm_prog(brw, fp, &key);
-}
-
-
-const struct brw_tracked_state brw_wm_prog = {
-   .dirty = {
-      .brw   = (BRW_NEW_FS |
-		BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = 0
-   },
-   .update = brw_upload_wm_prog
-};
-
diff --git a/src/gallium/drivers/i965simple/brw_wm.h b/src/gallium/drivers/i965simple/brw_wm.h
deleted file mode 100644
index b29c4393f01..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-              
-
-#ifndef BRW_WM_H
-#define BRW_WM_H
-
-
-#include "brw_context.h"
-#include "brw_eu.h"
-
-/* A big lookup table is used to figure out which and how many
- * additional regs will inserted before the main payload in the WM
- * program execution.  These mainly relate to depth and stencil
- * processing and the early-depth-test optimization.
- */
-#define IZ_PS_KILL_ALPHATEST_BIT    0x1
-#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
-#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
-#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
-#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
-#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
-#define IZ_EARLY_DEPTH_TEST_BIT     0x40
-#define IZ_BIT_MAX                  0x80
-
-#define AA_NEVER     0
-#define AA_SOMETIMES 1
-#define AA_ALWAYS    2
-
-struct brw_wm_prog_key {
-   unsigned source_depth_reg:3;
-   unsigned aa_dest_stencil_reg:3;
-   unsigned dest_depth_reg:3;
-   unsigned nr_depth_regs:3;
-   unsigned shadowtex_mask:8;
-   unsigned computes_depth:1;	/* could be derived from program string */
-   unsigned source_depth_to_render_target:1;
-   unsigned runtime_check_aads_emit:1;
-
-   unsigned yuvtex_mask:8;
-
-   unsigned program_string_id;
-};
-
-
-
-
-
-#define PROGRAM_INTERNAL_PARAM
-#define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
-#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + PIPE_MAX_ATTRIBS + 3)
-#define BRW_WM_MAX_GRF   128		/* hardware limit */
-#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
-#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
-#define BRW_WM_MAX_PARAM 256
-#define BRW_WM_MAX_CONST 256
-#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
-
-#define PAYLOAD_DEPTH     (PIPE_MAX_ATTRIBS)
-
-#define MAX_IFSN 32
-#define MAX_LOOP_DEPTH 32
-
-struct brw_wm_compile {
-   struct brw_compile func;
-   struct brw_wm_prog_key key;
-   struct brw_wm_prog_data prog_data; /* result */
-
-   struct brw_fragment_program *fp;
-
-   unsigned grf_limit;
-   unsigned max_wm_grf;
-
-
-   struct brw_reg pixel_xy[2];
-   struct brw_reg delta_xy[2];
-   struct brw_reg pixel_w;
-
-
-   struct brw_reg wm_regs[8][32][4];
-
-   struct brw_reg payload_depth[4];
-   struct brw_reg payload_coef[16];
-
-   struct brw_reg emit_mask_reg;
-
-   struct brw_instruction *if_inst[MAX_IFSN];
-   int if_insn;
-
-   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
-   int loop_insn;
-
-   struct brw_instruction *inst0;
-   struct brw_instruction *inst1;
-
-   struct brw_reg stack;
-   struct brw_indirect stack_index;
-
-   unsigned reg_index;
-
-   unsigned tmp_start;
-   unsigned tmp_index;
-};
-
-
-
-void brw_wm_lookup_iz( unsigned line_aa,
-		       unsigned lookup,
-		       struct brw_wm_prog_key *key );
-
-void brw_wm_glsl_emit(struct brw_wm_compile *c);
-void brw_wm_emit_decls(struct brw_wm_compile *c);
-
-#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
deleted file mode 100644
index d50e66f613f..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm_decl.c
+++ /dev/null
@@ -1,392 +0,0 @@
-
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-
-static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
-{
-   c->tmp_index++;
-   c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
-   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
-}
-
-static void release_tmps(struct brw_wm_compile *c)
-{
-   c->tmp_index = 0;
-}
-
-
-
-static int is_null( struct brw_reg reg )
-{
-   return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
-	   reg.nr == BRW_ARF_NULL);
-}
-
-static void emit_pixel_xy( struct brw_wm_compile *c )
-{
-   if (is_null(c->pixel_xy[0])) {
-
-      struct brw_compile *p = &c->func;
-      struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
-
-      c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
-      c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
-
-      /* Calculate pixel centers by adding 1 or 0 to each of the
-       * micro-tile coordinates passed in r1.
-       */
-      brw_ADD(p,
-	      c->pixel_xy[0],
-	      stride(suboffset(r1_uw, 4), 2, 4, 0),
-	      brw_imm_v(0x10101010));
-
-      brw_ADD(p,
-	      c->pixel_xy[1],
-	      stride(suboffset(r1_uw, 5), 2, 4, 0),
-	      brw_imm_v(0x11001100));
-   }
-}
-
-
-
-
-
-
-static void emit_delta_xy( struct brw_wm_compile *c )
-{
-   if (is_null(c->delta_xy[0])) {
-      struct brw_compile *p = &c->func;
-      struct brw_reg r1 = brw_vec1_grf(1, 0);
-
-      emit_pixel_xy(c);
-
-      c->delta_xy[0] = alloc_tmp(c);
-      c->delta_xy[1] = alloc_tmp(c);
-
-      /* Calc delta X,Y by subtracting origin in r1 from the pixel
-       * centers.
-       */
-      brw_ADD(p,
-	      c->delta_xy[0],
-	      retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
-	      negate(r1));
-
-      brw_ADD(p,
-	      c->delta_xy[1],
-	      retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
-	      negate(suboffset(r1,1)));
-   }
-}
-
-
-
-#if 0
-static void emit_pixel_w( struct brw_wm_compile *c )
-{
-   if (is_null(c->pixel_w)) {
-      struct brw_compile *p = &c->func;
-
-      struct brw_reg interp_wpos = c->coef_wpos;
-      
-      c->pixel_w = alloc_tmp(c);
-
-      emit_delta_xy(c);
-
-      /* Calc 1/w - just linterp wpos[3] optimized by putting the
-       * result straight into a message reg.
-       */
-      struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
-      brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
-      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
-
-      /* Calc w */
-      brw_math_16( p, 
-		   c->pixel_w,
-		   BRW_MATH_FUNCTION_INV,
-		   BRW_MATH_SATURATE_NONE,
-		   2, 
-		   brw_null_reg(),
-		   BRW_MATH_PRECISION_FULL);
-   }
-}
-#endif
-
-
-static void emit_cinterp(struct brw_wm_compile *c,
-			 int idx,
-			 int mask )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg interp[4];
-   struct brw_reg coef = c->payload_coef[idx];
-   int i;
-
-   interp[0] = brw_vec1_grf(coef.nr, 0);
-   interp[1] = brw_vec1_grf(coef.nr, 4);
-   interp[2] = brw_vec1_grf(coef.nr+1, 0);
-   interp[3] = brw_vec1_grf(coef.nr+1, 4);
-
-   for(i = 0; i < 4; i++ ) {
-      if (mask & (1<<i)) {
-	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
-	 brw_MOV(p, dst, suboffset(interp[i],3));
-      }
-   }
-}
-
-static void emit_linterp(struct brw_wm_compile *c,
-			 int idx,
-			 int mask )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg interp[4];
-   struct brw_reg coef = c->payload_coef[idx];
-   int i;
-
-   emit_delta_xy(c);
-
-   interp[0] = brw_vec1_grf(coef.nr, 0);
-   interp[1] = brw_vec1_grf(coef.nr, 4);
-   interp[2] = brw_vec1_grf(coef.nr+1, 0);
-   interp[3] = brw_vec1_grf(coef.nr+1, 4);
-
-   for(i = 0; i < 4; i++ ) {
-      if (mask & (1<<i)) {
-	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
-	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
-	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
-      }
-   }
-}
-
-#if 0
-static void emit_pinterp(struct brw_wm_compile *c,
-			 int idx,
-			 int mask )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg interp[4];
-   struct brw_reg coef = c->payload_coef[idx];
-   int i;
-
-   get_delta_xy(c);
-   get_pixel_w(c);
-
-   interp[0] = brw_vec1_grf(coef.nr, 0);
-   interp[1] = brw_vec1_grf(coef.nr, 4);
-   interp[2] = brw_vec1_grf(coef.nr+1, 0);
-   interp[3] = brw_vec1_grf(coef.nr+1, 4);
-
-   for(i = 0; i < 4; i++ ) {
-      if (mask & (1<<i)) {
-	 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
-	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
-	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
-	 brw_MUL(p, dst, dst, c->pixel_w);
-      }
-   }
-}
-#endif
-
-
-
-#if 0
-static void emit_wpos( )
-{ 
-   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
-   struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-   struct tgsi_full_src_register deltas = get_delta_xy(c);
-   struct tgsi_full_src_register arg2;
-   unsigned opcode;
-
-   opcode = WM_LINTERP;
-   arg2 = src_undef();
-
-   /* Have to treat wpos.xy specially:
-    */
-   emit_op(c,
-	   WM_WPOSXY,
-	   dst_mask(dst, WRITEMASK_XY),
-	   0, 0, 0,
-	   get_pixel_xy(c),
-	   src_undef(),
-	   src_undef());
-      
-   dst = dst_mask(dst, WRITEMASK_ZW);
-
-   /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
-    */
-   emit_op(c,
-	   WM_LINTERP,
-	   dst,
-	   0, 0, 0,
-	   interp,
-	   deltas,
-	   arg2);
-}
-#endif
-
-
-
-
-/* Perform register allocation:
- * 
- *  -- r0???
- *  -- passthrough depth regs (and stencil/aa??)
- *  -- curbe ??
- *  -- inputs (coefficients)
- *
- * Use a totally static register allocation.  This will perform poorly
- * but is an easy way to get started (again).
- */
-static void prealloc_reg(struct brw_wm_compile *c)
-{
-   int i, j;
-   int nr_curbe_regs = 0;
-
-   /* R0, then some depth related regs:
-    */
-   for (i = 0; i < c->key.nr_depth_regs; i++) {
-      c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
-      c->reg_index += 2;
-   }
-
-
-   /* Then a copy of our part of the CURBE entry:
-    */
-   {
-      int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
-      int index = 0;
-
-      /* XXX number of constants, or highest numbered constant? */
-      assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
-
-      c->prog_data.max_const = 4*nr_constants;
-      for (i = 0; i < nr_constants; i++) {
-	 for (j = 0; j < 4; j++, index++) 
-	    c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
-								index%8);
-      }
-
-      nr_curbe_regs = 2*((4*nr_constants+15)/16);
-      c->reg_index += nr_curbe_regs;
-   }
-
-   /* Adjust for parameter coefficients for position, which are
-    * currently always provided.
-    */
-//   c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
-   c->reg_index += 2;
-
-   /* Next we receive the plane coefficients for parameter
-    * interpolation:
-    */
-   assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
-   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
-      c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
-      c->reg_index += 2;
-   }
-
-   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-   c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
-   c->prog_data.curb_read_length = nr_curbe_regs;
-
-   /* That's the end of the payload, now we can start allocating registers.
-    */
-   c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-   c->reg_index++;
-
-   c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-   c->reg_index += 2;
-
-   /* Now allocate room for the interpolated inputs and staging
-    * registers for the outputs:
-    */
-   /* XXX do we want to loop over the _number_ of inputs/outputs or loop
-    * to the highest input/output index that's used?
-    *  Probably the same, actually.
-    */
-   assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
-   assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
-   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) 
-      for (j = 0; j < 4; j++)
-	 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
-
-   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++) 
-      for (j = 0; j < 4; j++)
-	 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
-
-   /* Beyond this we should only need registers for internal temporaries:
-    */
-   c->tmp_start = c->reg_index;
-}
-
-
-
-
-
-/* Need to interpolate fragment program inputs in as a preamble to the
- * shader.  A more sophisticated compiler would do this on demand, but
- * we'll do it up front:
- */
-void brw_wm_emit_decls(struct brw_wm_compile *c)
-{
-   struct tgsi_parse_context parse;
-   int done = 0;
-
-   prealloc_reg(c);
-
-   tgsi_parse_init( &parse, c->fp->program.tokens );
-
-   while( !done &&
-	  !tgsi_parse_end_of_tokens( &parse ) ) 
-   {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-      {
-	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
-	 unsigned first = decl->DeclarationRange.First;
-	 unsigned last = decl->DeclarationRange.Last;
-	 unsigned mask = decl->Declaration.UsageMask; /* ? */
-	 unsigned i;
-
-	 if (decl->Declaration.File != TGSI_FILE_INPUT)
-	    break;
-
-	 for( i = first; i <= last; i++ ) {
-	    switch (decl->Declaration.Interpolate) {
-	    case TGSI_INTERPOLATE_CONSTANT:
-	       emit_cinterp(c, i, mask);
-	       break;
-
-	    case TGSI_INTERPOLATE_LINEAR:
-	       emit_linterp(c, i, mask);
-	       break;
-
-	    case TGSI_INTERPOLATE_PERSPECTIVE:
-	       //emit_pinterp(c, i, mask);
-	       emit_linterp(c, i, mask);
-	       break;
-	    }
-	 }
-	 break;
-      }
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-      default:
-         done = 1;
-	 break;
-      }
-   }
-
-   tgsi_parse_free (&parse);
-   
-   release_tmps(c);
-}
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
deleted file mode 100644
index db759639328..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ /dev/null
@@ -1,1076 +0,0 @@
-
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-
-
-
-static int get_scalar_dst_index(struct tgsi_full_instruction *inst)
-{
-   struct tgsi_dst_register dst = inst->FullDstRegisters[0].DstRegister;
-   int i;
-   for (i = 0; i < 4; i++)
-      if (dst.WriteMask & (1<<i))
-	 break;
-   return i;
-}
-
-static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
-{
-   c->tmp_index++;
-   c->reg_index = MAX2(c->reg_index, c->tmp_index);
-   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
-}
-
-static void release_tmps(struct brw_wm_compile *c)
-{
-   c->tmp_index = 0;
-}
-
-
-static struct brw_reg
-get_reg(struct brw_wm_compile *c, int file, int index, int component )
-{
-   switch (file) {
-   case TGSI_FILE_NULL:
-      return brw_null_reg();
-
-   case TGSI_FILE_SAMPLER:
-      /* Should never get here:
-       */
-      assert (0);	       
-      return brw_null_reg();
-
-   case TGSI_FILE_IMMEDIATE:
-      /* These need a different path:
-       */
-      assert(0);
-      return brw_null_reg();
-
-       
-   case TGSI_FILE_CONSTANT:
-   case TGSI_FILE_INPUT:
-   case TGSI_FILE_OUTPUT:
-   case TGSI_FILE_TEMPORARY:
-   case TGSI_FILE_ADDRESS:
-      return c->wm_regs[file][index][component];
-
-   default:
-      assert(0);
-      return brw_null_reg();
-   }
-}
-
-
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
-				  struct tgsi_full_instruction *inst, 
-				  int component)
-{
-   return get_reg(c, 
-		  inst->FullDstRegisters[0].DstRegister.File, 
-		  inst->FullDstRegisters[0].DstRegister.Index,
-		  component);
-}
-
-static int get_swz( struct tgsi_src_register src, int index )
-{
-   switch (index & 3) {
-   case 0: return src.SwizzleX;
-   case 1: return src.SwizzleY;
-   case 2: return src.SwizzleZ;
-   case 3: return src.SwizzleW;
-   default: return 0;
-   }
-}
-
-static int get_ext_swz( struct tgsi_src_register_ext_swz src, int index )
-{
-   switch (index & 3) {
-   case 0: return src.ExtSwizzleX;
-   case 1: return src.ExtSwizzleY;
-   case 2: return src.ExtSwizzleZ;
-   case 3: return src.ExtSwizzleW;
-   default: return 0;
-   }
-}
-
-static struct brw_reg get_src_reg(struct brw_wm_compile *c,
-				  struct tgsi_full_src_register *src, 
-				  int index)
-{
-   struct brw_reg reg;
-   int component = index;
-   int neg = 0;
-   int abs = 0;
-
-   if (src->SrcRegister.Negate)
-      neg = 1;
-
-   component = get_swz(src->SrcRegister, component);
-
-   /* Yes, there are multiple negates:
-    */
-   switch (component & 3) {
-   case 0: neg ^= src->SrcRegisterExtSwz.NegateX; break;
-   case 1: neg ^= src->SrcRegisterExtSwz.NegateY; break;
-   case 2: neg ^= src->SrcRegisterExtSwz.NegateZ; break;
-   case 3: neg ^= src->SrcRegisterExtSwz.NegateW; break;
-   }
-
-   /* And multiple swizzles, fun isn't it:
-    */
-   component = get_ext_swz(src->SrcRegisterExtSwz, component);
-
-   /* Not handling indirect lookups yet:
-    */
-   assert(src->SrcRegister.Indirect == 0);
-
-   /* Don't know what dimension means:
-    */
-   assert(src->SrcRegister.Dimension == 0);
-
-   /* Will never handle any of this stuff: 
-    */
-   assert(src->SrcRegisterExtMod.Complement == 0);
-   assert(src->SrcRegisterExtMod.Bias == 0);
-   assert(src->SrcRegisterExtMod.Scale2X == 0);
-
-   if (src->SrcRegisterExtMod.Absolute)
-      abs = 1;
-
-   /* Another negate!  This is a post-absolute negate, which we
-    * can't do.  Need to clean the crap out of tgsi somehow.
-    */
-   assert(src->SrcRegisterExtMod.Negate == 0);
-
-   switch( component ) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
-      reg = get_reg(c, 
-		    src->SrcRegister.File, 
-		    src->SrcRegister.Index, 
-		    component );
-
-      if (neg) 
-	 reg = negate(reg);
-   
-      if (abs)
-	 reg = brw_abs(reg);
-
-      break;
-
-      /* XXX: this won't really work in the general case, but we know
-       * that the extended swizzle is only allowed in the SWZ
-       * instruction (right??), in which case using an immediate
-       * directly will work.
-       */
-   case TGSI_EXTSWIZZLE_ZERO:
-      reg = brw_imm_f(0);
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      if (neg && !abs)
-	 reg = brw_imm_f(-1.0);
-      else
-	 reg = brw_imm_f(1.0);
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-
-    
-   return reg;
-}
-
-static void emit_abs( struct brw_wm_compile *c,
-		      struct tgsi_full_instruction *inst)
-{
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-
-   int i;
-   struct brw_compile *p = &c->func;
-   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 struct brw_reg src, dst;
-	 dst = get_dst_reg(c, inst, i);
-	 src = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 brw_MOV(p, dst, brw_abs(src)); /* NOTE */
-      }
-   }
-   brw_set_saturate(p, 0);
-}
-
-
-static void emit_xpd(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   int i;
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   for (i = 0; i < 4; i++) {
-      unsigned i2 = (i+2)%3;
-      unsigned i1 = (i+1)%3;
-      if (mask & (1<<i)) {
-	 struct brw_reg src0, src1, dst;
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = negate(get_src_reg(c, &inst->FullSrcRegisters[0], i2));
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i1);
-	 brw_MUL(p, brw_null_reg(), src0, src1);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i1);
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i2);
-	 brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
-	 brw_MAC(p, dst, src0, src1);
-	 brw_set_saturate(p, 0);
-      }
-   }
-   brw_set_saturate(p, 0);
-}
-
-static void emit_dp3(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_reg src0[3], src1[3], dst;
-   int i;
-   struct brw_compile *p = &c->func;
-   for (i = 0; i < 3; i++) {
-      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-   }
-
-   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
-   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-   brw_MAC(p, dst, src0[2], src1[2]);
-   brw_set_saturate(p, 0);
-}
-
-static void emit_dp4(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_reg src0[4], src1[4], dst;
-   int i;
-   struct brw_compile *p = &c->func;
-   for (i = 0; i < 4; i++) {
-      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-   }
-   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
-   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-   brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
-   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-   brw_MAC(p, dst, src0[3], src1[3]);
-   brw_set_saturate(p, 0);
-}
-
-static void emit_dph(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_reg src0[4], src1[4], dst;
-   int i;
-   struct brw_compile *p = &c->func;
-   for (i = 0; i < 4; i++) {
-      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-   }
-   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
-   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-   brw_MAC(p, dst, src0[2], src1[2]);
-   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-   brw_ADD(p, dst, src0[3], src1[3]);
-   brw_set_saturate(p, 0);
-}
-
-static void emit_math1(struct brw_wm_compile *c,
-		       struct tgsi_full_instruction *inst, unsigned func)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg src0, dst;
-
-   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
-   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
-   brw_MOV(p, brw_message_reg(2), src0);
-   brw_math(p,
-	    dst,
-	    func,
-	    ((inst->Instruction.Saturate != TGSI_SAT_NONE) 
-	     ? BRW_MATH_SATURATE_SATURATE 
-	     : BRW_MATH_SATURATE_NONE),
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
-}
-
-
-static void emit_alu2(struct brw_wm_compile *c,		      
-		      struct tgsi_full_instruction *inst,
-		      unsigned opcode)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg src0, src1, dst;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   int i;
-   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-   for (i = 0 ; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-	 brw_alu2(p, opcode, dst, src0, src1);
-      }
-   }
-   brw_set_saturate(p, 0);
-}
-
-
-static void emit_alu1(struct brw_wm_compile *c,
-		      struct tgsi_full_instruction *inst,
-		      unsigned opcode)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg src0, dst;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   int i;
-   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-   for (i = 0 ; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 brw_alu1(p, opcode, dst, src0);
-      }
-   }
-   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
-      brw_set_saturate(p, 0);
-}
-
-
-static void emit_max(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg src0, src1, dst;
-   int i;
-   brw_push_insn_state(p);
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-	 brw_MOV(p, dst, src0);
-	 brw_set_saturate(p, 0);
-
-	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
-	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	 brw_MOV(p, dst, src1);
-	 brw_set_saturate(p, 0);
-	 brw_set_predicate_control_flag_value(p, 0xff);
-      }
-   }
-   brw_pop_insn_state(p);
-}
-
-static void emit_min(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg src0, src1, dst;
-   int i;
-   brw_push_insn_state(p);
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-	 brw_MOV(p, dst, src0);
-	 brw_set_saturate(p, 0);
-
-	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
-	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	 brw_MOV(p, dst, src1);
-	 brw_set_saturate(p, 0);
-	 brw_set_predicate_control_flag_value(p, 0xff);
-      }
-   }
-   brw_pop_insn_state(p);
-}
-
-static void emit_pow(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg dst, src0, src1;
-   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
-   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
-   src1 = get_src_reg(c, &inst->FullSrcRegisters[1], 0);
-
-   brw_MOV(p, brw_message_reg(2), src0);
-   brw_MOV(p, brw_message_reg(3), src1);
-
-   brw_math(p,
-	    dst,
-	    BRW_MATH_FUNCTION_POW,
-	    (inst->Instruction.Saturate != TGSI_SAT_NONE 
-	     ? BRW_MATH_SATURATE_SATURATE 
-	     : BRW_MATH_SATURATE_NONE),
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
-}
-
-static void emit_lrp(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
-   int i;
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-
-	 if (src1.nr == dst.nr) {
-	    tmp1 = alloc_tmp(c);
-	    brw_MOV(p, tmp1, src1);
-	 } else
-	    tmp1 = src1;
-
-	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
-	 if (src2.nr == dst.nr) {
-	    tmp2 = alloc_tmp(c);
-	    brw_MOV(p, tmp2, src2);
-	 } else
-	    tmp2 = src2;
-
-	 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
-	 brw_MUL(p, brw_null_reg(), dst, tmp2);
-	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-	 brw_MAC(p, dst, src0, tmp1);
-	 brw_set_saturate(p, 0);
-      }
-      release_tmps(c);
-   }
-}
-
-static void emit_kil(struct brw_wm_compile *c)
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-   brw_push_insn_state(p);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
-   brw_AND(p, depth, c->emit_mask_reg, depth);
-   brw_pop_insn_state(p);
-}
-
-static void emit_mad(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg dst, src0, src1, src2;
-   int i;
-
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
-	 brw_MUL(p, dst, src0, src1);
-
-	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
-	 brw_ADD(p, dst, dst, src2);
-	 brw_set_saturate(p, 0);
-      }
-   }
-}
-
-static void emit_sop(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst, unsigned cond)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg dst, src0, src1;
-   int i;
-
-   brw_push_insn_state(p);
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
-	 brw_CMP(p, brw_null_reg(), cond, src0, src1);
-	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	 brw_MOV(p, dst, brw_imm_f(0.0));
-	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	 brw_MOV(p, dst, brw_imm_f(1.0));
-      }
-   }
-   brw_pop_insn_state(p);
-}
-
-
-static void emit_ddx(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg interp[4];
-   struct brw_reg dst;
-   struct brw_reg src0, w;
-   unsigned nr, i;
-   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
-   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
-   nr = src0.nr;
-   interp[0] = brw_vec1_grf(nr, 0);
-   interp[1] = brw_vec1_grf(nr, 4);
-   interp[2] = brw_vec1_grf(nr+1, 0);
-   interp[3] = brw_vec1_grf(nr+1, 4);
-   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
-   for(i = 0; i < 4; i++ ) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 brw_MOV(p, dst, interp[i]);
-	 brw_MUL(p, dst, dst, w);
-      }
-   }
-   brw_set_saturate(p, 0);
-}
-
-static void emit_ddy(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
-   struct brw_reg interp[4];
-   struct brw_reg dst;
-   struct brw_reg src0, w;
-   unsigned nr, i;
-
-   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
-   nr = src0.nr;
-   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
-   interp[0] = brw_vec1_grf(nr, 0);
-   interp[1] = brw_vec1_grf(nr, 4);
-   interp[2] = brw_vec1_grf(nr+1, 0);
-   interp[3] = brw_vec1_grf(nr+1, 4);
-   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
-   for(i = 0; i < 4; i++ ) {
-      if (mask & (1<<i)) {
-	 dst = get_dst_reg(c, inst, i);
-	 brw_MOV(p, dst, suboffset(interp[i], 1));
-	 brw_MUL(p, dst, dst, w);
-      }
-   }
-   brw_set_saturate(p, 0);
-}
-
-/* TODO
-   BIAS on SIMD8 not workind yet...
-*/
-static void emit_txb(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-#if 0
-   struct brw_compile *p = &c->func;
-   struct brw_reg payload_reg = c->payload_depth[0];
-   struct brw_reg dst[4], src[4];
-   unsigned i;
-   for (i = 0; i < 4; i++)
-      dst[i] = get_dst_reg(c, inst, i);
-   for (i = 0; i < 4; i++)
-      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-
-#if 0
-   switch (inst->TexSrcTarget) {
-   case TEXTURE_1D_INDEX:
-      brw_MOV(p, brw_message_reg(2), src[0]);
-      brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
-      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-      break;
-   case TEXTURE_2D_INDEX:
-   case TEXTURE_RECT_INDEX:
-      brw_MOV(p, brw_message_reg(2), src[0]);
-      brw_MOV(p, brw_message_reg(3), src[1]);
-      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-      break;
-   default:
-      brw_MOV(p, brw_message_reg(2), src[0]);
-      brw_MOV(p, brw_message_reg(3), src[1]);
-      brw_MOV(p, brw_message_reg(4), src[2]);
-      break;
-   }
-#else
-   brw_MOV(p, brw_message_reg(2), src[0]);
-   brw_MOV(p, brw_message_reg(3), src[1]);
-   brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-#endif
-
-   brw_MOV(p, brw_message_reg(5), src[3]);
-   brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
-   brw_SAMPLE(p,
-	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	      1,
-	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	      inst->TexSrcUnit + 1, /* surface */
-	      inst->TexSrcUnit,     /* sampler */
-	      inst->FullDstRegisters[0].DstRegister.WriteMask,
-	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
-	      4,
-	      4,
-	      0);
-#endif
-}
-
-static void emit_tex(struct brw_wm_compile *c,
-		     struct tgsi_full_instruction *inst)
-{
-#if 0
-   struct brw_compile *p = &c->func;
-   struct brw_reg payload_reg = c->payload_depth[0];
-   struct brw_reg dst[4], src[4];
-   unsigned msg_len;
-   unsigned i, nr;
-   unsigned emit;
-   boolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
-
-   for (i = 0; i < 4; i++)
-      dst[i] = get_dst_reg(c, inst, i);
-   for (i = 0; i < 4; i++)
-      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
-
-#if 0
-   switch (inst->TexSrcTarget) {
-   case TEXTURE_1D_INDEX:
-      emit = WRITEMASK_X;
-      nr = 1;
-      break;
-   case TEXTURE_2D_INDEX:
-   case TEXTURE_RECT_INDEX:
-      emit = WRITEMASK_XY;
-      nr = 2;
-      break;
-   default:
-      emit = WRITEMASK_XYZ;
-      nr = 3;
-      break;
-   }
-#else
-   emit = WRITEMASK_XY;
-   nr = 2;
-#endif
-
-   msg_len = 1;
-
-   for (i = 0; i < nr; i++) {
-      static const unsigned swz[4] = {0,1,2,2};
-      if (emit & (1<<i))
-	 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
-      else
-	 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
-      msg_len += 1;
-   }
-
-   if (shadow) {
-      brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
-      brw_MOV(p, brw_message_reg(6), src[2]);
-   }
-
-   brw_SAMPLE(p,
-	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	      1,
-	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	      inst->TexSrcUnit + 1, /* surface */
-	      inst->TexSrcUnit,     /* sampler */
-	      inst->FullDstRegisters[0].DstRegister.WriteMask,
-	      BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
-	      4,
-	      shadow ? 6 : 4,
-	      0);
-
-   if (shadow)
-      brw_MOV(p, dst[3], brw_imm_f(1.0));
-#endif
-}
-
-
-
-
-
-
-
-
-static void emit_fb_write(struct brw_wm_compile *c,
-			  struct tgsi_full_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   int nr = 2;
-   int channel;
-   int base_reg = 0;
-
-   // src0 = output color
-   // src1 = payload_depth[0]
-   // src2 = output depth
-   // dst = ???
-
-
-
-   /* Reserve a space for AA - may not be needed:
-    */
-   if (c->key.aa_dest_stencil_reg)
-      nr += 1;
-
-   {
-      brw_push_insn_state(p);
-      for (channel = 0; channel < 4; channel++) {
-	 struct brw_reg src0 = c->wm_regs[TGSI_FILE_OUTPUT][0][channel];
-
-	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-	 brw_MOV(p, brw_message_reg(nr + channel), src0);
-      }
-      /* skip over the regs populated above: */
-      nr += 8;
-      brw_pop_insn_state(p);
-   }
-    
-
-   /* Pass through control information:
-    */
-   /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
-   {
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
-      brw_MOV(p,
-	      brw_message_reg(base_reg + 1),
-	      brw_vec8_grf(1, 0));
-      brw_pop_insn_state(p);
-   }
-
-   /* Send framebuffer write message: */
-   brw_fb_WRITE(p,
-		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
-		base_reg,
-		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
-		0,              /* render surface always 0 */
-		nr,
-		0,
-		1);
-
-}
-
-
-static void brw_wm_emit_instruction( struct brw_wm_compile *c,
-				     struct tgsi_full_instruction *inst )
-{
-   struct brw_compile *p = &c->func;
-
-#if 0   
-   if (inst->CondUpdate)
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-   else
-      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
-#else
-   brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
-#endif
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ABS:
-      emit_abs(c, inst);
-      break;
-   case TGSI_OPCODE_ADD:
-      emit_alu2(c, inst, BRW_OPCODE_ADD);
-      break;
-   case TGSI_OPCODE_SUB:
-      assert(0);
-//      emit_alu2(c, inst, BRW_OPCODE_SUB);
-      break;
-   case TGSI_OPCODE_FRC:
-      emit_alu1(c, inst, BRW_OPCODE_FRC);
-      break;
-   case TGSI_OPCODE_FLR:
-      assert(0);
-//      emit_alu1(c, inst, BRW_OPCODE_FLR);
-      break;
-   case TGSI_OPCODE_LRP:
-      emit_lrp(c, inst);
-      break;
-   case TGSI_OPCODE_INT:
-      emit_alu1(c, inst, BRW_OPCODE_RNDD);
-      break;
-   case TGSI_OPCODE_MOV:
-      emit_alu1(c, inst, BRW_OPCODE_MOV);
-      break;
-   case TGSI_OPCODE_DP3:
-      emit_dp3(c, inst);
-      break;
-   case TGSI_OPCODE_DP4:
-      emit_dp4(c, inst);
-      break;
-   case TGSI_OPCODE_XPD:
-      emit_xpd(c, inst);
-      break;
-   case TGSI_OPCODE_DPH:
-      emit_dph(c, inst);
-      break;
-   case TGSI_OPCODE_RCP:
-      emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
-      break;
-   case TGSI_OPCODE_RSQ:
-      emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
-      break;
-   case TGSI_OPCODE_SIN:
-      emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
-      break;
-   case TGSI_OPCODE_COS:
-      emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
-      break;
-   case TGSI_OPCODE_EX2:
-      emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
-      break;
-   case TGSI_OPCODE_LG2:
-      emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
-      break;
-   case TGSI_OPCODE_MAX:
-      emit_max(c, inst);
-      break;
-   case TGSI_OPCODE_MIN:
-      emit_min(c, inst);
-      break;
-   case TGSI_OPCODE_DDX:
-      emit_ddx(c, inst);
-      break;
-   case TGSI_OPCODE_DDY:
-      emit_ddy(c, inst);
-      break;
-   case TGSI_OPCODE_SLT:
-      emit_sop(c, inst, BRW_CONDITIONAL_L);
-      break;
-   case TGSI_OPCODE_SLE:
-      emit_sop(c, inst, BRW_CONDITIONAL_LE);
-      break;
-   case TGSI_OPCODE_SGT:
-      emit_sop(c, inst, BRW_CONDITIONAL_G);
-      break;
-   case TGSI_OPCODE_SGE:
-      emit_sop(c, inst, BRW_CONDITIONAL_GE);
-      break;
-   case TGSI_OPCODE_SEQ:
-      emit_sop(c, inst, BRW_CONDITIONAL_EQ);
-      break;
-   case TGSI_OPCODE_SNE:
-      emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
-      break;
-   case TGSI_OPCODE_MUL:
-      emit_alu2(c, inst, BRW_OPCODE_MUL);
-      break;
-   case TGSI_OPCODE_POW:
-      emit_pow(c, inst);
-      break;
-   case TGSI_OPCODE_MAD:
-      emit_mad(c, inst);
-      break;
-   case TGSI_OPCODE_TEX:
-      emit_tex(c, inst);
-      break;
-   case TGSI_OPCODE_TXB:
-      emit_txb(c, inst);
-      break;
-   case TGSI_OPCODE_TEXKILL:
-      emit_kil(c);
-      break;
-   case TGSI_OPCODE_IF:
-      assert(c->if_insn < MAX_IFSN);
-      c->if_inst[c->if_insn++] = brw_IF(p, BRW_EXECUTE_8);
-      break;
-   case TGSI_OPCODE_ELSE:
-      c->if_inst[c->if_insn-1]  = brw_ELSE(p, c->if_inst[c->if_insn-1]);
-      break;
-   case TGSI_OPCODE_ENDIF:
-      assert(c->if_insn > 0);
-      brw_ENDIF(p, c->if_inst[--c->if_insn]);
-      break;
-   case TGSI_OPCODE_BGNSUB:
-   case TGSI_OPCODE_ENDSUB:
-      break;
-   case TGSI_OPCODE_CAL:
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_ADD(p, deref_1ud(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-      brw_set_access_mode(p, BRW_ALIGN_16);
-      brw_ADD(p, 
-	      get_addr_reg(c->stack_index),
-	      get_addr_reg(c->stack_index), brw_imm_d(4));
-//      orig_inst = inst->Data;
-//      orig_inst->Data = &p->store[p->nr_insn];
-      assert(0);
-      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-      brw_pop_insn_state(p);
-      break;
-
-   case TGSI_OPCODE_RET:
-#if 0
-      brw_push_insn_state(p);
-      brw_set_mask_control(p, BRW_MASK_DISABLE);
-      brw_ADD(p, 
-	      get_addr_reg(c->stack_index),
-	      get_addr_reg(c->stack_index), brw_imm_d(-4));
-      brw_set_access_mode(p, BRW_ALIGN_1);
-      brw_MOV(p, brw_ip_reg(), deref_1ud(c->stack_index, 0));
-      brw_set_access_mode(p, BRW_ALIGN_16);
-      brw_pop_insn_state(p);
-#else
-      emit_fb_write(c, inst);
-#endif
-
-      break;
-   case TGSI_OPCODE_BGNFOR:
-      c->loop_inst[c->loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
-      break;
-   case TGSI_OPCODE_BRK:
-      brw_BREAK(p);
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      break;
-   case TGSI_OPCODE_CONT:
-      brw_CONT(p);
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-      break;
-   case TGSI_OPCODE_ENDFOR:
-      c->loop_insn--;
-      c->inst0 = c->inst1 = brw_WHILE(p, c->loop_inst[c->loop_insn]);
-      /* patch all the BREAK instructions from
-         last BGNFOR */
-      while (c->inst0 > c->loop_inst[c->loop_insn]) {
-	 c->inst0--;
-	 if (c->inst0->header.opcode == BRW_OPCODE_BREAK) {
-	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0 + 1;
-	    c->inst0->bits3.if_else.pop_count = 0;
-	 } else if (c->inst0->header.opcode == BRW_OPCODE_CONTINUE) {
-	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0;
-	    c->inst0->bits3.if_else.pop_count = 0;
-	 }
-      }
-      break;
-   case TGSI_OPCODE_END:
-      emit_fb_write(c, inst);
-      break;
-
-   default:
-      debug_printf("unsupported IR in fragment shader %d\n",
-		   inst->Instruction.Opcode);
-   }
-#if 0
-   if (inst->CondUpdate)
-      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-   else
-      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-#endif
-}
-
-
-
-
-
-
-void brw_wm_glsl_emit(struct brw_wm_compile *c)
-{
-   struct tgsi_parse_context parse;
-   struct brw_compile *p = &c->func;
-
-   brw_init_compile(&c->func);
-   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-   c->reg_index = 0;
-   c->if_insn = 0;
-   c->loop_insn = 0;
-   c->stack_index = brw_indirect(0,0);
-
-   /* Do static register allocation and parameter interpolation:
-    */
-   brw_wm_emit_decls( c );
-
-   /* Emit the actual program.  All done with very direct translation,
-    * hopefully we can improve on this shortly...
-    */
-   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
-
-   tgsi_parse_init( &parse, c->fp->program.tokens );
-
-   while( !tgsi_parse_end_of_tokens( &parse ) ) 
-   {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-	 /* already done */
-	 break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* not handled yet */
-	 assert(0);
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         brw_wm_emit_instruction(c, &parse.FullToken.FullInstruction);
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-
-   tgsi_parse_free (&parse);
-   
-   /* Fix up call targets:
-    */
-#if 0
-   {
-      unsigned nr_insns = c->fp->program.Base.NumInstructions;
-      unsigned insn, target_insn;
-      struct tgsi_full_instruction *inst1, *inst2;
-      struct brw_instruction *brw_inst1, *brw_inst2;
-      int offset;
-      for (insn = 0; insn < nr_insns; insn++) {
-	 inst1 = &c->fp->program.Base.Instructions[insn];
-	 brw_inst1 = inst1->Data;
-	 switch (inst1->Opcode) {
-	 case TGSI_OPCODE_CAL:
-	    target_insn = inst1->BranchTarget;
-	    inst2 = &c->fp->program.Base.Instructions[target_insn];
-	    brw_inst2 = inst2->Data;
-	    offset = brw_inst2 - brw_inst1;
-	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	    break;
-	 default:
-	    break;
-	 }
-      }
-   }
-#endif
-
-   c->prog_data.total_grf = c->reg_index;
-   c->prog_data.total_scratch = 0;
-}
diff --git a/src/gallium/drivers/i965simple/brw_wm_iz.c b/src/gallium/drivers/i965simple/brw_wm_iz.c
deleted file mode 100644
index 6c5f25bf39e..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm_iz.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_wm.h"
-
-
-#undef P			/* prompted depth */
-#undef C			/* computed */
-#undef N			/* non-promoted? */
-
-#define P 0
-#define C 1
-#define N 2
-
-const struct {
-   unsigned mode:2;
-   unsigned sd_present:1;
-   unsigned sd_to_rt:1;
-   unsigned dd_present:1;
-   unsigned ds_present:1;
-} wm_iz_table[IZ_BIT_MAX] =
-{
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 1, 1, 0, 0 }, 
- { C, 1, 1, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 1, 1, 0, 0 }, 
- { C, 1, 1, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 0, 1, 0, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 1, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 0, 0, 1 }, 
- { C, 0, 0, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 1, 1, 0, 1 }, 
- { C, 1, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 0, 0, 1 }, 
- { C, 0, 0, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 1, 1, 0, 1 }, 
- { C, 1, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { N, 1, 1, 0, 0 }, 
- { N, 0, 1, 0, 0 }, 
- { N, 0, 1, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { N, 1, 1, 0, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { N, 1, 1, 0, 0 }, 
- { N, 0, 1, 0, 0 }, 
- { N, 0, 1, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { N, 1, 1, 0, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { C, 0, 1, 1, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { N, 1, 1, 0, 1 }, 
- { N, 0, 1, 0, 1 }, 
- { N, 0, 1, 0, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { N, 1, 1, 0, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 0, 0, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 0, 1, 0, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 1, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { C, 0, 1, 0, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { P, 0, 0, 0, 0 }, 
- { C, 1, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 }, 
- { C, 0, 1, 1, 1 } 
-};
-
-void brw_wm_lookup_iz( unsigned line_aa,
-		       unsigned lookup,
-		       struct brw_wm_prog_key *key )
-{
-   unsigned reg = 2;
-
-   assert (lookup < IZ_BIT_MAX);
-      
-   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
-      key->computes_depth = 1;
-
-   if (wm_iz_table[lookup].sd_present) {
-      key->source_depth_reg = reg;
-      reg += 2;
-   }
-
-   if (wm_iz_table[lookup].sd_to_rt)
-      key->source_depth_to_render_target = 1;
-
-   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
-      key->aa_dest_stencil_reg = reg;
-      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
-				      line_aa == AA_SOMETIMES);
-      reg++;
-   }
-
-   if (wm_iz_table[lookup].dd_present) {
-      key->dest_depth_reg = reg;
-      reg+=2;
-   }
-
-   key->nr_depth_regs = (reg+1)/2;
-}
-
diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
deleted file mode 100644
index 52b2909a651..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-#define COMPAREFUNC_ALWAYS		0
-#define COMPAREFUNC_NEVER		0x1
-#define COMPAREFUNC_LESS		0x2
-#define COMPAREFUNC_EQUAL		0x3
-#define COMPAREFUNC_LEQUAL		0x4
-#define COMPAREFUNC_GREATER		0x5
-#define COMPAREFUNC_NOTEQUAL		0x6
-#define COMPAREFUNC_GEQUAL		0x7
-
-/* Samplers aren't strictly wm state from the hardware's perspective,
- * but that is the only situation in which we use them in this driver.
- */
-
-static int intel_translate_shadow_compare_func(unsigned func)
-{
-   switch(func) {
-   case PIPE_FUNC_NEVER:
-       return COMPAREFUNC_ALWAYS;
-   case PIPE_FUNC_LESS:
-       return COMPAREFUNC_LEQUAL;
-   case PIPE_FUNC_LEQUAL:
-       return COMPAREFUNC_LESS;
-   case PIPE_FUNC_GREATER:
-       return COMPAREFUNC_GEQUAL;
-   case PIPE_FUNC_GEQUAL:
-      return COMPAREFUNC_GREATER;
-   case PIPE_FUNC_NOTEQUAL:
-      return COMPAREFUNC_EQUAL;
-   case PIPE_FUNC_EQUAL:
-      return COMPAREFUNC_NOTEQUAL;
-   case PIPE_FUNC_ALWAYS:
-       return COMPAREFUNC_NEVER;
-   }
-
-   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_NEVER;
-}
-
-/* The brw (and related graphics cores) do not support GL_CLAMP.  The
- * Intel drivers for "other operating systems" implement GL_CLAMP as
- * GL_CLAMP_TO_EDGE, so the same is done here.
- */
-static unsigned translate_wrap_mode( int wrap )
-{
-   switch( wrap ) {
-   case PIPE_TEX_WRAP_REPEAT:
-      return BRW_TEXCOORDMODE_WRAP;
-   case PIPE_TEX_WRAP_CLAMP:
-      return BRW_TEXCOORDMODE_CLAMP;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      return BRW_TEXCOORDMODE_CLAMP_BORDER;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      return BRW_TEXCOORDMODE_MIRROR;
-   default:
-      return BRW_TEXCOORDMODE_WRAP;
-   }
-}
-
-
-static unsigned U_FIXED(float value, unsigned frac_bits)
-{
-   value *= (1<<frac_bits);
-   return value < 0 ? 0 : value;
-}
-
-static int S_FIXED(float value, unsigned frac_bits)
-{
-   return value * (1<<frac_bits);
-}
-
-
-static unsigned upload_default_color( struct brw_context *brw,
-                                      const float *color )
-{
-   struct brw_sampler_default_color sdc;
-
-   COPY_4V(sdc.color, color);
-
-   return brw_cache_data( &brw->cache[BRW_SAMPLER_DEFAULT_COLOR], &sdc );
-}
-
-
-/*
- */
-static void brw_update_sampler_state( const struct pipe_sampler_state *pipe_sampler,
-				      unsigned sdc_gs_offset,
-				      struct brw_sampler_state *sampler)
-{
-   memset(sampler, 0, sizeof(*sampler));
-
-   switch (pipe_sampler->min_mip_filter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-      break;
-   case PIPE_TEX_FILTER_ANISO:
-      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC;
-      break;
-   default:
-      break;
-   }
-
-   switch (pipe_sampler->min_mip_filter) {
-   case PIPE_TEX_MIPFILTER_NEAREST:
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
-      break;
-   case PIPE_TEX_MIPFILTER_LINEAR:
-      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
-      break;
-   case PIPE_TEX_MIPFILTER_NONE:
-      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
-      break;
-   default:
-      break;
-   }
-   /* Set Anisotropy:
-    */
-   switch (pipe_sampler->mag_img_filter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-      break;
-   case PIPE_TEX_FILTER_ANISO:
-      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-      break;
-   default:
-      break;
-   }
-
-   if (pipe_sampler->max_anisotropy > 2.0) {
-      sampler->ss3.max_aniso = MAX2((pipe_sampler->max_anisotropy - 2) / 2,
-                                    BRW_ANISORATIO_16);
-   }
-
-   sampler->ss1.s_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_s);
-   sampler->ss1.r_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_r);
-   sampler->ss1.t_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_t);
-
-   /* Fulsim complains if I don't do this.  Hardware doesn't mind:
-    */
-#if 0
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP_ARB) {
-      sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-      sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-   }
-#endif
-
-   /* Set shadow function:
-    */
-   if (pipe_sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-      /* Shadowing is "enabled" by emitting a particular sampler
-       * message (sample_c).  So need to recompile WM program when
-       * shadow comparison is enabled on each/any texture unit.
-       */
-      sampler->ss0.shadow_function = intel_translate_shadow_compare_func(pipe_sampler->compare_func);
-   }
-
-   /* Set LOD bias:
-    */
-   sampler->ss0.lod_bias = S_FIXED(CLAMP(pipe_sampler->lod_bias, -16, 15), 6);
-
-   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
-   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
-
-   /* Set BaseMipLevel, MaxLOD, MinLOD:
-    *
-    * XXX: I don't think that using firstLevel, lastLevel works,
-    * because we always setup the surface state as if firstLevel ==
-    * level zero.  Probably have to subtract firstLevel from each of
-    * these:
-    */
-   sampler->ss0.base_level = U_FIXED(0, 1);
-
-   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(pipe_sampler->max_lod, 0), 13), 6);
-   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(pipe_sampler->min_lod, 0), 13), 6);
-
-   sampler->ss2.default_color_pointer = sdc_gs_offset >> 5;
-}
-
-
-
-/* All samplers must be uploaded in a single contiguous array, which
- * complicates various things.  However, this is still too confusing -
- * FIXME: simplify all the different new texture state flags.
- */
-static void upload_wm_samplers(struct brw_context *brw)
-{
-   unsigned unit;
-   unsigned sampler_count = 0;
-
-   /* BRW_NEW_SAMPLER */
-   for (unit = 0; unit < brw->num_textures && unit < brw->num_samplers;
-        unit++) {
-      /* determine unit enable/disable by looking for a bound texture */
-      if (brw->attribs.Texture[unit]) {
-         const struct pipe_sampler_state *sampler = brw->attribs.Samplers[unit];
-	 unsigned sdc_gs_offset = upload_default_color(brw, sampler->border_color);
-
-	 brw_update_sampler_state(sampler,
-				  sdc_gs_offset,
-				  &brw->wm.sampler[unit]);
-
-	 sampler_count = unit + 1;
-      }
-   }
-
-   if (brw->wm.sampler_count != sampler_count) {
-      brw->wm.sampler_count = sampler_count;
-      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
-   }
-
-   brw->wm.sampler_gs_offset = 0;
-
-   if (brw->wm.sampler_count)
-      brw->wm.sampler_gs_offset =
-	 brw_cache_data_sz(&brw->cache[BRW_SAMPLER],
-			   brw->wm.sampler,
-			   sizeof(struct brw_sampler_state) * brw->wm.sampler_count);
-}
-
-const struct brw_tracked_state brw_wm_samplers = {
-   .dirty = {
-      .brw = BRW_NEW_SAMPLER,
-      .cache = 0
-   },
-   .update = upload_wm_samplers
-};
-
diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c
deleted file mode 100644
index 37a9bf919cd..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm_state.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "brw_wm.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-/***********************************************************************
- * WM unit - fragment programs and rasterization
- */
-static void upload_wm_unit(struct brw_context *brw )
-{
-   struct brw_wm_unit_state wm;
-   unsigned max_threads;
-   unsigned per_thread;
-
-   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
-      max_threads = 0;
-   else
-      max_threads = 31;
-
-
-   memset(&wm, 0, sizeof(wm));
-
-   /* CACHE_NEW_WM_PROG */
-   wm.thread0.grf_reg_count = align(brw->wm.prog_data->total_grf, 16) / 16 - 1;
-   wm.thread0.kernel_start_pointer = brw->wm.prog_gs_offset >> 6;
-   wm.thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
-   wm.thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
-   wm.thread3.const_urb_entry_read_length = brw->wm.prog_data->curb_read_length;
-
-   wm.wm5.max_threads = max_threads;
-
-   per_thread = align(brw->wm.prog_data->total_scratch, 1024);
-   assert(per_thread <= 12 * 1024);
-
-#if 0
-   if (brw->wm.prog_data->total_scratch) {
-      unsigned total = per_thread * (max_threads + 1);
-
-      /* Scratch space -- just have to make sure there is sufficient
-       * allocated for the active program and current number of threads.
-       */
-      brw->wm.scratch_buffer_size = total;
-      if (brw->wm.scratch_buffer &&
-	  brw->wm.scratch_buffer_size > brw->wm.scratch_buffer->size) {
-	 dri_bo_unreference(brw->wm.scratch_buffer);
-	 brw->wm.scratch_buffer = NULL;
-      }
-      if (!brw->wm.scratch_buffer) {
-	 brw->wm.scratch_buffer = dri_bo_alloc(intel->intelScreen->bufmgr,
-					       "wm scratch",
-					       brw->wm.scratch_buffer_size,
-					       4096, DRM_BO_FLAG_MEM_TT);
-      }
-   }
-   /* XXX: Scratch buffers are not implemented correectly.
-    *
-    * The scratch offset to be programmed into wm is relative to the general
-    * state base address.  However, using dri_bo_alloc/dri_bo_emit_reloc (or
-    * the previous bmGenBuffers scheme), we get an offset relative to the
-    * start of framebuffer.  Even before then, it was broken in other ways,
-    * so just fail for now if we hit that path.
-    */
-   assert(brw->wm.prog_data->total_scratch == 0);
-#endif
-
-   /* CACHE_NEW_SURFACE */
-   wm.thread1.binding_table_entry_count = brw->wm.nr_surfaces;
-
-   /* BRW_NEW_CURBE_OFFSETS */
-   wm.thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
-
-   wm.thread3.urb_entry_read_offset = 0;
-   wm.thread1.depth_coef_urb_read_offset = 1;
-   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-
-   /* CACHE_NEW_SAMPLER */
-   wm.wm4.sampler_count = (brw->wm.sampler_count + 1) / 4;
-   wm.wm4.sampler_state_pointer = brw->wm.sampler_gs_offset >> 5;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   {
-      const struct brw_fragment_program *fp = brw->attribs.FragmentProgram;
-
-      if (fp->UsesDepth)
-	 wm.wm5.program_uses_depth = 1; /* as far as we can tell */
-
-      if (fp->info.writes_z)
-	 wm.wm5.program_computes_depth = 1;
-
-      /* BRW_NEW_ALPHA_TEST */
-      if (fp->info.uses_kill ||
-	  brw->attribs.DepthStencil->alpha.enabled)
-	 wm.wm5.program_uses_killpixel = 1;
-
-      wm.wm5.enable_8_pix = 1;
-   }
-
-   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
-   wm.wm5.legacy_line_rast = 0;
-   wm.wm5.legacy_global_depth_bias = 0;
-   wm.wm5.early_depth_test = 1;	        /* never need to disable */
-   wm.wm5.line_aa_region_width = 0;
-   wm.wm5.line_endcap_aa_region_width = 1;
-
-   /* BRW_NEW_RASTERIZER */
-   if (brw->attribs.Raster->poly_stipple_enable)
-      wm.wm5.polygon_stipple = 1;
-
-#if 0
-   if (brw->attribs.Polygon->OffsetFill) {
-      wm.wm5.depth_offset = 1;
-      /* Something wierd going on with legacy_global_depth_bias,
-       * offset_constant, scaling and MRD.  This value passes glean
-       * but gives some odd results elsewere (eg. the
-       * quad-offset-units test).
-       */
-      wm.global_depth_offset_constant = brw->attribs.Polygon->OffsetUnits * 2;
-
-      /* This is the only value that passes glean:
-       */
-      wm.global_depth_offset_scale = brw->attribs.Polygon->OffsetFactor;
-   }
-#endif
-
-   if (brw->attribs.Raster->line_stipple_enable) {
-      wm.wm5.line_stipple = 1;
-   }
-
-   if (BRW_DEBUG & DEBUG_STATS)
-      wm.wm4.stats_enable = 1;
-
-   brw->wm.state_gs_offset = brw_cache_data( &brw->cache[BRW_WM_UNIT], &wm );
-
-   if (brw->wm.prog_data->total_scratch) {
-      /*
-      dri_emit_reloc(brw->cache[BRW_WM_UNIT].pool->buffer,
-		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
-		     (per_thread / 1024) - 1,
-		     brw->wm.state_gs_offset +
-		     ((char *)&wm.thread2 - (char *)&wm),
-		     brw->wm.scratch_buffer);
-      */
-   } else {
-      wm.thread2.scratch_space_base_pointer = 0;
-   }
-}
-
-const struct brw_tracked_state brw_wm_unit = {
-   .dirty = {
-      .brw = (BRW_NEW_RASTERIZER |
-	      BRW_NEW_ALPHA_TEST |
-	      BRW_NEW_FS |
-	      BRW_NEW_CURBE_OFFSETS),
-
-      .cache = (CACHE_NEW_SURFACE |
-		CACHE_NEW_WM_PROG |
-		CACHE_NEW_SAMPLER)
-   },
-   .update = upload_wm_unit
-};
-
diff --git a/src/gallium/drivers/i965simple/brw_wm_surface_state.c b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
deleted file mode 100644
index b5b9e0e7026..00000000000
--- a/src/gallium/drivers/i965simple/brw_wm_surface_state.c
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-
-static unsigned translate_tex_target( enum pipe_texture_target target )
-{
-   switch (target) {
-   case PIPE_TEXTURE_1D:
-      return BRW_SURFACE_1D;
-
-   case PIPE_TEXTURE_2D:
-      return BRW_SURFACE_2D;
-
-   case PIPE_TEXTURE_3D:
-      return BRW_SURFACE_3D;
-
-   case PIPE_TEXTURE_CUBE:
-      return BRW_SURFACE_CUBE;
-
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
-static unsigned translate_tex_format( enum pipe_format pipe_format )
-{
-   switch( pipe_format ) {
-   case PIPE_FORMAT_L8_UNORM:
-      return BRW_SURFACEFORMAT_L8_UNORM;
-
-   case PIPE_FORMAT_I8_UNORM:
-      return BRW_SURFACEFORMAT_I8_UNORM;
-
-   case PIPE_FORMAT_A8_UNORM:
-      return BRW_SURFACEFORMAT_A8_UNORM;
-
-   case PIPE_FORMAT_A8L8_UNORM:
-      return BRW_SURFACEFORMAT_L8A8_UNORM;
-
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      assert(0);		/* not supported for sampling */
-      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
-
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
-
-   case PIPE_FORMAT_R5G6B5_UNORM:
-      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
-
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
-      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
-
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
-      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
-
-   case PIPE_FORMAT_YCBCR_REV:
-      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
-
-   case PIPE_FORMAT_YCBCR:
-      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
-#if 0
-   case PIPE_FORMAT_RGB_FXT1:
-   case PIPE_FORMAT_RGBA_FXT1:
-      return BRW_SURFACEFORMAT_FXT1;
-#endif
-
-   case PIPE_FORMAT_Z16_UNORM:
-      return BRW_SURFACEFORMAT_I16_UNORM;
-#if 0
-   case PIPE_FORMAT_RGB_DXT1:
-       return BRW_SURFACEFORMAT_DXT1_RGB;
-
-   case PIPE_FORMAT_RGBA_DXT1:
-       return BRW_SURFACEFORMAT_BC1_UNORM;
-
-   case PIPE_FORMAT_RGBA_DXT3:
-       return BRW_SURFACEFORMAT_BC2_UNORM;
-
-   case PIPE_FORMAT_RGBA_DXT5:
-       return BRW_SURFACEFORMAT_BC3_UNORM;
-
-   case PIPE_FORMAT_SRGBA8:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
-   case PIPE_FORMAT_SRGB_DXT1:
-      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
-#endif
-
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
-static unsigned brw_buffer_offset(struct brw_context *brw,
-                                  struct pipe_buffer *buffer)
-{
-   return brw->winsys->get_buffer_offset(brw->winsys,
-                                         buffer,
-                                         0);
-}
-
-static
-void brw_update_texture_surface( struct brw_context *brw,
-				 unsigned unit )
-{
-   const struct brw_texture *tObj = brw->attribs.Texture[unit];
-   struct brw_surface_state surf;
-
-   memset(&surf, 0, sizeof(surf));
-
-   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
-   surf.ss0.surface_type = translate_tex_target(tObj->base.target);
-   surf.ss0.surface_format = translate_tex_format(tObj->base.format);
-
-   /* This is ok for all textures with channel width 8bit or less:
-    */
-/*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
-
-   /* Updated in emit_reloc */
-   surf.ss1.base_addr = brw_buffer_offset( brw, tObj->buffer );
-
-   surf.ss2.mip_count = tObj->base.last_level;
-   surf.ss2.width = tObj->base.width[0] - 1;
-   surf.ss2.height = tObj->base.height[0] - 1;
-
-   surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
-   surf.ss3.tiled_surface = 0; /* always zero */
-   surf.ss3.pitch = tObj->stride - 1;
-   surf.ss3.depth = tObj->base.depth[0] - 1;
-
-   surf.ss4.min_lod = 0;
-
-   if (tObj->base.target == PIPE_TEXTURE_CUBE) {
-      surf.ss0.cube_pos_x = 1;
-      surf.ss0.cube_pos_y = 1;
-      surf.ss0.cube_pos_z = 1;
-      surf.ss0.cube_neg_x = 1;
-      surf.ss0.cube_neg_y = 1;
-      surf.ss0.cube_neg_z = 1;
-   }
-
-   brw->wm.bind.surf_ss_offset[unit + 1] =
-      brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
-}
-
-
-
-#define OFFSET(TYPE, FIELD) ( (unsigned)&(((TYPE *)0)->FIELD) )
-
-
-static void upload_wm_surfaces(struct brw_context *brw )
-{
-   unsigned i;
-
-   {
-      struct brw_surface_state surf;
-
-      /* BRW_NEW_FRAMEBUFFER
-       */
-      struct pipe_surface *pipe_surface = brw->attribs.FrameBuffer.cbufs[0];/*fixme*/
-      struct brw_texture *tex = (struct brw_texture *)pipe_surface->texture;
-
-      memset(&surf, 0, sizeof(surf));
-
-      if (pipe_surface != NULL) {
-	 if (pipe_surface->block.size == 4)
-	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-	 else
-	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
-
-	 surf.ss0.surface_type = BRW_SURFACE_2D;
-
-	 surf.ss1.base_addr = brw_buffer_offset( brw, tex->buffer );
-
-	 surf.ss2.width = pipe_surface->width - 1;
-	 surf.ss2.height = pipe_surface->height - 1;
-	 surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
-	 surf.ss3.tiled_surface = 0;
-	 surf.ss3.pitch = pipe_surface->stride - 1;
-      } else {
-	 surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-	 surf.ss0.surface_type = BRW_SURFACE_NULL;
-      }
-
-      /* BRW_NEW_BLEND */
-      surf.ss0.color_blend = (!brw->attribs.Blend->logicop_enable &&
-			      brw->attribs.Blend->blend_enable);
-
-
-      surf.ss0.writedisable_red =   !(brw->attribs.Blend->colormask & PIPE_MASK_R);
-      surf.ss0.writedisable_green = !(brw->attribs.Blend->colormask & PIPE_MASK_G);
-      surf.ss0.writedisable_blue =  !(brw->attribs.Blend->colormask & PIPE_MASK_B);
-      surf.ss0.writedisable_alpha = !(brw->attribs.Blend->colormask & PIPE_MASK_A);
-
-
-
-
-      brw->wm.bind.surf_ss_offset[0] = brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
-
-      brw->wm.nr_surfaces = 1;
-   }
-
-
-   /* BRW_NEW_TEXTURE
-    */
-   for (i = 0; i < brw->num_textures && i < brw->num_samplers; i++) {
-      const struct brw_texture *texUnit = brw->attribs.Texture[i];
-
-      if (texUnit &&
-	  texUnit->base.reference.count/*(texUnit->reference.count > 0) == really used */) {
-
-	 brw_update_texture_surface(brw, i);
-
-	 brw->wm.nr_surfaces = i+2;
-      }
-      else {
-	 brw->wm.bind.surf_ss_offset[i+1] = 0;
-      }
-   }
-
-   brw->wm.bind_ss_offset = brw_cache_data( &brw->cache[BRW_SS_SURF_BIND],
-					    &brw->wm.bind );
-}
-
-
-/* KW: Will find a different way to acheive this, see for example the
- * state caches with relocs in the i915 swz driver.
- */
-#if 0
-static void emit_reloc_wm_surfaces(struct brw_context *brw)
-{
-   int unit;
-
-   if (brw->state.draw_region != NULL) {
-      /* Emit framebuffer relocation */
-      dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
-		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
-		     0,
-		     brw->wm.bind.surf_ss_offset[0] +
-		     offsetof(struct brw_surface_state, ss1),
-		     brw->state.draw_region->buffer);
-   }
-
-   /* Emit relocations for texture buffers */
-   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
-      struct gl_texture_object *tObj = texUnit->_Current;
-      struct intel_texture_object *intelObj = intel_texture_object(tObj);
-
-      if (texUnit->_ReallyEnabled && intelObj->mt != NULL) {
-	 dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
-			DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			0,
-			brw->wm.bind.surf_ss_offset[unit + 1] +
-			offsetof(struct brw_surface_state, ss1),
-			intelObj->mt->region->buffer);
-      }
-   }
-}
-#endif
-
-const struct brw_tracked_state brw_wm_surfaces = {
-   .dirty = {
-      .brw = (BRW_NEW_FRAMEBUFFER |
-	      BRW_NEW_BLEND |
-	      BRW_NEW_TEXTURE),
-      .cache = 0
-   },
-   .update = upload_wm_surfaces,
-};
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore
new file mode 100644
index 00000000000..257b72d7b2b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/.gitignore
@@ -0,0 +1 @@
+lp_tile_soa.c
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 5ac09de79ea..e038a5229e5 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -17,9 +17,14 @@ C_SOURCES = \
 	lp_bld_depth.c \
 	lp_bld_flow.c \
 	lp_bld_format_aos.c \
+	lp_bld_format_query.c \
+	lp_bld_format_soa.c \
 	lp_bld_interp.c \
 	lp_bld_intr.c \
 	lp_bld_logic.c \
+	lp_bld_pack.c \
+	lp_bld_sample.c \
+	lp_bld_sample_soa.c \
 	lp_bld_swizzle.c \
 	lp_bld_struct.c \
 	lp_bld_tgsi_soa.c \
@@ -30,7 +35,6 @@ C_SOURCES = \
 	lp_draw_arrays.c \
 	lp_flush.c \
 	lp_jit.c \
-	lp_prim_setup.c \
 	lp_prim_vbuf.c \
 	lp_setup.c \
 	lp_query.c \
@@ -46,9 +50,16 @@ C_SOURCES = \
 	lp_state_vs.c \
 	lp_surface.c \
 	lp_tex_cache.c \
-	lp_tex_sample.c \
+	lp_tex_sample_c.c \
+	lp_tex_sample_llvm.c \
 	lp_texture.c \
 	lp_tile_cache.c \
 	lp_tile_soa.c
 
+CPP_SOURCES = \
+	lp_bld_misc.cpp
+
 include ../../Makefile.template
+
+lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_access.py ../../auxiliary/util/u_format.csv
+	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 498d21dea6c..89d08834a3c 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -8,13 +8,16 @@ Done so far is:
 
  - the whole fragment pipeline is code generated in a single function
  
+   - input interpolation
+   
    - depth testing
  
+   - texture sampling (not all state/formats are supported) 
+   
    - fragment shader TGSI translation
      - same level of support as the TGSI SSE2 exec machine, with the exception
        we don't fallback to TGSI interpretation when an unsupported opcode is
        found, but just ignore it
-     - texture sampling via an intrinsic call
      - done in SoA layout
      - input interpolation also code generated
  
@@ -28,16 +31,17 @@ Done so far is:
      any width and length
    - not all operations are implemented for these types yet though
 
-Most mesa/progs/demos/* work. Speed is on par with Keith's softpipe-opt branch,
-which includes hand written fast implementations for common cases.
+Most mesa/progs/demos/* work. 
 
 To do (probably by this order):
 
  - code generate stipple and stencil testing
 
- - code generate texture sampling
+ - translate the remaining bits of texture sampling state
 
  - translate TGSI control flow instructions, and all other remaining opcodes
+ 
+ - integrate with the draw module for VS code generation
 
  - code generate the triangle setup and rasterization
 
@@ -93,7 +97,7 @@ Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
   make linux-llvm
 
-but the rest of these instructions assume scons is used.
+but the rest of these instructions assume that scons is used.
 
 
 Using
@@ -108,6 +112,9 @@ or
 
   export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/lib:$LD_LIBRARY_PATH
 
+For performance evaluation pass debug=no to scons, and use the corresponding
+lib directory without the "-debug" suffix.
+
 
 Unit testing
 ============
@@ -119,7 +126,7 @@ build/linux-???-debug/gallium/drivers/llvmpipe:
  - lp_test_conv: SIMD vector conversion
  - lp_test_format: pixel unpacking/packing
 
-Some of this tests can output results and benchmarks to a tab-seperated-file
+Some of this tests can output results and benchmarks to a tab-separated-file
 for posterior analysis, e.g.:
 
   build/linux-x86_64-debug/gallium/drivers/llvmpipe/lp_test_blend -o blend.tsv
@@ -133,10 +140,10 @@ Development Notes
   at the top of the lp_bld_*.c functions.  
 
 - All lp_bld_*.[ch] are isolated from the rest of the driver, and could/may be 
-  put in a standalone Gallium state -> LLVM IR translation module.
+  put in a stand-alone Gallium state -> LLVM IR translation module.
 
 - We use LLVM-C bindings for now. They are not documented, but follow the C++
   interfaces very closely, and appear to be complete enough for code
   generation. See 
   http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
-  for a standalone example.
+  for a stand-alone example.
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 5c29bdac56e..3bd2e700138 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -3,12 +3,19 @@ Import('*')
 env = env.Clone()
 
 env.Tool('llvm')
-if 'LLVM_VERSION' not in env:
+if not env.has_key('LLVM_VERSION'):
     print 'warning: LLVM not found: not building llvmpipe'
     Return()
 
 env.Tool('udis86')
 
+env.CodeGenerate(
+	target = 'lp_tile_soa.c',
+	script = 'lp_tile_soa.py',
+	source = ['#src/gallium/auxiliary/util/u_format.csv'],
+	command = 'python $SCRIPT $SOURCE > $TARGET'
+)
+
 llvmpipe = env.ConvenienceLibrary(
 	target = 'llvmpipe',
 	source = [
@@ -23,8 +30,14 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_bld_depth.c',
 		'lp_bld_flow.c',
 		'lp_bld_format_aos.c',
+        'lp_bld_format_query.c',
+		'lp_bld_format_soa.c',
 		'lp_bld_interp.c',
 		'lp_bld_intr.c',
+		'lp_bld_misc.cpp',
+        'lp_bld_pack.c',
+        'lp_bld_sample.c',
+		'lp_bld_sample_soa.c',
 		'lp_bld_struct.c',
 		'lp_bld_logic.c',
 		'lp_bld_swizzle.c',
@@ -36,7 +49,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_draw_arrays.c',
 		'lp_flush.c',
 		'lp_jit.c',
-		'lp_prim_setup.c',
 		'lp_prim_vbuf.c',
 		'lp_setup.c',
 		'lp_query.c',
@@ -52,7 +64,8 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vs.c',
 		'lp_surface.c',
 		'lp_tex_cache.c',
-		'lp_tex_sample.c',
+		'lp_tex_sample_c.c',
+		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
 		'lp_tile_cache.c',
 		'lp_tile_soa.c',
@@ -65,7 +78,7 @@ env.Prepend(LIBS = [llvmpipe] + auxiliaries)
 
 env.Program(
     target = 'lp_test_format',
-    source = ['lp_test_format.c'],
+    source = ['lp_test_format.c', 'lp_test_main.c'],
 )
 
 env.Program(
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index 49c2f911af7..2b4bc5c819d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -45,7 +45,7 @@
 void
 lp_build_alpha_test(LLVMBuilderRef builder,
                     const struct pipe_alpha_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
                     LLVMValueRef ref)
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 9dbcdb4daab..634575670db 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -38,14 +38,14 @@
 #include <llvm-c/Core.h>  
 
 struct pipe_alpha_state;
-union lp_type;
+struct lp_type;
 struct lp_build_mask_context;
 
 
 void
 lp_build_alpha_test(LLVMBuilderRef builder,
                     const struct pipe_alpha_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
                     LLVMValueRef ref);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index 09a57ff33d5..9c59677a741 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -47,12 +47,16 @@
 
 #include "util/u_memory.h"
 #include "util/u_debug.h"
+#include "util/u_math.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
 
@@ -65,36 +69,34 @@ lp_build_min_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.min.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -113,36 +115,34 @@ lp_build_max_simple(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.max.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -159,7 +159,7 @@ LLVMValueRef
 lp_build_comp(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->one)
       return bld->zero;
@@ -188,7 +188,7 @@ lp_build_add(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(a == bld->zero)
@@ -204,15 +204,14 @@ lp_build_add(struct lp_build_context *bld,
       if(a == bld->one || b == bld->one)
         return bld->one;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -241,7 +240,7 @@ lp_build_sub(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(b == bld->zero)
@@ -257,15 +256,14 @@ lp_build_sub(struct lp_build_context *bld,
       if(b == bld->one)
         return bld->zero;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -284,45 +282,6 @@ lp_build_sub(struct lp_build_context *bld,
 
 
 /**
- * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
- */
-static LLVMValueRef 
-lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-   assert(lo_hi < 2);
-
-   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
-      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
-      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
-   }
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Build constant int vector of width 'n' and value 'c'.
- */
-static LLVMValueRef 
-lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-
-   for(i = 0; i < n; ++i)
-      elems[i] = LLVMConstInt(type, c, 0);
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
  * Normalized 8bit multiplication.
  *
  * - alpha plus one
@@ -365,33 +324,30 @@ lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
  */
 static LLVMValueRef
 lp_build_mul_u8n(LLVMBuilderRef builder,
+                 struct lp_type i16_type,
                  LLVMValueRef a, LLVMValueRef b)
 {
-   static LLVMValueRef c01 = NULL;
-   static LLVMValueRef c08 = NULL;
-   static LLVMValueRef c80 = NULL;
+   LLVMValueRef c8;
    LLVMValueRef ab;
 
-   if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
-   if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
-   if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
+   c8 = lp_build_int_const_scalar(i16_type, 8);
    
 #if 0
    
    /* a*b/255 ~= (a*(b + 1)) >> 256 */
-   b = LLVMBuildAdd(builder, b, c01, "");
+   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
    ab = LLVMBuildMul(builder, a, b, "");
 
 #else
    
-   /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
+   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
    ab = LLVMBuildMul(builder, a, b, "");
-   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
-   ab = LLVMBuildAdd(builder, ab, c80, "");
+   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
 
 #endif
    
-   ab = LLVMBuildLShr(builder, ab, c08, "");
+   ab = LLVMBuildLShr(builder, ab, c8, "");
 
    return ab;
 }
@@ -405,7 +361,9 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
+   LLVMValueRef shift;
+   LLVMValueRef res;
 
    if(a == bld->zero)
       return bld->zero;
@@ -419,53 +377,104 @@ lp_build_mul(struct lp_build_context *bld,
       return bld->undef;
 
    if(!type.floating && !type.fixed && type.norm) {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width == 8 && type.length == 16) {
-         LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
-         LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
-         static LLVMValueRef ml = NULL;
-         static LLVMValueRef mh = NULL;
-         LLVMValueRef al, ah, bl, bh;
-         LLVMValueRef abl, abh;
-         LLVMValueRef ab;
-         
-         if(!ml) ml = lp_build_unpack_shuffle(16, 0);
-         if(!mh) mh = lp_build_unpack_shuffle(16, 1);
-
-         /*  PUNPCKLBW, PUNPCKHBW */
-         al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
-         bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
-         ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
-         bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
+      if(type.width == 8) {
+         struct lp_type i16_type = lp_wider_type(type);
+         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 
-         /* NOP */
-         al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
-         bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
-         ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
-         bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
+         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
+         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
 
          /* PMULLW, PSRLW, PADDW */
-         abl = lp_build_mul_u8n(bld->builder, al, bl);
-         abh = lp_build_mul_u8n(bld->builder, ah, bh);
+         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
+         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
 
-         /* PACKUSWB */
-         ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
-
-         /* NOP */
-         ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
+         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
          
          return ab;
       }
-#endif
 
       /* FIXME */
       assert(0);
    }
 
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      return LLVMConstMul(a, b);
+   if(type.fixed)
+      shift = lp_build_int_const_scalar(type, type.width/2);
+   else
+      shift = NULL;
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
+      res =  LLVMConstMul(a, b);
+      if(shift) {
+         if(type.sign)
+            res = LLVMConstAShr(res, shift);
+         else
+            res = LLVMConstLShr(res, shift);
+      }
+   }
+   else {
+      res = LLVMBuildMul(bld->builder, a, b, "");
+      if(shift) {
+         if(type.sign)
+            res = LLVMBuildAShr(bld->builder, res, shift, "");
+         else
+            res = LLVMBuildLShr(bld->builder, res, shift, "");
+      }
+   }
+
+   return res;
+}
+
+
+/**
+ * Small vector x scale multiplication optimization.
+ */
+LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b)
+{
+   LLVMValueRef factor;
+
+   if(b == 0)
+      return bld->zero;
+
+   if(b == 1)
+      return a;
+
+   if(b == -1)
+      return LLVMBuildNeg(bld->builder, a, "");
 
-   return LLVMBuildMul(bld->builder, a, b, "");
+   if(b == 2 && bld->type.floating)
+      return lp_build_add(bld, a, a);
+
+   if(util_is_pot(b)) {
+      unsigned shift = ffs(b) - 1;
+
+      if(bld->type.floating) {
+#if 0
+         /*
+          * Power of two multiplication by directly manipulating the mantissa.
+          *
+          * XXX: This might not be always faster, it will introduce a small error
+          * for multiplication by zero, and it will produce wrong results
+          * for Inf and NaN.
+          */
+         unsigned mantissa = lp_mantissa(bld->type);
+         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
+         a = LLVMBuildAdd(bld->builder, a, factor, "");
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
+         return a;
+#endif
+      }
+      else {
+         factor = lp_build_const_scalar(bld->type, shift);
+         return LLVMBuildShl(bld->builder, a, factor, "");
+      }
+   }
+
+   factor = lp_build_const_scalar(bld->type, (double)b);
+   return lp_build_mul(bld, a, factor);
 }
 
 
@@ -477,7 +486,7 @@ lp_build_div(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->zero;
@@ -493,16 +502,62 @@ lp_build_div(struct lp_build_context *bld,
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
       return LLVMConstFDiv(a, b);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-#endif
 
    return LLVMBuildFDiv(bld->builder, a, b, "");
 }
 
 
 /**
+ * Linear interpolation.
+ *
+ * This also works for integer values with a few caveats.
+ *
+ * @sa http://www.stereopsis.com/doubleblend.html
+ */
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   LLVMValueRef delta;
+   LLVMValueRef res;
+
+   delta = lp_build_sub(bld, v1, v0);
+
+   res = lp_build_mul(bld, x, delta);
+
+   res = lp_build_add(bld, v0, res);
+
+   if(bld->type.fixed)
+      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
+       * but it will be wrong for other uses. Basically we need a more
+       * powerful lp_type, capable of further distinguishing the values
+       * interpretation from the value storage. */
+      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11)
+{
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
+   return lp_build_lerp(bld, y, v0, v1);
+}
+
+
+/**
  * Generate min(a, b)
  * Do checks for special cases.
  */
@@ -565,33 +620,333 @@ LLVMValueRef
 lp_build_abs(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
 
    if(!type.sign)
       return a;
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(!type.floating && type.width*type.length == 128) {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      if(type.width == 8)
+   if(type.floating) {
+      /* Mask out the sign bit */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
+   }
+
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
+      switch(type.width) {
+      case 8:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
-      if(type.width == 16)
+      case 16:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
-      if(type.width == 32)
+      case 32:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+      }
    }
-#endif
 
    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 }
 
 
 LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+
+   /* Handle non-zero case */
+   if(!type.sign) {
+      /* if not zero then sign must be positive */
+      res = bld->one;
+   }
+   else if(type.floating) {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef one;
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      one = LLVMConstBitCast(bld->one, int_vec_type);
+      res = LLVMBuildOr(bld->builder, sign, one, "");
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+   else
+   {
+      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
+      res = lp_build_select(bld, cond, bld->one, minus_one);
+   }
+
+   /* Handle zero */
+   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
+   res = lp_build_select(bld, cond, bld->zero, bld->one);
+
+   return res;
+}
+
+
+enum lp_build_round_sse41_mode
+{
+   LP_BUILD_ROUND_SSE41_NEAREST = 0,
+   LP_BUILD_ROUND_SSE41_FLOOR = 1,
+   LP_BUILD_ROUND_SSE41_CEIL = 2,
+   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
+};
+
+
+static INLINE LLVMValueRef
+lp_build_round_sse41(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_sse41_mode mode)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   const char *intrinsic;
+
+   assert(type.floating);
+   assert(type.width*type.length == 128);
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse4_1);
+
+   switch(type.width) {
+   case 32:
+      intrinsic = "llvm.x86.sse41.round.ps";
+      break;
+   case 64:
+      intrinsic = "llvm.x86.sse41.round.pd";
+      break;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+
+   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
+                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+}
+
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef res;
+      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iround(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iceil(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+}
+
+
+LLVMValueRef
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef half;
+
+      /* get sign bit */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+      /* sign * 0.5 */
+      half = lp_build_const_scalar(type, 0.5);
+      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+      half = LLVMBuildOr(bld->builder, sign, half, "");
+      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      res = LLVMBuildAdd(bld->builder, a, half, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
+   else {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? ~0 : 0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+
+      /* offset = -0.99999(9)f */
+      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
+   else {
+      assert(0);
+      res = bld->undef;
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
+}
+
+
+LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -609,7 +964,7 @@ LLVMValueRef
 lp_build_rcp(struct lp_build_context *bld,
              LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    if(a == bld->zero)
       return bld->undef;
@@ -623,11 +978,9 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 }
@@ -640,15 +993,12 @@ LLVMValueRef
 lp_build_rsqrt(struct lp_build_context *bld,
                LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
 
    assert(type.floating);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-#endif
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
@@ -661,7 +1011,7 @@ LLVMValueRef
 lp_build_cos(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -681,7 +1031,7 @@ LLVMValueRef
 lp_build_sin(struct lp_build_context *bld,
               LLVMValueRef a)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
@@ -704,7 +1054,8 @@ lp_build_pow(struct lp_build_context *bld,
 {
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x) && LLVMIsConstant(y))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 }
@@ -752,13 +1103,14 @@ lp_build_polynomial(struct lp_build_context *bld,
                     const double *coeffs,
                     unsigned num_coeffs)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res = NULL;
    unsigned i;
 
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
       LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
@@ -800,7 +1152,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
                      LLVMValueRef *p_frac_part,
                      LLVMValueRef *p_exp2)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef ipart = NULL;
@@ -812,7 +1164,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
    if(p_exp2_int_part || p_frac_part || p_exp2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
@@ -893,7 +1246,7 @@ lp_build_log2_approx(struct lp_build_context *bld,
                      LLVMValueRef *p_floor_log2,
                      LLVMValueRef *p_log2)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
@@ -911,7 +1264,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    if(p_exp || p_floor_log2 || p_log2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
index fc8cb25966e..62be4b9aee1 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -67,11 +67,36 @@ lp_build_mul(struct lp_build_context *bld,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b);
+
+LLVMValueRef
 lp_build_div(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
 
 LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1);
+
+/**
+ * Bilinear interpolation.
+ *
+ * Values indices are in v_{yx}.
+ */
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11);
+
+LLVMValueRef
 lp_build_min(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
@@ -86,6 +111,41 @@ lp_build_abs(struct lp_build_context *bld,
              LLVMValueRef a);
 
 LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+LLVMValueRef
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index d19e18846c2..da272e549f3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -46,7 +46,7 @@
 
 
 struct pipe_blend_state;
-union lp_type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -74,7 +74,7 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src,
                    LLVMValueRef dst,
                    LLVMValueRef const_,
@@ -84,7 +84,7 @@ lp_build_blend_aos(LLVMBuilderRef builder,
 void
 lp_build_blend_soa(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src[4],
                    LLVMValueRef dst[4],
                    LLVMValueRef const_[4],
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index c11a9398f87..d14f468ba93 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -303,7 +303,7 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src,
                    LLVMValueRef dst,
                    LLVMValueRef const_,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
index b92254a7d6f..9511299d558 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -199,7 +199,7 @@ lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
 void
 lp_build_blend_soa(LLVMBuilderRef builder,
                    const struct pipe_blend_state *blend,
-                   union lp_type type,
+                   struct lp_type type,
                    LLVMValueRef src[4],
                    LLVMValueRef dst[4],
                    LLVMValueRef con[4],
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.c b/src/gallium/drivers/llvmpipe/lp_bld_const.c
index 21487365eae..c8eaa8c3940 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.c
@@ -42,7 +42,7 @@
 
 
 unsigned
-lp_mantissa(union lp_type type)
+lp_mantissa(struct lp_type type)
 {
    assert(type.floating);
 
@@ -72,7 +72,7 @@ lp_mantissa(union lp_type type)
  * Same as lp_const_scale(), but in terms of shifts.
  */
 unsigned
-lp_const_shift(union lp_type type)
+lp_const_shift(struct lp_type type)
 {
    if(type.floating)
       return 0;
@@ -86,7 +86,7 @@ lp_const_shift(union lp_type type)
 
 
 unsigned
-lp_const_offset(union lp_type type)
+lp_const_offset(struct lp_type type)
 {
    if(type.floating || type.fixed)
       return 0;
@@ -104,7 +104,7 @@ lp_const_offset(union lp_type type)
  * else for the fixed points types and normalized integers.
  */
 double
-lp_const_scale(union lp_type type)
+lp_const_scale(struct lp_type type)
 {
    unsigned long long llscale;
    double dscale;
@@ -122,7 +122,7 @@ lp_const_scale(union lp_type type)
  * Minimum value representable by the type.
  */
 double
-lp_const_min(union lp_type type)
+lp_const_min(struct lp_type type)
 {
    unsigned bits;
 
@@ -158,7 +158,7 @@ lp_const_min(union lp_type type)
  * Maximum value representable by the type.
  */
 double
-lp_const_max(union lp_type type)
+lp_const_max(struct lp_type type)
 {
    unsigned bits;
 
@@ -190,7 +190,7 @@ lp_const_max(union lp_type type)
 
 
 double
-lp_const_eps(union lp_type type)
+lp_const_eps(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
@@ -211,7 +211,7 @@ lp_const_eps(union lp_type type)
 
 
 LLVMValueRef
-lp_build_undef(union lp_type type)
+lp_build_undef(struct lp_type type)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    return LLVMGetUndef(vec_type);
@@ -219,7 +219,7 @@ lp_build_undef(union lp_type type)
                
 
 LLVMValueRef
-lp_build_zero(union lp_type type)
+lp_build_zero(struct lp_type type)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    return LLVMConstNull(vec_type);
@@ -227,7 +227,7 @@ lp_build_zero(union lp_type type)
                
 
 LLVMValueRef
-lp_build_one(union lp_type type)
+lp_build_one(struct lp_type type)
 {
    LLVMTypeRef elem_type;
    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
@@ -269,7 +269,7 @@ lp_build_one(union lp_type type)
                
 
 LLVMValueRef
-lp_build_const_scalar(union lp_type type,
+lp_build_const_scalar(struct lp_type type,
                       double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
@@ -295,7 +295,7 @@ lp_build_const_scalar(union lp_type type,
 
 
 LLVMValueRef
-lp_build_int_const_scalar(union lp_type type,
+lp_build_int_const_scalar(struct lp_type type,
                           long long val)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
@@ -312,7 +312,7 @@ lp_build_int_const_scalar(union lp_type type,
 
 
 LLVMValueRef
-lp_build_const_aos(union lp_type type, 
+lp_build_const_aos(struct lp_type type, 
                    double r, double g, double b, double a, 
                    const unsigned char *swizzle)
 {
@@ -352,8 +352,8 @@ lp_build_const_aos(union lp_type type,
 
 
 LLVMValueRef
-lp_build_const_mask_aos(union lp_type type,
-                        boolean cond[4])
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4])
 {
    LLVMTypeRef elem_type = LLVMIntType(type.width);
    LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_const.h b/src/gallium/drivers/llvmpipe/lp_bld_const.h
index 1934530ea3c..cb8e1c7b006 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_const.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_const.h
@@ -42,67 +42,67 @@
 #include <pipe/p_compiler.h>
 
 
-union lp_type type;
+struct lp_type;
 
 
 unsigned
-lp_mantissa(union lp_type type);
+lp_mantissa(struct lp_type type);
 
 
 unsigned
-lp_const_shift(union lp_type type);
+lp_const_shift(struct lp_type type);
 
 
 unsigned
-lp_const_offset(union lp_type type);
+lp_const_offset(struct lp_type type);
 
 
 double
-lp_const_scale(union lp_type type);
+lp_const_scale(struct lp_type type);
 
 double
-lp_const_min(union lp_type type);
+lp_const_min(struct lp_type type);
 
 
 double
-lp_const_max(union lp_type type);
+lp_const_max(struct lp_type type);
 
 
 double
-lp_const_eps(union lp_type type);
+lp_const_eps(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_undef(union lp_type type);
+lp_build_undef(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_zero(union lp_type type);
+lp_build_zero(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_one(union lp_type type);
+lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
-lp_build_const_scalar(union lp_type type,
+lp_build_const_scalar(struct lp_type type,
                       double val);
 
 
 LLVMValueRef
-lp_build_int_const_scalar(union lp_type type,
+lp_build_int_const_scalar(struct lp_type type,
                           long long val);
 
 
 LLVMValueRef
-lp_build_const_aos(union lp_type type, 
+lp_build_const_aos(struct lp_type type, 
                    double r, double g, double b, double a, 
                    const unsigned char *swizzle);
 
 
 LLVMValueRef
-lp_build_const_mask_aos(union lp_type type,
-                        boolean cond[4]);
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4]);
 
 
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
index c8954c8a34f..99352094379 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -63,11 +63,13 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
 #include "lp_bld_conv.h"
 
 
@@ -86,7 +88,7 @@
  */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        union lp_type src_type,
+                                        struct lp_type src_type,
                                         unsigned dst_width,
                                         LLVMValueRef src)
 {
@@ -122,7 +124,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
       int shift = dst_width - n;
       res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
 
-      /* Fill in the empty lower bits for added precision? */
+      /* TODO: Fill in the empty lower bits for additional precision? */
 #if 0
       {
          LLVMValueRef msb;
@@ -152,7 +154,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
                                 unsigned src_width,
-                                union lp_type dst_type,
+                                struct lp_type dst_type,
                                 LLVMValueRef src)
 {
    LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
@@ -198,243 +200,6 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
 
 /**
- * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
- */
-static LLVMValueRef
-lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-   assert(lo_hi < 2);
-
-   /* TODO: cache results in a static table */
-
-   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
-      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
-      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
-   }
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Build shuffle vectors that match PACKxx instructions.
- */
-static LLVMValueRef
-lp_build_const_pack_shuffle(unsigned n)
-{
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(n <= LP_MAX_VECTOR_LENGTH);
-
-   /* TODO: cache results in a static table */
-
-   for(i = 0; i < n; ++i)
-      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
-
-   return LLVMConstVector(elems, n);
-}
-
-
-/**
- * Expand the bit width.
- *
- * This will only change the number of bits the values are represented, not the
- * values themselved.
- */
-static void
-lp_build_expand(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
-               LLVMValueRef src,
-               LLVMValueRef *dst, unsigned num_dsts)
-{
-   unsigned num_tmps;
-   unsigned i;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length == dst_type.length * num_dsts);
-
-   num_tmps = 1;
-   dst[0] = src;
-
-   while(src_type.width < dst_type.width) {
-      union lp_type new_type = src_type;
-      LLVMTypeRef new_vec_type;
-
-      new_type.width *= 2;
-      new_type.length /= 2;
-      new_vec_type = lp_build_vec_type(new_type);
-
-      for(i = num_tmps; i--; ) {
-         LLVMValueRef zero;
-         LLVMValueRef shuffle_lo;
-         LLVMValueRef shuffle_hi;
-         LLVMValueRef lo;
-         LLVMValueRef hi;
-
-         zero = lp_build_zero(src_type);
-         shuffle_lo = lp_build_const_unpack_shuffle(src_type.length, 0);
-         shuffle_hi = lp_build_const_unpack_shuffle(src_type.length, 1);
-
-         /*  PUNPCKLBW, PUNPCKHBW */
-         lo = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_lo, "");
-         hi = LLVMBuildShuffleVector(builder, dst[i], zero, shuffle_hi, "");
-
-         dst[2*i + 0] = LLVMBuildBitCast(builder, lo, new_vec_type, "");
-         dst[2*i + 1] = LLVMBuildBitCast(builder, hi, new_vec_type, "");
-      }
-
-      src_type = new_type;
-
-      num_tmps *= 2;
-   }
-
-   assert(num_tmps == num_dsts);
-}
-
-
-/**
- * Non-interleaved pack.
- *
- * This will move values as
- *
- *   lo =   __ l0 __ l1 __ l2 __..  __ ln
- *   hi =   __ h0 __ h1 __ h2 __..  __ hn
- *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
- *
- * TODO: handle saturation consistently.
- */
-static LLVMValueRef
-lp_build_pack2(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
-               boolean clamped,
-               LLVMValueRef lo,
-               LLVMValueRef hi)
-{
-   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
-   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
-   LLVMValueRef shuffle;
-   LLVMValueRef res;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * 2 == dst_type.length);
-
-   assert(!src_type.floating);
-   assert(!dst_type.floating);
-
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(src_type.width * src_type.length == 128) {
-      /* All X86 non-interleaved pack instructions all take signed inputs and
-       * saturate them, so saturate beforehand. */
-      if(!src_type.sign && !clamped) {
-         struct lp_build_context bld;
-         unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
-         LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
-         lp_build_context_init(&bld, builder, src_type);
-         lo = lp_build_min(&bld, lo, dst_max);
-         hi = lp_build_min(&bld, hi, dst_max);
-      }
-
-      switch(src_type.width) {
-      case 32:
-         if(dst_type.sign)
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
-         else
-            /* PACKUSDW is the only instrinsic with a consistent signature */
-            return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
-         break;
-
-      case 16:
-         if(dst_type.sign)
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
-         else
-            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
-         break;
-
-      default:
-         assert(0);
-         return LLVMGetUndef(dst_vec_type);
-         break;
-      }
-
-      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
-      return res;
-   }
-#endif
-
-   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
-   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
-
-   shuffle = lp_build_const_pack_shuffle(dst_type.length);
-
-   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
-
-   return res;
-}
-
-
-/**
- * Truncate the bit width.
- *
- * TODO: Handle saturation consistently.
- */
-static LLVMValueRef
-lp_build_trunc(LLVMBuilderRef builder,
-               union lp_type src_type,
-               union lp_type dst_type,
-               boolean clamped,
-               const LLVMValueRef *src, unsigned num_srcs)
-{
-   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
-   /* We must not loose or gain channels. Only precision */
-   assert(src_type.length * num_srcs == dst_type.length);
-
-   for(i = 0; i < num_srcs; ++i)
-      tmp[i] = src[i];
-
-   while(src_type.width > dst_type.width) {
-      union lp_type new_type = src_type;
-
-      new_type.width /= 2;
-      new_type.length *= 2;
-
-      /* Take in consideration the sign changes only in the last step */
-      if(new_type.width == dst_type.width)
-         new_type.sign = dst_type.sign;
-
-      num_srcs /= 2;
-
-      for(i = 0; i < num_srcs; ++i)
-         tmp[i] = lp_build_pack2(builder, src_type, new_type, clamped,
-                                 tmp[2*i + 0], tmp[2*i + 1]);
-
-      src_type = new_type;
-   }
-
-   assert(num_srcs == 1);
-
-   return tmp[0];
-}
-
-
-/**
  * Generic type conversion.
  *
  * TODO: Take a precision argument, or even better, add a new precision member
@@ -442,12 +207,12 @@ lp_build_trunc(LLVMBuilderRef builder,
  */
 void
 lp_build_conv(LLVMBuilderRef builder,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               const LLVMValueRef *src, unsigned num_srcs,
               LLVMValueRef *dst, unsigned num_dsts)
 {
-   union lp_type tmp_type;
+   struct lp_type tmp_type;
    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
    unsigned num_tmps;
    unsigned i;
@@ -470,7 +235,7 @@ lp_build_conv(LLVMBuilderRef builder,
     * Clamp if necessary
     */
 
-   if(src_type.value != dst_type.value) {
+   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
       struct lp_build_context bld;
       double src_min = lp_const_min(src_type);
       double dst_min = lp_const_min(dst_type);
@@ -565,7 +330,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
    if(tmp_type.width > dst_type.width) {
       assert(num_dsts == 1);
-      tmp[0] = lp_build_trunc(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
+      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
       tmp_type.width = dst_type.width;
       tmp_type.length = dst_type.length;
       num_tmps = 1;
@@ -573,7 +338,7 @@ lp_build_conv(LLVMBuilderRef builder,
 
    if(tmp_type.width < dst_type.width) {
       assert(num_tmps == 1);
-      lp_build_expand(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
+      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
       tmp_type.width = dst_type.width;
       tmp_type.length = dst_type.length;
       num_tmps = num_dsts;
@@ -656,8 +421,8 @@ lp_build_conv(LLVMBuilderRef builder,
  */
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
-                   union lp_type src_type,
-                   union lp_type dst_type,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts)
 {
@@ -689,11 +454,11 @@ lp_build_conv_mask(LLVMBuilderRef builder,
 
    if(src_type.width > dst_type.width) {
       assert(num_dsts == 1);
-      dst[0] = lp_build_trunc(builder, src_type, dst_type, TRUE, src, num_srcs);
+      dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
    }
    else if(src_type.width < dst_type.width) {
       assert(num_srcs == 1);
-      lp_build_expand(builder, src_type, dst_type, src[0], dst, num_dsts);
+      lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts);
    }
    else {
       assert(num_srcs == num_dsts);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.h b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
index 05c1ef2a100..948e68fae4f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.h
@@ -40,33 +40,33 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type;
 
 
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
-                                        union lp_type src_type,
+                                        struct lp_type src_type,
                                         unsigned dst_width,
                                         LLVMValueRef src);
 
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
                                 unsigned src_width,
-                                union lp_type dst_type,
+                                struct lp_type dst_type,
                                 LLVMValueRef src);
 
 
 void
 lp_build_conv(LLVMBuilderRef builder,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               const LLVMValueRef *srcs, unsigned num_srcs,
               LLVMValueRef *dsts, unsigned num_dsts);
 
 void
 lp_build_conv_mask(LLVMBuilderRef builder,
-                   union lp_type src_type,
-                   union lp_type dst_type,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
                    const LLVMValueRef *src, unsigned num_srcs,
                    LLVMValueRef *dst, unsigned num_dsts);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
index 30925b5f415..59d8f492e60 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
@@ -30,10 +30,27 @@
 #include <udis86.h>
 #endif
 
+#include "util/u_math.h"
 #include "util/u_debug.h"
 #include "lp_bld_debug.h"
 
 
+/**
+ * Check alignment.
+ *
+ * It is important that this check is not implemented as a macro or inlined
+ * function, as the compiler assumptions in respect to alignment of global
+ * and stack variables would often make the check a no op, defeating the
+ * whole purpose of the exercise.
+ */
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment)
+{
+   assert(util_is_pot(alignment));
+   return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+
 void
 lp_disassemble(const void* func)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.h b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
index ecdafef76d0..583e6132b4b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.h
@@ -53,6 +53,10 @@ lp_build_name(LLVMValueRef val, const char *format, ...)
 }
 
 
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment);
+
+
 void
 lp_disassemble(const void* func);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 2cd6e6b9217..98ec1cb1b9d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -71,11 +71,11 @@
 /**
  * Return a type appropriate for depth/stencil testing.
  */
-union lp_type
+struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length)
 {
-   union lp_type type;
+   struct lp_type type;
    unsigned swizzle;
 
    assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
@@ -85,7 +85,7 @@ lp_depth_type(const struct util_format_description *format_desc,
    swizzle = format_desc->swizzle[0];
    assert(swizzle < 4);
 
-   type.value = 0;
+   memset(&type, 0, sizeof type);
    type.width = format_desc->block.bits;
 
    if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
@@ -114,7 +114,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     const struct util_format_description *format_desc,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef src,
@@ -179,12 +179,13 @@ lp_build_depth_test(LLVMBuilderRef builder,
       padding_right = 0;
       for(chan = 0; chan < z_swizzle; ++chan)
          padding_right += format_desc->channel[chan].size;
-      padding_left = format_desc->block.bits - format_desc->channel[z_swizzle].size;
+      padding_left = format_desc->block.bits -
+                     (padding_right + format_desc->channel[z_swizzle].size);
 
       if(padding_left || padding_right) {
-         const long long mask_left = ((long long)1 << (format_desc->block.bits - padding_left)) - 1;
-         const long long mask_right = ((long long)1 << (padding_right)) - 1;
-         z_bitmask = lp_build_int_const_scalar(type, mask_left & mask_right);
+         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
+         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
+         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
       }
 
       if(padding_left)
@@ -209,6 +210,4 @@ lp_build_depth_test(LLVMBuilderRef builder,
       dst = lp_build_select(&bld, z_bitmask, src, dst);
       LLVMBuildStore(builder, dst, dst_ptr);
    }
-
-   assert(!state->occlusion_count);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 5d2e042fcc5..79d6981bb51 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -41,11 +41,11 @@
  
 struct pipe_depth_state;
 struct util_format_description;
-union lp_type;
+struct lp_type;
 struct lp_build_mask_context;
 
 
-union lp_type
+struct lp_type
 lp_depth_type(const struct util_format_description *format_desc,
               unsigned length);
 
@@ -53,7 +53,7 @@ lp_depth_type(const struct util_format_description *format_desc,
 void
 lp_build_depth_test(LLVMBuilderRef builder,
                     const struct pipe_depth_state *state,
-                    union lp_type type,
+                    struct lp_type type,
                     const struct util_format_description *format_desc,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef src,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
index 9d99e1a9d9f..dcc25fbff86 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
@@ -32,59 +32,261 @@
  */
 
 #include "util/u_debug.h"
+#include "util/u_memory.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_flow.h"
 
 
+#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_DEPTH 32
+
+
+/**
+ * Enumeration of all possible flow constructs.
+ */
+enum lp_build_flow_construct_kind {
+   lP_BUILD_FLOW_SCOPE,
+   LP_BUILD_FLOW_SKIP,
+};
+
+
+/**
+ * Variable declaration scope.
+ */
+struct lp_build_flow_scope
+{
+   /** Number of variables declared in this scope */
+   unsigned num_variables;
+};
+
+
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_flow_skip
+{
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+
+   /** Number of variables declared at the beginning */
+   unsigned num_variables;
+
+   LLVMValueRef *phi;
+};
+
+
+/**
+ * Union of all possible flow constructs' data
+ */
+union lp_build_flow_construct_data
+{
+   struct lp_build_flow_scope scope;
+   struct lp_build_flow_skip skip;
+};
+
+
+/**
+ * Element of the flow construct stack.
+ */
+struct lp_build_flow_construct
+{
+   enum lp_build_flow_construct_kind kind;
+   union lp_build_flow_construct_data data;
+};
+
+
+/**
+ * All necessary data to generate LLVM control flow constructs.
+ *
+ * Besides keeping track of the control flow construct themselves we also
+ * need to keep track of variables in order to generate SSA Phi values.
+ */
+struct lp_build_flow_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * Control flow stack.
+    */
+   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
+   unsigned num_constructs;
+
+   /**
+    * Variable stack
+    */
+   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
+   unsigned num_variables;
+};
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder)
+{
+   struct lp_build_flow_context *flow;
+
+   flow = CALLOC_STRUCT(lp_build_flow_context);
+   if(!flow)
+      return NULL;
+
+   flow->builder = builder;
+
+   return flow;
+}
+
+
 void
-lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    LLVMBuilderRef builder,
-                    union lp_type type,
-                    LLVMValueRef value)
+lp_build_flow_destroy(struct lp_build_flow_context *flow)
 {
-   memset(mask, 0, sizeof *mask);
+   assert(flow->num_constructs == 0);
+   assert(flow->num_variables == 0);
+   FREE(flow);
+}
 
-   mask->builder = builder;
-   mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+
+static union lp_build_flow_construct_data *
+lp_build_flow_push(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
+   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
+      return NULL;
+
+   flow->constructs[flow->num_constructs].kind = kind;
+   return &flow->constructs[flow->num_constructs++].data;
+}
+
+
+static union lp_build_flow_construct_data *
+lp_build_flow_peek(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[flow->num_constructs - 1].data;
 }
 
 
+static union lp_build_flow_construct_data *
+lp_build_flow_pop(struct lp_build_flow_context *flow,
+                  enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[--flow->num_constructs].data;
+}
+
+
+/**
+ * Begin a variable scope.
+ *
+ *
+ */
 void
-lp_build_mask_update(struct lp_build_mask_context *mask,
-                     LLVMValueRef value)
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
 {
+   struct lp_build_flow_scope *scope;
 
-   LLVMValueRef cond;
-   LLVMBasicBlockRef current_block;
-   LLVMBasicBlockRef next_block;
-   LLVMBasicBlockRef new_block;
+   scope = &lp_build_flow_push(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-   if(mask->value)
-      mask->value = LLVMBuildAnd(mask->builder, mask->value, value, "");
-   else
-      mask->value = value;
+   scope->num_variables = 0;
+}
 
-   /* FIXME: disabled until we have proper control flow helpers */
-#if 0
-   cond = LLVMBuildICmp(mask->builder,
-                        LLVMIntEQ,
-                        LLVMBuildBitCast(mask->builder, mask->value, mask->reg_type, ""),
-                        LLVMConstNull(mask->reg_type),
-                        "");
 
-   current_block = LLVMGetInsertBlock(mask->builder);
+/**
+ * Declare a variable.
+ *
+ * A variable is a named entity which can have different LLVMValueRef's at
+ * different points of the program. This is relevant for control flow because
+ * when there are mutiple branches to a same location we need to replace
+ * the variable's value with a Phi function as explained in
+ * http://en.wikipedia.org/wiki/Static_single_assignment_form .
+ *
+ * We keep track of variables by keeping around a pointer to where their
+ * current.
+ *
+ * There are a few cautions to observe:
+ *
+ * - Variable's value must not be NULL. If there is no initial value then
+ *   LLVMGetUndef() should be used.
+ *
+ * - Variable's value must be kept up-to-date. If the variable is going to be
+ *   modified by a function then a pointer should be passed so that its value
+ *   is accurate. Failure to do this will cause some of the variables'
+ *   transient values to be lost, leading to wrong results.
+ *
+ * - A program should be written from top to bottom, by always appending
+ *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
+ *   modifying existing statements will most likely lead to wrong results.
+ *
+ */
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_peek(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-   if(!mask->skip_block) {
-      LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
-      mask->skip_block = LLVMAppendBasicBlock(function, "skip");
+   assert(*variable);
+   if(!*variable)
+      return;
+
+   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
+   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
+      return;
+
+   flow->variables[flow->num_variables++] = variable;
+   ++scope->num_variables;
+}
+
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_pop(flow, lP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
 
-      mask->phi = LLVMBuildPhi(mask->builder, LLVMTypeOf(mask->value), "");
+   assert(flow->num_variables >= scope->num_variables);
+   if(flow->num_variables < scope->num_variables) {
+      flow->num_variables = 0;
+      return;
    }
 
+   flow->num_variables -= scope->num_variables;
+}
+
+
+static LLVMBasicBlockRef
+lp_build_flow_insert_block(struct lp_build_flow_context *flow)
+{
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef next_block;
+   LLVMBasicBlockRef new_block;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
    next_block = LLVMGetNextBasicBlock(current_block);
-   assert(next_block);
    if(next_block) {
       new_block = LLVMInsertBasicBlock(next_block, "");
    }
@@ -93,30 +295,148 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
       new_block = LLVMAppendBasicBlock(function, "");
    }
 
-   LLVMAddIncoming(mask->phi, &mask->value, &current_block, 1);
-   LLVMBuildCondBr(mask->builder, cond, mask->skip_block, new_block);
+   return new_block;
+}
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBuilderRef builder;
+   unsigned i;
+
+   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   skip->block = lp_build_flow_insert_block(flow);
+   skip->num_variables = flow->num_variables;
+   if(!skip->num_variables) {
+      skip->phi = NULL;
+      return;
+   }
 
-   LLVMPositionBuilderAtEnd(mask->builder, new_block);
-#endif
+   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
+   if(!skip->phi) {
+      skip->num_variables = 0;
+      return;
+   }
+
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, skip->block);
+
+   for(i = 0; i < skip->num_variables; ++i)
+      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
+
+   LLVMDisposeBuilder(builder);
 }
 
 
-LLVMValueRef
-lp_build_mask_end(struct lp_build_mask_context *mask)
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef new_block;
+   unsigned i;
+
+   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   new_block = lp_build_flow_insert_block(flow);
+
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+   }
+
+   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+
+   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+ }
+
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow)
 {
-   if(mask->skip_block) {
-      LLVMBasicBlockRef current_block = LLVMGetInsertBlock(mask->builder);
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   unsigned i;
 
-      LLVMAddIncoming(mask->phi, &mask->value, &current_block, 1);
-      LLVMBuildBr(mask->builder, mask->skip_block);
+   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
 
-      LLVMPositionBuilderAtEnd(mask->builder, mask->skip_block);
+   current_block = LLVMGetInsertBlock(flow->builder);
 
-      mask->value = mask->phi;
-      mask->phi = NULL;
-      mask->skip_block = NULL;
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+      *flow->variables[i] = skip->phi[i];
    }
 
+   LLVMBuildBr(flow->builder, skip->block);
+   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
+
+   FREE(skip->phi);
+}
+
+
+static void
+lp_build_mask_check(struct lp_build_mask_context *mask)
+{
+   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMValueRef cond;
+
+   cond = LLVMBuildICmp(builder,
+                        LLVMIntEQ,
+                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMConstNull(mask->reg_type),
+                        "");
+
+   lp_build_flow_skip_cond_break(mask->flow, cond);
+}
+
+
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value)
+{
+   memset(mask, 0, sizeof *mask);
+
+   mask->flow = flow;
+   mask->reg_type = LLVMIntType(type.width * type.length);
+   mask->value = value;
+
+   lp_build_flow_scope_begin(flow);
+   lp_build_flow_scope_declare(flow, &mask->value);
+   lp_build_flow_skip_begin(flow);
+
+   lp_build_mask_check(mask);
+}
+
+
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value)
+{
+   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
+
+   lp_build_mask_check(mask);
+}
+
+
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask)
+{
+   lp_build_flow_skip_end(mask->flow);
+   lp_build_flow_scope_end(mask->flow);
    return mask->value;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.h b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
index 1b634ff038d..e61999ff06b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.h
@@ -38,27 +38,53 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type;
+struct lp_type;
+
+
+struct lp_build_flow_context;
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder);
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable);
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond);
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow);
 
 
 struct lp_build_mask_context
 {
-   LLVMBuilderRef builder;
+   struct lp_build_flow_context *flow;
 
    LLVMTypeRef reg_type;
 
    LLVMValueRef value;
-
-   LLVMValueRef phi;
-
-   LLVMBasicBlockRef skip_block;
 };
 
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    LLVMBuilderRef builder,
-                    union lp_type type,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
                     LLVMValueRef value);
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format.h b/src/gallium/drivers/llvmpipe/lp_bld_format.h
index 01c8a752d18..970bee379f5 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format.h
@@ -25,77 +25,59 @@
  *
  **************************************************************************/
 
-#ifndef LP_BLD_H
-#define LP_BLD_H
+#ifndef LP_BLD_FORMAT_H
+#define LP_BLD_FORMAT_H
 
 
 /**
  * @file
- * LLVM IR building helpers interfaces.
- *
- * We use LLVM-C bindings for now. They are not documented, but follow the C++
- * interfaces very closely, and appear to be complete enough for code
- * genration. See
- * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
- * for a standalone example.
+ * Pixel format helpers.
  */
 
 #include <llvm-c/Core.h>  
- 
+
 #include "pipe/p_format.h"
 
+struct util_format_description;
+struct lp_type;
 
-union lp_type;
+
+boolean
+lp_format_is_rgba8(const struct util_format_description *desc);
+
+
+void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled);
 
 
-/**
- * Unpack a pixel into its RGBA components.
- *
- * @param packed integer.
- *
- * @return RGBA in a 4 floats vector.
- */
 LLVMValueRef
-lp_build_unpack_rgba(LLVMBuilderRef builder,
-                     enum pipe_format format, 
-                     LLVMValueRef packed);
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         const struct util_format_description *desc,
+                         LLVMValueRef packed);
 
 
-/**
- * Pack a pixel.
- *
- * @param rgba 4 float vector with the unpacked components.
- */
 LLVMValueRef
-lp_build_pack_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef rgba);
+lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
+                          const struct util_format_description *desc,
+                          struct lp_type type,
+                          LLVMValueRef packed);
 
 
-/**
- * Load a pixel into its RGBA components.
- *
- * @param ptr value with the pointer to the packed pixel. Pointer type is
- * irrelevant.
- *
- * @return RGBA in a 4 floats vector.
- */
 LLVMValueRef
-lp_build_load_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format, 
-                   LLVMValueRef ptr);
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       const struct util_format_description *desc,
+                       LLVMValueRef rgba);
 
 
-/**
- * Store a pixel.
- *
- * @param rgba 4 float vector with the unpacked components.
- */
-void 
-lp_build_store_rgba(LLVMBuilderRef builder,
-                    enum pipe_format format,
-                    LLVMValueRef ptr,
-                    LLVMValueRef rgba);
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba);
 
 
-#endif /* !LP_BLD_H */
+#endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
index dcbc0076c7d..5836e0173f9 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
@@ -25,18 +25,39 @@
  *
  **************************************************************************/
 
+/**
+ * @file
+ * AoS pixel format manipulation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
 
+#include "util/u_cpu_detect.h"
 #include "util/u_format.h"
 
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
 #include "lp_bld_format.h"
 
 
+/**
+ * Unpack a single pixel into its RGBA components.
+ *
+ * @param packed integer.
+ *
+ * @return RGBA in a 4 floats vector.
+ *
+ * XXX: This is mostly for reference and testing -- operating a single pixel at
+ * a time is rarely if ever needed.
+ */
 LLVMValueRef
-lp_build_unpack_rgba(LLVMBuilderRef builder,
-                     enum pipe_format format,
-                     LLVMValueRef packed)
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         const struct util_format_description *desc,
+                         LLVMValueRef packed)
 {
-   const struct util_format_description *desc;
    LLVMTypeRef type;
    LLVMValueRef shifted, casted, scaled, masked;
    LLVMValueRef shifts[4];
@@ -49,8 +70,6 @@ lp_build_unpack_rgba(LLVMBuilderRef builder,
    unsigned shift;
    unsigned i;
 
-   desc = util_format_description(format);
-
    /* FIXME: Support more formats */
    assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
    assert(desc->block.width == 1);
@@ -151,12 +170,130 @@ lp_build_unpack_rgba(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Take a vector with packed pixels and unpack into a rgba8 vector.
+ *
+ * Formats with bit depth smaller than 32bits are accepted, but they must be
+ * padded to 32bits.
+ */
 LLVMValueRef
-lp_build_pack_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef rgba)
+lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
+                          const struct util_format_description *desc,
+                          struct lp_type type,
+                          LLVMValueRef packed)
+{
+   struct lp_build_context bld;
+   bool rgba8;
+   LLVMValueRef res;
+   unsigned i;
+
+   lp_build_context_init(&bld, builder, type);
+
+   /* FIXME: Support more formats */
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+   assert(desc->block.bits <= 32);
+
+   assert(!type.floating);
+   assert(!type.fixed);
+   assert(type.norm);
+   assert(type.width == 8);
+   assert(type.length % 4 == 0);
+
+   rgba8 = TRUE;
+   for(i = 0; i < 4; ++i) {
+      assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+             desc->channel[i].type == UTIL_FORMAT_TYPE_VOID);
+      if(desc->channel[0].size != 8)
+         rgba8 = FALSE;
+   }
+
+   if(rgba8) {
+      /*
+       * The pixel is already in a rgba8 format variant. All it is necessary
+       * is to swizzle the channels.
+       */
+
+      unsigned char swizzles[4];
+      boolean zeros[4]; /* bitwise AND mask */
+      boolean ones[4]; /* bitwise OR mask */
+      boolean swizzles_needed = FALSE;
+      boolean zeros_needed = FALSE;
+      boolean ones_needed = FALSE;
+
+      for(i = 0; i < 4; ++i) {
+         enum util_format_swizzle swizzle = desc->swizzle[i];
+
+         /* Initialize with the no-op case */
+         swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i;
+         zeros[i] = TRUE;
+         ones[i] = FALSE;
+
+         switch (swizzle) {
+         case UTIL_FORMAT_SWIZZLE_X:
+         case UTIL_FORMAT_SWIZZLE_Y:
+         case UTIL_FORMAT_SWIZZLE_Z:
+         case UTIL_FORMAT_SWIZZLE_W:
+            if(swizzle != swizzles[i]) {
+               swizzles[i] = swizzle;
+               swizzles_needed = TRUE;
+            }
+            break;
+         case UTIL_FORMAT_SWIZZLE_0:
+            zeros[i] = FALSE;
+            zeros_needed = TRUE;
+            break;
+         case UTIL_FORMAT_SWIZZLE_1:
+            ones[i] = TRUE;
+            ones_needed = TRUE;
+            break;
+         case UTIL_FORMAT_SWIZZLE_NONE:
+            assert(0);
+            break;
+         }
+      }
+
+      res = packed;
+
+      if(swizzles_needed)
+         res = lp_build_swizzle1_aos(&bld, res, swizzles);
+
+      if(zeros_needed) {
+         /* Mask out zero channels */
+         LLVMValueRef mask = lp_build_const_mask_aos(type, zeros);
+         res = LLVMBuildAnd(builder, res, mask, "");
+      }
+
+      if(ones_needed) {
+         /* Or one channels */
+         LLVMValueRef mask = lp_build_const_mask_aos(type, ones);
+         res = LLVMBuildOr(builder, res, mask, "");
+      }
+   }
+   else {
+      /* FIXME */
+      assert(0);
+      res = lp_build_undef(type);
+   }
+
+   return res;
+}
+
+
+/**
+ * Pack a single pixel.
+ *
+ * @param rgba 4 float vector with the unpacked components.
+ *
+ * XXX: This is mostly for reference and testing -- operating a single pixel at
+ * a time is rarely if ever needed.
+ */
+LLVMValueRef
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       const struct util_format_description *desc,
+                       LLVMValueRef rgba)
 {
-   const struct util_format_description *desc;
    LLVMTypeRef type;
    LLVMValueRef packed = NULL;
    LLVMValueRef swizzles[4];
@@ -167,8 +304,6 @@ lp_build_pack_rgba(LLVMBuilderRef builder,
    unsigned shift;
    unsigned i, j;
 
-   desc = util_format_description(format);
-
    assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
    assert(desc->block.width == 1);
    assert(desc->block.height == 1);
@@ -247,57 +382,3 @@ lp_build_pack_rgba(LLVMBuilderRef builder,
 
    return packed;
 }
-
-
-LLVMValueRef
-lp_build_load_rgba(LLVMBuilderRef builder,
-                   enum pipe_format format,
-                   LLVMValueRef ptr)
-{
-   const struct util_format_description *desc;
-   LLVMTypeRef type;
-   LLVMValueRef packed;
-
-   desc = util_format_description(format);
-
-   /* FIXME: Support more formats */
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-   assert(desc->block.bits <= 32);
-
-   type = LLVMIntType(desc->block.bits);
-
-   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
-
-   packed = LLVMBuildLoad(builder, ptr, "");
-
-   return lp_build_unpack_rgba(builder, format, packed);
-}
-
-
-void
-lp_build_store_rgba(LLVMBuilderRef builder,
-                    enum pipe_format format,
-                    LLVMValueRef ptr,
-                    LLVMValueRef rgba)
-{
-   const struct util_format_description *desc;
-   LLVMTypeRef type;
-   LLVMValueRef packed;
-
-   desc = util_format_description(format);
-
-   assert(desc->layout == UTIL_FORMAT_LAYOUT_ARITH);
-   assert(desc->block.width == 1);
-   assert(desc->block.height == 1);
-
-   type = LLVMIntType(desc->block.bits);
-
-   packed = lp_build_pack_rgba(builder, format, rgba);
-
-   ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(type, 0), "");
-
-   LLVMBuildStore(builder, packed, ptr);
-}
-
diff --git a/src/gallium/state_trackers/wgl/shared/stw_public.h b/src/gallium/drivers/llvmpipe/lp_bld_format_query.c
index 7fe9cfb3561..f3832d07ff9 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_public.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_query.c
@@ -18,56 +18,55 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  **************************************************************************/
 
-#ifndef STW_PUBLIC_H
-#define STW_PUBLIC_H
-
-#include <windows.h>
-
-BOOL stw_copy_context( UINT_PTR hglrcSrc,
-                       UINT_PTR hglrcDst,
-                       UINT mask );
-
-UINT_PTR stw_create_layer_context( HDC hdc, 
-                                   int iLayerPlane );
-
-BOOL stw_share_lists( UINT_PTR hglrc1, UINT_PTR hglrc2 );
-
-BOOL stw_delete_context( UINT_PTR hglrc );
-
-BOOL
-stw_release_context( UINT_PTR dhglrc );
-
-UINT_PTR stw_get_current_context( void );
-
-HDC stw_get_current_dc( void );
-
-BOOL stw_make_current( HDC hdc, UINT_PTR hglrc );
+/**
+ * @file
+ * Utility functions to make assertions about formats.
+ *
+ * This module centralizes most of logic used when determining what algorithm
+ * is most suitable (i.e., most efficient yet correct) for a given format.
+ *
+ * It might be possible to move some of these functions to u_format module,
+ * but since tiny differences in the format my render it more/less
+ * appropriate to a given algorithm it is impossible to make any long term
+ * guarantee about the semantics of these functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
 
-BOOL stw_swap_buffers( HDC hdc );
 
-BOOL
-stw_swap_layer_buffers( HDC hdc, UINT fuPlanes );
+#include "util/u_format.h"
 
-PROC stw_get_proc_address( LPCSTR lpszProc );
+#include "lp_bld_format.h"
 
-int stw_pixelformat_describe( HDC hdc,
-                              int iPixelFormat,
-                              UINT nBytes,
-                              LPPIXELFORMATDESCRIPTOR ppfd );
 
-int stw_pixelformat_get( HDC hdc );
+/**
+ * Whether this format is a 4 rgba8 variant
+ */
+boolean
+lp_format_is_rgba8(const struct util_format_description *desc)
+{
+   unsigned chan;
 
-BOOL stw_pixelformat_set( HDC hdc,
-                          int iPixelFormat );
+   if(desc->block.width != 1 ||
+      desc->block.height != 1 ||
+      desc->block.bits != 32)
+      return FALSE;
 
-int stw_pixelformat_choose( HDC hdc,
-                            CONST PIXELFORMATDESCRIPTOR *ppfd );
+   for(chan = 0; chan < 4; ++chan) {
+      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_SIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
+         return FALSE;
+      if(desc->channel[chan].size != 8)
+         return FALSE;
+   }
 
-#endif
+   return TRUE;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
new file mode 100644
index 00000000000..64151d169da
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_soa.c
@@ -0,0 +1,149 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_format.h"
+
+
+static LLVMValueRef
+lp_build_format_swizzle_chan_soa(struct lp_type type,
+                                 const LLVMValueRef *unswizzled,
+                                 enum util_format_swizzle swizzle)
+{
+   switch (swizzle) {
+   case UTIL_FORMAT_SWIZZLE_X:
+   case UTIL_FORMAT_SWIZZLE_Y:
+   case UTIL_FORMAT_SWIZZLE_Z:
+   case UTIL_FORMAT_SWIZZLE_W:
+      return unswizzled[swizzle];
+   case UTIL_FORMAT_SWIZZLE_0:
+      return lp_build_zero(type);
+   case UTIL_FORMAT_SWIZZLE_1:
+      return lp_build_one(type);
+   case UTIL_FORMAT_SWIZZLE_NONE:
+      return lp_build_undef(type);
+   default:
+      assert(0);
+      return lp_build_undef(type);
+   }
+}
+
+
+void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled)
+{
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum util_format_swizzle swizzle = format_desc->swizzle[0];
+      LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+      swizzled[2] = swizzled[1] = swizzled[0] = depth;
+      swizzled[3] = lp_build_one(type);
+   }
+   else {
+      unsigned chan;
+      for (chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+      }
+   }
+}
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba)
+{
+   LLVMValueRef inputs[4];
+   unsigned start;
+   unsigned chan;
+
+   /* FIXME: Support more formats */
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_ARITH ||
+          (format_desc->layout == UTIL_FORMAT_LAYOUT_ARRAY &&
+           format_desc->block.bits == format_desc->channel[0].size));
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   /* Decode the input vector components */
+   start = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned width = format_desc->channel[chan].size;
+      unsigned stop = start + width;
+      LLVMValueRef input;
+
+      input = packed;
+
+      switch(format_desc->channel[chan].type) {
+      case UTIL_FORMAT_TYPE_VOID:
+         input = NULL;
+         break;
+
+      case UTIL_FORMAT_TYPE_UNSIGNED:
+         if(type.floating) {
+            if(start)
+               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
+            if(stop < format_desc->block.bits) {
+               unsigned mask = ((unsigned long long)1 << width) - 1;
+               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
+            }
+
+            if(format_desc->channel[chan].normalized)
+               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
+            else
+               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      default:
+         /* fall through */
+         input = lp_build_undef(type);
+         break;
+      }
+
+      inputs[chan] = input;
+
+      start = stop;
+   }
+
+   lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index cfe20a0d75b..818c0e943e3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -109,32 +109,6 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
 
 
 /**
- * Small vector x scale multiplication optimization.
- *
- * TODO: Should be elsewhere.
- */
-static LLVMValueRef
-coeff_multiply(struct lp_build_interp_soa_context *bld,
-               LLVMValueRef coeff,
-               int step)
-{
-   LLVMValueRef factor;
-
-   switch(step) {
-   case 0:
-      return bld->base.zero;
-   case 1:
-      return coeff;
-   case 2:
-      return lp_build_add(&bld->base, coeff, coeff);
-   default:
-      factor = lp_build_const_scalar(bld->base.type, (double)step);
-      return lp_build_mul(&bld->base, coeff, factor);
-   }
-}
-
-
-/**
  * Multiply the dadx and dady with the xstep and ystep respectively.
  */
 static void
@@ -149,8 +123,8 @@ coeffs_update(struct lp_build_interp_soa_context *bld)
       if (mode != TGSI_INTERPOLATE_CONSTANT) {
          for(chan = 0; chan < NUM_CHANNELS; ++chan) {
             if(mask & (1 << chan)) {
-               bld->dadx[attrib][chan] = coeff_multiply(bld, bld->dadx[attrib][chan], bld->xstep);
-               bld->dady[attrib][chan] = coeff_multiply(bld, bld->dady[attrib][chan], bld->ystep);
+               bld->dadx[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dadx[attrib][chan], bld->xstep);
+               bld->dady[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dady[attrib][chan], bld->ystep);
             }
          }
       }
@@ -292,7 +266,7 @@ void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          const struct tgsi_token *tokens,
                          LLVMBuilderRef builder,
-                         union lp_type type,
+                         struct lp_type type,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 9194f6233a7..9c57a10879b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -83,7 +83,7 @@ void
 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          const struct tgsi_token *tokens,
                          LLVMBuilderRef builder,
-                         union lp_type type,
+                         struct lp_type type,
                          LLVMValueRef a0_ptr,
                          LLVMValueRef dadx_ptr,
                          LLVMValueRef dady_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
index 8631efd6c3e..db22a8028a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -33,6 +33,8 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
@@ -45,7 +47,7 @@ lp_build_cmp(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
    LLVMValueRef zeros = LLVMConstNull(int_vec_type);
@@ -65,7 +67,7 @@ lp_build_cmp(struct lp_build_context *bld,
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
-      if(type.floating) {
+      if(type.floating && util_cpu_caps.has_sse) {
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -114,7 +116,7 @@ lp_build_cmp(struct lp_build_context *bld,
          res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
          return res;
       }
-      else {
+      else if(util_cpu_caps.has_sse2) {
          static const struct {
             unsigned swap:1;
             unsigned eq:1;
@@ -301,7 +303,7 @@ lp_build_select(struct lp_build_context *bld,
                 LLVMValueRef a,
                 LLVMValueRef b)
 {
-   union lp_type type = bld->type;
+   struct lp_type type = bld->type;
    LLVMValueRef res;
 
    if(a == b)
@@ -313,8 +315,6 @@ lp_build_select(struct lp_build_context *bld,
       b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
    }
 
-   /* TODO: On SSE4 we could do this with a single instruction -- PBLENDVB */
-
    a = LLVMBuildAnd(bld->builder, a, mask, "");
 
    /* This often gets translated to PANDN, but sometimes the NOT is
@@ -339,9 +339,9 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b,
-                    boolean cond[4])
+                    const boolean cond[4])
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
@@ -376,9 +376,9 @@ lp_build_select_aos(struct lp_build_context *bld,
 
       return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
    }
+   else {
 #if 0
-   else if(0) {
-      /* FIXME: Unfortunately select of vectors do not work */
+      /* XXX: Unfortunately select of vectors do not work */
       /* Use a select */
       LLVMTypeRef elem_type = LLVMInt1Type();
       LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
@@ -388,10 +388,9 @@ lp_build_select_aos(struct lp_build_context *bld,
             cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
 
       return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
-   }
-#endif
-   else {
+#else
       LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
       return lp_build_select(bld, mask, a, b);
+#endif
    }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.h b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
index 29b9e1c45b8..d67500ef707 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.h
@@ -42,7 +42,7 @@
 #include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
 
 
-union lp_type type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -66,7 +66,7 @@ LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
                     LLVMValueRef a,
                     LLVMValueRef b,
-                    boolean cond[4]);
+                    const boolean cond[4]);
 
 
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
new file mode 100644
index 00000000000..d3f78c06d92
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_config.h"
+
+#include "lp_bld_misc.h"
+
+
+#ifndef LLVM_NATIVE_ARCH
+
+namespace llvm {
+   extern void LinkInJIT();
+}
+
+
+void
+LLVMLinkInJIT(void)
+{
+   llvm::LinkInJIT();
+}
+
+
+extern "C" int X86TargetMachineModule;
+
+
+int
+LLVMInitializeNativeTarget(void)
+{
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   X86TargetMachineModule = 1;
+#endif
+   return 0;
+}
+
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.h b/src/gallium/drivers/llvmpipe/lp_bld_misc.h
new file mode 100644
index 00000000000..0e787e0b9cb
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.h
@@ -0,0 +1,56 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_MISC_H
+#define LP_BLD_MISC_H
+
+
+#include "llvm/Config/config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifndef LLVM_NATIVE_ARCH
+
+void
+LLVMLinkInJIT(void);
+
+int
+LLVMInitializeNativeTarget(void);
+
+#endif /* !LLVM_NATIVE_ARCH */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* !LP_BLD_MISC_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_pack.c b/src/gallium/drivers/llvmpipe/lp_bld_pack.c
new file mode 100644
index 00000000000..bc360ad77ad
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_pack.c
@@ -0,0 +1,418 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for packing/unpacking.
+ *
+ * Pack/unpacking is necessary for conversion between types of different
+ * bit width.
+ *
+ * They are also commonly used when an computation needs higher
+ * precision for the intermediate values. For example, if one needs the
+ * function:
+ *
+ *   c = compute(a, b);
+ *
+ * to use more precision for intermediate results then one should implement it
+ * as:
+ *
+ *   LLVMValueRef
+ *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
+ *   {
+ *      struct lp_type wide_type = lp_wider_type(type);
+ *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
+ *
+ *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
+ *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
+ *
+ *      cl = compute_half(al, bl);
+ *      ch = compute_half(ah, bh);
+ *
+ *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
+ *
+ *      return c;
+ *   }
+ *
+ * where compute_half() would do the computation for half the elements with
+ * twice the precision.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Build shuffle vectors that match PACKxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_pack_shuffle(unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0; i < n; ++i)
+      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Interleave vector elements.
+ *
+ * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
+ */
+LLVMValueRef
+lp_build_interleave2(LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi)
+{
+   LLVMValueRef shuffle;
+
+   shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
+
+   return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
+}
+
+
+/**
+ * Double the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+void
+lp_build_unpack2(LLVMBuilderRef builder,
+                 struct lp_type src_type,
+                 struct lp_type dst_type,
+                 LLVMValueRef src,
+                 LLVMValueRef *dst_lo,
+                 LLVMValueRef *dst_hi)
+{
+   LLVMValueRef msb;
+   LLVMTypeRef dst_vec_type;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(dst_type.width == src_type.width * 2);
+   assert(dst_type.length * 2 == src_type.length);
+
+   if(dst_type.sign && src_type.sign) {
+      /* Replicate the sign bit in the most significant bits */
+      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
+   }
+   else
+      /* Most significant bits always zero */
+      msb = lp_build_zero(src_type);
+
+   /* Interleave bits */
+   if(util_cpu_caps.little_endian) {
+      *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
+      *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
+   }
+   else {
+      *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
+      *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
+   }
+
+   /* Cast the result into the new type (twice as wide) */
+
+   dst_vec_type = lp_build_vec_type(dst_type);
+
+   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
+   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
+}
+
+
+/**
+ * Expand the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+void
+lp_build_unpack(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef src,
+                LLVMValueRef *dst, unsigned num_dsts)
+{
+   unsigned num_tmps;
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length == dst_type.length * num_dsts);
+
+   num_tmps = 1;
+   dst[0] = src;
+
+   while(src_type.width < dst_type.width) {
+      struct lp_type tmp_type = src_type;
+
+      tmp_type.width *= 2;
+      tmp_type.length /= 2;
+
+      for(i = num_tmps; i--; ) {
+         lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
+      }
+
+      src_type = tmp_type;
+
+      num_tmps *= 2;
+   }
+
+   assert(num_tmps == num_dsts);
+}
+
+
+/**
+ * Non-interleaved pack.
+ *
+ * This will move values as
+ *
+ *   lo =   __ l0 __ l1 __ l2 __..  __ ln
+ *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ *
+ * It is assumed the values are already clamped into the destination type range.
+ * Values outside that range will produce undefined results. Use
+ * lp_build_packs2 instead.
+ */
+LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef lo,
+               LLVMValueRef hi)
+{
+   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
+   LLVMValueRef shuffle;
+   LLVMValueRef res;
+
+   dst_vec_type = lp_build_vec_type(dst_type);
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+      switch(src_type.width) {
+      case 32:
+         if(dst_type.sign) {
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+         }
+         else {
+            if (util_cpu_caps.has_sse4_1) {
+               /* PACKUSDW is the only instrinsic with a consistent signature */
+               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
+            }
+            else {
+               assert(0);
+               return LLVMGetUndef(dst_vec_type);
+            }
+         }
+         break;
+
+      case 16:
+         if(dst_type.sign)
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+         else
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+         break;
+
+      default:
+         assert(0);
+         return LLVMGetUndef(dst_vec_type);
+         break;
+      }
+
+      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+      return res;
+   }
+
+   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
+   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
+
+   shuffle = lp_build_const_pack_shuffle(dst_type.length);
+
+   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
+
+   return res;
+}
+
+
+
+/**
+ * Non-interleaved pack and saturate.
+ *
+ * Same as lp_build_pack2 but will saturate values so that they fit into the
+ * destination type.
+ */
+LLVMValueRef
+lp_build_packs2(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef lo,
+                LLVMValueRef hi)
+{
+   boolean clamp;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.sign == dst_type.sign);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   clamp = TRUE;
+
+   /* All X86 SSE non-interleaved pack instructions take signed inputs and
+    * saturate them, so no need to clamp for those cases. */
+   if(util_cpu_caps.has_sse2 &&
+      src_type.width * src_type.length == 128 &&
+      src_type.sign)
+      clamp = FALSE;
+
+   if(clamp) {
+      struct lp_build_context bld;
+      unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
+      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+      lp_build_context_init(&bld, builder, src_type);
+      lo = lp_build_min(&bld, lo, dst_max);
+      hi = lp_build_min(&bld, hi, dst_max);
+      /* FIXME: What about lower bound? */
+   }
+
+   return lp_build_pack2(builder, src_type, dst_type, lo, hi);
+}
+
+
+/**
+ * Truncate the bit width.
+ *
+ * TODO: Handle saturation consistently.
+ */
+LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs)
+{
+   LLVMValueRef (*pack2)(LLVMBuilderRef builder,
+                         struct lp_type src_type,
+                         struct lp_type dst_type,
+                         LLVMValueRef lo,
+                         LLVMValueRef hi);
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length);
+
+   if(clamped)
+      pack2 = &lp_build_pack2;
+   else
+      pack2 = &lp_build_packs2;
+
+   for(i = 0; i < num_srcs; ++i)
+      tmp[i] = src[i];
+
+   while(src_type.width > dst_type.width) {
+      struct lp_type tmp_type = src_type;
+
+      tmp_type.width /= 2;
+      tmp_type.length *= 2;
+
+      /* Take in consideration the sign changes only in the last step */
+      if(tmp_type.width == dst_type.width)
+         tmp_type.sign = dst_type.sign;
+
+      num_srcs /= 2;
+
+      for(i = 0; i < num_srcs; ++i)
+         tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]);
+
+      src_type = tmp_type;
+   }
+
+   assert(num_srcs == 1);
+
+   return tmp[0];
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_pack.h b/src/gallium/drivers/llvmpipe/lp_bld_pack.h
new file mode 100644
index 00000000000..fb2a34984a4
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_pack.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for packing/unpacking conversions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_BLD_PACK_H
+#define LP_BLD_PACK_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+
+
+LLVMValueRef
+lp_build_interleave2(LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi);
+
+
+void
+lp_build_unpack2(LLVMBuilderRef builder,
+                 struct lp_type src_type,
+                 struct lp_type dst_type,
+                 LLVMValueRef src,
+                 LLVMValueRef *dst_lo,
+                 LLVMValueRef *dst_hi);
+
+
+void
+lp_build_unpack(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef src,
+                LLVMValueRef *dst, unsigned num_dsts);
+
+
+LLVMValueRef
+lp_build_packs2(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef lo,
+                LLVMValueRef hi);
+
+
+LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef lo,
+               LLVMValueRef hi);
+
+
+LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs);
+
+
+#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.c b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
new file mode 100644
index 00000000000..4d272bea87e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
@@ -0,0 +1,190 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- common code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler)
+{
+   memset(state, 0, sizeof *state);
+
+   if(!texture)
+      return;
+
+   if(!sampler)
+      return;
+
+   state->format            = texture->format;
+   state->target            = texture->target;
+   state->pot_width         = util_is_pot(texture->width[0]);
+   state->pot_height        = util_is_pot(texture->height[0]);
+   state->pot_depth         = util_is_pot(texture->depth[0]);
+
+   state->wrap_s            = sampler->wrap_s;
+   state->wrap_t            = sampler->wrap_t;
+   state->wrap_r            = sampler->wrap_r;
+   state->min_img_filter    = sampler->min_img_filter;
+   state->min_mip_filter    = sampler->min_mip_filter;
+   state->mag_img_filter    = sampler->mag_img_filter;
+   if(sampler->compare_mode) {
+      state->compare_mode      = sampler->compare_mode;
+      state->compare_func      = sampler->compare_func;
+   }
+   state->normalized_coords = sampler->normalized_coords;
+   state->prefilter         = sampler->prefilter;
+}
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ *
+ * @param src_width src element width
+ * @param dst_width result element width (source will be expanded to fit)
+ * @param length length of the offsets,
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(dst_vec_type);
+   for(i = 0; i < length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef elem_offset;
+      LLVMValueRef elem_ptr;
+      LLVMValueRef elem;
+
+      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
+      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
+      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
+      elem = LLVMBuildLoad(builder, elem_ptr, "");
+
+      assert(src_width <= dst_width);
+      if(src_width > dst_width)
+         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
+      if(src_width < dst_width)
+         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
+
+      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Compute the offset of a pixel.
+ *
+ * x, y, y_stride are vectors
+ */
+LLVMValueRef
+lp_build_sample_offset(struct lp_build_context *bld,
+                       const struct util_format_description *format_desc,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr)
+{
+   LLVMValueRef x_stride;
+   LLVMValueRef offset;
+
+   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
+
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      LLVMValueRef x_lo, x_hi;
+      LLVMValueRef y_lo, y_hi;
+      LLVMValueRef x_stride_lo, x_stride_hi;
+      LLVMValueRef y_stride_lo, y_stride_hi;
+      LLVMValueRef x_offset_lo, x_offset_hi;
+      LLVMValueRef y_offset_lo, y_offset_hi;
+      LLVMValueRef offset_lo, offset_hi;
+
+      x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
+      y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
+
+      x_hi = LLVMBuildLShr(bld->builder, x, bld->one, "");
+      y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
+
+      x_stride_lo = x_stride;
+      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
+
+      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
+      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
+
+      x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
+      y_offset_lo = lp_build_mul(bld, y_lo, y_stride_lo);
+      offset_lo = lp_build_add(bld, x_offset_lo, y_offset_lo);
+
+      x_offset_hi = lp_build_mul(bld, x_hi, x_stride_hi);
+      y_offset_hi = lp_build_mul(bld, y_hi, y_stride_hi);
+      offset_hi = lp_build_add(bld, x_offset_hi, y_offset_hi);
+
+      offset = lp_build_add(bld, offset_hi, offset_lo);
+   }
+   else {
+      LLVMValueRef x_offset;
+      LLVMValueRef y_offset;
+
+      x_offset = lp_build_mul(bld, x, x_stride);
+      y_offset = lp_build_mul(bld, y, y_stride);
+
+      offset = lp_build_add(bld, x_offset, y_offset);
+   }
+
+   return offset;
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.h b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
new file mode 100644
index 00000000000..8cb8210ca76
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_SAMPLE_H
+#define LP_BLD_SAMPLE_H
+
+
+#include <llvm-c/Core.h>
+
+struct pipe_texture;
+struct pipe_sampler_state;
+struct util_format_description;
+struct lp_type;
+struct lp_build_context;
+
+
+/**
+ * Sampler static state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are embedded in the generated code.
+ */
+struct lp_sampler_static_state
+{
+   /* pipe_texture's state */
+   enum pipe_format format;
+   unsigned target:2;
+   unsigned pot_width:1;
+   unsigned pot_height:1;
+   unsigned pot_depth:1;
+
+   /* pipe_sampler_state's state */
+   unsigned wrap_s:3;
+   unsigned wrap_t:3;
+   unsigned wrap_r:3;
+   unsigned min_img_filter:2;
+   unsigned min_mip_filter:2;
+   unsigned mag_img_filter:2;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+   unsigned normalized_coords:1;
+   unsigned prefilter:4;
+};
+
+
+/**
+ * Sampler dynamic state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are computed in runtime.
+ *
+ * There are obtained through callbacks, as we don't want to tie the texture
+ * sampling code generation logic to any particular texture layout or pipe
+ * driver.
+ */
+struct lp_sampler_dynamic_state
+{
+
+   /** Obtain the base texture width. */
+   LLVMValueRef
+   (*width)( struct lp_sampler_dynamic_state *state,
+             LLVMBuilderRef builder,
+             unsigned unit);
+
+   /** Obtain the base texture height. */
+   LLVMValueRef
+   (*height)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*stride)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*data_ptr)( struct lp_sampler_dynamic_state *state,
+                LLVMBuilderRef builder,
+                unsigned unit);
+
+};
+
+
+/**
+ * Derive the sampler static state.
+ */
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler);
+
+
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+LLVMValueRef
+lp_build_sample_offset(struct lp_build_context *bld,
+                       const struct util_format_description *format_desc,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr);
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type fp_type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel);
+
+
+
+#endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
new file mode 100644
index 00000000000..47b68b71e25
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -0,0 +1,595 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- SoA.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_debug_dump.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+/**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+static void
+lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
+                          LLVMValueRef x,
+                          LLVMValueRef y,
+                          LLVMValueRef y_stride,
+                          LLVMValueRef data_ptr,
+                          LLVMValueRef *texel)
+{
+   LLVMValueRef offset;
+   LLVMValueRef packed;
+
+   offset = lp_build_sample_offset(&bld->int_coord_bld,
+                                   bld->format_desc,
+                                   x, y, y_stride,
+                                   data_ptr);
+
+   assert(bld->format_desc->block.width == 1);
+   assert(bld->format_desc->block.height == 1);
+   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+   packed = lp_build_gather(bld->builder,
+                            bld->texel_type.length,
+                            bld->format_desc->block.bits,
+                            bld->texel_type.width,
+                            data_ptr, offset);
+
+   lp_build_unpack_rgba_soa(bld->builder,
+                            bld->format_desc,
+                            bld->texel_type,
+                            packed, texel);
+}
+
+
+static LLVMValueRef
+lp_build_sample_packed(struct lp_build_sample_context *bld,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr)
+{
+   LLVMValueRef offset;
+
+   offset = lp_build_sample_offset(&bld->int_coord_bld,
+                                   bld->format_desc,
+                                   x, y, y_stride,
+                                   data_ptr);
+
+   assert(bld->format_desc->block.width == 1);
+   assert(bld->format_desc->block.height == 1);
+   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+   return lp_build_gather(bld->builder,
+                          bld->texel_type.length,
+                          bld->format_desc->block.bits,
+                          bld->texel_type.width,
+                          data_ptr, offset);
+}
+
+
+static LLVMValueRef
+lp_build_sample_wrap(struct lp_build_sample_context *bld,
+                     LLVMValueRef coord,
+                     LLVMValueRef length,
+                     boolean is_pot,
+                     unsigned wrap_mode)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      /* FIXME */
+      _debug_printf("warning: failed to translate texture wrap mode %s\n",
+                    debug_dump_tex_wrap(wrap_mode, TRUE));
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return coord;
+}
+
+
+static void
+lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
+                               LLVMValueRef s,
+                               LLVMValueRef t,
+                               LLVMValueRef width,
+                               LLVMValueRef height,
+                               LLVMValueRef stride,
+                               LLVMValueRef data_ptr,
+                               LLVMValueRef *texel)
+{
+   LLVMValueRef x;
+   LLVMValueRef y;
+
+   x = lp_build_ifloor(&bld->coord_bld, s);
+   y = lp_build_ifloor(&bld->coord_bld, t);
+
+   x = lp_build_sample_wrap(bld, x, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y = lp_build_sample_wrap(bld, y, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel_soa(bld, x, y, stride, data_ptr, texel);
+}
+
+
+static void
+lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMValueRef half;
+   LLVMValueRef s_ipart;
+   LLVMValueRef t_ipart;
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2][4];
+   unsigned chan;
+
+   half = lp_build_const_scalar(bld->coord_type, 0.5);
+   s = lp_build_sub(&bld->coord_bld, s, half);
+   t = lp_build_sub(&bld->coord_bld, t, half);
+
+   s_ipart = lp_build_floor(&bld->coord_bld, s);
+   t_ipart = lp_build_floor(&bld->coord_bld, t);
+
+   s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
+   t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
+
+   x0 = lp_build_itrunc(&bld->coord_bld, s_ipart);
+   y0 = lp_build_itrunc(&bld->coord_bld, t_ipart);
+
+   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   lp_build_sample_texel_soa(bld, x0, y0, stride, data_ptr, neighbors[0][0]);
+   lp_build_sample_texel_soa(bld, x1, y0, stride, data_ptr, neighbors[0][1]);
+   lp_build_sample_texel_soa(bld, x0, y1, stride, data_ptr, neighbors[1][0]);
+   lp_build_sample_texel_soa(bld, x1, y1, stride, data_ptr, neighbors[1][1]);
+
+   /* TODO: Don't interpolate missing channels */
+   for(chan = 0; chan < 4; ++chan) {
+      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                     s_fpart, t_fpart,
+                                     neighbors[0][0][chan],
+                                     neighbors[0][1][chan],
+                                     neighbors[1][0][chan],
+                                     neighbors[1][1][chan]);
+   }
+}
+
+
+static void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba)
+{
+   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
+   unsigned chan;
+
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned start = chan*8;
+      unsigned stop = start + 8;
+      LLVMValueRef input;
+
+      input = packed;
+
+      if(start)
+         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
+
+      if(stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
+
+      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+
+      rgba[chan] = input;
+   }
+}
+
+
+static void
+lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2];
+   LLVMValueRef neighbors_lo[2][2];
+   LLVMValueRef neighbors_hi[2][2];
+   LLVMValueRef packed, packed_lo, packed_hi;
+   LLVMValueRef unswizzled[4];
+
+   lp_build_context_init(&i32, builder, lp_type_int(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+
+   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
+   s = LLVMBuildAdd(builder, s, i32_c128, "");
+   t = LLVMBuildAdd(builder, t, i32_c128, "");
+
+   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+
+   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
+   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
+   t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
+
+   x0 = s_ipart;
+   y0 = t_ipart;
+
+   x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x1 = lp_build_sample_wrap(bld, x1, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap(bld, y1, height, bld->static_state->pot_height, bld->static_state->wrap_t);
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+
+   {
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffle_lo;
+      LLVMValueRef shuffle_hi;
+      unsigned i, j;
+
+      for(j = 0; j < h16.type.length; j += 4) {
+         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
+         LLVMValueRef index;
+
+         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+         for(i = 0; i < 4; ++i)
+            shuffles_lo[j + i] = index;
+
+         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+         for(i = 0; i < 4; ++i)
+            shuffles_hi[j + i] = index;
+      }
+
+      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, "");
+      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, "");
+      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, "");
+      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+
+   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
+   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
+   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
+   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
+
+   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
+   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
+   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
+   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
+
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
+
+   /*
+    * Linear interpolate with 8.8 fixed point.
+    */
+
+   packed_lo = lp_build_lerp_2d(&h16,
+                                s_fpart_lo, t_fpart_lo,
+                                neighbors_lo[0][0],
+                                neighbors_lo[0][1],
+                                neighbors_lo[1][0],
+                                neighbors_lo[1][1]);
+
+   packed_hi = lp_build_lerp_2d(&h16,
+                                s_fpart_hi, t_fpart_hi,
+                                neighbors_hi[0][0],
+                                neighbors_hi[0][1],
+                                neighbors_hi[1][0],
+                                neighbors_hi[1][1]);
+
+   packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi);
+
+   /*
+    * Convert to SoA and swizzle.
+    */
+
+   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
+
+   lp_build_rgba8_to_f32_soa(bld->builder,
+                             bld->texel_type,
+                             packed, unswizzled);
+
+   lp_build_format_swizzle_soa(bld->format_desc,
+                               bld->texel_type, unswizzled,
+                               texel);
+}
+
+
+static void
+lp_build_sample_compare(struct lp_build_sample_context *bld,
+                        LLVMValueRef p,
+                        LLVMValueRef *texel)
+{
+   struct lp_build_context *texel_bld = &bld->texel_bld;
+   LLVMValueRef res;
+   unsigned chan;
+
+   if(!bld->static_state->compare_mode)
+      return;
+
+   /* TODO: Compare before swizzling, to avoid redundant computations */
+   res = NULL;
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef cmp;
+      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
+      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
+
+      if(res)
+         res = lp_build_add(texel_bld, res, cmp);
+      else
+         res = cmp;
+   }
+
+   assert(res);
+   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for(chan = 0; chan < 3; ++chan)
+      texel[chan] = res;
+   texel[3] = texel_bld->one;
+}
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel)
+{
+   struct lp_build_sample_context bld;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef stride;
+   LLVMValueRef data_ptr;
+   LLVMValueRef s;
+   LLVMValueRef t;
+   LLVMValueRef p;
+
+   /* Setup our build context */
+   memset(&bld, 0, sizeof bld);
+   bld.builder = builder;
+   bld.static_state = static_state;
+   bld.dynamic_state = dynamic_state;
+   bld.format_desc = util_format_description(static_state->format);
+   bld.coord_type = type;
+   bld.int_coord_type = lp_int_type(type);
+   bld.texel_type = type;
+   lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
+   lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
+
+   /* Get the dynamic state */
+   width = dynamic_state->width(dynamic_state, builder, unit);
+   height = dynamic_state->height(dynamic_state, builder, unit);
+   stride = dynamic_state->stride(dynamic_state, builder, unit);
+   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+
+   s = coords[0];
+   t = coords[1];
+   p = coords[2];
+
+   width = lp_build_broadcast_scalar(&bld.int_coord_bld, width);
+   height = lp_build_broadcast_scalar(&bld.int_coord_bld, height);
+   stride = lp_build_broadcast_scalar(&bld.int_coord_bld, stride);
+
+   if(static_state->target == PIPE_TEXTURE_1D)
+      t = bld.coord_bld.zero;
+
+   if(static_state->normalized_coords) {
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld.coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(builder, width, coord_vec_type, "");
+      LLVMValueRef fp_height = LLVMBuildSIToFP(builder, height, coord_vec_type, "");
+      s = lp_build_mul(&bld.coord_bld, s, fp_width);
+      t = lp_build_mul(&bld.coord_bld, t, fp_height);
+   }
+
+   switch (static_state->min_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      if(lp_format_is_rgba8(bld.format_desc))
+         lp_build_sample_2d_linear_aos(&bld, s, t, width, height, stride, data_ptr, texel);
+      else
+         lp_build_sample_2d_linear_soa(&bld, s, t, width, height, stride, data_ptr, texel);
+      break;
+   default:
+      assert(0);
+   }
+
+   /* FIXME: respect static_state->min_mip_filter */;
+   /* FIXME: respect static_state->mag_img_filter */;
+   /* FIXME: respect static_state->prefilter */;
+
+   lp_build_sample_compare(&bld, p, texel);
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.c b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
index 14d2b10df9c..3998ac374fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.c
@@ -42,17 +42,30 @@
 
 
 LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name)
+{
+   LLVMValueRef indices[2];
+   LLVMValueRef member_ptr;
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   lp_build_name(member_ptr, "%s.%s_ptr", LLVMGetValueName(ptr), name);
+   return member_ptr;
+}
+
+
+LLVMValueRef
 lp_build_struct_get(LLVMBuilderRef builder,
                     LLVMValueRef ptr,
                     unsigned member,
                     const char *name)
 {
-   LLVMValueRef indices[2];
    LLVMValueRef member_ptr;
    LLVMValueRef res;
-   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
-   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
    res = LLVMBuildLoad(builder, member_ptr, "");
    lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
    return res;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_struct.h b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
index cbefdc9f815..740392f5611 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_struct.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_struct.h
@@ -53,6 +53,18 @@
              offsetof(_ctype, _cmember))
 
 
+/**
+ * Get value pointer to a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name);
+
+/**
+ * Get the value of a structure member.
+ */
 LLVMValueRef
 lp_build_struct_get(LLVMBuilderRef builder,
                     LLVMValueRef ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
index ac7eed9379a..64e81f7b1fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.c
@@ -64,7 +64,7 @@ LLVMValueRef
 lp_build_broadcast_scalar(struct lp_build_context *bld,
                           LLVMValueRef scalar)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    LLVMValueRef res;
    unsigned i;
 
@@ -83,7 +83,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
                        LLVMValueRef a,
                        unsigned channel)
 {
-   const union lp_type type = bld->type;
+   const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
@@ -115,7 +115,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
        *   YY00 YY00 .... YY00
        *   YYYY YYYY .... YYYY  <= output
        */
-      union lp_type type4 = type;
+      struct lp_type type4 = type;
       const char shifts[4][2] = {
          { 1,  2},
          {-1,  2},
@@ -161,7 +161,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_swizzle1_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
-                      unsigned char swizzle[4])
+                      const unsigned char swizzle[4])
 {
    const unsigned n = bld->type.length;
    unsigned i, j;
@@ -192,7 +192,7 @@ LLVMValueRef
 lp_build_swizzle2_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef b,
-                      unsigned char swizzle[4])
+                      const unsigned char swizzle[4])
 {
    const unsigned n = bld->type.length;
    unsigned i, j;
@@ -201,11 +201,12 @@ lp_build_swizzle2_aos(struct lp_build_context *bld,
       return lp_build_swizzle1_aos(bld, a, swizzle);
 
    if(a == b) {
-      swizzle[0] %= 4;
-      swizzle[1] %= 4;
-      swizzle[2] %= 4;
-      swizzle[3] %= 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle);
+      unsigned char swizzle1[4];
+      swizzle1[0] = swizzle[0] % 4;
+      swizzle1[1] = swizzle[1] % 4;
+      swizzle1[2] = swizzle[2] % 4;
+      swizzle1[3] = swizzle[3] % 4;
+      return lp_build_swizzle1_aos(bld, a, swizzle1);
    }
 
    if(swizzle[0] % 4 == 0 &&
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
index d7dd6a8a604..b9472127a63 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_swizzle.h
@@ -40,7 +40,7 @@
 #include <llvm-c/Core.h>  
 
 
-union lp_type type;
+struct lp_type;
 struct lp_build_context;
 
 
@@ -73,7 +73,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_swizzle1_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
-                      unsigned char swizzle[4]);
+                      const unsigned char swizzle[4]);
 
 
 /**
@@ -85,7 +85,7 @@ LLVMValueRef
 lp_build_swizzle2_aos(struct lp_build_context *bld,
                       LLVMValueRef a,
                       LLVMValueRef b,
-                      unsigned char swizzle[4]);
+                      const unsigned char swizzle[4]);
 
 
 #endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
index 912db24aecb..eddb7a83fa2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi.h
@@ -39,31 +39,46 @@
 
 
 struct tgsi_token;
-union lp_type;
+struct lp_type;
 struct lp_build_context;
 struct lp_build_mask_context;
 
 
-typedef void
-(*lp_emit_fetch_texel_soa_callback)( LLVMBuilderRef builder,
-                                void *context,
-                                unsigned unit,
-                                unsigned num_coords,
-                                const LLVMValueRef *coords,
-                                LLVMValueRef lodbias,
-                                LLVMValueRef *texel);
+/**
+ * Sampler code generation interface.
+ *
+ * Although texture sampling is a requirement for TGSI translation, it is
+ * a very different problem with several different approaches to it. This
+ * structure establishes an interface for texture sampling code generation, so
+ * that we can easily use different texture sampling strategies.
+ */
+struct lp_build_sampler_soa
+{
+   void
+   (*destroy)( struct lp_build_sampler_soa *sampler );
+
+   void
+   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
+                        LLVMBuilderRef builder,
+                        struct lp_type type,
+                        unsigned unit,
+                        unsigned num_coords,
+                        const LLVMValueRef *coords,
+                        LLVMValueRef lodbias,
+                        LLVMValueRef *texel);
+};
+
 
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
-                  union lp_type type,
+                  struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
-                  lp_emit_fetch_texel_soa_callback emit_fetch_texel,
-                  void *emit_fetch_texel_context);
+                  struct lp_build_sampler_soa *sampler);
 
 
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index d4d18febec7..64027de6aa9 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -78,6 +78,11 @@
 #define CHAN_Z 2
 #define CHAN_W 3
 
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
 
 struct lp_build_tgsi_soa_context
 {
@@ -88,8 +93,7 @@ struct lp_build_tgsi_soa_context
    const LLVMValueRef (*inputs)[NUM_CHANNELS];
    LLVMValueRef (*outputs)[NUM_CHANNELS];
 
-   lp_emit_fetch_texel_soa_callback emit_fetch_texel;
-   void *emit_fetch_texel_context;
+   struct lp_build_sampler_soa *sampler;
 
    LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
    LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
@@ -98,6 +102,51 @@ struct lp_build_tgsi_soa_context
 };
 
 
+static const unsigned char
+swizzle_left[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
+   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
+};
+
+
+static LLVMValueRef
+emit_ddx(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
+   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
+   return lp_build_sub(&bld->base, src_right, src_left);
+}
+
+
+static LLVMValueRef
+emit_ddy(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
+   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
+   return lp_build_sub(&bld->base, src_top, src_bottom);
+}
+
+
 /**
  * Register fetch.
  */
@@ -109,14 +158,14 @@ emit_fetch(
    const unsigned chan_index )
 {
    const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[index];
-   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    LLVMValueRef res;
 
    switch (swizzle) {
-   case TGSI_EXTSWIZZLE_X:
-   case TGSI_EXTSWIZZLE_Y:
-   case TGSI_EXTSWIZZLE_Z:
-   case TGSI_EXTSWIZZLE_W:
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
 
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_CONSTANT: {
@@ -149,14 +198,6 @@ emit_fetch(
       }
       break;
 
-   case TGSI_EXTSWIZZLE_ZERO:
-      res = bld->base.zero;
-      break;
-
-   case TGSI_EXTSWIZZLE_ONE:
-      res = bld->base.one;
-      break;
-
    default:
       assert( 0 );
       return bld->base.undef;
@@ -168,6 +209,7 @@ emit_fetch(
       break;
 
    case TGSI_UTIL_SIGN_SET:
+      /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
       res = LLVMBuildNeg( bld->base.builder, res, "" );
       break;
@@ -185,6 +227,36 @@ emit_fetch(
 
 
 /**
+ * Register fetch with derivatives.
+ */
+static void
+emit_fetch_deriv(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index,
+   LLVMValueRef *res,
+   LLVMValueRef *ddx,
+   LLVMValueRef *ddy)
+{
+   LLVMValueRef src;
+
+   src = emit_fetch(bld, inst, index, chan_index);
+
+   if(res)
+      *res = src;
+
+   /* TODO: use interpolation coeffs for inputs */
+
+   if(ddx)
+      *ddx = emit_ddx(bld, src);
+
+   if(ddy)
+      *ddy = emit_ddy(bld, src);
+}
+
+
+/**
  * Register store.
  */
 static void
@@ -239,17 +311,18 @@ emit_store(
  * High-level instruction translators.
  */
 
+
 static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
           boolean apply_lodbias,
-          boolean projected)
+          boolean projected,
+          LLVMValueRef *texel)
 {
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    LLVMValueRef lodbias;
    LLVMValueRef oow;
    LLVMValueRef coords[3];
-   LLVMValueRef texel[4];
    unsigned num_coords;
    unsigned i;
 
@@ -289,12 +362,11 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
 
-   bld->emit_fetch_texel(bld->base.builder, bld->emit_fetch_texel_context,
-                         unit, num_coords, coords, lodbias, texel);
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( inst, i ) {
-      emit_store( bld, inst, 0, i, texel[i] );
-   }
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->base.builder,
+                                  bld->base.type,
+                                  unit, num_coords, coords, lodbias,
+                                  texel);
 }
 
 
@@ -314,12 +386,7 @@ emit_kil(
       unsigned swizzle;
 
       /* Unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
-
-      /* Note that we test if the value is less than zero, so 1.0 and 0.0 need
-       * not to be tested. */
-      if(swizzle == TGSI_EXTSWIZZLE_ZERO || swizzle == TGSI_EXTSWIZZLE_ONE)
-         continue;
+      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
 
       /* Check if the component has not been already tested. */
       assert(swizzle < NUM_CHANNELS);
@@ -347,14 +414,6 @@ emit_kil(
 }
 
 
-static void
-emit_kilp(
-   struct lp_build_tgsi_soa_context *bld )
-{
-   /* XXX todo / fix me */
-}
-
-
 /**
  * Check if inst src/dest regs use indirect addressing into temporary
  * register file.
@@ -382,45 +441,52 @@ indirect_temp_reference(const struct tgsi_full_instruction *inst)
 static int
 emit_instruction(
    struct lp_build_tgsi_soa_context *bld,
-   struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   const struct tgsi_opcode_info *info)
 {
    unsigned chan_index;
    LLVMValueRef src0, src1, src2;
    LLVMValueRef tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-   LLVMValueRef dst0;
+   LLVMValueRef res;
+   LLVMValueRef dst0[NUM_CHANNELS];
 
    /* we can't handle indirect addressing into temp register file yet */
    if (indirect_temp_reference(inst))
       return FALSE;
 
+   assert(info->num_dst <= 1);
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.undef;
+      }
+   }
+
    switch (inst->Instruction.Opcode) {
 #if 0
    case TGSI_OPCODE_ARL:
+      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          emit_flr(bld, 0, 0);
          emit_f2it( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 #endif
 
    case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_SWZ:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
       }
       break;
 
    case TGSI_OPCODE_LIT:
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
-         emit_store( bld, inst, 0, CHAN_X, bld->base.one);
+         dst0[CHAN_X] = bld->base.one;
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
          src0 = emit_fetch( bld, inst, 0, CHAN_X );
-         dst0 = lp_build_max( &bld->base, src0, bld->base.zero);
-         emit_store( bld, inst, 0, CHAN_Y, dst0);
+         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
          /* XMM[1] = SrcReg[0].yyyy */
@@ -432,20 +498,19 @@ emit_instruction(
          tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
          tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
-         dst0 = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
-         emit_store( bld, inst, 0, CHAN_Z, dst0);
+         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
-         emit_store( bld, inst, 0, CHAN_W, bld->base.one);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_RCP:
    /* TGSI_OPCODE_RECIP */
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
-      dst0 = lp_build_rcp(&bld->base, src0);
+      res = lp_build_rcp(&bld->base, src0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -453,9 +518,9 @@ emit_instruction(
    /* TGSI_OPCODE_RECIPSQRT */
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
       src0 = lp_build_abs(&bld->base, src0);
-      dst0 = lp_build_rsqrt(&bld->base, src0);
+      res = lp_build_rsqrt(&bld->base, src0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -479,16 +544,15 @@ emit_instruction(
          lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
 
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            emit_store( bld, inst, 0, CHAN_X, tmp0);
+            dst0[CHAN_X] = tmp0;
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
-            emit_store( bld, inst, 0, CHAN_Y, tmp1);
+            dst0[CHAN_Y] = tmp1;
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            emit_store( bld, inst, 0, CHAN_Z, tmp2);
+            dst0[CHAN_Z] = tmp2;
       }
       /* dst.w = 1.0 */
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
@@ -514,20 +578,18 @@ emit_instruction(
 
          /* dst.x = floor(lg2(abs(src.x))) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
-            emit_store( bld, inst, 0, CHAN_X, tmp0);
+            dst0[CHAN_X] = tmp0;
          /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
-            tmp1 = lp_build_div( &bld->base, src0, tmp1);
-            emit_store( bld, inst, 0, CHAN_Y, tmp1);
+            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
          }
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
-            emit_store( bld, inst, 0, CHAN_Z, tmp2);
+            dst0[CHAN_Z] = tmp2;
       }
       /* dst.w = 1.0 */
       if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
@@ -535,8 +597,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_mul(&bld->base, src0, src1);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
       }
       break;
 
@@ -544,8 +605,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_add(&bld->base, src0, src1);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
       }
       break;
 
@@ -563,7 +623,7 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -585,28 +645,24 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_DST:
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_X, tmp0);
+         dst0[CHAN_X] = bld->base.one;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
          tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
-         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
-         emit_store( bld, inst, 0, CHAN_Y, tmp0);
+         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp0 = emit_fetch( bld, inst, 0, CHAN_Z );
-         emit_store( bld, inst, 0, CHAN_Z, tmp0);
+         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = emit_fetch( bld, inst, 1, CHAN_W );
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
       }
       break;
 
@@ -614,8 +670,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_min( &bld->base, src0, src1 );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
       }
       break;
 
@@ -623,8 +678,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
-         dst0 = lp_build_max( &bld->base, src0, src1 );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
       }
       break;
 
@@ -634,8 +688,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -645,8 +698,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -658,7 +710,7 @@ emit_instruction(
          tmp2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
          tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -666,8 +718,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          tmp1 = emit_fetch( bld, inst, 1, chan_index );
-         tmp0 = lp_build_sub( &bld->base, tmp0, tmp1);
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
       }
       break;
 
@@ -678,13 +729,19 @@ emit_instruction(
          src2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_sub( &bld->base, src1, src2 );
          tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
-         dst0 = lp_build_add( &bld->base, tmp0, src2 );
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
       }
       break;
 
    case TGSI_OPCODE_CND:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
+      }
       break;
 
    case TGSI_OPCODE_DP2A:
@@ -698,45 +755,49 @@ emit_instruction(
       tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);  /* dest[ch] = xmm0 */
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
       }
       break;
 
-#if 0
    case TGSI_OPCODE_FRC:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_frc( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp0 = lp_build_floor(&bld->base, src0);
+         tmp0 = lp_build_sub(&bld->base, tmp0, src0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_CLAMP:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_max(&bld->base, tmp0, src1);
+         tmp0 = lp_build_min(&bld->base, tmp0, src2);
+         dst0[chan_index] = tmp0;
+      }
       break;
 
    case TGSI_OPCODE_FLR:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_flr( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
       }
       break;
 
    case TGSI_OPCODE_ROUND:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_rnd( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
       }
       break;
-#endif
 
    case TGSI_OPCODE_EX2: {
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_exp2( &bld->base, tmp0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
    }
@@ -745,16 +806,16 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_log2( &bld->base, tmp0);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_POW:
       src0 = emit_fetch( bld, inst, 0, CHAN_X );
       src1 = emit_fetch( bld, inst, 1, CHAN_X );
-      dst0 = lp_build_pow( &bld->base, src0, src1 );
+      res = lp_build_pow( &bld->base, src0, src1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0 );
+         dst0[chan_index] = res;
       }
       break;
 
@@ -775,7 +836,7 @@ emit_instruction(
          tmp5 = tmp3;
          tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
          tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
-         emit_store( bld, inst, 0, CHAN_X, tmp2);
+         dst0[CHAN_X] = tmp2;
       }
       if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
@@ -786,31 +847,30 @@ emit_instruction(
          tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
          tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
          tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
-         emit_store( bld, inst, 0, CHAN_Y, tmp3);
+         dst0[CHAN_Y] = tmp3;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
          tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
          tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
          tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
-         emit_store( bld, inst, 0, CHAN_Z, tmp5);
+         dst0[CHAN_Z] = tmp5;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_ABS:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         tmp0 = lp_build_abs( &bld->base, tmp0 ) ;
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
       }
       break;
 
    case TGSI_OPCODE_RCC:
+      /* deprecated? */
+      assert(0);
       return 0;
-      break;
 
    case TGSI_OPCODE_DPH:
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
@@ -827,7 +887,7 @@ emit_instruction(
       tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -835,25 +895,27 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_cos( &bld->base, tmp0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
    case TGSI_OPCODE_DDX:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
+      }
       break;
 
    case TGSI_OPCODE_DDY:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
+      }
       break;
 
-#if 0
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld );
-      return 0; /* XXX fix me */
+      /* FIXME */
+      return 0;
       break;
-#endif
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
@@ -885,13 +947,14 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
    case TGSI_OPCODE_SFL:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
       break;
 
    case TGSI_OPCODE_SGT:
@@ -899,8 +962,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -908,7 +970,7 @@ emit_instruction(
       tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
       tmp0 = lp_build_sin( &bld->base, tmp0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 
@@ -917,8 +979,7 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
@@ -927,85 +988,99 @@ emit_instruction(
          src0 = emit_fetch( bld, inst, 0, chan_index );
          src1 = emit_fetch( bld, inst, 1, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
-         dst0 = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
       }
       break;
 
    case TGSI_OPCODE_STR:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.one;
+      }
       break;
 
    case TGSI_OPCODE_TEX:
-      emit_tex( bld, inst, FALSE, FALSE );
+      emit_tex( bld, inst, FALSE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXD:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_UP2H:
+      /* deprecated */
+      assert (0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP2US:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP4B:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_UP4UB:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_X2D:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ARA:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
 #if 0
    case TGSI_OPCODE_ARR:
+      /* FIXME */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
          emit_rnd( bld, 0, 0 );
          emit_f2it( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = tmp0;
       }
       break;
 #endif
 
    case TGSI_OPCODE_BRA:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CAL:
+      /* FIXME */
       return 0;
       break;
 
-#if 0
    case TGSI_OPCODE_RET:
-      emit_ret( bld );
+      /* FIXME */
+      return 0;
       break;
-#endif
 
    case TGSI_OPCODE_END:
       break;
 
-#if 0
    case TGSI_OPCODE_SSG:
    /* TGSI_OPCODE_SGN */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_sgn( bld, 0, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
       }
       break;
-#endif
 
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
@@ -1013,34 +1088,29 @@ emit_instruction(
          src1 = emit_fetch( bld, inst, 1, chan_index );
          src2 = emit_fetch( bld, inst, 2, chan_index );
          tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
-         dst0 = lp_build_select( &bld->base, tmp0, src1, src2);
-         emit_store( bld, inst, 0, chan_index, dst0);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
       }
       break;
 
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp0 = lp_build_cos( &bld->base, tmp0 );
-         emit_store( bld, inst, 0, CHAN_X, tmp0);
+         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
          tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
-         tmp0 = lp_build_sin( &bld->base, tmp0 );
-         emit_store( bld, inst, 0, CHAN_Y, tmp0);
+         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
-         tmp0 = bld->base.zero;
-         emit_store( bld, inst, 0, CHAN_Z, tmp0);
+         dst0[CHAN_Z] = bld->base.zero;
       }
       IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
-         tmp0 = bld->base.one;
-         emit_store( bld, inst, 0, CHAN_W, tmp0);
+         dst0[CHAN_W] = bld->base.one;
       }
       break;
 
    case TGSI_OPCODE_TXB:
-      emit_tex( bld, inst, TRUE, FALSE );
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_NRM:
@@ -1099,38 +1169,35 @@ emit_instruction(
 
             /* dst.x = xmm1 * src.x */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
-               tmp4 = lp_build_mul( &bld->base, tmp4, tmp1);
-               emit_store(bld, inst, 0, CHAN_X, tmp4);
+               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
             }
 
             /* dst.y = xmm1 * src.y */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
-               tmp5 = lp_build_mul( &bld->base, tmp5, tmp1);
-               emit_store(bld, inst, 0, CHAN_Y, tmp5);
+               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
             }
 
             /* dst.z = xmm1 * src.z */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
-               tmp6 = lp_build_mul( &bld->base, tmp6, tmp1);
-               emit_store(bld, inst, 0, CHAN_Z, tmp6);
+               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
             }
 
             /* dst.w = xmm1 * src.w */
             if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
-               tmp7 = lp_build_mul( &bld->base, tmp7, tmp1);
-               emit_store(bld, inst, 0, CHAN_W, tmp7);
+               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
             }
          }
 
-         /* dst0.w = 1.0 */
+         /* dst.w = 1.0 */
          if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
-            tmp0 = bld->base.one;
-            emit_store(bld, inst, 0, CHAN_W, tmp0);
+            dst0[CHAN_W] = bld->base.one;
          }
       }
       break;
 
    case TGSI_OPCODE_DIV:
+      /* deprecated */
+      assert( 0 );
       return 0;
       break;
 
@@ -1143,118 +1210,157 @@ emit_instruction(
       tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
       tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, tmp0);  /* dest[ch] = xmm0 */
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
       }
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( bld, inst, TRUE, FALSE );
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
       break;
 
    case TGSI_OPCODE_TXP:
-      emit_tex( bld, inst, FALSE, TRUE );
+      emit_tex( bld, inst, FALSE, TRUE, dst0 );
       break;
       
    case TGSI_OPCODE_BRK:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_IF:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_BGNFOR:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_REP:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ELSE:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_ENDIF:
+      /* FIXME */
       return 0;
       break;
 
    case TGSI_OPCODE_ENDFOR:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_ENDREP:
+      /* deprecated */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_PUSHA:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_POPA:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CEIL:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
+      }
       break;
 
    case TGSI_OPCODE_I2F:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_NOT:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
-#if 0
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          tmp0 = emit_fetch( bld, inst, 0, chan_index );
-         emit_f2it( bld, 0 );
-         emit_i2f( bld, 0 );
-         emit_store( bld, inst, 0, chan_index, tmp0);
+         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
       }
       break;
-#endif
 
    case TGSI_OPCODE_SHL:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_SHR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_AND:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_OR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_MOD:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_XOR:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_SAD:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_TXF:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_TXQ:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
    case TGSI_OPCODE_CONT:
+      /* deprecated? */
+      assert(0);
       return 0;
       break;
 
@@ -1266,10 +1372,19 @@ emit_instruction(
       return 0;
       break;
 
+   case TGSI_OPCODE_NOP:
+      break;
+
    default:
       return 0;
    }
    
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
+      }
+   }
+
    return 1;
 }
 
@@ -1277,14 +1392,13 @@ emit_instruction(
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
-                  union lp_type type,
+                  struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[NUM_CHANNELS],
                   LLVMValueRef (*outputs)[NUM_CHANNELS],
-                  lp_emit_fetch_texel_soa_callback emit_fetch_texel,
-                  void *emit_fetch_texel_context)
+                  struct lp_build_sampler_soa *sampler)
 {
    struct lp_build_tgsi_soa_context bld;
    struct tgsi_parse_context parse;
@@ -1299,8 +1413,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
    bld.inputs = inputs;
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
-   bld.emit_fetch_texel = emit_fetch_texel;
-   bld.emit_fetch_texel_context = emit_fetch_texel_context;
+   bld.sampler = sampler;
 
    tgsi_parse_init( &parse, tokens );
 
@@ -1309,16 +1422,18 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-         /* Input already interpolated */
+         /* Inputs already interpolated */
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         if (!emit_instruction( &bld, &parse.FullToken.FullInstruction )) {
+         {
             unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
             const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
-	    _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-	                  info ? info->mnemonic : "<invalid>");
-	 }
+            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
+               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                             info ? info->mnemonic : "<invalid>");
+         }
+
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.c b/src/gallium/drivers/llvmpipe/lp_bld_type.c
index 8e0026fd973..1320a267214 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.c
@@ -33,7 +33,7 @@
 
 
 LLVMTypeRef
-lp_build_elem_type(union lp_type type)
+lp_build_elem_type(struct lp_type type)
 {
    if (type.floating) {
       switch(type.width) {
@@ -55,7 +55,7 @@ lp_build_elem_type(union lp_type type)
 
 
 LLVMTypeRef
-lp_build_vec_type(union lp_type type)
+lp_build_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
    return LLVMVectorType(elem_type, type.length);
@@ -69,7 +69,7 @@ lp_build_vec_type(union lp_type type)
  * type and check for identity.
  */
 boolean
-lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type) 
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) 
 {
    LLVMTypeKind elem_kind;
 
@@ -107,7 +107,7 @@ lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type)
 
 
 boolean
-lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type) 
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type) 
 {
    LLVMTypeRef elem_type;
 
@@ -128,7 +128,7 @@ lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type)
 
 
 boolean
-lp_check_value(union lp_type type, LLVMValueRef val) 
+lp_check_value(struct lp_type type, LLVMValueRef val) 
 {
    LLVMTypeRef vec_type;
 
@@ -143,24 +143,55 @@ lp_check_value(union lp_type type, LLVMValueRef val)
 
 
 LLVMTypeRef
-lp_build_int_elem_type(union lp_type type)
+lp_build_int_elem_type(struct lp_type type)
 {
    return LLVMIntType(type.width);
 }
 
 
 LLVMTypeRef
-lp_build_int_vec_type(union lp_type type)
+lp_build_int_vec_type(struct lp_type type)
 {
    LLVMTypeRef elem_type = lp_build_int_elem_type(type);
    return LLVMVectorType(elem_type, type.length);
 }
 
 
+struct lp_type
+lp_int_type(struct lp_type type)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = type.width;
+   res_type.length = type.length;
+
+   return res_type;
+}
+
+
+/**
+ * Return the type with twice the bit width (hence half the number of elements).
+ */
+struct lp_type
+lp_wider_type(struct lp_type type)
+{
+   struct lp_type res_type;
+
+   memcpy(&res_type, &type, sizeof res_type);
+   res_type.width *= 2;
+   res_type.length /= 2;
+
+   assert(res_type.length);
+
+   return res_type;
+}
+
+
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
-                      union lp_type type)
+                      struct lp_type type)
 {
    bld->builder = builder;
    bld->type = type;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_type.h b/src/gallium/drivers/llvmpipe/lp_bld_type.h
index 3ce566be641..2fb233d335f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_type.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_type.h
@@ -43,71 +43,73 @@
 
 
 /**
+ * Native SIMD register width.
+ *
+ * 128 for all architectures we care about.
+ */
+#define LP_NATIVE_VECTOR_WIDTH 128
+
+/**
  * Several functions can only cope with vectors of length up to this value.
  * You may need to increase that value if you want to represent bigger vectors.
  */
 #define LP_MAX_VECTOR_LENGTH 16
 
-#define LP_MAX_TYPE_WIDTH 64
-
 
 /**
  * The LLVM type system can't conveniently express all the things we care about
  * on the types used for intermediate computations, such as signed vs unsigned,
  * normalized values, or fixed point.
  */
-union lp_type {
-   struct {
-      /** 
-       * Floating-point. Cannot be used with fixed. Integer numbers are
-       * represented by this zero.
-       */
-      unsigned floating:1;
-
-      /** 
-       * Fixed-point. Cannot be used with floating. Integer numbers are
-       * represented by this zero.
-       */
-      unsigned fixed:1;
-      
-      /** 
-       * Whether it can represent negative values or not.
-       *
-       * If this is not set for floating point, it means that all values are
-       * assumed to be positive.
-       */
-      unsigned sign:1;
-
-      /**
-       * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
-       * interval for signed types.
-       *
-       * For integer types it means the representable integer range should be
-       * interpreted as the interval above.
-       *
-       * For floating and fixed point formats it means the values should be
-       * clamped to the interval above.
-       */
-      unsigned norm:1;
-
-      /**
-       * Element width.
-       *
-       * For fixed point values, the fixed point is assumed to be at half the
-       * width.
-       */
-      unsigned width:14;
-
-      /** 
-       * Vector length.
-       *
-       * width*length should be a power of two greater or equal to eight.
-       *
-       * @sa LP_MAX_VECTOR_LENGTH
-       */
-      unsigned length:14;
-   };
-   uint32_t value;
+struct lp_type {
+   /**
+    * Floating-point. Cannot be used with fixed. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned floating:1;
+
+   /**
+    * Fixed-point. Cannot be used with floating. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned fixed:1;
+
+   /**
+    * Whether it can represent negative values or not.
+    *
+    * If this is not set for floating point, it means that all values are
+    * assumed to be positive.
+    */
+   unsigned sign:1;
+
+   /**
+    * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
+    * interval for signed types.
+    *
+    * For integer types it means the representable integer range should be
+    * interpreted as the interval above.
+    *
+    * For floating and fixed point formats it means the values should be
+    * clamped to the interval above.
+    */
+   unsigned norm:1;
+
+   /**
+    * Element width.
+    *
+    * For fixed point values, the fixed point is assumed to be at half the
+    * width.
+    */
+   unsigned width:14;
+
+   /**
+    * Vector length.
+    *
+    * width*length should be a power of two greater or equal to eight.
+    *
+    * @sa LP_MAX_VECTOR_LENGTH
+    */
+   unsigned length:14;
 };
 
 
@@ -124,7 +126,7 @@ struct lp_build_context
     * This not only describes the input/output LLVM types, but also whether
     * to normalize/clamp the results.
     */
-   union lp_type type;
+   struct lp_type type;
 
    /** Same as lp_build_undef(type) */
    LLVMValueRef undef;
@@ -137,38 +139,131 @@ struct lp_build_context
 };
 
 
+static INLINE struct lp_type
+lp_type_float(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.floating = TRUE;
+   res_type.sign = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_int(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_uint(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_unorm(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.norm = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_fixed(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.fixed = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_ufixed(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.fixed = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
 LLVMTypeRef
-lp_build_elem_type(union lp_type type);
+lp_build_elem_type(struct lp_type type);
 
 
 LLVMTypeRef
-lp_build_vec_type(union lp_type type);
+lp_build_vec_type(struct lp_type type);
 
 
 boolean
-lp_check_elem_type(union lp_type type, LLVMTypeRef elem_type);
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type);
 
 
 boolean
-lp_check_vec_type(union lp_type type, LLVMTypeRef vec_type);
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type);
 
 
 boolean
-lp_check_value(union lp_type type, LLVMValueRef val);
+lp_check_value(struct lp_type type, LLVMValueRef val);
 
 
 LLVMTypeRef
-lp_build_int_elem_type(union lp_type type);
+lp_build_int_elem_type(struct lp_type type);
 
 
 LLVMTypeRef
-lp_build_int_vec_type(union lp_type type);
+lp_build_int_vec_type(struct lp_type type);
+
+
+struct lp_type
+lp_int_type(struct lp_type type);
+
+
+struct lp_type
+lp_wider_type(struct lp_type type);
 
 
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
-                      union lp_type type);
+                      struct lp_type type);
 
 
 #endif /* !LP_BLD_TYPE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
index 580cca5b463..bdcff94b9bf 100644
--- a/src/gallium/drivers/llvmpipe/lp_clear.c
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -67,6 +67,7 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
          util_pack_color(rgba, ps->format, &cv);
          lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, cv);
       }
+      llvmpipe->dirty_render_cache = TRUE;
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 233d1df0e10..57e71f3e986 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -31,13 +31,13 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "lp_clear.h"
 #include "lp_context.h"
 #include "lp_flush.h"
-#include "lp_prim_setup.h"
 #include "lp_prim_vbuf.h"
 #include "lp_state.h"
 #include "lp_surface.h"
@@ -107,11 +107,16 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    if (llvmpipe->draw)
       draw_destroy( llvmpipe->draw );
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
+      pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
 
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
       lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
+      pipe_texture_reference(&llvmpipe->texture[i], NULL);
+   }
 
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
       if (llvmpipe->constants[i].buffer) {
@@ -141,8 +146,6 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
          return PIPE_REFERENCED_FOR_WRITE;
    }
    
-   /* FIXME: we also need to do the same for the texture cache */
-   
    return PIPE_UNREFERENCED;
 }
 
@@ -261,21 +264,21 @@ llvmpipe_create( struct pipe_screen *screen )
                          (struct tgsi_sampler **)
                             llvmpipe->tgsi.vert_samplers_list);
 
-   llvmpipe->setup = lp_draw_render_stage(llvmpipe);
-   if (!llvmpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "LP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->setup);
-   }
-   else {
-      lp_init_vbuf(llvmpipe);
-   }
+   llvmpipe->vbuf_backend = lp_create_vbuf_backend(llvmpipe);
+   if (!llvmpipe->vbuf_backend)
+      goto fail;
+
+   llvmpipe->vbuf = draw_vbuf_stage(llvmpipe->draw, llvmpipe->vbuf_backend);
+   if (!llvmpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->vbuf);
+   draw_set_render(llvmpipe->draw, llvmpipe->vbuf_backend);
+
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 8d5a0d4f1fc..3ad95d0bfc2 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -88,9 +88,6 @@ struct llvmpipe_context {
    /** Mapped vertex buffers */
    ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
    
-   /** Mapped constant buffers */
-   void *mapped_constants[PIPE_SHADER_TYPES];
-
    /** Vertex format */
    struct vertex_info vertex_info;
    struct vertex_info vertex_info_vbuf;
@@ -124,9 +121,10 @@ struct llvmpipe_context {
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct llvmpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
    
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index 89772e62d31..0aa13a1fc67 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -45,54 +45,6 @@
 
 
 
-static void
-llvmpipe_map_constant_buffers(struct llvmpipe_context *lp)
-{
-   struct pipe_screen *screen = lp->pipe.screen;
-   uint i, size;
-
-   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      if (lp->constants[i].buffer && lp->constants[i].buffer->size)
-         lp->mapped_constants[i] = screen->buffer_map(screen, lp->constants[i].buffer,
-                                                      PIPE_BUFFER_USAGE_CPU_READ);
-   }
-
-   if (lp->constants[PIPE_SHADER_VERTEX].buffer)
-      size = lp->constants[PIPE_SHADER_VERTEX].buffer->size;
-   else
-      size = 0;
-
-   lp->jit_context.constants = lp->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   draw_set_mapped_constant_buffer(lp->draw,
-                                   lp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   size);
-}
-
-
-static void
-llvmpipe_unmap_constant_buffers(struct llvmpipe_context *lp)
-{
-   struct pipe_screen *screen = lp->pipe.screen;
-   uint i;
-
-   /* really need to flush all prims since the vert/frag shaders const buffers
-    * are going away now.
-    */
-   draw_flush(lp->draw);
-
-   draw_set_mapped_constant_buffer(lp->draw, NULL, 0);
-
-   lp->jit_context.constants = NULL;
-
-   for (i = 0; i < 2; i++) {
-      if (lp->constants[i].buffer && lp->constants[i].buffer->size)
-         screen->buffer_unmap(screen, lp->constants[i].buffer);
-      lp->mapped_constants[i] = NULL;
-   }
-}
-
-
 boolean
 llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
@@ -124,7 +76,6 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
       llvmpipe_update_derived( lp );
 
    llvmpipe_map_transfers(lp);
-   llvmpipe_map_constant_buffers(lp);
 
    /*
     * Map vertex buffers
@@ -163,7 +114,6 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
 
 
    /* Note: leave drawing surfaces mapped */
-   llvmpipe_unmap_constant_buffers(lp);
 
    lp->dirty_render_cache = TRUE;
    
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index b5c1c95bb73..cd8381fe308 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -58,8 +58,10 @@ llvmpipe_flush( struct pipe_context *pipe,
        * in the hope that a later clear will wipe them out.
        */
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
-         if (llvmpipe->cbuf_cache[i])
+         if (llvmpipe->cbuf_cache[i]) {
+            lp_tile_cache_map_transfers(llvmpipe->cbuf_cache[i]);
             lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
+         }
 
       /* Need this call for hardware buffers before swapbuffers.
        *
@@ -71,8 +73,10 @@ llvmpipe_flush( struct pipe_context *pipe,
    }
    else if (flags & PIPE_FLUSH_RENDER_CACHE) {
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++)
-         if (llvmpipe->cbuf_cache[i])
+         if (llvmpipe->cbuf_cache[i]) {
+            lp_tile_cache_map_transfers(llvmpipe->cbuf_cache[i]);
             lp_flush_tile_cache(llvmpipe->cbuf_cache[i]);
+         }
 
       /* FIXME: untile zsbuf! */
      
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index d288460a1b8..13535dd638e 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,23 +36,57 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
+#include "util/u_cpu_detect.h"
 #include "lp_screen.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_misc.h"
 #include "lp_jit.h"
 
 
 static void
 lp_jit_init_globals(struct llvmpipe_screen *screen)
 {
-   /* struct lp_jit_context */
+   LLVMTypeRef texture_type;
+
+   /* struct lp_jit_texture */
    {
       LLVMTypeRef elem_types[4];
+
+      elem_types[LP_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
+      elem_types[LP_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+
+      texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, stride,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_DATA);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
+                           screen->target, texture_type);
+
+      LLVMAddTypeName(screen->module, "texture", texture_type);
+   }
+
+   /* struct lp_jit_context */
+   {
+      LLVMTypeRef elem_types[5];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
       elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
       elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
       elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -64,6 +98,9 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
                              screen->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
                              screen->target, context_type, 3);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
+                             screen->target, context_type,
+                             LP_JIT_CONTEXT_TEXTURES_INDEX);
       LP_CHECK_STRUCT_SIZE(struct lp_jit_context,
                            screen->target, context_type);
 
@@ -112,12 +149,23 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 {
    char *error = NULL;
 
+   util_cpu_detect();
+
+#if 0
+   /* For simulating less capable machines */
+   util_cpu_caps.has_sse3 = 0;
+   util_cpu_caps.has_sse4_1 = 0;
+#endif
+
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+
    screen->module = LLVMModuleCreateWithName("llvmpipe");
 
    screen->provider = LLVMCreateModuleProviderForExistingModule(screen->module);
 
    if (LLVMCreateJITCompiler(&screen->engine, screen->provider, 1, &error)) {
-      fprintf(stderr, "%s\n", error);
+      _debug_printf("%s\n", error);
       LLVMDisposeMessage(error);
       abort();
    }
@@ -128,8 +176,15 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
    LLVMAddTargetData(screen->target, screen->pass);
    /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
     * but there are more on SVN. */
+   /* TODO: Add more passes */
    LLVMAddConstantPropagationPass(screen->pass);
-   LLVMAddInstructionCombiningPass(screen->pass);
+   if(util_cpu_caps.has_sse4_1) {
+      /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+       * and sitofp (necessary for trunc/floor/ceil/round implementation)
+       * somehow becomes invalid code.
+       */
+      LLVMAddInstructionCombiningPass(screen->pass);
+   }
    LLVMAddPromoteMemoryToRegisterPass(screen->pass);
    LLVMAddGVNPass(screen->pass);
    LLVMAddCFGSimplificationPass(screen->pass);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index a7fb60f9f5c..58f716ede29 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -38,11 +38,31 @@
 
 #include "lp_bld_struct.h"
 
+#include "pipe/p_state.h"
+
 
 struct tgsi_sampler;
 struct llvmpipe_screen;
 
 
+struct lp_jit_texture
+{
+   uint32_t width;
+   uint32_t height;
+   uint32_t stride;
+   const void *data;
+};
+
+
+enum {
+   LP_JIT_TEXTURE_WIDTH = 0,
+   LP_JIT_TEXTURE_HEIGHT,
+   LP_JIT_TEXTURE_STRIDE,
+   LP_JIT_TEXTURE_DATA
+};
+
+
+
 /**
  * This structure is passed directly to the generated fragment shader.
  *
@@ -60,11 +80,12 @@ struct lp_jit_context
 
    struct tgsi_sampler **samplers;
 
-   /* TODO: alpha reference value */
    float alpha_ref_value;
 
-   /* TODO: blend constant color */
+   /* FIXME: store (also?) in floats */
    uint8_t *blend_color;
+
+   struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
 };
 
 
@@ -80,6 +101,11 @@ struct lp_jit_context
 #define lp_jit_context_blend_color(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 3, "blend_color")
 
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+
+#define lp_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+
 
 typedef void
 (*lp_jit_frag_func)(struct lp_jit_context *context,
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.c b/src/gallium/drivers/llvmpipe/lp_prim_setup.c
deleted file mode 100644
index b14f8fb99d9..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in lp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
- */
-
-
-#include "lp_context.h"
-#include "lp_setup.h"
-#include "lp_state.h"
-#include "lp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   llvmpipe_setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   llvmpipe_setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   llvmpipe_setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   llvmpipe_setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   llvmpipe_setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *lp_draw_render_stage( struct llvmpipe_context *llvmpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = llvmpipe_setup_create_context(llvmpipe);
-   sstage->stage.draw = llvmpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-lp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-lp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.h b/src/gallium/drivers/llvmpipe/lp_prim_setup.h
deleted file mode 100644
index da6cae63751..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef LP_PRIM_SETUP_H
-#define LP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct llvmpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-lp_draw_render_stage( struct llvmpipe_context *llvmpipe );
-
-extern struct setup_context *
-lp_draw_setup_context( struct draw_stage * );
-
-extern void
-lp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-lp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* LP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
index c394dcb61d0..4abff4ecccc 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
@@ -37,10 +37,9 @@
 
 
 #include "lp_context.h"
+#include "lp_setup.h"
 #include "lp_state.h"
 #include "lp_prim_vbuf.h"
-#include "lp_prim_setup.h"
-#include "lp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "util/u_memory.h"
@@ -59,6 +58,8 @@ struct llvmpipe_vbuf_render
 {
    struct vbuf_render base;
    struct llvmpipe_context *llvmpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -75,6 +76,11 @@ llvmpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 lp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -105,36 +111,6 @@ lp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 lp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         llvmpipe_get_vbuf_vertex_info(cvbr->llvmpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -160,11 +136,7 @@ static boolean
 lp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = lp_draw_setup_context(cvbr->llvmpipe->setup);
+   struct setup_context *setup_ctx = cvbr->setup;
    
    llvmpipe_setup_prepare( setup_ctx );
 
@@ -193,14 +165,9 @@ lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
    const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = llvmpipe->setup;
-   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -367,11 +334,6 @@ lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in lp_setup.c:
-    */
-   lp_draw_flush( setup );
 }
 
 
@@ -384,17 +346,12 @@ lp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
    struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = llvmpipe->setup;
-   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -568,40 +525,38 @@ static void
 lp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   cvbr->llvmpipe->vbuf_render = NULL;
+   llvmpipe_setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-lp_init_vbuf(struct llvmpipe_context *lp)
+struct vbuf_render *
+lp_create_vbuf_backend(struct llvmpipe_context *lp)
 {
-   assert(lp->draw);
+   struct llvmpipe_vbuf_render *cvbr = CALLOC_STRUCT(llvmpipe_vbuf_render);
 
-   lp->vbuf_render = CALLOC_STRUCT(llvmpipe_vbuf_render);
+   assert(lp->draw);
 
-   lp->vbuf_render->base.max_indices = LP_MAX_VBUF_INDEXES;
-   lp->vbuf_render->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
 
-   lp->vbuf_render->base.get_vertex_info = lp_vbuf_get_vertex_info;
-   lp->vbuf_render->base.allocate_vertices = lp_vbuf_allocate_vertices;
-   lp->vbuf_render->base.map_vertices = lp_vbuf_map_vertices;
-   lp->vbuf_render->base.unmap_vertices = lp_vbuf_unmap_vertices;
-   lp->vbuf_render->base.set_primitive = lp_vbuf_set_primitive;
-   lp->vbuf_render->base.draw = lp_vbuf_draw;
-   lp->vbuf_render->base.draw_arrays = lp_vbuf_draw_arrays;
-   lp->vbuf_render->base.release_vertices = lp_vbuf_release_vertices;
-   lp->vbuf_render->base.destroy = lp_vbuf_destroy;
+   cvbr->base.max_indices = LP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
 
-   lp->vbuf_render->llvmpipe = lp;
+   cvbr->base.get_vertex_info = lp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = lp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = lp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = lp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = lp_vbuf_set_primitive;
+   cvbr->base.draw = lp_vbuf_draw;
+   cvbr->base.draw_arrays = lp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = lp_vbuf_release_vertices;
+   cvbr->base.destroy = lp_vbuf_destroy;
 
-   lp->vbuf = draw_vbuf_stage(lp->draw, &lp->vbuf_render->base);
+   cvbr->llvmpipe = lp;
 
-   draw_set_rasterize_stage(lp->draw, lp->vbuf);
+   cvbr->setup = llvmpipe_setup_create_context(cvbr->llvmpipe);
 
-   draw_set_render(lp->draw, &lp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
index 6c4e6063e6d..0676e2f42ac 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct llvmpipe_context;
 
-extern void
-lp_init_vbuf(struct llvmpipe_context *llvmpipe);
+extern struct vbuf_render *
+lp_create_vbuf_backend(struct llvmpipe_context *llvmpipe);
 
 
 #endif /* LP_VBUF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 125035771e5..05189274589 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -27,8 +27,6 @@
 
 
 #include "util/u_memory.h"
-#include "util/u_simple_screen.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -67,8 +65,6 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -86,7 +82,7 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 8;  /* max 128x128x128 */
+      return 9;  /* max 256x256x256 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_TGSI_CONT_SUPPORTED:
@@ -196,8 +192,7 @@ static void
 llvmpipe_destroy_screen( struct pipe_screen *_screen )
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
-
-   struct pipe_winsys *winsys = _screen->winsys;
+   struct llvmpipe_winsys *winsys = screen->winsys;
 
    lp_jit_screen_cleanup(screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index d145f6d6bbc..ffcbc9a379f 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -33,7 +33,6 @@
  */
 
 #include "lp_context.h"
-#include "lp_prim_setup.h"
 #include "lp_quad.h"
 #include "lp_setup.h"
 #include "lp_state.h"
@@ -44,6 +43,7 @@
 #include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "lp_bld_debug.h"
 #include "lp_tile_cache.h"
 #include "lp_tile_soa.h"
 
@@ -89,6 +89,8 @@ struct setup_context {
    float oneoverarea;
    int facing;
 
+   float pixel_offset;
+
    struct quad_header quad[MAX_QUADS];
    struct quad_header *quad_ptrs[MAX_QUADS];
    unsigned count;
@@ -114,6 +116,7 @@ struct setup_context {
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
+ALIGN_STACK
 static void
 shade_quads(struct llvmpipe_context *llvmpipe,
             struct quad_header *quads[],
@@ -123,7 +126,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    struct quad_header *quad = quads[0];
    const unsigned x = quad->input.x0;
    const unsigned y = quad->input.y0;
-   uint8_t *tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+   uint8_t *tile;
    uint8_t *color;
    void *depth;
    uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
@@ -149,7 +152,13 @@ shade_quads(struct llvmpipe_context *llvmpipe,
          mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
 
    /* color buffer */
-   color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
+   if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
+      llvmpipe->framebuffer.cbufs[0]) {
+      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
+      color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
+   }
+   else
+      color = NULL;
 
    /* depth buffer */
    if(llvmpipe->zsbuf_map) {
@@ -162,12 +171,12 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    else
       depth = NULL;
 
-   /* TODO: blend color */
+   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
+   assert(lp_check_alignment(mask, 16));
 
-   assert((((uintptr_t)mask) & 0xf) == 0);
-   assert((((uintptr_t)depth) & 0xf) == 0);
-   assert((((uintptr_t)color) & 0xf) == 0);
-   assert((((uintptr_t)llvmpipe->jit_context.blend_color) & 0xf) == 0);
+   assert(lp_check_alignment(depth, 16));
+   assert(lp_check_alignment(color, 16));
+   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
 
    /* run shader */
    fs->current->jit_function( &llvmpipe->jit_context,
@@ -270,11 +279,13 @@ clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
        * until we codegenerate single-quad variants of the fragment pipeline
        * we need this hack. */
       const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-      struct quad_header quads[nr_quads];
-      struct quad_header *quad_ptrs[nr_quads];
+      struct quad_header quads[4];
+      struct quad_header *quad_ptrs[4];
       int x0 = block_x(quad->input.x0);
       unsigned i;
 
+      assert(nr_quads == 4);
+
       for(i = 0; i < nr_quads; ++i) {
          int x = x0 + 2*i;
          if(x == quad->input.x0)
@@ -473,6 +484,16 @@ static boolean setup_sort_vertices( struct setup_context *setup,
       ((det > 0.0) ^ 
        (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
+    */
+   if (setup->llvmpipe->rasterizer->gl_rasterization_rules) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
+
    return TRUE;
 }
 
@@ -498,7 +519,7 @@ static void tri_pos_coeff( struct setup_context *setup,
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
+    * pixel centers, in other words (pixel_offset, pixel_offset).
     *
     * this is neat but unfortunately not a good way to do things for
     * triangles with very large values of dadx or dady as it will
@@ -509,8 +530,8 @@ static void tri_pos_coeff( struct setup_context *setup,
     * instead - i'll switch to this later.
     */
    setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - 0.5f) +
-                            dady * (setup->vmin[0][1] - 0.5f)));
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
    /*
    debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
@@ -599,8 +620,8 @@ static void tri_linear_coeff( struct setup_context *setup,
        * instead - i'll switch to this later.
        */
       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
       /*
       debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
@@ -651,8 +672,8 @@ static void tri_persp_coeff( struct setup_context *setup,
       setup->coef.dadx[1 + attrib][i] = dadx;
       setup->coef.dady[1 + attrib][i] = dady;
       setup->coef.a0[1 + attrib][i] = (mina -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
    }
 }
 
@@ -736,12 +757,12 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
 static void setup_tri_edges( struct setup_context *setup )
 {
-   float vmin_x = setup->vmin[0][0] + 0.5f;
-   float vmid_x = setup->vmid[0][0] + 0.5f;
+   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
+   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
 
-   float vmin_y = setup->vmin[0][1] - 0.5f;
-   float vmid_y = setup->vmid[0][1] - 0.5f;
-   float vmax_y = setup->vmax[0][1] - 0.5f;
+   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
+   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
+   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
 
    setup->emaj.sy = ceilf(vmin_y);
    setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
@@ -940,8 +961,8 @@ linear_pos_coeff(struct setup_context *setup,
    setup->coef.dadx[0][i] = dadx;
    setup->coef.dady[0][i] = dady;
    setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - 0.5f) +
-                            dady * (setup->vmin[0][1] - 0.5f)));
+                           (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                            dady * (setup->vmin[0][1] - setup->pixel_offset)));
 }
 
 
@@ -962,8 +983,8 @@ line_linear_coeff(struct setup_context *setup,
       setup->coef.dadx[1 + attrib][i] = dadx;
       setup->coef.dady[1 + attrib][i] = dady;
       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
    }
 }
 
@@ -988,8 +1009,8 @@ line_persp_coeff(struct setup_context *setup,
       setup->coef.dadx[1 + attrib][i] = dadx;
       setup->coef.dady[1 + attrib][i] = dady;
       setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
+                     (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                      dady * (setup->vmin[0][1] - setup->pixel_offset)));
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index fb10329887d..7b26ce61a38 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -36,6 +36,7 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
 #include "lp_jit.h"
+#include "lp_bld_sample.h" /* for struct lp_sampler_static_state */
 
 
 #define LP_NEW_VIEWPORT      0x1
@@ -57,16 +58,20 @@
 
 struct tgsi_sampler;
 struct vertex_info;
-
+struct pipe_context;
+struct llvmpipe_context;
 
 struct lp_fragment_shader;
 
 
 struct lp_fragment_shader_variant_key
 {
+   enum pipe_format zsbuf_format;
    struct pipe_depth_state depth;
    struct pipe_alpha_state alpha;
    struct pipe_blend_state blend;
+
+   struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
index 3f03bd00571..b2e75d3b14e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -76,7 +76,7 @@ void llvmpipe_set_blend_color( struct pipe_context *pipe,
    for (i = 0; i < 4; ++i) {
       uint8_t c = float_to_ubyte(blend_color->color[i]);
       for (j = 0; j < 16; ++j)
-         llvmpipe->jit_context.blend_color[i*4 + j] = c;
+         llvmpipe->jit_context.blend_color[i*16 + j] = c;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 6fbb057937e..c753b183c0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -65,26 +65,19 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
    if (vinfo->num_attribs == 0) {
       /* compute vertex layout now */
       const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-      const enum interp_mode colorInterp
-         = llvmpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(llvmpipe->draw);
       uint i;
 
-      if (llvmpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(llvmpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
@@ -93,35 +86,40 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < lpfs->info.num_inputs; i++) {
          int src;
-         switch (lpfs->info.input_semantic_name[i]) {
-         case TGSI_SEMANTIC_POSITION:
-            src = draw_find_vs_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_POSITION, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
-            break;
+         enum interp_mode interp;
 
-         case TGSI_SEMANTIC_COLOR:
-            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_COLOR, 
-                                 lpfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         switch (lpfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
             break;
-
-         case TGSI_SEMANTIC_FOG:
-            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
             break;
-
-         case TGSI_SEMANTIC_GENERIC:
-         case TGSI_SEMANTIC_FACE:
-            /* this includes texcoords and varying vars */
-            src = draw_find_vs_output(llvmpipe->draw, TGSI_SEMANTIC_GENERIC,
-                                      lpfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
             break;
-
          default:
             assert(0);
+            interp = INTERP_LINEAR;
          }
+
+         switch (lpfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            interp = INTERP_POS;
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            if (llvmpipe->rasterizer->flatshade) {
+               interp = INTERP_CONSTANT;
+            }
+            break;
+         }
+
+         /* this includes texcoords and varying vars */
+         src = draw_find_vs_output(llvmpipe->draw,
+                                   lpfs->info.input_semantic_name[i],
+                                   lpfs->info.input_semantic_index[i]);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
       llvmpipe->psize_slot = draw_find_vs_output(llvmpipe->draw,
@@ -250,7 +248,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
 
    if (llvmpipe->dirty & (LP_NEW_FS |
                           LP_NEW_BLEND |
-                          LP_NEW_DEPTH_STENCIL_ALPHA))
+                          LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_SAMPLER |
+                          LP_NEW_TEXTURE))
       llvmpipe_update_fs( llvmpipe );
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 94170bd7161..2e9aa9fffe3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -83,8 +83,10 @@
 #include "lp_bld_debug.h"
 #include "lp_screen.h"
 #include "lp_context.h"
+#include "lp_buffer.h"
 #include "lp_state.h"
 #include "lp_quad.h"
+#include "lp_tex_sample.h"
 
 
 static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
@@ -130,21 +132,20 @@ generate_pos0(LLVMBuilderRef builder,
  * Generate the depth test.
  */
 static void
-generate_depth(struct llvmpipe_context *lp,
-               LLVMBuilderRef builder,
-               const struct pipe_depth_state *state,
-               union lp_type src_type,
+generate_depth(LLVMBuilderRef builder,
+               const struct lp_fragment_shader_variant_key *key,
+               struct lp_type src_type,
                struct lp_build_mask_context *mask,
                LLVMValueRef src,
                LLVMValueRef dst_ptr)
 {
    const struct util_format_description *format_desc;
-   union lp_type dst_type;
+   struct lp_type dst_type;
 
-   if(!lp->framebuffer.zsbuf)
+   if(!key->depth.enabled)
       return;
 
-   format_desc = util_format_description(lp->framebuffer.zsbuf->format);
+   format_desc = util_format_description(key->zsbuf_format);
    assert(format_desc);
 
    /* Pick the depth type. */
@@ -164,7 +165,7 @@ generate_depth(struct llvmpipe_context *lp,
 #endif
 
    lp_build_depth_test(builder,
-                       state,
+                       &key->depth,
                        dst_type,
                        format_desc,
                        mask,
@@ -173,107 +174,6 @@ generate_depth(struct llvmpipe_context *lp,
 }
 
 
-struct build_fetch_texel_context
-{
-   LLVMValueRef context_ptr;
-
-   LLVMValueRef samplers_ptr;
-
-   /** Coords/texels store */
-   LLVMValueRef store_ptr;
-};
-
-
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store )
-{
-   struct tgsi_sampler *sampler = samplers[unit];
-
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n",
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   debug_printf("lodbias %f\n", store[12]);
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      sampler->get_samples(sampler,
-                           &store[0],
-                           &store[4],
-                           &store[8],
-                           0.0f, /*store[12],  lodbias */
-                           rgba);
-      memcpy(store, rgba, sizeof rgba);
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-
-static void
-emit_fetch_texel( LLVMBuilderRef builder,
-                  void *context,
-                  unsigned unit,
-                  unsigned num_coords,
-                  const LLVMValueRef *coords,
-                  LLVMValueRef lodbias,
-                  LLVMValueRef *texel)
-{
-   struct build_fetch_texel_context *bld = context;
-   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
-   LLVMValueRef args[3];
-   unsigned i;
-
-   if(!bld->samplers_ptr)
-      bld->samplers_ptr = lp_jit_context_samplers(builder, bld->context_ptr);
-
-   if(!bld->store_ptr)
-      bld->store_ptr = LLVMBuildArrayAlloca(builder,
-                                            vec_type,
-                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                            "texel_store");
-
-   for (i = 0; i < num_coords; i++) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, bld->store_ptr, &index, 1, "");
-      LLVMBuildStore(builder, coords[i], coord_ptr);
-   }
-
-   args[0] = bld->samplers_ptr;
-   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
-   args[2] = bld->store_ptr;
-
-   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
-
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, bld->store_ptr, &index, 1, "");
-      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
-   }
-}
-
-
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
  */
@@ -282,11 +182,11 @@ generate_fs(struct llvmpipe_context *lp,
             struct lp_fragment_shader *shader,
             const struct lp_fragment_shader_variant_key *key,
             LLVMBuilderRef builder,
-            union lp_type type,
+            struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
             const struct lp_build_interp_soa_context *interp,
-            struct build_fetch_texel_context *sampler,
+            struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef *color,
             LLVMValueRef depth_ptr)
@@ -298,6 +198,7 @@ generate_fs(struct llvmpipe_context *lp,
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    LLVMValueRef z = interp->pos[2];
+   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
    boolean early_depth_test;
    unsigned attrib;
@@ -309,25 +210,35 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   lp_build_mask_begin(&mask, builder, type, *pmask);
+   flow = lp_build_flow_create(builder);
+
+   memset(outputs, 0, sizeof outputs);
+
+   lp_build_flow_scope_begin(flow);
+
+   /* Declare the color and z variables */
+   for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+      color[chan] = LLVMGetUndef(vec_type);
+      lp_build_flow_scope_declare(flow, &color[chan]);
+   }
+   lp_build_flow_scope_declare(flow, &z);
+
+   lp_build_mask_begin(&mask, flow, type, *pmask);
 
    early_depth_test =
-      lp->depth_stencil->depth.enabled &&
-      lp->framebuffer.zsbuf &&
-      !lp->depth_stencil->alpha.enabled &&
-      !lp->fs->info.uses_kill &&
-      !lp->fs->info.writes_z;
+      key->depth.enabled &&
+      !key->alpha.enabled &&
+      !shader->info.uses_kill &&
+      !shader->info.writes_z;
 
    if(early_depth_test)
-      generate_depth(lp, builder, &key->depth,
+      generate_depth(builder, key,
                      type, &mask,
                      z, depth_ptr);
 
-   memset(outputs, 0, sizeof outputs);
-
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, emit_fetch_texel, sampler);
+                     outputs, sampler);
 
    for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -368,12 +279,16 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    if(!early_depth_test)
-      generate_depth(lp, builder, &key->depth,
+      generate_depth(builder, key,
                      type, &mask,
                      z, depth_ptr);
 
    lp_build_mask_end(&mask);
 
+   lp_build_flow_scope_end(flow);
+
+   lp_build_flow_destroy(flow);
+
    *pmask = mask.value;
 
 }
@@ -385,13 +300,15 @@ generate_fs(struct llvmpipe_context *lp,
 static void
 generate_blend(const struct pipe_blend_state *blend,
                LLVMBuilderRef builder,
-               union lp_type type,
+               struct lp_type type,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
                LLVMValueRef dst_ptr)
 {
    struct lp_build_context bld;
+   struct lp_build_flow_context *flow;
+   struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMTypeRef int_vec_type;
    LLVMValueRef const_ptr;
@@ -400,11 +317,14 @@ generate_blend(const struct pipe_blend_state *blend,
    LLVMValueRef res[4];
    unsigned chan;
 
+   lp_build_context_init(&bld, builder, type);
+
+   flow = lp_build_flow_create(builder);
+   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+
    vec_type = lp_build_vec_type(type);
    int_vec_type = lp_build_int_vec_type(type);
 
-   lp_build_context_init(&bld, builder, type);
-
    const_ptr = lp_jit_context_blend_color(builder, context_ptr);
    const_ptr = LLVMBuildBitCast(builder, const_ptr,
                                 LLVMPointerType(vec_type, 0), "");
@@ -422,11 +342,16 @@ generate_blend(const struct pipe_blend_state *blend,
    lp_build_blend_soa(builder, blend, type, src, dst, con, res);
 
    for(chan = 0; chan < 4; ++chan) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
-      lp_build_name(res[chan], "res.%c", "rgba"[chan]);
-      res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
-      LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      if(blend->colormask & (1 << chan)) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), chan, 0);
+         lp_build_name(res[chan], "res.%c", "rgba"[chan]);
+         res[chan] = lp_build_select(&bld, mask, res[chan], dst[chan]);
+         LLVMBuildStore(builder, res[chan], LLVMBuildGEP(builder, dst_ptr, &index, 1, ""));
+      }
    }
+
+   lp_build_mask_end(&mask_ctx);
+   lp_build_flow_destroy(flow);
 }
 
 
@@ -440,8 +365,8 @@ generate_fragment(struct llvmpipe_context *lp,
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    struct lp_fragment_shader_variant *variant;
-   union lp_type fs_type;
-   union lp_type blend_type;
+   struct lp_type fs_type;
+   struct lp_type blend_type;
    LLVMTypeRef fs_elem_type;
    LLVMTypeRef fs_vec_type;
    LLVMTypeRef fs_int_vec_type;
@@ -462,7 +387,7 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMBuilderRef builder;
    LLVMValueRef x0;
    LLVMValueRef y0;
-   struct build_fetch_texel_context sampler;
+   struct lp_build_sampler_soa *sampler;
    struct lp_build_interp_soa_context interp;
    LLVMValueRef fs_mask[LP_MAX_VECTOR_LENGTH];
    LLVMValueRef fs_out_color[NUM_CHANNELS][LP_MAX_VECTOR_LENGTH];
@@ -475,9 +400,9 @@ generate_fragment(struct llvmpipe_context *lp,
 #ifdef DEBUG
    tgsi_dump(shader->base.tokens, 0);
    if(key->depth.enabled) {
+      debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
       debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
       debug_printf("depth.writemask = %u\n", key->depth.writemask);
-      debug_printf("depth.occlusion_count = %u\n", key->depth.occlusion_count);
    }
    if(key->alpha.enabled) {
       debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
@@ -495,6 +420,34 @@ generate_fragment(struct llvmpipe_context *lp,
       debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_dst_factor, TRUE));
    }
    debug_printf("blend.colormask = 0x%x\n", key->blend.colormask);
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
+      if(key->sampler[i].format) {
+         debug_printf("sampler[%u] = \n", i);
+         debug_printf("  .format = %s\n",
+                      pf_name(key->sampler[i].format));
+         debug_printf("  .target = %s\n",
+                      debug_dump_tex_target(key->sampler[i].target, TRUE));
+         debug_printf("  .pot = %u %u %u\n",
+                      key->sampler[i].pot_width,
+                      key->sampler[i].pot_height,
+                      key->sampler[i].pot_depth);
+         debug_printf("  .wrap = %s %s %s\n",
+                      debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                      debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                      debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+         debug_printf("  .min_img_filter = %s\n",
+                      debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+         debug_printf("  .min_mip_filter = %s\n",
+                      debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+         debug_printf("  .mag_img_filter = %s\n",
+                      debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+         if(key->sampler[i].compare_mode)
+            debug_printf("  .compare_mode = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+         debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
+      }
+   }
+
 #endif
 
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
@@ -507,7 +460,7 @@ generate_fragment(struct llvmpipe_context *lp,
    /* TODO: actually pick these based on the fs and color buffer
     * characteristics. */
 
-   fs_type.value = 0;
+   memset(&fs_type, 0, sizeof fs_type);
    fs_type.floating = TRUE; /* floating point values */
    fs_type.sign = TRUE;     /* values are signed */
    fs_type.norm = FALSE;    /* values are not limited to [0,1] or [-1,1] */
@@ -515,7 +468,7 @@ generate_fragment(struct llvmpipe_context *lp,
    fs_type.length = 4;      /* 4 element per vector */
    num_fs = 4;
 
-   blend_type.value = 0;
+   memset(&blend_type, 0, sizeof blend_type);
    blend_type.floating = FALSE; /* values are integers */
    blend_type.sign = FALSE;     /* values are unsigned */
    blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
@@ -586,8 +539,13 @@ generate_fragment(struct llvmpipe_context *lp,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x0, y0, 2, 0);
 
-   memset(&sampler, 0, sizeof sampler);
-   sampler.context_ptr = context_ptr;
+#if 0
+   /* C texture sampling */
+   sampler = lp_c_sampler_soa_create(context_ptr);
+#else
+   /* code generated texture sampling */
+   sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
+#endif
 
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
@@ -606,7 +564,7 @@ generate_fragment(struct llvmpipe_context *lp,
                   context_ptr,
                   i,
                   &interp,
-                  &sampler,
+                  sampler,
                   &fs_mask[i],
                   out_color,
                   depth_ptr_i);
@@ -615,6 +573,8 @@ generate_fragment(struct llvmpipe_context *lp,
          fs_out_color[chan][i] = out_color[chan];
    }
 
+   sampler->destroy(sampler);
+
    /* 
     * Convert the fs's output color and mask to fit to the blending type. 
     */
@@ -651,6 +611,11 @@ generate_fragment(struct llvmpipe_context *lp,
     * Translate the LLVM IR into machine code.
     */
 
+   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
+      LLVMDumpValue(variant->function);
+      abort();
+   }
+
    LLVMRunFunctionPassManager(screen->pass, variant->function);
 
 #ifdef DEBUG
@@ -658,11 +623,6 @@ generate_fragment(struct llvmpipe_context *lp,
    debug_printf("\n");
 #endif
 
-   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
-      LLVMDumpValue(variant->function);
-      abort();
-   }
-
    variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
 
 #ifdef DEBUG
@@ -741,16 +701,29 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 void
 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf)
+                             const struct pipe_constant_buffer *constants)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct pipe_buffer *buffer = constants ? constants->buffer : NULL;
+   unsigned size = buffer ? buffer->size : 0;
+   const void *data = buffer ? llvmpipe_buffer(buffer)->data : NULL;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   if(shader == PIPE_SHADER_VERTEX)
+      draw_flush(llvmpipe->draw);
+
    /* note: reference counting */
-   pipe_buffer_reference(&llvmpipe->constants[shader].buffer,
-			 buf ? buf->buffer : NULL);
+   pipe_buffer_reference(&llvmpipe->constants[shader].buffer, buffer);
+
+   if(shader == PIPE_SHADER_FRAGMENT) {
+      llvmpipe->jit_context.constants = data;
+   }
+
+   if(shader == PIPE_SHADER_VERTEX) {
+      draw_set_mapped_constant_buffer(llvmpipe->draw, data, size);
+   }
 
    llvmpipe->dirty |= LP_NEW_CONSTANTS;
 }
@@ -765,18 +738,45 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
  */
 static void
 make_variant_key(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
                  struct lp_fragment_shader_variant_key *key)
 {
+   unsigned i;
+
    memset(key, 0, sizeof *key);
 
-   memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   if(lp->framebuffer.zsbuf &&
+      lp->depth_stencil->depth.enabled) {
+      key->zsbuf_format = lp->framebuffer.zsbuf->format;
+      memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+   }
 
    key->alpha.enabled = lp->depth_stencil->alpha.enabled;
    if(key->alpha.enabled)
       key->alpha.func = lp->depth_stencil->alpha.func;
    /* alpha.ref_value is passed in jit_context */
 
-   memcpy(&key->blend, lp->blend, sizeof key->blend);
+   if(lp->framebuffer.cbufs[0]) {
+      const struct util_format_description *format_desc;
+      unsigned chan;
+
+      memcpy(&key->blend, lp->blend, sizeof key->blend);
+
+      format_desc = util_format_description(lp->framebuffer.cbufs[0]->format);
+      assert(format_desc->layout == UTIL_FORMAT_COLORSPACE_RGB ||
+             format_desc->layout == UTIL_FORMAT_COLORSPACE_SRGB);
+
+      /* mask out color channels not present in the color buffer */
+      for(chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         if(swizzle > 4)
+            key->blend.colormask &= ~(1 << chan);
+      }
+   }
+
+   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
+         lp_sampler_static_state(&key->sampler[i], lp->texture[i], lp->sampler[i]);
 }
 
 
@@ -787,7 +787,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant;
 
-   make_variant_key(lp, &key);
+   make_variant_key(lp, shader, &key);
 
    variant = shader->variants;
    while(variant) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 4fef541b1e3..c69d90c723a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -98,6 +98,16 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
 
       pipe_texture_reference(&llvmpipe->texture[i], tex);
       lp_tex_tile_cache_set_texture(llvmpipe->tex_cache[i], tex);
+
+      if(tex) {
+         struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
+         struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
+         jit_tex->width = tex->width[0];
+         jit_tex->height = tex->height[0];
+         jit_tex->stride = lp_tex->stride[0];
+         if(!lp_tex->dt)
+            jit_tex->data = lp_tex->data;
+      }
    }
 
    llvmpipe->num_textures = num;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index 177a26b7b1f..c06ce8b75c1 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -53,10 +53,11 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       /* check if changing cbuf */
       if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
          /* flush old */
+         lp_tile_cache_map_transfers(lp->cbuf_cache[i]);
          lp_flush_tile_cache(lp->cbuf_cache[i]);
 
          /* assign new */
-         lp->framebuffer.cbufs[i] = fb->cbufs[i];
+         pipe_surface_reference(&lp->framebuffer.cbufs[i], fb->cbufs[i]);
 
          /* update cache */
          lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
@@ -81,7 +82,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       }
 
       /* assign new */
-      lp->framebuffer.zsbuf = fb->zsbuf;
+      pipe_surface_reference(&lp->framebuffer.zsbuf, fb->zsbuf);
 
       /* Tell draw module how deep the Z/depth buffer is */
       if (lp->framebuffer.zsbuf) {
diff --git a/src/gallium/drivers/llvmpipe/lp_test.h b/src/gallium/drivers/llvmpipe/lp_test.h
index 69aaae26e0a..39d80726e65 100644
--- a/src/gallium/drivers/llvmpipe/lp_test.h
+++ b/src/gallium/drivers/llvmpipe/lp_test.h
@@ -56,6 +56,9 @@
 #include "lp_bld_type.h"
 
 
+#define LP_TEST_NUM_SAMPLES 32
+
+
 void
 write_tsv_header(FILE *fp);
 
@@ -68,17 +71,28 @@ boolean
 test_all(unsigned verbose, FILE *fp);
 
 
+#if defined(PIPE_CC_MSVC)
+
+unsigned __int64 __rdtsc();
+#pragma intrinsic(__rdtsc)
+#define rdtsc() __rdtsc()
+
+#elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
+
 static INLINE uint64_t
 rdtsc(void)
 {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    uint32_t hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)lo) | (((uint64_t)hi) << 32);
+}
+
 #else
-   return 0;
+
+#define rdtsc() 0
+
 #endif
-}
+
 
 
 float
@@ -86,43 +100,43 @@ random_float(void);
 
 
 void
-dump_type(FILE *fp, union lp_type type);
+dump_type(FILE *fp, struct lp_type type);
 
 
 double
-read_elem(union lp_type type, const void *src, unsigned index);
+read_elem(struct lp_type type, const void *src, unsigned index);
 
 
 void
-write_elem(union lp_type type, void *dst, unsigned index, double src);
+write_elem(struct lp_type type, void *dst, unsigned index, double src);
 
 
 void
-random_elem(union lp_type type, void *dst, unsigned index);
+random_elem(struct lp_type type, void *dst, unsigned index);
 
 
 void
-read_vec(union lp_type type, const void *src, double *dst);
+read_vec(struct lp_type type, const void *src, double *dst);
 
 
 void
-write_vec(union lp_type type, void *dst, const double *src);
+write_vec(struct lp_type type, void *dst, const double *src);
 
 
 void
-random_vec(union lp_type type, void *dst);
+random_vec(struct lp_type type, void *dst);
 
 
 boolean
-compare_vec_with_eps(union lp_type type, const void *res, const void *ref, double eps);
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps);
 
 
 boolean
-compare_vec(union lp_type type, const void *res, const void *ref);
+compare_vec(struct lp_type type, const void *res, const void *ref);
 
 
 void
-dump_vec(FILE *fp, union lp_type type, const void *src);
+dump_vec(FILE *fp, struct lp_type type, const void *src);
 
 
 #endif /* !LP_TEST_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 8dfad468e3c..29fff91981a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -80,7 +80,7 @@ static void
 write_tsv_row(FILE *fp,
               const struct pipe_blend_state *blend,
               enum vector_mode mode,
-              union lp_type type,
+              struct lp_type type,
               double cycles,
               boolean success)
 {
@@ -125,7 +125,7 @@ static void
 dump_blend_type(FILE *fp,
                 const struct pipe_blend_state *blend,
                 enum vector_mode mode,
-                union lp_type type)
+                struct lp_type type)
 {
    fprintf(fp, "%s", mode ? "soa" : "aos");
 
@@ -153,7 +153,7 @@ static LLVMValueRef
 add_blend_test(LLVMModuleRef module,
                const struct pipe_blend_state *blend,
                enum vector_mode mode,
-               union lp_type type)
+               struct lp_type type)
 {
    LLVMTypeRef ret_type;
    LLVMTypeRef vec_type;
@@ -462,12 +462,13 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 }
 
 
+ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
          const struct pipe_blend_state *blend,
          enum vector_mode mode,
-         union lp_type type)
+         struct lp_type type)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef func = NULL;
@@ -477,8 +478,8 @@ test_one(unsigned verbose,
    char *error = NULL;
    blend_test_ptr_t blend_test_ptr;
    boolean success;
-   const unsigned n = 32;
-   int64_t cycles[n];
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
    double cycles_avg = 0.0;
    unsigned i, j;
 
@@ -530,11 +531,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t con[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t res[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -595,11 +596,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         uint8_t src[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t dst[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t con[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t res[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
-         uint8_t ref[4*LP_MAX_VECTOR_LENGTH*LP_MAX_TYPE_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
@@ -765,10 +766,10 @@ blend_funcs[] = {
 };
 
 
-const union lp_type blend_types[] = {
+const struct lp_type blend_types[] = {
    /* float, fixed,  sign,  norm, width, len */
-   {{  TRUE, FALSE, FALSE,  TRUE,    32,   4 }}, /* f32 x 4 */
-   {{ FALSE, FALSE, FALSE,  TRUE,     8,  16 }}, /* u8n x 16 */
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 }, /* f32 x 4 */
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
 };
 
 
@@ -788,7 +789,7 @@ test_all(unsigned verbose, FILE *fp)
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
    enum vector_mode mode;
-   const union lp_type *type;
+   const struct lp_type *type;
    bool success = TRUE;
 
    for(rgb_func = blend_funcs; rgb_func < &blend_funcs[num_funcs]; ++rgb_func) {
@@ -841,27 +842,27 @@ test_some(unsigned verbose, FILE *fp, unsigned long n)
    const unsigned *alpha_dst_factor;
    struct pipe_blend_state blend;
    enum vector_mode mode;
-   const union lp_type *type;
+   const struct lp_type *type;
    unsigned long i;
    bool success = TRUE;
 
    for(i = 0; i < n; ++i) {
-      rgb_func = &blend_funcs[random() % num_funcs];
-      alpha_func = &blend_funcs[random() % num_funcs];
-      rgb_src_factor = &blend_factors[random() % num_factors];
-      alpha_src_factor = &blend_factors[random() % num_factors];
+      rgb_func = &blend_funcs[rand() % num_funcs];
+      alpha_func = &blend_funcs[rand() % num_funcs];
+      rgb_src_factor = &blend_factors[rand() % num_factors];
+      alpha_src_factor = &blend_factors[rand() % num_factors];
       
       do {
-         rgb_dst_factor = &blend_factors[random() % num_factors];
+         rgb_dst_factor = &blend_factors[rand() % num_factors];
       } while(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
       do {
-         alpha_dst_factor = &blend_factors[random() % num_factors];
+         alpha_dst_factor = &blend_factors[rand() % num_factors];
       } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
 
-      mode = random() & 1;
+      mode = rand() & 1;
 
-      type = &blend_types[random() % num_types];
+      type = &blend_types[rand() % num_types];
 
       memset(&blend, 0, sizeof blend);
       blend.blend_enable      = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index e6489834af5..968c7a2d4aa 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -59,8 +59,8 @@ write_tsv_header(FILE *fp)
 
 static void
 write_tsv_row(FILE *fp,
-              union lp_type src_type,
-              union lp_type dst_type,
+              struct lp_type src_type,
+              struct lp_type dst_type,
               double cycles,
               boolean success)
 {
@@ -80,8 +80,8 @@ write_tsv_row(FILE *fp,
 
 static void
 dump_conv_types(FILE *fp,
-               union lp_type src_type,
-               union lp_type dst_type)
+               struct lp_type src_type,
+               struct lp_type dst_type)
 {
    fprintf(fp, "src_type=");
    dump_type(fp, src_type);
@@ -96,8 +96,8 @@ dump_conv_types(FILE *fp,
 
 static LLVMValueRef
 add_conv_test(LLVMModuleRef module,
-              union lp_type src_type, unsigned num_srcs,
-              union lp_type dst_type, unsigned num_dsts)
+              struct lp_type src_type, unsigned num_srcs,
+              struct lp_type dst_type, unsigned num_dsts)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
@@ -142,11 +142,12 @@ add_conv_test(LLVMModuleRef module,
 }
 
 
+ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
-         union lp_type src_type,
-         union lp_type dst_type)
+         struct lp_type src_type,
+         struct lp_type dst_type)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef func = NULL;
@@ -156,8 +157,8 @@ test_one(unsigned verbose,
    char *error = NULL;
    conv_test_ptr_t conv_test_ptr;
    boolean success;
-   const unsigned n = 32;
-   int64_t cycles[n];
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
    double cycles_avg = 0.0;
    unsigned num_srcs;
    unsigned num_dsts;
@@ -229,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
@@ -343,35 +344,35 @@ test_one(unsigned verbose,
 }
 
 
-const union lp_type conv_types[] = {
+const struct lp_type conv_types[] = {
    /* float, fixed,  sign,  norm, width, len */
 
-   {{  TRUE, FALSE,  TRUE,  TRUE,    32,   4 }},
-   {{  TRUE, FALSE,  TRUE, FALSE,    32,   4 }},
-   {{  TRUE, FALSE, FALSE,  TRUE,    32,   4 }},
-   {{  TRUE, FALSE, FALSE, FALSE,    32,   4 }},
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
 
    /* TODO: test fixed formats too */
 
-   {{ FALSE, FALSE,  TRUE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    16,   8 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,    32,   4 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    32,   4 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    32,   4 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    32,   4 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,    16,   8 }},
-   {{ FALSE, FALSE, FALSE, FALSE,    16,   8 }},
-
-   {{ FALSE, FALSE,  TRUE,  TRUE,     8,  16 }},
-   {{ FALSE, FALSE,  TRUE, FALSE,     8,  16 }},
-   {{ FALSE, FALSE, FALSE,  TRUE,     8,  16 }},
-   {{ FALSE, FALSE, FALSE, FALSE,     8,  16 }},
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,  16 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,  16 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,  16 },
 };
 
 
@@ -381,8 +382,8 @@ const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
 boolean
 test_all(unsigned verbose, FILE *fp)
 {
-   const union lp_type *src_type;
-   const union lp_type *dst_type;
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
    bool success = TRUE;
 
    for(src_type = conv_types; src_type < &conv_types[num_types]; ++src_type) {
@@ -407,16 +408,16 @@ test_all(unsigned verbose, FILE *fp)
 boolean
 test_some(unsigned verbose, FILE *fp, unsigned long n)
 {
-   const union lp_type *src_type;
-   const union lp_type *dst_type;
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
    unsigned long i;
    bool success = TRUE;
 
    for(i = 0; i < n; ++i) {
-      src_type = &conv_types[random() % num_types];
+      src_type = &conv_types[rand() % num_types];
       
       do {
-         dst_type = &conv_types[random() % num_types];
+         dst_type = &conv_types[rand() % num_types];
       } while (src_type == dst_type || src_type->norm != dst_type->norm);
 
       if(!test_one(verbose, fp, *src_type, *dst_type))
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 1d192355eed..23ea9ebbe7d 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -35,10 +35,11 @@
 #include <llvm-c/Target.h>
 #include <llvm-c/Transforms/Scalar.h>
 
+#include "util/u_cpu_detect.h"
 #include "util/u_format.h"
 
-#include "lp_bld_flow.h"
 #include "lp_bld_format.h"
+#include "lp_test.h"
 
 
 struct pixel_test_case
@@ -89,40 +90,63 @@ struct pixel_test_case test_cases[] =
 };
 
 
-typedef void (*load_ptr_t)(const void *, float *);
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct util_format_description *desc,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%s\n", desc->name);
+
+   fflush(fp);
+}
+
+
+typedef void (*load_ptr_t)(const uint32_t packed, float *);
 
 
 static LLVMValueRef
 add_load_rgba_test(LLVMModuleRef module,
-                   enum pipe_format format)
+                   const struct util_format_description *desc)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
-   LLVMValueRef ptr;
+   LLVMValueRef packed;
    LLVMValueRef rgba_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef rgba;
-   struct lp_build_loop_state loop;
 
-   args[0] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[0] = LLVMInt32Type();
    args[1] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
 
    func = LLVMAddFunction(module, "load", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
-   ptr = LLVMGetParam(func, 0);
+   packed = LLVMGetParam(func, 0);
    rgba_ptr = LLVMGetParam(func, 1);
 
    block = LLVMAppendBasicBlock(func, "entry");
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 1, 0), &loop);
+   if(desc->block.bits < 32)
+      packed = LLVMBuildTrunc(builder, packed, LLVMIntType(desc->block.bits), "");
 
-   rgba = lp_build_load_rgba(builder, format, ptr);
-   LLVMBuildStore(builder, rgba, rgba_ptr);
+   rgba = lp_build_unpack_rgba_aos(builder, desc, packed);
 
-   lp_build_loop_end(builder, LLVMConstInt(LLVMInt32Type(), 4, 0), NULL, &loop);
+   LLVMBuildStore(builder, rgba, rgba_ptr);
 
    LLVMBuildRetVoid(builder);
 
@@ -131,27 +155,28 @@ add_load_rgba_test(LLVMModuleRef module,
 }
 
 
-typedef void (*store_ptr_t)(void *, const float *);
+typedef void (*store_ptr_t)(uint32_t *, const float *);
 
 
 static LLVMValueRef
 add_store_rgba_test(LLVMModuleRef module,
-                    enum pipe_format format)
+                    const struct util_format_description *desc)
 {
    LLVMTypeRef args[2];
    LLVMValueRef func;
-   LLVMValueRef ptr;
+   LLVMValueRef packed_ptr;
    LLVMValueRef rgba_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef rgba;
+   LLVMValueRef packed;
 
-   args[0] = LLVMPointerType(LLVMInt8Type(), 0);
+   args[0] = LLVMPointerType(LLVMInt32Type(), 0);
    args[1] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0);
 
    func = LLVMAddFunction(module, "store", LLVMFunctionType(LLVMVoidType(), args, 2, 0));
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
-   ptr = LLVMGetParam(func, 0);
+   packed_ptr = LLVMGetParam(func, 0);
    rgba_ptr = LLVMGetParam(func, 1);
 
    block = LLVMAppendBasicBlock(func, "entry");
@@ -160,7 +185,12 @@ add_store_rgba_test(LLVMModuleRef module,
 
    rgba = LLVMBuildLoad(builder, rgba_ptr, "");
 
-   lp_build_store_rgba(builder, format, ptr, rgba);
+   packed = lp_build_pack_rgba_aos(builder, desc, rgba);
+
+   if(desc->block.bits < 32)
+      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+
+   LLVMBuildStore(builder, packed, packed_ptr);
 
    LLVMBuildRetVoid(builder);
 
@@ -169,8 +199,9 @@ add_store_rgba_test(LLVMModuleRef module,
 }
 
 
+ALIGN_STACK
 static boolean
-test_format(const struct pixel_test_case *test)
+test_format(unsigned verbose, FILE *fp, const struct pixel_test_case *test)
 {
    LLVMModuleRef module = NULL;
    LLVMValueRef load = NULL;
@@ -192,8 +223,8 @@ test_format(const struct pixel_test_case *test)
 
    module = LLVMModuleCreateWithName("test");
 
-   load = add_load_rgba_test(module, test->format);
-   store = add_store_rgba_test(module, test->format);
+   load = add_load_rgba_test(module, desc);
+   store = add_store_rgba_test(module, desc);
 
    if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) {
       LLVMDumpModule(module);
@@ -229,7 +260,7 @@ test_format(const struct pixel_test_case *test)
    memset(unpacked, 0, sizeof unpacked);
    packed = 0;
 
-   load_ptr(&test->packed, unpacked);
+   load_ptr(test->packed, unpacked);
    store_ptr(&packed, unpacked);
 
    success = TRUE;
@@ -255,18 +286,29 @@ test_format(const struct pixel_test_case *test)
    if(pass)
       LLVMDisposePassManager(pass);
 
+   if(fp)
+      write_tsv_row(fp, desc, success);
+
    return success;
 }
 
 
-int main(int argc, char **argv)
+boolean
+test_all(unsigned verbose, FILE *fp)
 {
    unsigned i;
-   int ret;
+   bool success = TRUE;
 
    for (i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); ++i)
-      if(!test_format(&test_cases[i]))
-        ret = 1;
+      if(!test_format(verbose, fp, &test_cases[i]))
+        success = FALSE;
 
-   return ret;
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp, unsigned long n)
+{
+   return test_all(verbose, fp);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 49213fb4f0b..314544aa9a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -34,13 +34,28 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_const.h"
+#include "lp_bld_misc.h"
 #include "lp_test.h"
 
 
+#ifdef PIPE_CC_MSVC
+static INLINE double
+round(double x)
+{
+   if (x >= 0.0)
+      return floor(x + 0.5);
+   else
+      return ceil(x - 0.5);
+}
+#endif
+
+
 void
 dump_type(FILE *fp,
-          union lp_type type)
+          struct lp_type type)
 {
    fprintf(fp, "%s%s%u%sx%u",
            type.sign ? (type.floating || type.fixed ? "" : "s") : "u",
@@ -52,7 +67,7 @@ dump_type(FILE *fp,
 
 
 double
-read_elem(union lp_type type, const void *src, unsigned index)
+read_elem(struct lp_type type, const void *src, unsigned index)
 {
    double scale = lp_const_scale(type);
    double value;
@@ -115,7 +130,7 @@ read_elem(union lp_type type, const void *src, unsigned index)
 
 
 void
-write_elem(union lp_type type, void *dst, unsigned index, double value)
+write_elem(struct lp_type type, void *dst, unsigned index, double value)
 {
    assert(index < type.length);
    if(!type.sign && value < 0.0)
@@ -184,11 +199,11 @@ write_elem(union lp_type type, void *dst, unsigned index, double value)
 
 
 void
-random_elem(union lp_type type, void *dst, unsigned index)
+random_elem(struct lp_type type, void *dst, unsigned index)
 {
    double value;
    assert(index < type.length);
-   value = (double)random()/(double)RAND_MAX;
+   value = (double)rand()/(double)RAND_MAX;
    if(!type.norm) {
       unsigned long long mask;
       if (type.floating)
@@ -199,17 +214,17 @@ random_elem(union lp_type type, void *dst, unsigned index)
          mask = ((unsigned long long)1 << (type.width - 1)) - 1;
       else
          mask = ((unsigned long long)1 << type.width) - 1;
-      value += (double)(mask & random());
+      value += (double)(mask & rand());
    }
    if(!type.sign)
-      if(random() & 1)
+      if(rand() & 1)
          value = -value;
    write_elem(type, dst, index, value);
 }
 
 
 void
-read_vec(union lp_type type, const void *src, double *dst)
+read_vec(struct lp_type type, const void *src, double *dst)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -218,7 +233,7 @@ read_vec(union lp_type type, const void *src, double *dst)
 
 
 void
-write_vec(union lp_type type, void *dst, const double *src)
+write_vec(struct lp_type type, void *dst, const double *src)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -229,12 +244,12 @@ write_vec(union lp_type type, void *dst, const double *src)
 float
 random_float(void)
 {
-    return (float)((double)random()/(double)RAND_MAX);
+    return (float)((double)rand()/(double)RAND_MAX);
 }
 
 
 void
-random_vec(union lp_type type, void *dst)
+random_vec(struct lp_type type, void *dst)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i)
@@ -243,7 +258,7 @@ random_vec(union lp_type type, void *dst)
 
 
 boolean
-compare_vec_with_eps(union lp_type type, const void *res, const void *ref, double eps)
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i) {
@@ -259,7 +274,7 @@ compare_vec_with_eps(union lp_type type, const void *res, const void *ref, doubl
 
 
 boolean
-compare_vec(union lp_type type, const void *res, const void *ref)
+compare_vec(struct lp_type type, const void *res, const void *ref)
 {
    double eps = lp_const_eps(type);
    return compare_vec_with_eps(type, res, ref, eps);
@@ -267,7 +282,7 @@ compare_vec(union lp_type type, const void *res, const void *ref)
 
 
 void
-dump_vec(FILE *fp, union lp_type type, const void *src)
+dump_vec(FILE *fp, struct lp_type type, const void *src)
 {
    unsigned i;
    for (i = 0; i < type.length; ++i) {
@@ -365,6 +380,11 @@ int main(int argc, char **argv)
          n = atoi(argv[i]);
    }
 
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+
+   util_cpu_detect();
+
    if(fp) {
       /* Warm up the caches */
       test_some(0, NULL, 100);
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
index 23a94b5b0d5..773e8482425 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
@@ -154,7 +154,7 @@ lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
       if (lpt->timestamp != tc->timestamp) {
          /* texture was modified, invalidate all cached tiles */
          uint i;
-         _debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
+         debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
          for (i = 0; i < NUM_ENTRIES; i++) {
             tc->entries[i].addr.bits.invalid = 1;
          }
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 628ec3f1efd..9ad1bde9565 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -29,10 +29,13 @@
 #define LP_TEX_SAMPLE_H
 
 
+#include <llvm-c/Core.h>
+
 #include "tgsi/tgsi_exec.h"
 
 
 struct llvmpipe_tex_tile_cache;
+struct lp_sampler_static_state;
 
 
 /**
@@ -75,4 +78,24 @@ lp_get_samples(struct tgsi_sampler *tgsi_sampler,
                float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
+/**
+ * Texture sampling code generator that just calls lp_get_samples C function
+ * for the actual sampling computation.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr);
+
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
+ */
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key,
+                           LLVMValueRef context_ptr);
+
+
 #endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
index 94eb6dad5af..a1365a045f1 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
@@ -1578,3 +1578,136 @@ out:
    tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
 }
 
+
+void PIPE_CDECL
+lp_fetch_texel_soa( struct tgsi_sampler **samplers,
+                    uint32_t unit,
+                    float *store )
+{
+   struct tgsi_sampler *sampler = samplers[unit];
+
+#if 0
+   uint j;
+
+   debug_printf("%s sampler: %p (%p) store: %p\n",
+                __FUNCTION__,
+                sampler, *sampler,
+                store );
+
+   debug_printf("lodbias %f\n", store[12]);
+
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d texcoord %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j]);
+#endif
+
+   {
+      float rgba[NUM_CHANNELS][QUAD_SIZE];
+      sampler->get_samples(sampler,
+                           &store[0],
+                           &store[4],
+                           &store[8],
+                           0.0f, /*store[12],  lodbias */
+                           rgba);
+      memcpy(store, rgba, sizeof rgba);
+   }
+
+#if 0
+   for (j = 0; j < 4; j++)
+      debug_printf("sample %d result %f %f %f %f\n",
+                   j,
+                   store[0+j],
+                   store[4+j],
+                   store[8+j],
+                   store[12+j]);
+#endif
+}
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_tgsi.h"
+
+
+struct lp_c_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   LLVMValueRef context_ptr;
+
+   LLVMValueRef samplers_ptr;
+
+   /** Coords/texels store */
+   LLVMValueRef store_ptr;
+};
+
+
+static void
+lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
+                                  LLVMBuilderRef builder,
+                                  struct lp_type type,
+                                  unsigned unit,
+                                  unsigned num_coords,
+                                  const LLVMValueRef *coords,
+                                  LLVMValueRef lodbias,
+                                  LLVMValueRef *texel)
+{
+   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
+   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
+   LLVMValueRef args[3];
+   unsigned i;
+
+   if(!sampler->samplers_ptr)
+      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
+
+   if(!sampler->store_ptr)
+      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
+                                            vec_type,
+                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
+                                            "texel_store");
+
+   for (i = 0; i < num_coords; i++) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      LLVMBuildStore(builder, coords[i], coord_ptr);
+   }
+
+   args[0] = sampler->samplers_ptr;
+   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   args[2] = sampler->store_ptr;
+
+   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
+
+   for (i = 0; i < NUM_CHANNELS; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
+      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
+   }
+}
+
+
+struct lp_build_sampler_soa *
+lp_c_sampler_soa_create(LLVMValueRef context_ptr)
+{
+   struct lp_c_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_c_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
+   sampler->context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
new file mode 100644
index 00000000000..d2a6ae21f57
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_llvm.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - LLVM pipe driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_sample.h"
+#include "lp_bld_tgsi.h"
+#include "lp_state.h"
+#include "lp_tex_sample.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in lp_jit_context
+ * and lp_jit_texture and the sampler code generator. It provides the
+ * texture layout information required by the texture sampler code generator
+ * in terms of the state stored in lp_jit_context and lp_jit_texture in runtime.
+ */
+struct llvmpipe_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct lp_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct llvmpipe_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_texture_member(struct lp_sampler_dynamic_state *base,
+                       LLVMBuilderRef builder,
+                       unsigned unit,
+                       unsigned member_index,
+                       const char *member_name)
+{
+   struct llvmpipe_sampler_dynamic_state *state = (struct llvmpipe_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), LP_JIT_CONTEXT_TEXTURES_INDEX, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to fetch
+ * the members of lp_jit_texture to fulfill the sampler code generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture sampler code
+ * generator a reusable module without dependencies to llvmpipe internals.
+ */
+#define LP_LLVM_TEXTURE_MEMBER(_name, _index) \
+   static LLVMValueRef \
+   lp_llvm_texture_##_name( struct lp_sampler_dynamic_state *base, \
+                            LLVMBuilderRef builder, \
+                            unsigned unit) \
+   { \
+      return lp_llvm_texture_member(base, builder, unit, _index, #_name ); \
+   }
+
+
+LP_LLVM_TEXTURE_MEMBER(width,    LP_JIT_TEXTURE_WIDTH)
+LP_LLVM_TEXTURE_MEMBER(height,   LP_JIT_TEXTURE_HEIGHT)
+LP_LLVM_TEXTURE_MEMBER(stride,   LP_JIT_TEXTURE_STRIDE)
+LP_LLVM_TEXTURE_MEMBER(data_ptr, LP_JIT_TEXTURE_DATA)
+
+
+static void
+lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void
+lp_llvm_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *base,
+                                     LLVMBuilderRef builder,
+                                     struct lp_type type,
+                                     unsigned unit,
+                                     unsigned num_coords,
+                                     const LLVMValueRef *coords,
+                                     LLVMValueRef lodbias,
+                                     LLVMValueRef *texel)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords,
+                       coords,
+                       lodbias,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                           LLVMValueRef context_ptr)
+{
+   struct lp_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = lp_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = lp_llvm_texture_width;
+   sampler->dynamic_state.base.height = lp_llvm_texture_height;
+   sampler->dynamic_state.base.stride = lp_llvm_texture_stride;
+   sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 724d4378336..a00f2495dfc 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -66,16 +66,24 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
    pf_get_block(lpt->base.format, &lpt->base.block);
 
    for (level = 0; level <= pt->last_level; level++) {
+      unsigned nblocksx, nblocksy;
+
       pt->width[level] = width;
       pt->height[level] = height;
       pt->depth[level] = depth;
       pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
-      lpt->stride[level] = align(pt->nblocksx[level]*pt->block.size, 16);
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);
+
+      /* Allocate storage for whole quads. This is particularly important
+       * for depth surfaces, which are currently stored in a swizzled format. */
+      nblocksx = pf_get_nblocksx(&pt->block, align(width, 2));
+      nblocksy = pf_get_nblocksy(&pt->block, align(height, 2));
+
+      lpt->stride[level] = align(nblocksx*pt->block.size, 16);
 
       lpt->level_offset[level] = buffer_size;
 
-      buffer_size += (pt->nblocksy[level] *
+      buffer_size += (nblocksy *
                       ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
                       lpt->stride[level]);
 
@@ -353,17 +361,9 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
 
    if(lpt->dt) {
       struct llvmpipe_winsys *winsys = screen->winsys;
-      unsigned flags = 0;
-
-      if (transfer->usage != PIPE_TRANSFER_READ) {
-         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-      }
-
-      if (transfer->usage != PIPE_TRANSFER_WRITE) {
-         flags |= PIPE_BUFFER_USAGE_CPU_READ;
-      }
 
-      map = winsys->displaytarget_map(winsys, lpt->dt, flags);
+      map = winsys->displaytarget_map(winsys, lpt->dt,
+                                      pipe_transfer_buffer_flags(transfer));
       if (map == NULL)
          return NULL;
    }
@@ -373,7 +373,7 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
+   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE))
    {
       /* Do something to notify sharing contexts of a texture change.
        * In llvmpipe, that would mean flushing the texture cache.
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
index 143afec3d35..ec3e002d628 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -44,10 +44,53 @@
 #include "lp_tile_cache.h"
 
 
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+enum llvmpipe_tile_status
+{
+   LP_TILE_STATUS_UNDEFINED = 0,
+   LP_TILE_STATUS_CLEAR = 1,
+   LP_TILE_STATUS_DEFINED = 2
+};
+
+
+struct llvmpipe_cached_tile
+{
+   enum llvmpipe_tile_status status;
+
+   /** color in SOA format */
+   uint8_t *color;
+};
+
+
+struct llvmpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
+
+   uint8_t clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+
+   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
 struct llvmpipe_tile_cache *
 lp_create_tile_cache( struct pipe_screen *screen )
 {
    struct llvmpipe_tile_cache *tc;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
 
    tc = CALLOC_STRUCT( llvmpipe_tile_cache );
    if(!tc)
@@ -193,44 +236,41 @@ lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
    if(!pt)
       return;
 
+   assert(tc->transfer_map);
+
    /* push the tile to all positions marked as clear */
    for (y = 0; y < pt->height; y += TILE_SIZE) {
       for (x = 0; x < pt->width; x += TILE_SIZE) {
          struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
 
-         switch(tile->status) {
-         case LP_TILE_STATUS_UNDEFINED:
-            break;
-
-         case LP_TILE_STATUS_CLEAR: {
-            /**
-             * Actually clear the tiles which were flagged as being in a clear state.
-             */
-
-            struct pipe_screen *screen = pt->texture->screen;
-            unsigned tw = TILE_SIZE;
-            unsigned th = TILE_SIZE;
-            void *dst;
+         if(tile->status != LP_TILE_STATUS_UNDEFINED) {
+            unsigned w = TILE_SIZE;
+            unsigned h = TILE_SIZE;
+
+            if (!pipe_clip_tile(x, y, &w, &h, pt)) {
+               switch(tile->status) {
+               case LP_TILE_STATUS_CLEAR:
+                  /* Actually clear the tiles which were flagged as being in a
+                   * clear state. */
+                  util_fill_rect(tc->transfer_map, &pt->block, pt->stride,
+                                 x, y, w, h,
+                                 tc->clear_val);
+                  break;
+
+               case LP_TILE_STATUS_DEFINED:
+                  lp_tile_write_4ub(pt->format,
+                                    tile->color,
+                                    tc->transfer_map, pt->stride,
+                                    x, y, w, h);
+                  break;
+
+               default:
+                  assert(0);
+                  break;
+               }
+            }
 
-            if (pipe_clip_tile(x, y, &tw, &th, pt))
-               continue;
-
-            dst = screen->transfer_map(screen, pt);
-            assert(dst);
-            if(!dst)
-               continue;
-
-            util_fill_rect(dst, &pt->block, pt->stride,
-                           x, y, tw,  th,
-                           tc->clear_val);
-
-            screen->transfer_unmap(screen, pt);
-            break;
-         }
-
-         case LP_TILE_STATUS_DEFINED:
-            lp_put_tile_rgba_soa(pt, x, y, tile->color);
-            break;
+            tile->status = LP_TILE_STATUS_UNDEFINED;
          }
       }
    }
@@ -248,6 +288,9 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
    struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
    struct pipe_transfer *pt = tc->transfer;
    
+   assert(tc->surface);
+   assert(tc->transfer);
+
    switch(tile->status) {
    case LP_TILE_STATUS_CLEAR:
       /* don't get tile from framebuffer, just clear it */
@@ -255,11 +298,22 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
       tile->status = LP_TILE_STATUS_DEFINED;
       break;
 
-   case LP_TILE_STATUS_UNDEFINED:
-      /* get new tile data from transfer */
-      lp_get_tile_rgba_soa(pt, x, y, tile->color);
+   case LP_TILE_STATUS_UNDEFINED: {
+      unsigned w = TILE_SIZE;
+      unsigned h = TILE_SIZE;
+
+      x &= ~(TILE_SIZE - 1);
+      y &= ~(TILE_SIZE - 1);
+
+      if (!pipe_clip_tile(x, y, &w, &h, tc->transfer))
+         lp_tile_read_4ub(pt->format,
+                          tile->color,
+                          tc->transfer_map, tc->transfer->stride,
+                          x, y, w, h);
+
       tile->status = LP_TILE_STATUS_DEFINED;
       break;
+   }
 
    case LP_TILE_STATUS_DEFINED:
       /* nothing to do */
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
index 6d8ba5ece7a..161bab37991 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
@@ -33,42 +33,7 @@
 #include "lp_tile_soa.h"
 
 
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-/** XXX move these */
-#define MAX_WIDTH 2048
-#define MAX_HEIGHT 2048
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
+struct llvmpipe_tile_cache;  /* opaque */
 
 
 extern struct llvmpipe_tile_cache *
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.c b/src/gallium/drivers/llvmpipe/lp_tile_soa.c
deleted file mode 100644
index 4e4ccb31ccd..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.c
+++ /dev/null
@@ -1,931 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * RGBA/float tile get/put functions.
- * Usable both by drivers and state trackers.
- */
-
-
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "util/u_rect.h"
-#include "util/u_tile.h"
-#include "lp_tile_cache.h"
-#include "lp_tile_soa.h"
-
-
-const unsigned char
-tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {
-   {  0,  1,  4,  5,  8,  9, 12, 13},
-   {  2,  3,  6,  7, 10, 11, 14, 15}
-};
-
-
-
-/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
-
-static void
-a8r8g8b8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const unsigned pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = (pixel >> 16) & 0xff;
-         TILE_PIXEL(p, j, i, 1) = (pixel >>  8) & 0xff;
-         TILE_PIXEL(p, j, i, 2) = (pixel >>  0) & 0xff;
-         TILE_PIXEL(p, j, i, 3) = (pixel >> 24) & 0xff;
-      }
-   }
-}
-
-
-static void
-a8r8g8b8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
-
-static void
-x8r8g8b8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const unsigned pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = (pixel >> 16) & 0xff;
-         TILE_PIXEL(p, j, i, 1) = (pixel >>  8) & 0xff;
-         TILE_PIXEL(p, j, i, 2) = (pixel >>  0) & 0xff;
-         TILE_PIXEL(p, j, i, 3) = 0xff;
-      }
-   }
-}
-
-
-static void
-x8r8g8b8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
-
-static void
-b8g8r8a8_get_tile_rgba(const unsigned *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const unsigned pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = (pixel >>  8) & 0xff;
-         TILE_PIXEL(p, j, i, 1) = (pixel >> 16) & 0xff;
-         TILE_PIXEL(p, j, i, 2) = (pixel >> 24) & 0xff;
-         TILE_PIXEL(p, j, i, 3) = (pixel >>  0) & 0xff;
-      }
-   }
-}
-
-
-static void
-b8g8r8a8_put_tile_rgba(unsigned *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
-
-static void
-a1r5g5b5_get_tile_rgba(const ushort *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const ushort pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = ((pixel >> 10) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 1) = ((pixel >>  5) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 3) = ((pixel >> 15)       ) * 255;
-      }
-   }
-}
-
-
-static void
-a1r5g5b5_put_tile_rgba(ushort *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         r = r >> 3;  /* 5 bits */
-         g = g >> 3;  /* 5 bits */
-         b = b >> 3;  /* 5 bits */
-         a = a >> 7;  /* 1 bit */
-         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
-
-static void
-a4r4g4b4_get_tile_rgba(const ushort *src,
-                       unsigned w, unsigned h,
-                       uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const ushort pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = ((pixel >>  8) & 0xf) * 255 / 15;
-         TILE_PIXEL(p, j, i, 1) = ((pixel >>  4) & 0xf) * 255 / 15;
-         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0xf) * 255 / 15;
-         TILE_PIXEL(p, j, i, 3) = ((pixel >> 12)      ) * 255 / 15;
-      }
-   }
-}
-
-
-static void
-a4r4g4b4_put_tile_rgba(ushort *dst,
-                       unsigned w, unsigned h,
-                       const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, g, b, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         g = TILE_PIXEL(p, j, i, 1);
-         b = TILE_PIXEL(p, j, i, 2);
-         a = TILE_PIXEL(p, j, i, 3);
-         r >>= 4;
-         g >>= 4;
-         b >>= 4;
-         a >>= 4;
-         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_R5G6B5_UNORM ***/
-
-static void
-r5g6b5_get_tile_rgba(const ushort *src,
-                     unsigned w, unsigned h,
-                     uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         const ushort pixel = *src++;
-         TILE_PIXEL(p, j, i, 0) = ((pixel >> 11) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 1) = ((pixel >>  5) & 0x3f) * 255 / 63;
-         TILE_PIXEL(p, j, i, 2) = ((pixel      ) & 0x1f) * 255 / 31;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-r5g6b5_put_tile_rgba(ushort *dst,
-                     unsigned w, unsigned h,
-                     const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         uint r = (uint) TILE_PIXEL(p, j, i, 0) * 31 / 255;
-         uint g = (uint) TILE_PIXEL(p, j, i, 1) * 63 / 255;
-         uint b = (uint) TILE_PIXEL(p, j, i, 2) * 31 / 255;
-         *dst++ = (r << 11) | (g << 5) | (b);
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_Z16_UNORM ***/
-
-/**
- * Return each Z value as four floats in [0,1].
- */
-static void
-z16_get_tile_rgba(const ushort *src,
-                  unsigned w, unsigned h,
-                  uint8_t *p)
-{
-   const float scale = 1.0f / 65535.0f;
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = *src++ * scale;
-      }
-   }
-}
-
-
-
-
-/*** PIPE_FORMAT_L8_UNORM ***/
-
-static void
-l8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = *src;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-l8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r;
-         r = TILE_PIXEL(p, j, i, 0);
-         *dst++ = (ubyte) r;
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_A8_UNORM ***/
-
-static void
-a8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = 0;
-         TILE_PIXEL(p, j, i, 3) = *src;
-      }
-   }
-}
-
-
-static void
-a8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned a;
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (ubyte) a;
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_R16_SNORM ***/
-
-static void
-r16_get_tile_rgba(const short *src,
-                  unsigned w, unsigned h,
-                  uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) = MAX2(src[0] >> 7, 0);
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = 0;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-r16_put_tile_rgba(short *dst,
-                  unsigned w, unsigned h,
-                  const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, dst++) {
-         dst[0] = TILE_PIXEL(p, j, i, 0) << 7;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
-
-static void
-r16g16b16a16_get_tile_rgba(const short *src,
-                           unsigned w, unsigned h,
-                           uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src += 4) {
-         TILE_PIXEL(p, j, i, 0) = src[0] >> 8;
-         TILE_PIXEL(p, j, i, 1) = src[1] >> 8;
-         TILE_PIXEL(p, j, i, 2) = src[2] >> 8;
-         TILE_PIXEL(p, j, i, 3) = src[3] >> 8;
-      }
-   }
-}
-
-
-static void
-r16g16b16a16_put_tile_rgba(short *dst,
-                           unsigned w, unsigned h,
-                           const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, dst += 4) {
-         dst[0] = TILE_PIXEL(p, j, i, 0) << 8;
-         dst[1] = TILE_PIXEL(p, j, i, 1) << 8;
-         dst[2] = TILE_PIXEL(p, j, i, 2) << 8;
-         dst[3] = TILE_PIXEL(p, j, i, 3) << 8;
-      }
-   }
-}
-
-
-
-/*** PIPE_FORMAT_I8_UNORM ***/
-
-static void
-i8_get_tile_rgba(const ubyte *src,
-                 unsigned w, unsigned h,
-                 uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++, src++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = *src;
-      }
-   }
-}
-
-
-static void
-i8_put_tile_rgba(ubyte *dst,
-                 unsigned w, unsigned h,
-                 const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r;
-         r = TILE_PIXEL(p, j, i, 0);
-         *dst++ = (ubyte) r;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_A8L8_UNORM ***/
-
-static void
-a8l8_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         ushort ra = *src++;
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) = ra & 0xff;
-         TILE_PIXEL(p, j, i, 3) = ra >> 8;
-      }
-   }
-}
-
-
-static void
-a8l8_put_tile_rgba(ushort *dst,
-                   unsigned w, unsigned h,
-                   const uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         unsigned r, a;
-         r = TILE_PIXEL(p, j, i, 0);
-         a = TILE_PIXEL(p, j, i, 3);
-         *dst++ = (a << 8) | r;
-      }
-   }
-}
-
-
-
-
-/*** PIPE_FORMAT_Z32_UNORM ***/
-
-/**
- * Return each Z value as four floats in [0,1].
- */
-static void
-z32_get_tile_rgba(const unsigned *src,
-                  unsigned w, unsigned h,
-                  uint8_t *p)
-{
-   const double scale = 1.0 / (double) 0xffffffff;
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (float) (*src++ * scale);
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_S8Z24_UNORM ***/
-
-/**
- * Return Z component as four float in [0,1].  Stencil part ignored.
- */
-static void
-s8z24_get_tile_rgba(const unsigned *src,
-                    unsigned w, unsigned h,
-                    uint8_t *p)
-{
-   const double scale = 1.0 / ((1 << 24) - 1);
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (float) (scale * (*src++ & 0xffffff));
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_Z24S8_UNORM ***/
-
-/**
- * Return Z component as four float in [0,1].  Stencil part ignored.
- */
-static void
-z24s8_get_tile_rgba(const unsigned *src,
-                    unsigned w, unsigned h,
-                    uint8_t *p)
-{
-   const double scale = 1.0 / ((1 << 24) - 1);
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (float) (scale * (*src++ >> 8));
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_Z32_FLOAT ***/
-
-/**
- * Return each Z value as four floats in [0,1].
- */
-static void
-z32f_get_tile_rgba(const float *src,
-                   unsigned w, unsigned h,
-                   uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = *src++;
-      }
-   }
-}
-
-
-/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
-
-/**
- * Convert YCbCr (or YCrCb) to RGBA.
- */
-static void
-ycbcr_get_tile_rgba(const ushort *src,
-                    unsigned w, unsigned h,
-                    uint8_t *p,
-                    boolean rev)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      /* do two texels at a time */
-      for (j = 0; j < (w & ~1); j += 2, src += 2) {
-         const ushort t0 = src[0];
-         const ushort t1 = src[1];
-         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
-         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
-         ubyte cb, cr;
-         float r, g, b;
-
-         if (rev) {
-            cb = t1 & 0xff;         /* chroma U */
-            cr = t0 & 0xff;         /* chroma V */
-         }
-         else {
-            cb = t0 & 0xff;         /* chroma U */
-            cr = t1 & 0xff;         /* chroma V */
-         }
-
-         /* even pixel: y0,cr,cb */
-         r = 1.164f * (y0-16) + 1.596f * (cr-128);
-         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y0-16) + 2.018f * (cb-128);
-         TILE_PIXEL(p, j, i, 0) = r;
-         TILE_PIXEL(p, j, i, 1) = g;
-         TILE_PIXEL(p, j, i, 2) = b;
-         TILE_PIXEL(p, j, i, 3) = 255;
-
-         /* odd pixel: use y1,cr,cb */
-         r = 1.164f * (y1-16) + 1.596f * (cr-128);
-         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y1-16) + 2.018f * (cb-128);
-         TILE_PIXEL(p, j + 1, i, 0) = r;
-         TILE_PIXEL(p, j + 1, i, 1) = g;
-         TILE_PIXEL(p, j + 1, i, 2) = b;
-         TILE_PIXEL(p, j + 1, i, 3) = 255;
-      }
-      /* do the last texel */
-      if (w & 1) {
-         const ushort t0 = src[0];
-         const ushort t1 = src[1];
-         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
-         ubyte cb, cr;
-         float r, g, b;
-
-         if (rev) {
-            cb = t1 & 0xff;         /* chroma U */
-            cr = t0 & 0xff;         /* chroma V */
-         }
-         else {
-            cb = t0 & 0xff;         /* chroma U */
-            cr = t1 & 0xff;         /* chroma V */
-         }
-
-         /* even pixel: y0,cr,cb */
-         r = 1.164f * (y0-16) + 1.596f * (cr-128);
-         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
-         b = 1.164f * (y0-16) + 2.018f * (cb-128);
-         TILE_PIXEL(p, j, i, 0) = r;
-         TILE_PIXEL(p, j, i, 1) = g;
-         TILE_PIXEL(p, j, i, 2) = b;
-         TILE_PIXEL(p, j, i, 3) = 255;
-      }
-   }
-}
-
-
-static void
-fake_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   uint8_t *p)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      for (j = 0; j < w; j++) {
-         TILE_PIXEL(p, j, i, 0) =
-         TILE_PIXEL(p, j, i, 1) =
-         TILE_PIXEL(p, j, i, 2) =
-         TILE_PIXEL(p, j, i, 3) = (i ^ j) & 1 ? 255 : 0;
-      }
-   }
-}
-
-
-static void
-lp_tile_raw_to_rgba_soa(enum pipe_format format,
-                        void *src,
-                        uint w, uint h,
-                        uint8_t *p)
-{
-   switch (format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
-      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
-      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
-      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_R5G6B5_UNORM:
-      r5g6b5_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_L8_UNORM:
-      l8_get_tile_rgba((ubyte *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A8_UNORM:
-      a8_get_tile_rgba((ubyte *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_I8_UNORM:
-      i8_get_tile_rgba((ubyte *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_A8L8_UNORM:
-      a8l8_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_R16_SNORM:
-      r16_get_tile_rgba((short *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      r16g16b16a16_get_tile_rgba((short *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      z16_get_tile_rgba((ushort *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      z32_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      s8z24_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-      z24s8_get_tile_rgba((unsigned *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_Z32_FLOAT:
-      z32f_get_tile_rgba((float *) src, w, h, p);
-      break;
-   case PIPE_FORMAT_YCBCR:
-      ycbcr_get_tile_rgba((ushort *) src, w, h, p, FALSE);
-      break;
-   case PIPE_FORMAT_YCBCR_REV:
-      ycbcr_get_tile_rgba((ushort *) src, w, h, p, TRUE);
-      break;
-   default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
-      fake_get_tile_rgba(src, w, h, p);
-   }
-}
-
-
-void
-lp_get_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     uint8_t *p)
-{
-   uint w = TILE_SIZE, h = TILE_SIZE;
-   void *packed;
-
-   if (pipe_clip_tile(x, y, &w, &h, pt))
-      return;
-
-   packed = MALLOC(pf_get_nblocks(&pt->block, w, h) * pt->block.size);
-
-   if (!packed)
-      return;
-
-   if(pt->format == PIPE_FORMAT_YCBCR || pt->format == PIPE_FORMAT_YCBCR_REV)
-      assert((x & 1) == 0);
-
-   pipe_get_tile_raw(pt, x, y, w, h, packed, 0);
-
-   lp_tile_raw_to_rgba_soa(pt->format, packed, w, h, p);
-
-   FREE(packed);
-}
-
-
-void
-lp_put_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     const uint8_t *p)
-{
-   uint w = TILE_SIZE, h = TILE_SIZE;
-   void *packed;
-
-   if (pipe_clip_tile(x, y, &w, &h, pt))
-      return;
-
-   packed = MALLOC(pf_get_nblocks(&pt->block, w, h) * pt->block.size);
-
-   if (!packed)
-      return;
-
-   switch (pt->format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
-      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
-      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R5G6B5_UNORM:
-      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      assert(0);
-      break;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
-      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_L8_UNORM:
-      l8_put_tile_rgba((ubyte *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_A8_UNORM:
-      a8_put_tile_rgba((ubyte *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_I8_UNORM:
-      i8_put_tile_rgba((ubyte *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_A8L8_UNORM:
-      a8l8_put_tile_rgba((ushort *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R16_SNORM:
-      r16_put_tile_rgba((short *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p);
-      break;
-   case PIPE_FORMAT_Z16_UNORM:
-      /*z16_put_tile_rgba((ushort *) packed, w, h, p);*/
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      /*z32_put_tile_rgba((unsigned *) packed, w, h, p);*/
-      break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p);*/
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p);*/
-      break;
-   default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(pt->format));
-   }
-
-   pipe_put_tile_raw(pt, x, y, w, h, packed, 0);
-
-   FREE(packed);
-}
-
-
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
index 3d8c703b73d..040b01865dd 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
@@ -64,14 +64,18 @@ tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH];
 
 
 void
-lp_get_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     uint8_t *p);
+lp_tile_read_4ub(enum pipe_format format,
+                 uint8_t *dst,
+                 const void *src, unsigned src_stride,
+                 unsigned x, unsigned y, unsigned w, unsigned h);
+
 
 void
-lp_put_tile_rgba_soa(struct pipe_transfer *pt,
-                     uint x, uint y,
-                     const uint8_t *p);
+lp_tile_write_4ub(enum pipe_format format,
+                  const uint8_t *src,
+                  void *dst, unsigned dst_stride,
+                  unsigned x, unsigned y, unsigned w, unsigned h);
+
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
new file mode 100644
index 00000000000..004c5c979e3
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+'''
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Pixel format accessor functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+'''
+
+
+import sys
+import os.path
+
+sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '../../auxiliary/util'))
+
+from u_format_access import *
+
+
+def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
+    '''Generate the function to read pixels from a particular format'''
+
+    name = short_name(format)
+
+    src_native_type = native_type(format)
+
+    print 'static void'
+    print 'lp_tile_%s_read_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, dst_suffix, dst_native_type)
+    print '{'
+    print '   unsigned x, y;'
+    print '   const uint8_t *src_row = src + y0*src_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
+
+    names = ['']*4
+    if format.colorspace == 'rgb':
+        for i in range(4):
+            swizzle = format.out_swizzle[i]
+            if swizzle < 4:
+                names[swizzle] += 'rgba'[i]
+    elif format.colorspace == 'zs':
+        swizzle = format.out_swizzle[0]
+        if swizzle < 4:
+            names[swizzle] = 'z'
+        else:
+            assert False
+    else:
+        assert False
+
+    if format.layout == ARITH:
+        print '         %s pixel = *src_pixel++;' % src_native_type
+        shift = 0;
+        for i in range(4):
+            src_type = format.in_types[i]
+            width = src_type.size
+            if names[i]:
+                value = 'pixel'
+                mask = (1 << width) - 1
+                if shift:
+                    value = '(%s >> %u)' % (value, shift)
+                if shift + width < format.block_size():
+                    value = '(%s & 0x%x)' % (value, mask)
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                print '         %s %s = %s;' % (dst_native_type, names[i], value)
+            shift += width
+    elif format.layout == ARRAY:
+        for i in range(4):
+            src_type = format.in_types[i]
+            if names[i]:
+                value = '(*src_pixel++)'
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                print '         %s %s = %s;' % (dst_native_type, names[i], value)
+    else:
+        assert False
+
+    for i in range(4):
+        if format.colorspace == 'rgb':
+            swizzle = format.out_swizzle[i]
+            if swizzle < 4:
+                value = names[swizzle]
+            elif swizzle == SWIZZLE_0:
+                value = '0'
+            elif swizzle == SWIZZLE_1:
+                value = '1'
+            else:
+                assert False
+        elif format.colorspace == 'zs':
+            if i < 3:
+                value = 'z'
+            else:
+                value = '1'
+        else:
+            assert False
+        print '         TILE_PIXEL(dst, x, y, %u) = %s; /* %s */' % (i, value, 'rgba'[i])
+
+    print '      }'
+    print '      src_row += src_stride;'
+    print '   }'
+    print '}'
+    print
+    
+
+def generate_format_write(format, src_type, src_native_type, src_suffix):
+    '''Generate the function to write pixels to a particular format'''
+
+    name = short_name(format)
+
+    dst_native_type = native_type(format)
+
+    print 'static void'
+    print 'lp_tile_%s_write_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type)
+    print '{'
+    print '   unsigned x, y;'
+    print '   uint8_t *dst_row = dst + y0*dst_stride;'
+    print '   for (y = 0; y < h; ++y) {'
+    print '      %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride())
+    print '      for (x = 0; x < w; ++x) {'
+
+    inv_swizzle = [None]*4
+    if format.colorspace == 'rgb':
+        for i in range(4):
+            swizzle = format.out_swizzle[i]
+            if swizzle < 4:
+                inv_swizzle[swizzle] = i
+    elif format.colorspace == 'zs':
+        swizzle = format.out_swizzle[0]
+        if swizzle < 4:
+            inv_swizzle[swizzle] = 0
+    else:
+        assert False
+
+    if format.layout == ARITH:
+        print '         %s pixel = 0;' % dst_native_type
+        shift = 0;
+        for i in range(4):
+            dst_type = format.in_types[i]
+            width = dst_type.size
+            if inv_swizzle[i] is not None:
+                value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                if shift:
+                    value = '(%s << %u)' % (value, shift)
+                print '         pixel |= %s;' % value
+            shift += width
+        print '         *dst_pixel++ = pixel;'
+    elif format.layout == ARRAY:
+        for i in range(4):
+            dst_type = format.in_types[i]
+            if inv_swizzle[i] is not None:
+                value = 'TILE_PIXEL(src, x, y, %u)' % inv_swizzle[i]
+                value = conversion_expr(src_type, dst_type, dst_native_type, value)
+                print '         *dst_pixel++ = %s;' % value
+    else:
+        assert False
+
+    print '      }'
+    print '      dst_row += dst_stride;'
+    print '   }'
+    print '}'
+    print
+    
+
+def generate_read(formats, dst_type, dst_native_type, dst_suffix):
+    '''Generate the dispatch function to read pixels from any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_read(format, dst_type, dst_native_type, dst_suffix)
+
+    print 'void'
+    print 'lp_tile_read_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type)
+    print '{'
+    print '   void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % dst_native_type
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &lp_tile_%s_read_%s;' % (short_name(format), dst_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("unsupported format\\n");'
+    print '      return;'
+    print '   }'
+    print '   func(dst, (const uint8_t *)src, src_stride, x, y, w, h);'
+    print '}'
+    print
+
+
+def generate_write(formats, src_type, src_native_type, src_suffix):
+    '''Generate the dispatch function to write pixels to any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_write(format, src_type, src_native_type, src_suffix)
+
+    print 'void'
+    print 'lp_tile_write_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type)
+    
+    print '{'
+    print '   void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % src_native_type
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &lp_tile_%s_write_%s;' % (short_name(format), src_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("unsupported format\\n");'
+    print '      return;'
+    print '   }'
+    print '   func(src, (uint8_t *)dst, dst_stride, x, y, w, h);'
+    print '}'
+    print
+
+
+def main():
+    formats = []
+    for arg in sys.argv[1:]:
+        formats.extend(parse(arg))
+
+    print '/* This file is autogenerated by lp_tile_soa.py from u_format.csv. Do not edit directly. */'
+    print
+    # This will print the copyright message on the top of this file
+    print __doc__.strip()
+    print
+    print '#include "pipe/p_compiler.h"'
+    print '#include "util/u_format.h"'
+    print '#include "util/u_math.h"'
+    print '#include "lp_tile_soa.h"'
+    print
+    print 'const unsigned char'
+    print 'tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {'
+    print '   {  0,  1,  4,  5,  8,  9, 12, 13},'
+    print '   {  2,  3,  6,  7, 10, 11, 14, 15}'
+    print '};'
+    print
+
+    generate_clamp()
+
+    type = Type(UNSIGNED, True, 8)
+    native_type = 'uint8_t'
+    suffix = '4ub'
+
+    generate_read(formats, type, native_type, suffix)
+    generate_write(formats, type, native_type, suffix)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/nouveau/Makefile b/src/gallium/drivers/nouveau/Makefile
index dbe8a6e7bf5..0cb66041d50 100644
--- a/src/gallium/drivers/nouveau/Makefile
+++ b/src/gallium/drivers/nouveau/Makefile
@@ -3,6 +3,7 @@ include $(TOP)/configs/current
 
 LIBNAME = nouveau
 
-C_SOURCES = nouveau_screen.c
+C_SOURCES = nouveau_screen.c \
+	    nouveau_context.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/nouveau/nouveau_context.c b/src/gallium/drivers/nouveau/nouveau_context.c
new file mode 100644
index 00000000000..23443869e68
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_context.c
@@ -0,0 +1,41 @@
+#include <pipe/p_defines.h>
+#include <pipe/p_context.h>
+
+#include "nouveau/nouveau_screen.h"
+#include "nouveau/nouveau_context.h"
+
+#include "nouveau/nouveau_bo.h"
+
+static unsigned int
+nouveau_reference_flags(struct nouveau_bo *bo)
+{
+	uint32_t bo_flags;
+	int flags = 0;
+
+	bo_flags = nouveau_bo_pending(bo);
+	if (bo_flags & NOUVEAU_BO_RD)
+		flags |= PIPE_REFERENCED_FOR_READ;
+	if (bo_flags & NOUVEAU_BO_WR)
+		flags |= PIPE_REFERENCED_FOR_WRITE;
+
+	return flags;
+}
+
+unsigned int
+nouveau_is_texture_referenced(struct pipe_context *pipe,
+			      struct pipe_texture *pt,
+			      unsigned face, unsigned level)
+{
+	struct nouveau_miptree *mt = nouveau_miptree(pt);
+
+	return nouveau_reference_flags(mt->bo);
+}
+
+unsigned int
+nouveau_is_buffer_referenced(struct pipe_context *pipe, struct pipe_buffer *pb)
+{
+	struct nouveau_bo *bo = nouveau_bo(pb);
+
+	return nouveau_reference_flags(bo);
+}
+
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
new file mode 100644
index 00000000000..6a28d40da7b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -0,0 +1,11 @@
+#ifndef __NOUVEAU_CONTEXT_H__
+#define __NOUVEAU_CONTEXT_H__
+
+unsigned int
+nouveau_is_texture_referenced(struct pipe_context *, struct pipe_texture *,
+			      unsigned face, unsigned level);
+
+unsigned int
+nouveau_is_buffer_referenced(struct pipe_context *, struct pipe_buffer *);
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
index 17166c9f51d..10d984ace9b 100644
--- a/src/gallium/drivers/nv04/nv04_context.c
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -64,30 +64,6 @@ nv04_init_hwctx(struct nv04_context *nv04)
 	return TRUE;
 }
 
-static unsigned int
-nv04_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-nv04_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-
 struct pipe_context *
 nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -113,8 +89,8 @@ nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv04->pipe.clear = nv04_clear;
 	nv04->pipe.flush = nv04_flush;
 
-	nv04->pipe.is_texture_referenced = nv04_is_texture_referenced;
-	nv04->pipe.is_buffer_referenced = nv04_is_buffer_referenced;
+	nv04->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nv04->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
 	nv04_init_surface_functions(nv04);
 	nv04_init_state_functions(nv04);
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
index 2842b2c90db..55326c787a8 100644
--- a/src/gallium/drivers/nv04/nv04_context.h
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -13,6 +13,7 @@
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_context.h"
 
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
 	struct nv04_screen *ctx = nv04->screen
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index ff2febb668e..170ce3eb7e5 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -16,8 +16,6 @@ nv04_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 0;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index f88e138c79d..8be134b83dd 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -1,5 +1,6 @@
 #include "pipe/p_context.h"
 #include "pipe/p_format.h"
+#include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include "nouveau/nouveau_winsys.h"
@@ -12,10 +13,13 @@ nv04_surface_format(enum pipe_format format)
 {
 	switch (format) {
 	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
 		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
 	case PIPE_FORMAT_R16_SNORM:
 	case PIPE_FORMAT_R5G6B5_UNORM:
 	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_A8L8_UNORM:
 		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
 	case PIPE_FORMAT_X8R8G8B8_UNORM:
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -35,8 +39,10 @@ nv04_rect_format(enum pipe_format format)
 	case PIPE_FORMAT_A8_UNORM:
 		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
 	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_A8L8_UNORM:
 	case PIPE_FORMAT_Z16_UNORM:
 		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 	case PIPE_FORMAT_Z24S8_UNORM:
 	case PIPE_FORMAT_Z24X8_UNORM:
@@ -50,6 +56,10 @@ static INLINE int
 nv04_scaled_image_format(enum pipe_format format)
 {
 	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
 	case PIPE_FORMAT_A1R5G5B5_UNORM:
 		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -58,6 +68,7 @@ nv04_scaled_image_format(enum pipe_format format)
 		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
 	case PIPE_FORMAT_R5G6B5_UNORM:
 	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_A8L8_UNORM:
 		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
 	default:
 		return -1;
@@ -107,17 +118,20 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
 	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
 	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+        /* Max width & height may not be the same on all HW, but must be POT */
 	const unsigned max_w = 1024;
 	const unsigned max_h = 1024;
-	const unsigned sub_w = w > max_w ? max_w : w;
-	const unsigned sub_h = h > max_h ? max_h : h;
-	unsigned cx;
-	unsigned cy;
+	unsigned sub_w = w > max_w ? max_w : w;
+	unsigned sub_h = h > max_h ? max_h : h;
+	unsigned x;
+	unsigned y;
 
-#if 0
-	/* That's the way she likes it */
-	assert(src_pitch == ((struct nv04_surface *)dst)->pitch);
-#endif
+        /* Swizzled surfaces must be POT  */
+	assert(util_is_pot(dst->width) && util_is_pot(dst->height));
+
+        /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */
+	assert(sub_w == w || util_is_pot(sub_w));
+	assert(sub_h == h || util_is_pot(sub_h));
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
 	OUT_RELOCo(chan, dst_bo,
@@ -125,41 +139,46 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
 	OUT_RING  (chan, nv04_surface_format(dst->format) |
-	                 log2i(w) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
-	                 log2i(h) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
- 
+	                 log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
 	OUT_RELOCo(chan, src_bo,
 	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
 	OUT_RING  (chan, swzsurf->handle);
 
-	for (cy = 0; cy < h; cy += sub_h) {
-	  for (cx = 0; cx < w; cx += sub_w) {
+	for (y = 0; y < h; y += sub_h) {
+	  sub_h = MIN2(sub_h, h - y);
+
+	  for (x = 0; x < w; x += sub_w) {
+	    sub_w = MIN2(sub_w, w - x);
+
+	    /* Must be 64-byte aligned */
+	    assert(!((dst->offset + nv04_swizzle_bits(dx+x, dy+y) * dst->texture->block.size) & 63));
+
 	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(cx+dx, cy+dy) *
-			     dst->texture->block.size, NOUVEAU_BO_GART |
-			     NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(dx+x, dy+y) * dst->texture->block.size,
+                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
 	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
 	    OUT_RING  (chan, 0);
-	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
 	    OUT_RING  (chan, 0);
-	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
 	    OUT_RING  (chan, 1 << 20);
 	    OUT_RING  (chan, 1 << 20);
 
 	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
-	    OUT_RING  (chan, sub_h << 16 | sub_w);
+	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w);
 	    OUT_RING  (chan, src_pitch |
 			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	    OUT_RELOCl(chan, src_bo, src->offset + (cy+sy) * src_pitch +
-			     (cx+sx) * src->texture->block.size, NOUVEAU_BO_GART |
-			     NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * src->texture->block.size,
+                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 	    OUT_RING  (chan, 0);
 	  }
 	}
@@ -214,43 +233,6 @@ nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
 }
 
 static int
-nv04_surface_copy_m2mf_swizzle(struct nv04_surface_2d *ctx,
-			       struct pipe_surface *dst, int dx, int dy,
-			       struct pipe_surface *src, int sx, int sy)
-{
-	struct nouveau_channel *chan = ctx->m2mf->channel;
-	struct nouveau_grobj *m2mf = ctx->m2mf;
-	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
-	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
-	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
-	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
-	unsigned dst_offset = dst->offset + nv04_swizzle_bits(dx, dy) *
-	                      dst->texture->block.size;
-	unsigned src_offset = src->offset + sy * src_pitch +
-	                      sx * src->texture->block.size;
-
-	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
-	OUT_RELOCo(chan, src_bo,
-		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
-	OUT_RELOCo(chan, dst_bo,
-		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-
-	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
-	OUT_RELOCl(chan, src_bo, src_offset,
-		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCl(chan, dst_bo, dst_offset,
-		   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
-	OUT_RING  (chan, src_pitch);
-	OUT_RING  (chan, dst_pitch);
-	OUT_RING  (chan, 1 * src->texture->block.size);
-	OUT_RING  (chan, 1);
-	OUT_RING  (chan, 0x0101);
-	OUT_RING  (chan, 0);
-
-	return 0;
-}
-
-static int
 nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
 		       int w, int h)
@@ -299,61 +281,10 @@ nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 	assert(src->format == dst->format);
 
 	/* Setup transfer to swizzle the texture to vram if needed */
-	if (src_linear && !dst_linear) {
-		int x,y;
-
-		if ((w>1) && (h>1)) {
-			int potWidth = 1<<log2i(w);
-			int potHeight = 1<<log2i(h);
-			int remainWidth = w-potWidth;
-			int remainHeight = h-potHeight;
-			int squareDim = (potWidth>potHeight ? potHeight : potWidth);
-
-			/* top left is always POT, but we can only swizzle squares */
-			for (y=0; y<potHeight; y+=squareDim) {
-				for (x=0; x<potWidth; x+= squareDim) {
-					nv04_surface_copy_swizzle(ctx, dst, dx+x, dy+y,
-					                          src, sx+x, sy+y,
-					                          squareDim, squareDim);
-				}
-			}
-
-			/* top right */
-			if (remainWidth>0) {
-			nv04_surface_copy(ctx, dst, dx+potWidth, dy,
-				                  src, sx+potWidth, sy,
-				                  remainWidth, potHeight);
-			}
-
-			/* bottom left */
-			if (remainHeight>0) {
-				nv04_surface_copy(ctx, dst, dx, dy+potHeight,
-			                  src, sx, sy+potHeight,
-				                  potWidth, remainHeight);
-			}
-
-			/* bottom right */
-			if ((remainWidth>0) && (remainHeight>0)) {
-				nv04_surface_copy(ctx, dst, dx+potWidth, dy+potHeight,
-				                  src, sx+potWidth, sy+potHeight,
-				                  remainWidth, remainHeight);
-			}
-		} else if (w==1) {
-			/* We have a column to copy to a swizzled texture */
-			for (y=0; y<h; y++) {
-				nv04_surface_copy_m2mf_swizzle(ctx, dst, dx, dy+y,
-				                               src, sx, sy+y);
-			}
-		} else if (h==1) {
-			/* We have a row to copy to a swizzled texture */
-			for (x=0; x<w; x++) {
-				nv04_surface_copy_m2mf_swizzle(ctx, dst, dx+x, dy,
-				                               src, sx+x, sy);
-			}
-		}
-
-		return;
-	}
+        if (src_linear && !dst_linear && w > 1 && h > 1) {
+           nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+           return;
+        }
 
 	/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
 	 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
diff --git a/src/gallium/drivers/nv04/nv04_transfer.c b/src/gallium/drivers/nv04/nv04_transfer.c
index 854b855d64a..6618660743d 100644
--- a/src/gallium/drivers/nv04/nv04_transfer.c
+++ b/src/gallium/drivers/nv04/nv04_transfer.c
@@ -13,22 +13,6 @@ struct nv04_transfer {
 	bool direct;
 };
 
-static unsigned nv04_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv04_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               0, 0, 0,
-	                                               nv04_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
-	                                       nv04_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv04_screen *nvscreen = nv04_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv04_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv04_transfer *tx = (struct nv04_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv04_screen *nvscreen = nv04_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv04_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv04_miptree *mt = (struct nv04_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv04_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
index a127b134ecd..65a22b175e1 100644
--- a/src/gallium/drivers/nv10/nv10_context.c
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -243,7 +243,7 @@ static void nv10_init_hwctx(struct nv10_context *nv10)
 	OUT_RING  (0.0);
 	OUT_RINGf  (16777216.0);
 
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_SCALE_X, 4);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
 	OUT_RINGf  (-2048.0);
 	OUT_RINGf  (-2048.0);
 	OUT_RINGf  (16777215.0 * 0.5);
@@ -257,29 +257,6 @@ nv10_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
 {
 }
 
-static unsigned int
-nv10_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-nv10_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
 struct pipe_context *
 nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -305,8 +282,8 @@ nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv10->pipe.clear = nv10_clear;
 	nv10->pipe.flush = nv10_flush;
 
-	nv10->pipe.is_texture_referenced = nv10_is_texture_referenced;
-	nv10->pipe.is_buffer_referenced = nv10_is_buffer_referenced;
+	nv10->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nv10->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
 	nv10_init_surface_functions(nv10);
 	nv10_init_state_functions(nv10);
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
index f1e003c9537..36a6aa7a74e 100644
--- a/src/gallium/drivers/nv10/nv10_context.h
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -13,6 +13,7 @@
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_context.h"
 
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
 	struct nv10_screen *ctx = nv10->screen
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
index 1806d5f8ccc..7ba9777a222 100644
--- a/src/gallium/drivers/nv10/nv10_prim_vbuf.c
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -69,9 +69,9 @@ void nv10_vtxbuf_bind( struct nv10_context* nv10 )
 {
 	int i;
 	for(i = 0; i < 8; i++) {
-		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(i), 1);
+		BEGIN_RING(celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
 		OUT_RING(0/*nv10->vtxbuf*/);
-		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(i) ,1);
+		BEGIN_RING(celsius, NV10TCL_VTXFMT(i), 1);
 		OUT_RING(0/*XXX*/);
 	}
 }
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index 4469b22d91a..ee5901e743e 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -15,8 +15,6 @@ nv10_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
index d8691ef9c67..2577ab73b56 100644
--- a/src/gallium/drivers/nv10/nv10_state_emit.c
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -129,6 +129,9 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	rt_format = NV10TCL_RT_FORMAT_TYPE_LINEAR;
 
 	switch (colour_format) {
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 	case 0:
 		rt_format |= NV10TCL_RT_FORMAT_COLOR_A8R8G8B8;
diff --git a/src/gallium/drivers/nv10/nv10_transfer.c b/src/gallium/drivers/nv10/nv10_transfer.c
index c06b8d34c72..8feb85e4bda 100644
--- a/src/gallium/drivers/nv10/nv10_transfer.c
+++ b/src/gallium/drivers/nv10/nv10_transfer.c
@@ -13,22 +13,6 @@ struct nv10_transfer {
 	bool direct;
 };
 
-static unsigned nv10_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv10_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               0, 0, 0,
-	                                               nv10_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
-	                                       nv10_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv10_screen *nvscreen = nv10_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv10_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv10_transfer *tx = (struct nv10_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv10_screen *nvscreen = nv10_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv10_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv10_miptree *mt = (struct nv10_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv10_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
index b32d0d83ba0..276db8b57b6 100644
--- a/src/gallium/drivers/nv20/nv20_context.c
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -360,13 +360,13 @@ static void nv20_init_hwctx(struct nv20_context *nv20)
 	OUT_RINGf (0.0);
 	OUT_RINGf (16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE0_X, 4);
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
 	OUT_RINGf (0.0); /* x-offset, w/2 + 1.031250 */
 	OUT_RINGf (0.0); /* y-offset, h/2 + 0.030762 */
 	OUT_RINGf (0.0);
 	OUT_RINGf (16777215.0);
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE1_X, 4);
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
 	OUT_RINGf (0.0); /* no effect?, w/2 */
 	OUT_RINGf (0.0); /* no effect?, h/2 */
 	OUT_RINGf (16777215.0 * 0.5);
@@ -380,30 +380,6 @@ nv20_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
 {
 }
 
-
-static unsigned int
-nv20_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-nv20_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
 struct pipe_context *
 nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -429,8 +405,8 @@ nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv20->pipe.clear = nv20_clear;
 	nv20->pipe.flush = nv20_flush;
 
-	nv20->pipe.is_texture_referenced = nv20_is_texture_referenced;
-	nv20->pipe.is_buffer_referenced = nv20_is_buffer_referenced;
+	nv20->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nv20->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
 	nv20_init_surface_functions(nv20);
 	nv20_init_state_functions(nv20);
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
index fc932f1f90e..a4eaa956608 100644
--- a/src/gallium/drivers/nv20/nv20_context.h
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -13,6 +13,7 @@
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_context.h"
 
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
 	struct nv20_screen *ctx = nv20->screen
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index e6924ad71eb..4eeacd1afd5 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -15,8 +15,6 @@ nv20_screen_get_param(struct pipe_screen *screen, int param)
 		return 0;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
index 4042f46d053..0122b1c2cdb 100644
--- a/src/gallium/drivers/nv20/nv20_state_emit.c
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -135,6 +135,9 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	rt_format = NV20TCL_RT_FORMAT_TYPE_LINEAR | 0x20;
 
 	switch (colour_format) {
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 	case 0:
 		rt_format |= NV20TCL_RT_FORMAT_COLOR_A8R8G8B8;
diff --git a/src/gallium/drivers/nv20/nv20_transfer.c b/src/gallium/drivers/nv20/nv20_transfer.c
index 5018995596c..81b4f1a9177 100644
--- a/src/gallium/drivers/nv20/nv20_transfer.c
+++ b/src/gallium/drivers/nv20/nv20_transfer.c
@@ -13,22 +13,6 @@ struct nv20_transfer {
 	bool direct;
 };
 
-static unsigned nv20_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv20_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               0, 0, 0,
-	                                               nv20_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
-	                                       nv20_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv20_screen *nvscreen = nv20_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv20_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv20_transfer *tx = (struct nv20_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage = PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv20_screen *nvscreen = nv20_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv20_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv20_miptree *mt = (struct nv20_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv20_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
index f827bdc78b1..d8300fd69f6 100644
--- a/src/gallium/drivers/nv30/nv30_context.c
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -10,7 +10,7 @@ nv30_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	
+
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
 		BEGIN_RING(rankine, 0x1fd8, 1);
 		OUT_RING  (2);
@@ -31,29 +31,6 @@ nv30_destroy(struct pipe_context *pipe)
 	FREE(nv30);
 }
 
-static unsigned int
-nv30_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-nv30_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
 struct pipe_context *
 nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -78,8 +55,8 @@ nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv30->pipe.clear = nv30_clear;
 	nv30->pipe.flush = nv30_flush;
 
-	nv30->pipe.is_texture_referenced = nv30_is_texture_referenced;
-	nv30->pipe.is_buffer_referenced = nv30_is_buffer_referenced;
+	nv30->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nv30->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
 	nv30_init_query_functions(nv30);
 	nv30_init_surface_functions(nv30);
@@ -95,4 +72,3 @@ nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
 
 	return &nv30->pipe;
 }
-	
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
index 4229c0a0e14..8d49366dfcb 100644
--- a/src/gallium/drivers/nv30/nv30_context.h
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -13,6 +13,7 @@
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_context.h"
 
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
 	struct nv30_screen *ctx = nv30->screen
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
index a48ba9782b3..0ce702d6f84 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -4,6 +4,7 @@
 #include "pipe/p_inlines.h"
 
 #include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 
@@ -131,7 +132,7 @@ emit_src(struct nv30_fpc *fpc, int pos, struct nv30_sreg src)
 				sizeof(uint32_t) * 4);
 		}
 
-		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);	
+		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);
 		break;
 	case NV30SR_NONE:
 		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
@@ -318,38 +319,23 @@ src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
 {
 	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
 	struct nv30_sreg tgsi = tgsi_src(fpc, fsrc);
-	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
-	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
-			fsrc->SrcRegisterExtSwz.NegateY,
-			fsrc->SrcRegisterExtSwz.NegateZ,
-			fsrc->SrcRegisterExtSwz.NegateW };
+	uint mask = 0;
 	uint c;
 
 	for (c = 0; c < 4; c++) {
-		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
-		case TGSI_EXTSWIZZLE_X:
-		case TGSI_EXTSWIZZLE_Y:
-		case TGSI_EXTSWIZZLE_Z:
-		case TGSI_EXTSWIZZLE_W:
+		switch (tgsi_util_get_full_src_register_swizzle(fsrc, c)) {
+		case TGSI_SWIZZLE_X:
+		case TGSI_SWIZZLE_Y:
+		case TGSI_SWIZZLE_Z:
+		case TGSI_SWIZZLE_W:
 			mask |= (1 << c);
 			break;
-		case TGSI_EXTSWIZZLE_ZERO:
-			zero_mask |= (1 << c);
-			tgsi.swz[c] = SWZ_X;
-			break;
-		case TGSI_EXTSWIZZLE_ONE:
-			one_mask |= (1 << c);
-			tgsi.swz[c] = SWZ_X;
-			break;
 		default:
 			assert(0);
 		}
-
-		if (!tgsi.negate && neg[c])
-			neg_mask |= (1 << c);
 	}
 
-	if (mask == MASK_ALL && !neg_mask)
+	if (mask == MASK_ALL)
 		return TRUE;
 
 	*src = temp(fpc);
@@ -357,18 +343,6 @@ src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
 	if (mask)
 		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
 
-	if (zero_mask)
-		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
-
-	if (one_mask)
-		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
-
-	if (neg_mask) {
-		struct nv30_sreg one = temp(fpc);
-		arith(fpc, 0, STR, one, neg_mask, one, none, none);
-		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
-	}
-
 	return FALSE;
 }
 
@@ -527,12 +501,6 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 	case TGSI_OPCODE_MUL:
 		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
 		break;
-	case TGSI_OPCODE_NOISE1:
-	case TGSI_OPCODE_NOISE2:
-	case TGSI_OPCODE_NOISE3:
-	case TGSI_OPCODE_NOISE4:
-		arith(fpc, sat, SFL, dst, mask, none, none, none);
-		break;
 	case TGSI_OPCODE_POW:
 		arith(fpc, sat, POW, dst, mask, src[0], src[1], none);
 		break;
@@ -699,7 +667,7 @@ nv30_fragprog_prepare(struct nv30_fpc *fpc)
 		{
 			struct tgsi_full_immediate *imm;
 			float vals[4];
-			
+
 			imm = &p.FullToken.FullImmediate;
 			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
 			assert(fpc->nr_imm < MAX_IMM);
@@ -787,7 +755,7 @@ nv30_fragprog_translate(struct nv30_context *nv30,
 	fp->insn[fpc->inst_offset + 1] = 0x00000000;
 	fp->insn[fpc->inst_offset + 2] = 0x00000000;
 	fp->insn[fpc->inst_offset + 3] = 0x00000000;
-	
+
 	fp->translated = TRUE;
 	fp->on_hw = FALSE;
 out_err:
@@ -871,7 +839,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
 update_constants:
 	if (fp->nr_consts) {
 		float *map;
-		
+
 		map = pipe_buffer_map(pscreen, constbuf,
 				      PIPE_BUFFER_USAGE_CPU_READ);
 		for (i = 0; i < fp->nr_consts; i++) {
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
index 822e1d8defe..dca760cae62 100644
--- a/src/gallium/drivers/nv30/nv30_fragtex.c
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -21,6 +21,7 @@ struct nv30_texture_format {
 
 static struct nv30_texture_format
 nv30_texture_formats[] = {
+	_(X8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W),
 	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
 	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
 	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
@@ -29,8 +30,8 @@ nv30_texture_formats[] = {
 	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
 	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
 	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
-//	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X),
-//	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(Z16_UNORM     , R5G6B5  ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(Z24S8_UNORM   , A8R8G8B8,   S1,   S1,   S1,  ONE, X, X, X, X),
 	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
 	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
 	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
@@ -69,13 +70,13 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 
 	tf = nv30_fragtex_format(pt->format);
 	if (!tf)
-		assert(0);
+		return NULL;
 
 	txf  = tf->format;
 	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
-	txf |= log2i(pt->width[0]) << 20;
-	txf |= log2i(pt->height[0]) << 24;
-	txf |= log2i(pt->depth[0]) << 28;
+	txf |= log2i(pt->width[0]) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= log2i(pt->height[0]) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= log2i(pt->depth[0]) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
 	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
 
 	switch (pt->target) {
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
index 7f8054de733..280696d4503 100644
--- a/src/gallium/drivers/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -96,6 +96,11 @@ nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
 		case PIPE_FORMAT_X8R8G8B8_UNORM:
 		case PIPE_FORMAT_R16_SNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
 		{
 			if (debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE))
 				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
@@ -142,6 +147,9 @@ nv30_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].pitch = stride[0];
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
+	/* Assume whoever created this buffer expects it to be linear for now */
+	mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	pipe_buffer_reference(&mt->buffer, pb);
 	return &mt->base;
 }
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index f8285e4455f..7cd36902eb4 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -10,6 +10,22 @@
 #define NV34TCL_CHIPSET_3X_MASK 0x00000010
 #define NV35TCL_CHIPSET_3X_MASK 0x000001e0
 
+/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h
+ * to get the pointer to the context front buffer, so I copied nouveau_winsys here.
+ * nv30_screen_surface_format_supported() can then use it to enforce creating fbo
+ * with same number of bits everywhere.
+ */
+struct nouveau_winsys {
+	struct pipe_winsys base;
+
+	struct pipe_screen *pscreen;
+
+	unsigned nr_pctx;
+	struct pipe_context **pctx;
+
+	struct pipe_surface *front;
+};
+
 static int
 nv30_screen_get_param(struct pipe_screen *pscreen, int param)
 {
@@ -22,8 +38,6 @@ nv30_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
@@ -85,6 +99,8 @@ nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
 				     enum pipe_texture_target target,
 				     unsigned tex_usage, unsigned geom_flags)
 {
+	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
+
 	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
 		switch (format) {
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -98,7 +114,11 @@ nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
 		switch (format) {
 		case PIPE_FORMAT_Z24S8_UNORM:
 		case PIPE_FORMAT_Z24X8_UNORM:
+			return TRUE;
 		case PIPE_FORMAT_Z16_UNORM:
+			if (front) {
+				return (front->format == PIPE_FORMAT_R5G6B5_UNORM);
+			}
 			return TRUE;
 		default:
 			break;
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
index 44b6a74715a..6f6d1740d6e 100644
--- a/src/gallium/drivers/nv30/nv30_state_fb.c
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -8,15 +8,15 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 	struct nouveau_channel *chan = nv30->screen->base.channel;
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	struct nv04_surface *rt[2], *zeta = NULL;
-	uint32_t rt_enable, rt_format;
-	int i, colour_format = 0, zeta_format = 0;
+	uint32_t rt_enable = 0, rt_format = 0;
+	int i, colour_format = 0, zeta_format = 0, depth_only = 0;
 	struct nouveau_stateobj *so = so_new(64, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
 	struct nv30_miptree *nv30mt;
+	int colour_bits = 32, zeta_bits = 32;
 
-	rt_enable = 0;
 	for (i = 0; i < fb->nr_cbufs; i++) {
 		if (colour_format) {
 			assert(colour_format == fb->cbufs[i]->format);
@@ -35,26 +35,47 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 		zeta = (struct nv04_surface *)fb->zsbuf;
 	}
 
-	if (!(rt[0]->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
-		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
-		for (i = 1; i < fb->nr_cbufs; i++)
-			assert(!(rt[i]->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0|NV34TCL_RT_ENABLE_COLOR1)) {
+		/* Render to at least a colour buffer */
+		if (!(rt[0]->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+			for (i = 1; i < fb->nr_cbufs; i++)
+				assert(!(rt[i]->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
 
-		/* FIXME: NV34TCL_RT_FORMAT_LOG2_[WIDTH/HEIGHT] */
-		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-		log2i(fb->width) << 16 /*NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT*/ |
-		log2i(fb->height) << 24 /*NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT*/;
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(rt[0]->base.width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(rt[0]->base.height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else if (fb->zsbuf) {
+		depth_only = 1;
+
+		/* Render to depth buffer only */
+		if (!(zeta->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(zeta->base.width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(zeta->base.height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else {
+		return FALSE;
 	}
-	else
-		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
 
 	switch (colour_format) {
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 	case 0:
 		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
 		break;
 	case PIPE_FORMAT_R5G6B5_UNORM:
 		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		colour_bits = 16;
 		break;
 	default:
 		assert(0);
@@ -63,6 +84,7 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 	switch (zeta_format) {
 	case PIPE_FORMAT_Z16_UNORM:
 		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		zeta_bits = 16;
 		break;
 	case PIPE_FORMAT_Z24S8_UNORM:
 	case PIPE_FORMAT_Z24X8_UNORM:
@@ -73,21 +95,27 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 		assert(0);
 	}
 
-	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
-		uint32_t pitch = rt[0]->pitch;
+	if (colour_bits > zeta_bits) {
+		return FALSE;
+	}
+
+	if (depth_only || (rt_enable & NV34TCL_RT_ENABLE_COLOR0)) {
+		struct nv04_surface *rt0 = (depth_only ? zeta : rt[0]);
+		uint32_t pitch = rt0->pitch;
+
 		if (zeta) {
 			pitch |= (zeta->pitch << 16);
 		} else {
 			pitch |= (pitch << 16);
 		}
 
-		nv30mt = (struct nv30_miptree *)rt[0]->base.texture;
+		nv30mt = (struct nv30_miptree *) rt0->base.texture;
 		so_method(so, rankine, NV34TCL_DMA_COLOR0, 1);
 		so_reloc (so, nouveau_bo(nv30mt->buffer), 0, rt_flags | NOUVEAU_BO_OR,
 			      chan->vram->handle, chan->gart->handle);
 		so_method(so, rankine, NV34TCL_COLOR0_PITCH, 2);
 		so_data  (so, pitch);
-		so_reloc (so, nouveau_bo(nv30mt->buffer), rt[0]->base.offset,
+		so_reloc (so, nouveau_bo(nv30mt->buffer), rt0->base.offset,
 			      rt_flags | NOUVEAU_BO_LOW, 0, 0);
 	}
 
diff --git a/src/gallium/drivers/nv30/nv30_transfer.c b/src/gallium/drivers/nv30/nv30_transfer.c
index 23675718781..98011decf7c 100644
--- a/src/gallium/drivers/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nv30/nv30_transfer.c
@@ -13,22 +13,6 @@ struct nv30_transfer {
 	bool direct;
 };
 
-static unsigned nv30_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv30_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               face, level, zslice,
-	                                               nv30_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
-	                                       nv30_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv30_screen *nvscreen = nv30_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv30_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv30_transfer *tx = (struct nv30_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv30_screen *nvscreen = nv30_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv30_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv30_miptree *mt = (struct nv30_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv30_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
index 8eba6a43ef9..7f008274a4e 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -10,7 +10,7 @@ nv40_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	
+
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
 		BEGIN_RING(curie, 0x1fd8, 1);
 		OUT_RING  (2);
@@ -31,29 +31,6 @@ nv40_destroy(struct pipe_context *pipe)
 	FREE(nv40);
 }
 
-static unsigned int
-nv40_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-nv40_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
 struct pipe_context *
 nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -78,8 +55,8 @@ nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv40->pipe.clear = nv40_clear;
 	nv40->pipe.flush = nv40_flush;
 
-	nv40->pipe.is_texture_referenced = nv40_is_texture_referenced;
-	nv40->pipe.is_buffer_referenced = nv40_is_buffer_referenced;
+	nv40->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nv40->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
 	nv40_init_query_functions(nv40);
 	nv40_init_surface_functions(nv40);
@@ -95,4 +72,3 @@ nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
 
 	return &nv40->pipe;
 }
-	
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
index 97bc83292d4..a3d594167aa 100644
--- a/src/gallium/drivers/nv40/nv40_context.h
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -13,6 +13,7 @@
 
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_context.h"
 
 #define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
 	struct nv40_screen *ctx = nv40->screen
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index 32d9ed1a7f8..99277506fc2 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -321,38 +321,23 @@ src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
 {
 	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
 	struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
-	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
-	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
-			fsrc->SrcRegisterExtSwz.NegateY,
-			fsrc->SrcRegisterExtSwz.NegateZ,
-			fsrc->SrcRegisterExtSwz.NegateW };
+	uint mask = 0;
 	uint c;
 
 	for (c = 0; c < 4; c++) {
-		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
-		case TGSI_EXTSWIZZLE_X:
-		case TGSI_EXTSWIZZLE_Y:
-		case TGSI_EXTSWIZZLE_Z:
-		case TGSI_EXTSWIZZLE_W:
+		switch (tgsi_util_get_full_src_register_swizzle(fsrc, c)) {
+		case TGSI_SWIZZLE_X:
+		case TGSI_SWIZZLE_Y:
+		case TGSI_SWIZZLE_Z:
+		case TGSI_SWIZZLE_W:
 			mask |= (1 << c);
 			break;
-		case TGSI_EXTSWIZZLE_ZERO:
-			zero_mask |= (1 << c);
-			tgsi.swz[c] = SWZ_X;
-			break;
-		case TGSI_EXTSWIZZLE_ONE:
-			one_mask |= (1 << c);
-			tgsi.swz[c] = SWZ_X;
-			break;
 		default:
 			assert(0);
 		}
-
-		if (!tgsi.negate && neg[c])
-			neg_mask |= (1 << c);
 	}
 
-	if (mask == MASK_ALL && !neg_mask)
+	if (mask == MASK_ALL)
 		return TRUE;
 
 	*src = temp(fpc);
@@ -360,18 +345,6 @@ src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
 	if (mask)
 		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
 
-	if (zero_mask)
-		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
-
-	if (one_mask)
-		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
-
-	if (neg_mask) {
-		struct nv40_sreg one = temp(fpc);
-		arith(fpc, 0, STR, one, neg_mask, one, none, none);
-		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
-	}
-
 	return FALSE;
 }
 
@@ -568,12 +541,6 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 	case TGSI_OPCODE_MUL:
 		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
 		break;
-	case TGSI_OPCODE_NOISE1:
-	case TGSI_OPCODE_NOISE2:
-	case TGSI_OPCODE_NOISE3:
-	case TGSI_OPCODE_NOISE4:
-		arith(fpc, sat, SFL, dst, mask, none, none, none);
-		break;
 	case TGSI_OPCODE_POW:
 		tmp = temp(fpc);
 		arith(fpc, 0, LG2, tmp, MASK_X,
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
index f6cdf31dfee..e2ec57564d1 100644
--- a/src/gallium/drivers/nv40/nv40_fragtex.c
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -23,6 +23,7 @@ struct nv40_texture_format {
 
 static struct nv40_texture_format
 nv40_texture_formats[] = {
+	_(X8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
 	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
 	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
 	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
index 5a201ccf458..465dd3b0693 100644
--- a/src/gallium/drivers/nv40/nv40_miptree.c
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -141,6 +141,9 @@ nv40_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].pitch = stride[0];
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
+	/* Assume whoever created this buffer expects it to be linear for now */
+	mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	pipe_buffer_reference(&mt->buffer, pb);
 	return &mt->base;
 }
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index 5d2a4216c5a..bd13dfddd1c 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -21,8 +21,6 @@ nv40_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
index c2f739157ad..1c7a7cd64f0 100644
--- a/src/gallium/drivers/nv40/nv40_state_fb.c
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -57,6 +57,9 @@ nv40_state_framebuffer_validate(struct nv40_context *nv40)
 		rt_format = NV40TCL_RT_FORMAT_TYPE_LINEAR;
 
 	switch (colour_format) {
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
 	case PIPE_FORMAT_A8R8G8B8_UNORM:
 	case 0:
 		rt_format |= NV40TCL_RT_FORMAT_COLOR_A8R8G8B8;
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index 6d92ac3db9c..92caee6f382 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -13,22 +13,6 @@ struct nv40_transfer {
 	bool direct;
 };
 
-static unsigned nv40_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               face, level, zslice,
-	                                               nv40_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
-	                                       nv40_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv40_screen *nvscreen = nv40_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv40_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv40_transfer *tx = (struct nv40_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv40_screen *nvscreen = nv40_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv40_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv40_miptree *mt = (struct nv40_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv40_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
index 0382dbba8f6..31dae2457fd 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -362,38 +362,23 @@ src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
 {
 	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
 	struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
-	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
-	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
-			fsrc->SrcRegisterExtSwz.NegateY,
-			fsrc->SrcRegisterExtSwz.NegateZ,
-			fsrc->SrcRegisterExtSwz.NegateW };
+	uint mask = 0;
 	uint c;
 
 	for (c = 0; c < 4; c++) {
-		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
-		case TGSI_EXTSWIZZLE_X:
-		case TGSI_EXTSWIZZLE_Y:
-		case TGSI_EXTSWIZZLE_Z:
-		case TGSI_EXTSWIZZLE_W:
+		switch (tgsi_util_get_full_src_register_swizzle(fsrc, c)) {
+		case TGSI_SWIZZLE_X:
+		case TGSI_SWIZZLE_Y:
+		case TGSI_SWIZZLE_Z:
+		case TGSI_SWIZZLE_W:
 			mask |= tgsi_mask(1 << c);
 			break;
-		case TGSI_EXTSWIZZLE_ZERO:
-			zero_mask |= tgsi_mask(1 << c);
-			tgsi.swz[c] = SWZ_X;
-			break;
-		case TGSI_EXTSWIZZLE_ONE:
-			one_mask |= tgsi_mask(1 << c);
-			tgsi.swz[c] = SWZ_X;
-			break;
 		default:
 			assert(0);
 		}
-
-		if (!tgsi.negate && neg[c])
-			neg_mask |= tgsi_mask(1 << c);
 	}
 
-	if (mask == MASK_ALL && !neg_mask)
+	if (mask == MASK_ALL)
 		return TRUE;
 
 	*src = temp(vpc);
@@ -401,18 +386,6 @@ src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
 	if (mask)
 		arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
 
-	if (zero_mask)
-		arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
-
-	if (one_mask)
-		arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
-
-	if (neg_mask) {
-		struct nv40_sreg one = temp(vpc);
-		arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
-		arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
-	}
-
 	return FALSE;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index 6e8f4f9750d..219e7a78623 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -33,15 +33,9 @@ nv50_flush(struct pipe_context *pipe, unsigned flags,
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
 	struct nouveau_channel *chan = nv50->screen->base.channel;
-	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 
-	/* We need this in the ddx for reliable composite, not sure what we're
-	 * actually flushing. We generate all our own flushes with flags = 0. */
-	WAIT_RING(chan, 3);
-	BEGIN_RING(chan, eng2d, 0x0110, 1);
-	OUT_RING  (chan, 0);
-
-	FIRE_RING(chan);
+	if (flags & PIPE_FLUSH_FRAME)
+		FIRE_RING(chan);
 }
 
 static void
@@ -59,29 +53,6 @@ nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
 {
 }
 
-static unsigned int
-nv50_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-static unsigned int
-nv50_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
-{
-   /**
-    * FIXME: Optimize.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
 struct pipe_context *
 nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -107,8 +78,11 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 
 	nv50->pipe.flush = nv50_flush;
 
-	nv50->pipe.is_texture_referenced = nv50_is_texture_referenced;
-	nv50->pipe.is_buffer_referenced = nv50_is_buffer_referenced;
+	nv50->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nv50->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
+
+	screen->base.channel->user_private = nv50;
+	screen->base.channel->flush_notify = nv50_state_flush_notify;
 
 	nv50_init_surface_functions(nv50);
 	nv50_init_state_functions(nv50);
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 1e9e8e49bfb..4b0f0622953 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -14,6 +14,7 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_stateobj.h"
+#include "nouveau/nouveau_context.h"
 
 #include "nv50_screen.h"
 #include "nv50_program.h"
@@ -68,6 +69,18 @@ struct nv50_sampler_stateobj {
 	unsigned tsc[8];
 };
 
+static INLINE unsigned
+get_tile_height(uint32_t tile_mode)
+{
+        return 1 << ((tile_mode & 0xf) + 2);
+}
+
+static INLINE unsigned
+get_tile_depth(uint32_t tile_mode)
+{
+        return 1 << (tile_mode >> 4);
+}
+
 struct nv50_miptree_level {
 	int *image_offset;
 	unsigned pitch;
@@ -116,9 +129,11 @@ struct nv50_state {
 	unsigned miptree_nr;
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *programs;
 	struct nouveau_stateobj *vtxfmt;
 	struct nouveau_stateobj *vtxbuf;
 	struct nouveau_stateobj *vtxattr;
+	unsigned vtxelt_nr;
 };
 
 struct nv50_context {
@@ -151,6 +166,8 @@ struct nv50_context {
 	unsigned sampler_nr;
 	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
 	unsigned miptree_nr;
+
+	uint16_t vbo_fifo;
 };
 
 static INLINE struct nv50_context *
@@ -190,12 +207,28 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
 /* nv50_program.c */
 extern void nv50_vertprog_validate(struct nv50_context *nv50);
 extern void nv50_fragprog_validate(struct nv50_context *nv50);
-extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
+extern void nv50_linkage_validate(struct nv50_context *nv50);
+extern void nv50_program_destroy(struct nv50_context *nv50,
+				 struct nv50_program *p);
 
 /* nv50_state_validate.c */
 extern boolean nv50_state_validate(struct nv50_context *nv50);
+extern void nv50_state_flush_notify(struct nouveau_channel *chan);
+
+extern void nv50_so_init_sifc(struct nv50_context *nv50,
+			      struct nouveau_stateobj *so,
+			      struct nouveau_bo *bo, unsigned reloc,
+			      unsigned size);
 
 /* nv50_tex.c */
 extern void nv50_tex_validate(struct nv50_context *);
 
+/* nv50_transfer.c */
+extern void
+nv50_upload_sifc(struct nv50_context *nv50,
+		 struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc,
+		 unsigned dst_format, int dst_w, int dst_h, int dst_pitch,
+		 void *src, unsigned src_format, int src_pitch,
+		 int x, int y, int w, int h, int cpp);
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 03b9243b828..9c20c5cc282 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -26,6 +26,35 @@
 
 #include "nv50_context.h"
 
+/* The restrictions in tile mode selection probably aren't necessary. */
+static INLINE uint32_t
+get_tile_mode(unsigned ny, unsigned d)
+{
+	uint32_t tile_mode = 0x00;
+
+	if (ny > 32) tile_mode = 0x04; /* height 64 tiles */
+	else
+	if (ny > 16) tile_mode = 0x03; /* height 32 tiles */
+	else
+	if (ny >  8) tile_mode = 0x02; /* height 16 tiles */
+	else
+	if (ny >  4) tile_mode = 0x01; /* height 8 tiles */
+
+	if (d == 1)
+		return tile_mode;
+	else
+	if (tile_mode > 0x02)
+		tile_mode = 0x02;
+
+	if (d > 16 && tile_mode < 0x02)
+		return tile_mode | 0x50; /* depth 32 tiles */
+	if (d >  8) return tile_mode | 0x40; /* depth 16 tiles */
+	if (d >  4) return tile_mode | 0x30; /* depth 8 tiles */
+	if (d >  2) return tile_mode | 0x20; /* depth 4 tiles */
+
+	return tile_mode | 0x10;
+}
+
 static struct pipe_texture *
 nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 {
@@ -33,8 +62,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
 	struct pipe_texture *pt = &mt->base.base;
 	unsigned width = tmp->width[0], height = tmp->height[0];
-	unsigned depth = tmp->depth[0];
-	uint32_t tile_mode, tile_flags, tile_h;
+	unsigned depth = tmp->depth[0], image_alignment;
+	uint32_t tile_flags;
 	int ret, i, l;
 
 	*pt = *tmp;
@@ -57,24 +86,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 		break;
 	}
 
-	if      (pt->height[0] > 32) tile_mode = 4;
-	else if (pt->height[0] > 16) tile_mode = 3;
-	else if (pt->height[0] >  8) tile_mode = 2;
-	else if (pt->height[0] >  4) tile_mode = 1;
-	else                         tile_mode = 0;
-	tile_h = 1 << (tile_mode + 2);
-
-	switch (pt->target) {
-	case PIPE_TEXTURE_3D:
-		mt->image_nr = pt->depth[0];
-		break;
-	case PIPE_TEXTURE_CUBE:
-		mt->image_nr = 6;
-		break;
-	default:
-		mt->image_nr = 1;
-		break;
-	}
+	/* XXX: texture arrays */
+	mt->image_nr = (pt->target == PIPE_TEXTURE_CUBE) ? 6 : 1;
 
 	for (l = 0; l <= pt->last_level; l++) {
 		struct nv50_miptree_level *lvl = &mt->level[l];
@@ -86,33 +99,36 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
 
 		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
-		lvl->pitch = align(pt->width[l] * pt->block.size, 64);
-		lvl->tile_mode = tile_mode;
+		lvl->pitch = align(pt->nblocksx[l] * pt->block.size, 64);
+		lvl->tile_mode = get_tile_mode(pt->nblocksy[l], depth);
 
 		width = MAX2(1, width >> 1);
 		height = MAX2(1, height >> 1);
 		depth = MAX2(1, depth >> 1);
-
-		if (tile_mode && height <= (tile_h >> 1)) {
-			tile_mode--;
-			tile_h >>= 1;
-		}
 	}
 
+	image_alignment  = get_tile_height(mt->level[0].tile_mode) * 64;
+	image_alignment *= get_tile_depth(mt->level[0].tile_mode);
+
+	/* NOTE the distinction between arrays of mip-mapped 2D textures and
+	 * mip-mapped 3D textures. We can't use image_nr == depth for 3D mip.
+	 */
 	for (i = 0; i < mt->image_nr; i++) {
 		for (l = 0; l <= pt->last_level; l++) {
 			struct nv50_miptree_level *lvl = &mt->level[l];
 			int size;
-			tile_h = 1 << (lvl->tile_mode + 2);
+			unsigned tile_h = get_tile_height(lvl->tile_mode);
+			unsigned tile_d = get_tile_depth(lvl->tile_mode);
 
-			size  = align(pt->width[l], 8) * pt->block.size;
-			size  = align(size, 64);
-			size *= align(pt->height[l], tile_h);
+			size  = lvl->pitch;
+			size *= align(pt->nblocksy[l], tile_h);
+			size *= align(pt->depth[l], tile_d);
 
 			lvl->image_offset[i] = mt->total_size;
 
 			mt->total_size += size;
 		}
+		mt->total_size = align(mt->total_size, image_alignment);
 	}
 
 	ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size,
@@ -148,6 +164,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->image_nr = 1;
 	mt->level[0].pitch = *stride;
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+	mt->level[0].tile_mode = bo->tile_mode;
 
 	nouveau_bo_ref(bo, &mt->base.bo);
 	return &mt->base.base;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 4a838529de7..bf50982dd16 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -31,9 +31,12 @@
 
 #include "nv50_context.h"
 
-#define NV50_SU_MAX_TEMP 64
+#define NV50_SU_MAX_TEMP 127
+#define NV50_SU_MAX_ADDR 4
 //#define NV50_PROGRAM_DUMP
 
+/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
+
 /* ARL - gallium craps itself on progs/vp/arl.txt
  *
  * MSB - Like MAD, but MUL+SUB
@@ -79,22 +82,32 @@ struct nv50_reg {
 		P_ATTR,
 		P_RESULT,
 		P_CONST,
-		P_IMMD
+		P_IMMD,
+		P_ADDR
 	} type;
 	int index;
 
 	int hw;
-	int neg;
+	int mod;
 
 	int rhw; /* result hw for FP outputs, or interpolant index */
 	int acc; /* instruction where this reg is last read (first insn == 1) */
 };
 
+#define NV50_MOD_NEG 1
+#define NV50_MOD_ABS 2
+#define NV50_MOD_SAT 4
+
+/* arbitrary limits */
+#define MAX_IF_DEPTH 4
+#define MAX_LOOP_DEPTH 4
+
 struct nv50_pc {
 	struct nv50_program *p;
 
 	/* hw resources */
 	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
 
 	/* tgsi resources */
 	struct nv50_reg *temp;
@@ -108,15 +121,28 @@ struct nv50_pc {
 	struct nv50_reg *immd;
 	float *immd_buf;
 	int immd_nr;
+	struct nv50_reg **addr;
+	int addr_nr;
 
 	struct nv50_reg *temp_temp[16];
 	unsigned temp_temp_nr;
 
+	/* broadcast and destination replacement regs */
+	struct nv50_reg *r_brdc;
+	struct nv50_reg *r_dst[4];
+
 	unsigned interp_mode[32];
 	/* perspective interpolation registers */
 	struct nv50_reg *iv_p;
 	struct nv50_reg *iv_c;
 
+	struct nv50_program_exec *if_cond;
+	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
+	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
+	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
+	int if_lvl, loop_lvl;
+	unsigned loop_pos[MAX_LOOP_DEPTH];
+
 	/* current instruction and total number of insns */
 	unsigned insn_cur;
 	unsigned insn_nr;
@@ -124,6 +150,36 @@ struct nv50_pc {
 	boolean allow32;
 };
 
+static INLINE void
+ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
+{
+	reg->type = type;
+	reg->index = index;
+	reg->hw = hw;
+	reg->mod = 0;
+	reg->rhw = -1;
+	reg->acc = 0;
+}
+
+static INLINE unsigned
+popcnt4(uint32_t val)
+{
+	static const unsigned cnt[16]
+	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+	return cnt[val & 0xf];
+}
+
+static void
+terminate_mbb(struct nv50_pc *pc)
+{
+	int i;
+
+	/* remove records of temporary address register values */
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
+		if (pc->r_addr[i].index < 0)
+			pc->r_addr[i].rhw = -1;
+}
+
 static void
 alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 {
@@ -173,6 +229,10 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 	assert(0);
 }
 
+/* XXX: For shaders that aren't executed linearly (e.g. shaders that
+ * contain loops), we need to assign all hw regs to TGSI TEMPs early,
+ * lest we risk temp_temps overwriting regs alloc'd "later".
+ */
 static struct nv50_reg *
 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 {
@@ -184,11 +244,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 
 	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
 		if (!pc->r_temp[i]) {
-			r = CALLOC_STRUCT(nv50_reg);
-			r->type = P_TEMP;
-			r->index = -1;
-			r->hw = i;
-			r->rhw = -1;
+			r = MALLOC_STRUCT(nv50_reg);
+			ctor_reg(r, P_TEMP, -1, i);
 			pc->r_temp[i] = r;
 			return r;
 		}
@@ -254,10 +311,8 @@ alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
 		return alloc_temp4(pc, dst, idx + 4);
 
 	for (i = 0; i < 4; i++) {
-		dst[i] = CALLOC_STRUCT(nv50_reg);
-		dst[i]->type = P_TEMP;
-		dst[i]->index = -1;
-		dst[i]->hw = idx + i;
+		dst[i] = MALLOC_STRUCT(nv50_reg);
+		ctor_reg(dst[i], P_TEMP, -1, idx + i);
 		pc->r_temp[idx + i] = dst[i];
 	}
 
@@ -309,7 +364,7 @@ ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
 static struct nv50_reg *
 alloc_immd(struct nv50_pc *pc, float f)
 {
-	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
 	unsigned hw;
 
 	for (hw = 0; hw < pc->immd_nr * 4; hw++)
@@ -319,9 +374,7 @@ alloc_immd(struct nv50_pc *pc, float f)
 	if (hw == pc->immd_nr * 4)
 		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
 
-	r->type = P_IMMD;
-	r->hw = hw;
-	r->index = -1;
+	ctor_reg(r, P_IMMD, -1, hw);
 	return r;
 }
 
@@ -403,14 +456,20 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 	}
 
 	alloc_reg(pc, dst);
+	if (dst->hw > 63)
+		set_long(pc, e);
 	e->inst[0] |= (dst->hw << 2);
 }
 
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
+	unsigned val;
 	float f = pc->immd_buf[imm->hw];
-	unsigned val = fui(imm->neg ? -f : f);
+
+	if (imm->mod & NV50_MOD_ABS)
+		f = fabsf(f);
+	val = fui((imm->mod & NV50_MOD_NEG) ? -f : f);
 
 	set_long(pc, e);
 	/*XXX: can't be predicated - bits overlap.. catch cases where both
@@ -423,9 +482,96 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 	e->inst[1] |= (val >> 6) << 2;
 }
 
+static INLINE void
+set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
+{
+	assert(!(e->inst[0] & 0x0c000000));
+	assert(!(e->inst[1] & 0x00000004));
+
+	e->inst[0] |= (a->hw & 3) << 26;
+	e->inst[1] |= (a->hw >> 2) << 2;
+}
+
+static void
+emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
+		  struct nv50_reg *src0, uint16_t src1_val)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000 | (src1_val << 9);
+	e->inst[1] = 0x20000000;
+	set_long(pc, e);
+	e->inst[0] |= dst->hw << 2;
+	if (src0) /* otherwise will add to $a0, which is always 0 */
+		set_addr(e, src0);
+
+	emit(pc, e);
+}
+
+static struct nv50_reg *
+alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
+{
+	int i;
+	struct nv50_reg *a_tgsi = NULL, *a = NULL;
+
+	if (!ref) {
+		/* allocate for TGSI address reg */
+		for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
+			if (pc->r_addr[i].index >= 0)
+				continue;
+			if (pc->r_addr[i].rhw >= 0 &&
+			    pc->r_addr[i].acc == pc->insn_cur)
+				continue;
+
+			pc->r_addr[i].rhw = -1;
+			pc->r_addr[i].index = i;
+			return &pc->r_addr[i];
+		}
+		assert(0);
+		return NULL;
+	}
+
+	/* Allocate and set an address reg so we can access 'ref'.
+	 *
+	 * If and r_addr has index < 0, it is not reserved for TGSI,
+	 * and index will be the negative of the TGSI addr index the
+	 * value in rhw is relative to, or -256 if rhw is an offset
+	 * from 0. If rhw < 0, the reg has not been initialized.
+	 */
+	for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
+		if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
+			continue;
+		if (pc->r_addr[i].rhw < 0) { /* unused */
+			a = &pc->r_addr[i];
+			continue;
+		}
+		if (!a && pc->r_addr[i].acc != pc->insn_cur)
+			a = &pc->r_addr[i];
+
+		if (ref->hw - pc->r_addr[i].rhw >= 128)
+			continue;
+
+		if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
+		    (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
+			pc->r_addr[i].acc = pc->insn_cur;
+			return &pc->r_addr[i];
+		}
+	}
+	assert(a);
+
+	if (ref->acc < 0)
+		a_tgsi = pc->addr[ref->index];
+
+	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
+
+	a->rhw = ref->hw & ~0x7f;
+	a->acc = pc->insn_cur;
+	a->index = a_tgsi ? -ref->index : -256;
+	return a;
+}
 
 #define INTERP_LINEAR		0
-#define INTERP_FLAT			1
+#define INTERP_FLAT		1
 #define INTERP_PERSPECTIVE	2
 #define INTERP_CENTROID		4
 
@@ -463,10 +609,18 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
 {
 	set_long(pc, e);
 
-	e->param.index = src->hw;
+	e->param.index = src->hw & 127;
 	e->param.shift = s;
 	e->param.mask = m << (s % 32);
 
+	if (src->hw > 127)
+		set_addr(e, alloc_addr(pc, src));
+	else
+	if (src->acc < 0) {
+		assert(src->type == P_CONST);
+		set_addr(e, pc->addr[src->index]);
+	}
+
 	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
 }
 
@@ -475,11 +629,13 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
 	struct nv50_program_exec *e = exec(pc);
 
-	e->inst[0] |= 0x10000000;
+	e->inst[0] = 0x10000000;
+	if (!pc->allow32)
+		set_long(pc, e);
 
 	set_dst(pc, dst, e);
 
-	if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {
+	if (!is_long(e) && src->type == P_IMMD) {
 		set_immd(pc, src, e);
 		/*XXX: 32-bit, but steals part of "half" reg space - need to
 		 *     catch and handle this case if/when we do half-regs
@@ -496,6 +652,8 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 		}
 
 		alloc_reg(pc, src);
+		if (src->hw > 63)
+			set_long(pc, e);
 		e->inst[0] |= (src->hw << 9);
 	}
 
@@ -543,6 +701,24 @@ check_swap_src_0_1(struct nv50_pc *pc,
 }
 
 static void
+set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
+		     struct nv50_program_exec *e)
+{
+	struct nv50_reg *temp;
+
+	if (src->type != P_TEMP) {
+		temp = temp_temp(pc);
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	if (src->hw > 63)
+		set_long(pc, e);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 {
 	if (src->type == P_ATTR) {
@@ -557,6 +733,8 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	}
 
 	alloc_reg(pc, src);
+	if (src->hw > 63)
+		set_long(pc, e);
 	e->inst[0] |= (src->hw << 9);
 }
 
@@ -583,7 +761,9 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	}
 
 	alloc_reg(pc, src);
-	e->inst[0] |= (src->hw << 16);
+	if (src->hw > 63)
+		set_long(pc, e);
+	e->inst[0] |= ((src->hw & 127) << 16);
 }
 
 static void
@@ -611,7 +791,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	}
 
 	alloc_reg(pc, src);
-	e->inst[1] |= (src->hw << 14);
+	e->inst[1] |= ((src->hw & 127) << 14);
 }
 
 static void
@@ -629,12 +809,12 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
 	if (src1->type == P_IMMD && !is_long(e)) {
-		if (src0->neg)
+		if (src0->mod & NV50_MOD_NEG)
 			e->inst[0] |= 0x00008000;
 		set_immd(pc, src1, e);
 	} else {
 		set_src_1(pc, src1, e);
-		if (src0->neg ^ src1->neg) {
+		if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
 			if (is_long(e))
 				e->inst[1] |= 0x08000000;
 			else
@@ -651,13 +831,15 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
 {
 	struct nv50_program_exec *e = exec(pc);
 
-	e->inst[0] |= 0xb0000000;
+	e->inst[0] = 0xb0000000;
 
+	alloc_reg(pc, src1);
 	check_swap_src_0_1(pc, &src0, &src1);
 
-	if (!pc->allow32 || src0->neg || src1->neg) {
+	if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
 		set_long(pc, e);
-		e->inst[1] |= (src0->neg << 26) | (src1->neg << 27);
+		e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
+			      ((src1->mod & NV50_MOD_NEG) << 27);
 	}
 
 	set_dst(pc, dst, e);
@@ -674,6 +856,22 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
 }
 
 static void
+emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
+	 uint8_t s)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[1] |= 0xc0000000;
+
+	e->inst[0] |= dst->hw << 2;
+	e->inst[0] |= s << 16; /* shift left */
+	set_src_0_restricted(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	    struct nv50_reg *src0, struct nv50_reg *src1)
 {
@@ -688,6 +886,11 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	set_src_0(pc, src0, e);
 	set_src_1(pc, src1, e);
 
+	if (src0->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+	if (src1->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00080000;
+
 	emit(pc, e);
 }
 
@@ -695,9 +898,47 @@ static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
-	src1->neg ^= 1;
+	assert(src0 != src1);
+	src1->mod ^= NV50_MOD_NEG;
 	emit_add(pc, dst, src0, src1);
-	src1->neg ^= 1;
+	src1->mod ^= NV50_MOD_NEG;
+}
+
+static void
+emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	    struct nv50_reg *src1, unsigned op)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000;
+	set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
+	    op != TGSI_OPCODE_XOR)
+		assert(!"invalid bit op");
+
+	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
+		set_immd(pc, src1, e);
+		if (op == TGSI_OPCODE_OR)
+			e->inst[0] |= 0x0100;
+		else
+		if (op == TGSI_OPCODE_XOR)
+			e->inst[0] |= 0x8000;
+	} else {
+		set_src_1(pc, src1, e);
+		e->inst[1] |= 0x04000000; /* 32 bit */
+		if (op == TGSI_OPCODE_OR)
+			e->inst[1] |= 0x4000;
+		else
+		if (op == TGSI_OPCODE_XOR)
+			e->inst[1] |= 0x8000;
+	}
+
+	emit(pc, e);
 }
 
 static void
@@ -714,9 +955,9 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	set_src_1(pc, src1, e);
 	set_src_2(pc, src2, e);
 
-	if (src0->neg ^ src1->neg)
+	if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
 		e->inst[1] |= 0x04000000;
-	if (src2->neg)
+	if (src2->mod & NV50_MOD_NEG)
 		e->inst[1] |= 0x08000000;
 
 	emit(pc, e);
@@ -726,9 +967,10 @@ static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
-	src2->neg ^= 1;
+	assert(src2 != src0 && src2 != src1);
+	src2->mod ^= NV50_MOD_NEG;
 	emit_mad(pc, dst, src0, src1, src2);
-	src2->neg ^= 1;
+	src2->mod ^= NV50_MOD_NEG;
 }
 
 static void
@@ -744,7 +986,11 @@ emit_flop(struct nv50_pc *pc, unsigned sub,
 	}
 
 	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
+
+	if (sub == 0 || sub == 2)
+		set_src_0_restricted(pc, src, e);
+	else
+		set_src_0(pc, src, e);
 
 	emit(pc, e);
 }
@@ -786,16 +1032,19 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 #define CVTOP_SAT	0x08
 #define CVTOP_ABS	0x10
 
+/* 0x04 == 32 bit dst */
+/* 0x40 == dst is float */
+/* 0x80 == src is float */
 #define CVT_F32_F32 0xc4
 #define CVT_F32_S32 0x44
-#define CVT_F32_U32 0x64
 #define CVT_S32_F32 0x8c
 #define CVT_S32_S32 0x0c
-#define CVT_F32_F32_ROP 0xcc
+#define CVT_NEG     0x20
+#define CVT_RI      0x08
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cop, unsigned fmt)
+	 int wp, unsigned cvn, unsigned fmt)
 {
 	struct nv50_program_exec *e;
 
@@ -803,8 +1052,8 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	set_long(pc, e);
 
 	e->inst[0] |= 0xa0000000;
-	e->inst[1] |= 0x00004000;
-	e->inst[1] |= (cop << 16);
+	e->inst[1] |= 0x00004000; /* 32 bit src */
+	e->inst[1] |= (cvn << 16);
 	e->inst[1] |= (fmt << 24);
 	set_src_0(pc, src, e);
 
@@ -821,53 +1070,85 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 	emit(pc, e);
 }
 
+/* nv50 Condition codes:
+ *  0x1 = LT
+ *  0x2 = EQ
+ *  0x3 = LE
+ *  0x4 = GT
+ *  0x5 = NE
+ *  0x6 = GE
+ *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
+ *  0x8 = unordered bit (allows NaN)
+ */
 static void
-emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	 struct nv50_reg *src0, struct nv50_reg *src1)
 {
+	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
 	struct nv50_program_exec *e = exec(pc);
-	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 	struct nv50_reg *rdst;
 
-	assert(c_op <= 7);
+	assert(ccode < 16);
 	if (check_swap_src_0_1(pc, &src0, &src1))
-		c_op = inv_cop[c_op];
+		ccode = cc_swapped[ccode & 7] | (ccode & 8);
 
 	rdst = dst;
-	if (dst->type != P_TEMP)
+	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
 	/* set.u32 */
 	set_long(pc, e);
 	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (3 << 29);
-	e->inst[1] |= (c_op << 14);
-	/*XXX: breaks things, .u32 by default?
-	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
-	 *     doesn't seem to match what the hw actually does.
-	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	e->inst[1] |= 0x60000000 | (ccode << 14);
+
+	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
+	 * that doesn't seem to match what the hw actually does
+	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
 	 */
-	set_dst(pc, dst, e);
+
+	if (wp >= 0)
+		set_pred_wr(pc, 1, wp, e);
+	if (dst)
+		set_dst(pc, dst, e);
+	else {
+		e->inst[0] |= 0x000001fc;
+		e->inst[1] |= 0x00000008;
+	}
+
 	set_src_0(pc, src0, e);
 	set_src_1(pc, src1, e);
-	emit(pc, e);
 
-	/* cvt.f32.u32 */
-	e = exec(pc);
-	e->inst[0] = 0xa0000001;
-	e->inst[1] = 0x64014780;
-	set_dst(pc, rdst, e);
-	set_src_0(pc, dst, e);
 	emit(pc, e);
+	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
 
-	if (dst != rdst)
+	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
+	if (rdst)
+		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
+static INLINE unsigned
+map_tgsi_setop_cc(unsigned op)
+{
+	switch (op) {
+	case TGSI_OPCODE_SLT: return 0x1;
+	case TGSI_OPCODE_SGE: return 0x6;
+	case TGSI_OPCODE_SEQ: return 0x2;
+	case TGSI_OPCODE_SGT: return 0x4;
+	case TGSI_OPCODE_SLE: return 0x3;
+	case TGSI_OPCODE_SNE: return 0xd;
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
+	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
 }
 
 static void
@@ -890,6 +1171,12 @@ emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
 }
 
+static INLINE void
+emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+}
+
 static void
 emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	 struct nv50_reg **src)
@@ -944,20 +1231,10 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	FREE(one);
 }
 
-static void
+static INLINE void
 emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	set_long(pc, e);
-	e->inst[0] |= 0xa0000000; /* delta */
-	e->inst[1] |= (7 << 29); /* delta */
-	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
-	e->inst[1] |= (1 << 14); /* src .f32 */
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
+	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
 }
 
 static void
@@ -965,30 +1242,52 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 {
 	struct nv50_program_exec *e;
 	const int r_pred = 1;
+	unsigned cvn = CVT_F32_F32;
 
-	/* Sets predicate reg ? */
-	e = exec(pc);
-	e->inst[0] = 0xa00001fd;
-	e->inst[1] = 0xc4014788;
-	set_src_0(pc, src, e);
-	set_pred_wr(pc, 1, r_pred, e);
-	if (src->neg)
-		e->inst[1] |= 0x20000000;
-	emit(pc, e);
+	if (src->mod & NV50_MOD_NEG)
+		cvn |= CVT_NEG;
+	/* write predicate reg */
+	emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
 
-	/* This is probably KILP */
+	/* conditional discard */
 	e = exec(pc);
-	e->inst[0] = 0x000001fe;
+	e->inst[0] = 0x00000002;
 	set_long(pc, e);
-	set_pred(pc, 1 /* LT? */, r_pred, e);
+	set_pred(pc, 0x1 /* LT */, r_pred, e);
 	emit(pc, e);
 }
 
 static void
+load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
+		     struct nv50_reg **src, boolean proj)
+{
+	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
+
+	src[0]->mod |= NV50_MOD_ABS;
+	src[1]->mod |= NV50_MOD_ABS;
+	src[2]->mod |= NV50_MOD_ABS;
+
+	emit_minmax(pc, 4, t[2], src[0], src[1]);
+	emit_minmax(pc, 4, t[2], src[2], t[2]);
+
+	src[0]->mod = mod[0];
+	src[1]->mod = mod[1];
+	src[2]->mod = mod[2];
+
+	if (proj && 0 /* looks more correct without this */)
+		emit_mul(pc, t[2], t[2], src[3]);
+	emit_flop(pc, 0, t[2], t[2]);
+
+	emit_mul(pc, t[0], src[0], t[2]);
+	emit_mul(pc, t[1], src[1], t[2]);
+	emit_mul(pc, t[2], src[2], t[2]);
+}
+
+static void
 emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
 {
-	struct nv50_reg *temp, *t[4];
+	struct nv50_reg *t[4];
 	struct nv50_program_exec *e;
 
 	unsigned c, mode, dim;
@@ -1017,6 +1316,9 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	/* some cards need t[0]'s hw index to be a multiple of 4 */
 	alloc_temp4(pc, t, 0);
 
+	if (type == TGSI_TEXTURE_CUBE) {
+		load_cube_tex_coords(pc, t, src, proj);
+	} else
 	if (proj) {
 		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
 			mode = pc->interp_mode[src[0]->index];
@@ -1041,17 +1343,8 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 			 */
 		}
 	} else {
-		if (type == TGSI_TEXTURE_CUBE) {
-			temp = temp_temp(pc);
-			emit_minmax(pc, 4, temp, src[0], src[1]);
-			emit_minmax(pc, 4, temp, temp, src[2]);
-			emit_flop(pc, 0, temp, temp);
-			for (c = 0; c < 3; c++)
-				emit_mul(pc, t[c], src[c], temp);
-		} else {
-			for (c = 0; c < dim; c++)
-				emit_mov(pc, t[c], src[c]);
-		}
+		for (c = 0; c < dim; c++)
+			emit_mov(pc, t[c], src[c]);
 	}
 
 	e = exec(pc);
@@ -1064,19 +1357,22 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	if (dim == 2)
 		e->inst[0] |= 0x00400000;
 	else
-	if (dim == 3)
+	if (dim == 3) {
 		e->inst[0] |= 0x00800000;
+		if (type == TGSI_TEXTURE_CUBE)
+			e->inst[0] |= 0x08000000;
+	}
 
 	e->inst[0] |= (mask & 0x3) << 25;
 	e->inst[1] |= (mask & 0xc) << 12;
 
 	emit(pc, e);
-
 #if 1
-	if (mask & 1) emit_mov(pc, dst[0], t[0]);
-	if (mask & 2) emit_mov(pc, dst[1], t[1]);
-	if (mask & 4) emit_mov(pc, dst[2], t[2]);
-	if (mask & 8) emit_mov(pc, dst[3], t[3]);
+	c = 0;
+	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
+	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
+	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
+	if (mask & 8) emit_mov(pc, dst[3], t[c]);
 
 	free_temp4(pc, t);
 #else
@@ -1093,6 +1389,75 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 }
 
 static void
+emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
+	    struct nv50_program_exec **join)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	if (join) {
+		set_long(pc, e);
+		e->inst[0] |= 0xa0000002;
+		emit(pc, e);
+		*join = e;
+		e = exec(pc);
+	}
+
+	set_long(pc, e);
+	e->inst[0] |= 0x10000002;
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+	emit(pc, e);
+}
+
+static void
+emit_nop(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	e->inst[1] = 0xe0000000;
+	emit(pc, e);
+}
+
+static void
+emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(src->type == P_TEMP);
+
+	e->inst[0] = 0xc0140000;
+	e->inst[1] = 0x89800000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_src_2(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(src->type == P_TEMP);
+
+	if (!(src->mod & NV50_MOD_NEG)) /* ! double negation */
+		emit_neg(pc, src, src);
+
+	e->inst[0] = 0xc0150000;
+	e->inst[1] = 0x8a400000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_src_2(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	unsigned q = 0, m = ~0;
@@ -1140,10 +1505,14 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 	e->inst[1] |= q;
 }
 
+/* Some operations support an optional negation flag. */
 static boolean
 negate_supported(const struct tgsi_full_instruction *insn, int i)
 {
+	int s;
+
 	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_DDY:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
 	case TGSI_OPCODE_MUL:
@@ -1151,12 +1520,93 @@ negate_supported(const struct tgsi_full_instruction *insn, int i)
 	case TGSI_OPCODE_ADD:
 	case TGSI_OPCODE_SUB:
 	case TGSI_OPCODE_MAD:
-		return TRUE;
+		break;
 	case TGSI_OPCODE_POW:
-		return (i == 1) ? TRUE : FALSE;
+		if (i == 1)
+			break;
+		return FALSE;
 	default:
 		return FALSE;
 	}
+
+	/* Watch out for possible multiple uses of an nv50_reg, we
+	 * can't use nv50_reg::neg in these cases.
+	 */
+	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
+		if (s == i)
+			continue;
+		if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
+		     insn->FullSrcRegisters[i].SrcRegister.Index) &&
+		    (insn->FullSrcRegisters[s].SrcRegister.File ==
+		     insn->FullSrcRegisters[i].SrcRegister.File))
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+/* Return a read mask for source registers deduced from opcode & write mask. */
+static unsigned
+nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
+{
+	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
+
+	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_SIN:
+		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+	case TGSI_OPCODE_DP3:
+		return 0x7;
+	case TGSI_OPCODE_DP4:
+	case TGSI_OPCODE_DPH:
+	case TGSI_OPCODE_KIL: /* WriteMask ignored */
+		return 0xf;
+	case TGSI_OPCODE_DST:
+		return mask & (c ? 0xa : 0x6);
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_POW:
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ:
+	case TGSI_OPCODE_SCS:
+		return 0x1;
+	case TGSI_OPCODE_LIT:
+		return 0xb;
+	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXP:
+	{
+		const struct tgsi_instruction_ext_texture *tex;
+
+		assert(insn->Instruction.Extended);
+		tex = &insn->InstructionExtTexture;
+
+		mask = 0x7;
+		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
+			mask |= 0x8;
+
+		switch (tex->Texture) {
+		case TGSI_TEXTURE_1D:
+			mask &= 0x9;
+			break;
+		case TGSI_TEXTURE_2D:
+			mask &= 0xb;
+			break;
+		default:
+			break;
+		}
+	}
+		return mask;
+	case TGSI_OPCODE_XPD:
+		x = 0;
+		if (mask & 1) x |= 0x6;
+		if (mask & 2) x |= 0x5;
+		if (mask & 4) x |= 0x3;
+		return x;
+	default:
+		break;
+	}
+
+	return mask;
 }
 
 static struct nv50_reg *
@@ -1167,6 +1617,16 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 		return &pc->temp[dst->DstRegister.Index * 4 + c];
 	case TGSI_FILE_OUTPUT:
 		return &pc->result[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_ADDRESS:
+	{
+		struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c];
+		if (!r) {
+			r = alloc_addr(pc, NULL);
+			pc->addr[dst->DstRegister.Index * 4 + c] = r;
+		}
+		assert(r);
+		return r;
+	}
 	case TGSI_FILE_NULL:
 		return NULL;
 	default:
@@ -1182,16 +1642,19 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 {
 	struct nv50_reg *r = NULL;
 	struct nv50_reg *temp;
-	unsigned sgn, c;
+	unsigned sgn, c, swz;
+
+	if (src->SrcRegister.File != TGSI_FILE_CONSTANT)
+		assert(!src->SrcRegister.Indirect);
 
 	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
 
-	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
+	c = tgsi_util_get_full_src_register_swizzle(src, chan);
 	switch (c) {
-	case TGSI_EXTSWIZZLE_X:
-	case TGSI_EXTSWIZZLE_Y:
-	case TGSI_EXTSWIZZLE_Z:
-	case TGSI_EXTSWIZZLE_W:
+	case TGSI_SWIZZLE_X:
+	case TGSI_SWIZZLE_Y:
+	case TGSI_SWIZZLE_Z:
+	case TGSI_SWIZZLE_W:
 		switch (src->SrcRegister.File) {
 		case TGSI_FILE_INPUT:
 			r = &pc->attr[src->SrcRegister.Index * 4 + c];
@@ -1200,25 +1663,35 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 			r = &pc->temp[src->SrcRegister.Index * 4 + c];
 			break;
 		case TGSI_FILE_CONSTANT:
-			r = &pc->param[src->SrcRegister.Index * 4 + c];
+			if (!src->SrcRegister.Indirect) {
+				r = &pc->param[src->SrcRegister.Index * 4 + c];
+				break;
+			}
+			/* Indicate indirection by setting r->acc < 0 and
+			 * use the index field to select the address reg.
+			 */
+			r = MALLOC_STRUCT(nv50_reg);
+			swz = tgsi_util_get_src_register_swizzle(
+						 &src->SrcRegisterInd, 0);
+			ctor_reg(r, P_CONST,
+				 src->SrcRegisterInd.Index * 4 + swz,
+				 src->SrcRegister.Index * 4 + c);
+			r->acc = -1;
 			break;
 		case TGSI_FILE_IMMEDIATE:
 			r = &pc->immd[src->SrcRegister.Index * 4 + c];
 			break;
 		case TGSI_FILE_SAMPLER:
 			break;
+		case TGSI_FILE_ADDRESS:
+			r = pc->addr[src->SrcRegister.Index * 4 + c];
+			assert(r);
+			break;
 		default:
 			assert(0);
 			break;
 		}
 		break;
-	case TGSI_EXTSWIZZLE_ZERO:
-		r = alloc_immd(pc, 0.0);
-		return r;
-	case TGSI_EXTSWIZZLE_ONE:
-		if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)
-			return alloc_immd(pc, -1.0);
-		return alloc_immd(pc, 1.0);
 	default:
 		assert(0);
 		break;
@@ -1234,7 +1707,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		break;
 	case TGSI_UTIL_SIGN_TOGGLE:
 		if (neg)
-			r->neg = 1;
+			r->mod = NV50_MOD_NEG;
 		else {
 			temp = temp_temp(pc);
 			emit_neg(pc, temp, r);
@@ -1243,11 +1716,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		break;
 	case TGSI_UTIL_SIGN_SET:
 		temp = temp_temp(pc);
-		emit_abs(pc, temp, r);
-		if (neg)
-			temp->neg = 1;
-		else
-			emit_neg(pc, temp, temp);
+		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
 		r = temp;
 		break;
 	default:
@@ -1258,93 +1727,175 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 	return r;
 }
 
-/* returns TRUE if instruction can overwrite sources before they're read */
+/* return TRUE for ops that produce only a single result */
 static boolean
-direct2dest_op(const struct tgsi_full_instruction *insn)
+is_scalar_op(unsigned op)
 {
-	if (insn->Instruction.Saturate)
-		return FALSE;
-
-	switch (insn->Instruction.Opcode) {
+	switch (op) {
 	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DP2:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
 	case TGSI_OPCODE_DPH:
-	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_LG2:
 	case TGSI_OPCODE_POW:
 	case TGSI_OPCODE_RCP:
 	case TGSI_OPCODE_RSQ:
-	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_SIN:
+		/*
+	case TGSI_OPCODE_KIL:
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
+		*/
+		return TRUE;
+	default:
+		return FALSE;
+	}
+}
+
+/* Returns a bitmask indicating which dst components depend
+ * on source s, component c (reverse of nv50_tgsi_src_mask).
+ */
+static unsigned
+nv50_tgsi_dst_revdep(unsigned op, int s, int c)
+{
+	if (is_scalar_op(op))
+		return 0x1;
+
+	switch (op) {
+	case TGSI_OPCODE_DST:
+		return (1 << c) & (s ? 0xa : 0x6);
+	case TGSI_OPCODE_XPD:
+		switch (c) {
+		case 0: return 0x6;
+		case 1: return 0x5;
+		case 2: return 0x3;
+		case 3: return 0x0;
+		default:
+			assert(0);
+			return 0x0;
+		}
+	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
 	case TGSI_OPCODE_TXP:
-		return FALSE;
+		/* these take care of dangerous swizzles themselves */
+		return 0x0;
+	case TGSI_OPCODE_IF:
+	case TGSI_OPCODE_KIL:
+		/* don't call this function for these ops */
+		assert(0);
+		return 0;
 	default:
-		return TRUE;
+		/* linear vector instruction */
+		return (1 << c);
 	}
 }
 
+static INLINE boolean
+has_pred(struct nv50_program_exec *e, unsigned cc)
+{
+	if (!is_long(e) || is_immd(e))
+		return FALSE;
+	return ((e->inst[1] & 0x780) == (cc << 7));
+}
+
+/* on ENDIF see if we can do "@p0.neu single_op" instead of:
+ *        join_at ENDIF
+ *        @p0.eq bra ENDIF
+ *        single_op
+ * ENDIF: nop.join
+ */
 static boolean
-nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+nv50_kill_branch(struct nv50_pc *pc)
 {
-	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
-	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	int lvl = pc->if_lvl;
+
+	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
+		return FALSE;
+
+	/* if ccode == 'true', the BRA is from an ELSE and the predicate
+	 * reg may no longer be valid, since we currently always use $p0
+	 */
+	if (has_pred(pc->if_insn[lvl], 0xf))
+		return FALSE;
+	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
+
+	/* We'll use the exec allocated for JOIN_AT (as we can't easily
+	 * update prev's next); if exec_tail is BRK, update the pointer.
+	 */
+	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
+		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
+
+	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
+
+	*pc->br_join[lvl] = *pc->p->exec_tail;
+
+	FREE(pc->if_insn[lvl]);
+	FREE(pc->p->exec_tail);
+
+	pc->p->exec_tail = pc->br_join[lvl];
+	pc->p->exec_tail->next = NULL;
+	set_pred(pc, 0xd, 0, pc->p->exec_tail);
+
+	return TRUE;
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc,
+		     const struct tgsi_full_instruction *inst)
+{
+	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
 	unsigned mask, sat, unit;
-	boolean assimilate = FALSE;
 	int i, c;
 
 	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
 	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
 
+	memset(src, 0, sizeof(src));
+
 	for (c = 0; c < 4; c++) {
-		if (mask & (1 << c))
+		if ((mask & (1 << c)) && !pc->r_dst[c])
 			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
 		else
-			dst[c] = NULL;
-		rdst[c] = NULL;
-		src[0][c] = NULL;
-		src[1][c] = NULL;
-		src[2][c] = NULL;
+			dst[c] = pc->r_dst[c];
+		rdst[c] = dst[c];
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+		unsigned src_mask;
+		boolean neg_supp;
+
+		src_mask = nv50_tgsi_src_mask(inst, i);
+		neg_supp = negate_supported(inst, i);
 
 		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
 			unit = fs->SrcRegister.Index;
 
 		for (c = 0; c < 4; c++)
-			src[i][c] = tgsi_src(pc, c, fs,
-					     negate_supported(inst, i));
+			if (src_mask & (1 << c))
+				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
 	}
 
-	if (sat) {
-		for (c = 0; c < 4; c++) {
-			rdst[c] = dst[c];
-			dst[c] = temp_temp(pc);
-		}
+	brdc = temp = pc->r_brdc;
+	if (brdc && brdc->type != P_TEMP) {
+		temp = temp_temp(pc);
+		if (sat)
+			brdc = temp;
 	} else
-	if (direct2dest_op(inst)) {
+	if (sat) {
 		for (c = 0; c < 4; c++) {
-			if (!dst[c] || dst[c]->type != P_TEMP)
+			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
-
-			for (i = c + 1; i < 4; i++) {
-				if (dst[c] == src[0][i] ||
-				    dst[c] == src[1][i] ||
-				    dst[c] == src[2][i])
-					break;
-			}
-			if (i == 4)
-				continue;
-
-			assimilate = TRUE;
-			rdst[c] = dst[c];
-			dst[c] = alloc_temp(pc, NULL);
+			/* rdst[c] = dst[c]; */ /* done above */
+			dst[c] = temp_temp(pc);
 		}
 	}
 
+	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
+
 	switch (inst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
 		for (c = 0; c < 4; c++) {
@@ -1360,74 +1911,137 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_add(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
-	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_AND:
+	case TGSI_OPCODE_XOR:
+	case TGSI_OPCODE_OR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_bitop2(pc, dst[c], src[0][c], src[1][c],
+				    inst->Instruction.Opcode);
+		}
+		break;
+	case TGSI_OPCODE_ARL:
+		assert(src[0][0]);
 		temp = temp_temp(pc);
+		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
+		emit_arl(pc, dst[0], temp, 4);
+		break;
+	case TGSI_OPCODE_BGNLOOP:
+		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_BRK:
+		emit_branch(pc, -1, 0, NULL);
+		assert(pc->loop_lvl > 0);
+		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
+		break;
+	case TGSI_OPCODE_CEIL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
+		}
+		break;
+	case TGSI_OPCODE_CMP:
+		pc->allow32 = FALSE;
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
+			emit_mov(pc, dst[c], src[1][c]);
+			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
+			emit_mov(pc, dst[c], src[2][c]);
+			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 5, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
+		}
 		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, temp, temp);
+		emit_flop(pc, 5, brdc, temp);
+		break;
+	case TGSI_OPCODE_DDX:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_mov(pc, dst[c], temp);
+			emit_ddx(pc, dst[c], src[0][c]);
 		}
 		break;
-	case TGSI_OPCODE_DP3:
-		temp = temp_temp(pc);
-		emit_mul(pc, temp, src[0][0], src[1][0]);
-		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+	case TGSI_OPCODE_DDY:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_mov(pc, dst[c], temp);
+			emit_ddy(pc, dst[c], src[0][c]);
 		}
 		break;
+	case TGSI_OPCODE_DP3:
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
+		break;
 	case TGSI_OPCODE_DP4:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_mad(pc, temp, src[0][3], src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DPH:
-		temp = temp_temp(pc);
 		emit_mul(pc, temp, src[0][0], src[1][0]);
 		emit_mad(pc, temp, src[0][1], src[1][1], temp);
 		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_add(pc, temp, src[1][3], temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_add(pc, brdc, src[1][3], temp);
 		break;
 	case TGSI_OPCODE_DST:
-	{
-		struct nv50_reg *one = alloc_immd(pc, 1.0);
-		if (mask & (1 << 0))
-			emit_mov(pc, dst[0], one);
 		if (mask & (1 << 1))
 			emit_mul(pc, dst[1], src[0][1], src[1][1]);
 		if (mask & (1 << 2))
 			emit_mov(pc, dst[2], src[0][2]);
 		if (mask & (1 << 3))
 			emit_mov(pc, dst[3], src[1][3]);
-		FREE(one);
-	}
+		if (mask & (1 << 0))
+			emit_mov_immdval(pc, dst[0], 1.0f);
+		break;
+	case TGSI_OPCODE_ELSE:
+		emit_branch(pc, -1, 0, NULL);
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_ENDIF:
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+
+		/* try to replace branch over 1 insn with a predicated insn */
+		if (nv50_kill_branch(pc) == TRUE)
+			break;
+
+		if (pc->br_join[pc->if_lvl]) {
+			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
+			pc->br_join[pc->if_lvl] = NULL;
+		}
+		terminate_mbb(pc);
+		/* emit a NOP as join point, we could set it on the next
+		 * one, but would have to make sure it is long and !immd
+		 */
+		emit_nop(pc);
+		pc->p->exec_tail->inst[1] |= 2;
+		break;
+	case TGSI_OPCODE_ENDLOOP:
+		emit_branch(pc, -1, 0, NULL);
+		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
+		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
+		terminate_mbb(pc);
 		break;
 	case TGSI_OPCODE_EX2:
-		temp = temp_temp(pc);
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 6, brdc, temp);
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -1445,24 +2059,27 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
 		break;
+	case TGSI_OPCODE_IF:
+		/* emitting a join_at may not be necessary */
+		assert(pc->if_lvl < MAX_IF_DEPTH);
+		/* set_pred_wr(pc, 1, 0, pc->if_cond); */
+		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
+			 CVT_F32_F32);
+		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		terminate_mbb(pc);
+		break;
 	case TGSI_OPCODE_KIL:
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
-		pc->p->cfg.fp.regs[2] |= 0x00100000;
 		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		temp = temp_temp(pc);
-		emit_flop(pc, 3, temp, src[0][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_flop(pc, 3, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_LRP:
 		temp = temp_temp(pc);
@@ -1495,7 +2112,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_MOV:
-	case TGSI_OPCODE_SWZ:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -1510,31 +2126,18 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		break;
 	case TGSI_OPCODE_POW:
-		temp = temp_temp(pc);
-		emit_pow(pc, temp, src[0][0], src[1][0]);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
-		}
+		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 0, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 0, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RSQ:
-		for (c = 3; c >= 0; c--) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flop(pc, 2, dst[c], src[0][0]);
-		}
+		emit_flop(pc, 2, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_SCS:
 		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
+		if (mask & 3)
+			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
 			emit_flop(pc, 5, dst[0], temp);
 		if (mask & (1 << 1))
@@ -1544,28 +2147,29 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
-	case TGSI_OPCODE_SGE:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
-		}
-		break;
 	case TGSI_OPCODE_SIN:
-		temp = temp_temp(pc);
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 4, temp, temp);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], temp);
+		if (mask & 8) {
+			emit_precossin(pc, temp, src[0][3]);
+			emit_flop(pc, 4, dst[3], temp);
+			if (!(mask &= 7))
+				break;
+			if (temp == dst[3])
+				temp = brdc = temp_temp(pc);
 		}
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, brdc, temp);
 		break;
 	case TGSI_OPCODE_SLT:
+	case TGSI_OPCODE_SGE:
+	case TGSI_OPCODE_SEQ:
+	case TGSI_OPCODE_SGT:
+	case TGSI_OPCODE_SLE:
+	case TGSI_OPCODE_SNE:
+		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_SUB:
@@ -1583,6 +2187,14 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		emit_tex(pc, dst, mask, src[0], unit,
 			 inst->InstructionExtTexture.Texture, TRUE);
 		break;
+	case TGSI_OPCODE_TRUNC:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
+		}
+		break;
 	case TGSI_OPCODE_XPD:
 		temp = temp_temp(pc);
 		if (mask & (1 << 0)) {
@@ -1607,28 +2219,36 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		return FALSE;
 	}
 
+	if (brdc) {
+		if (sat)
+			emit_sat(pc, brdc, brdc);
+		for (c = 0; c < 4; c++)
+			if ((mask & (1 << c)) && dst[c] != brdc)
+				emit_mov(pc, dst[c], brdc);
+	} else
 	if (sat) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT,
-				 CVT_F32_F32);
+			/* In this case we saturate later, and dst[c] won't
+			 * be another temp_temp (and thus lost), since rdst
+			 * already is TEMP (see above). */
+			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
+				continue;
+			emit_sat(pc, rdst[c], dst[c]);
 		}
-	} else if (assimilate) {
-		for (c = 0; c < 4; c++)
-			if (rdst[c])
-				assimilate_temp(pc, rdst[c], dst[c]);
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
 		for (c = 0; c < 4; c++) {
 			if (!src[i][c])
 				continue;
+			src[i][c]->mod = 0;
 			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
 				FREE(src[i][c]);
 			else
-			if (src[i][c]->acc == pc->insn_cur)
-				release_hw(pc, src[i][c]);
+			if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
+				FREE(src[i][c]); /* indirect constant */
 		}
 	}
 
@@ -1636,180 +2256,284 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	return TRUE;
 }
 
-/* Adjust a bitmask that indicates what components of a source are used,
- * we use this in tx_prep so we only load interpolants that are needed.
- */
-static void
-insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)
-{
-	const struct tgsi_instruction_ext_texture *tex;
-
-	switch (insn->Instruction.Opcode) {
-	case TGSI_OPCODE_DP3:
-		*mask = 0x7;
-		break;
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_DPH:
-		*mask = 0xF;
-		break;
-	case TGSI_OPCODE_LIT:
-		*mask = 0xB;
-		break;
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ:
-		*mask = 0x1;
-		break;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXP:
-		assert(insn->Instruction.Extended);
-		tex = &insn->InstructionExtTexture;
-
-		*mask = 0x7;
-		if (tex->Texture == TGSI_TEXTURE_1D)
-			*mask = 0x1;
-		else
-		if (tex->Texture == TGSI_TEXTURE_2D)
-			*mask = 0x3;
-
-		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-			*mask |= 0x8;
-		break;
-	default:
-		break;
-	}
-}
-
 static void
-prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok,
-		  unsigned *r_usage[2])
+prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	const struct tgsi_full_instruction *insn;
+	struct nv50_reg *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
+	unsigned i, c, k, mask;
 
-	unsigned i, c, k, n, mask, *acc_p;
-
-	insn = &tok->FullInstruction;
 	dst = &insn->FullDstRegisters[0].DstRegister;
 	mask = dst->WriteMask;
 
-	if (!r_usage[0])
-		r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned));
-	if (!r_usage[1])
-		r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned));
+        if (dst->File == TGSI_FILE_TEMPORARY)
+                reg = pc->temp;
+        else
+        if (dst->File == TGSI_FILE_OUTPUT)
+                reg = pc->result;
 
-	if (dst->File == TGSI_FILE_TEMPORARY) {
+	if (reg) {
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			r_usage[0][dst->Index * 4 + c] = pc->insn_nr;
+			reg[dst->Index * 4 + c].acc = pc->insn_nr;
 		}
 	}
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		src = &insn->FullSrcRegisters[i];
 
-		switch (src->SrcRegister.File) {
-		case TGSI_FILE_TEMPORARY:
-			acc_p = r_usage[0];
-			break;
-		case TGSI_FILE_INPUT:
-			acc_p = r_usage[1];
-			break;
-		default:
+		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
+			reg = pc->temp;
+		else
+		if (src->SrcRegister.File == TGSI_FILE_INPUT)
+			reg = pc->attr;
+		else
 			continue;
-		}
 
-		insn_adjust_mask(insn, &mask);
+		mask = nv50_tgsi_src_mask(insn, i);
 
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
+			k = tgsi_util_get_full_src_register_swizzle(src, c);
 
-			k = tgsi_util_get_full_src_register_extswizzle(src, c);
-			switch (k) {
-			case TGSI_EXTSWIZZLE_X:
-			case TGSI_EXTSWIZZLE_Y:
-			case TGSI_EXTSWIZZLE_Z:
-			case TGSI_EXTSWIZZLE_W:
-				n = src->SrcRegister.Index * 4 + k;
-				acc_p[n] = pc->insn_nr;
-				break;
-			default:
-				break;
-			}
+			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
 		}
 	}
 }
 
+/* Returns a bitmask indicating which dst components need to be
+ * written to temporaries first to avoid 'corrupting' sources.
+ *
+ * m[i]   (out) indicate component to write in the i-th position
+ * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
+ */
+static unsigned
+nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
+{
+	unsigned i, c, x, unsafe;
+
+	for (c = 0; c < 4; c++)
+		m[c] = c;
+
+	/* Swap as long as a dst component written earlier is depended on
+	 * by one written later, but the next one isn't depended on by it.
+	 */
+	for (c = 0; c < 3; c++) {
+		if (rdep[m[c + 1]] & (1 << m[c]))
+			continue; /* if next one is depended on by us */
+		for (i = c + 1; i < 4; i++)
+			/* if we are depended on by a later one */
+			if (rdep[m[c]] & (1 << m[i]))
+				break;
+		if (i == 4)
+			continue;
+		/* now, swap */
+		x = m[c];
+		m[c] = m[c + 1];
+		m[c + 1] = x;
+
+		/* restart */
+		c = 0;
+	}
+
+	/* mark dependencies that could not be resolved by reordering */
+	for (i = 0; i < 3; ++i)
+		for (c = i + 1; c < 4; ++c)
+			if (rdep[m[i]] & (1 << m[c]))
+				unsafe |= (1 << i);
+
+	/* NOTE: $unsafe is with respect to order, not component */
+	return unsafe;
+}
+
+/* Select a suitable dst register for broadcasting scalar results,
+ * or return NULL if we have to allocate an extra TEMP.
+ *
+ * If e.g. only 1 component is written, we may also emit the final
+ * result to a write-only register.
+ */
+static struct nv50_reg *
+tgsi_broadcast_dst(struct nv50_pc *pc,
+		   const struct tgsi_full_dst_register *fd, unsigned mask)
+{
+	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
+		int c = ffs(~mask & fd->DstRegister.WriteMask);
+		if (c)
+			return tgsi_dst(pc, c - 1, fd);
+	} else {
+		int c = ffs(fd->DstRegister.WriteMask) - 1;
+		if ((1 << c) == fd->DstRegister.WriteMask)
+			return tgsi_dst(pc, c, fd);
+	}
+
+	return NULL;
+}
+
+/* Scan source swizzles and return a bitmask indicating dst regs that
+ * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
+ */
 static unsigned
-load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid,
-	       int *aid, int *p_oid)
+nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
+		       unsigned rdep[4])
 {
-	struct nv50_reg *iv;
-	int oid, c, n;
-	unsigned mask = 0;
+	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
+	const struct tgsi_full_src_register *fs;
+	unsigned i, deqs = 0;
 
-	iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p;
+	for (i = 0; i < 4; ++i)
+		rdep[i] = 0;
 
-	for (c = 0, n = i * 4; c < 4; c++, n++) {
-		oid = (*p_oid)++;
-		pc->attr[n].type = P_TEMP;
-		pc->attr[n].index = i;
+	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
+		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
+		boolean neg_supp = negate_supported(insn, i);
 
-		if (pc->attr[n].acc == acc[n])
+		fs = &insn->FullSrcRegisters[i];
+		if (fs->SrcRegister.File != fd->DstRegister.File ||
+		    fs->SrcRegister.Index != fd->DstRegister.Index)
 			continue;
-		mask |= (1 << c);
 
-		pc->attr[n].acc = acc[n];
-		pc->attr[n].rhw = pc->attr[n].hw = -1;
-		alloc_reg(pc, &pc->attr[n]);
+		for (chn = 0; chn < 4; ++chn) {
+			unsigned s, c;
+
+			if (!(mask & (1 << chn))) /* src is not read */
+				continue;
+			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
+			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
+
+			if (!(fd->DstRegister.WriteMask & (1 << c)))
+				continue;
 
-		pc->attr[n].rhw = (*aid)++;
-		emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]);
+			/* no danger if src is copied to TEMP first */
+			if ((s != TGSI_UTIL_SIGN_KEEP) &&
+			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
+				continue;
 
-		pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4));
-		(*mid)++;
-		pc->p->cfg.fp.regs[1] += 0x00010001;
+			rdep[c] |= nv50_tgsi_dst_revdep(
+				insn->Instruction.Opcode, i, chn);
+			deqs |= (1 << c);
+		}
 	}
 
-	return mask;
+	return deqs;
 }
 
 static boolean
-nv50_program_tx_prep(struct nv50_pc *pc)
+nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 {
-	struct tgsi_parse_context p;
-	boolean ret = FALSE;
-	unsigned i, c;
-	unsigned fcol, bcol, fcrd, depr;
+	struct tgsi_full_instruction insn = tok->FullInstruction;
+	const struct tgsi_full_dst_register *fd;
+	unsigned i, deqs, rdep[4], m[4];
+
+	fd = &tok->FullInstruction.FullDstRegisters[0];
+	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
 
-	/* count (centroid) perspective interpolations */
-	unsigned centroid_loads = 0;
-	unsigned perspect_loads = 0;
+	if (is_scalar_op(insn.Instruction.Opcode)) {
+		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
+		if (!pc->r_brdc)
+			pc->r_brdc = temp_temp(pc);
+		return nv50_program_tx_insn(pc, &insn);
+	}
+	pc->r_brdc = NULL;
 
-	/* track register access for temps and attrs */
-	unsigned *r_usage[2];
-	r_usage[0] = NULL;
-	r_usage[1] = NULL;
+	if (!deqs)
+		return nv50_program_tx_insn(pc, &insn);
 
-	depr = fcol = bcol = fcrd = 0xffff;
+	deqs = nv50_revdep_reorder(m, rdep);
 
-	if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-		pc->p->cfg.fp.regs[0] = 0x01000404;
-		pc->p->cfg.fp.regs[1] = 0x00000400;
+	for (i = 0; i < 4; ++i) {
+		assert(pc->r_dst[m[i]] == NULL);
+
+		insn.FullDstRegisters[0].DstRegister.WriteMask =
+			fd->DstRegister.WriteMask & (1 << m[i]);
+
+		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
+			continue;
+
+		if (deqs & (1 << i))
+			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
+
+		if (!nv50_program_tx_insn(pc, &insn))
+			return FALSE;
 	}
 
-	tgsi_parse_init(&p, pc->p->pipe.tokens);
-	while (!tgsi_parse_end_of_tokens(&p)) {
-		const union tgsi_full_token *tok = &p.FullToken;
+	for (i = 0; i < 4; i++) {
+		struct nv50_reg *reg = pc->r_dst[i];
+		if (!reg)
+			continue;
+		pc->r_dst[i] = NULL;
+
+		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
+		else
+			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
+		free_temp(pc, reg);
+	}
+
+	return TRUE;
+}
+
+static void
+load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *iv, **ppiv;
+	unsigned mode = pc->interp_mode[reg->index];
+
+	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
+	iv = *ppiv;
+
+	if ((mode & INTERP_PERSPECTIVE) && !iv) {
+		iv = *ppiv = alloc_temp(pc, NULL);
+		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
+
+		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
+		emit_flop(pc, 0, iv, iv);
+
+		/* XXX: when loading interpolants dynamically, move these
+		 * to the program head, or make sure it can't be skipped.
+		 */
+	}
+
+	emit_interp(pc, reg, iv, mode);
+}
+
+/* The face input is always at v[255] (varying space), with a
+ * value of 0 for back-facing, and 0xffffffff for front-facing.
+ */
+static void
+load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
+{
+	struct nv50_reg *one = alloc_immd(pc, 1.0f);
+
+	assert(a->rhw == -1);
+	alloc_reg(pc, a); /* do this before rhw is set */
+	a->rhw = 255;
+	load_interpolant(pc, a);
+	emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
+
+	FREE(one);
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context tp;
+	struct nv50_program *p = pc->p;
+	boolean ret = FALSE;
+	unsigned i, c, flat_nr = 0;
 
-		tgsi_parse_token(&p);
+	tgsi_parse_init(&tp, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&tp)) {
+		const union tgsi_full_token *tok = &tp.FullToken;
+
+		tgsi_parse_token(&tp);
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_IMMEDIATE:
 		{
 			const struct tgsi_full_immediate *imm =
-				&p.FullToken.FullImmediate;
+				&tp.FullToken.FullImmediate;
 
 			ctor_immd(pc, imm->u[0].Float,
 				      imm->u[1].Float,
@@ -1820,88 +2544,69 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		case TGSI_TOKEN_TYPE_DECLARATION:
 		{
 			const struct tgsi_full_declaration *d;
-			unsigned last, first, mode;
+			unsigned si, last, first, mode;
 
-			d = &p.FullToken.FullDeclaration;
+			d = &tp.FullToken.FullDeclaration;
 			first = d->DeclarationRange.First;
 			last = d->DeclarationRange.Last;
 
 			switch (d->Declaration.File) {
 			case TGSI_FILE_TEMPORARY:
-				if (pc->temp_nr < (last + 1))
-					pc->temp_nr = last + 1;
 				break;
 			case TGSI_FILE_OUTPUT:
-				if (pc->result_nr < (last + 1))
-					pc->result_nr = last + 1;
-
-				if (!d->Declaration.Semantic)
+				if (!d->Declaration.Semantic ||
+				    p->type == PIPE_SHADER_FRAGMENT)
 					break;
 
+				si = d->Semantic.SemanticIndex;
 				switch (d->Semantic.SemanticName) {
-				case TGSI_SEMANTIC_POSITION:
-					depr = first;
-					pc->p->cfg.fp.regs[2] |= 0x00000100;
-					pc->p->cfg.fp.regs[3] |= 0x00000011;
+				case TGSI_SEMANTIC_BCOLOR:
+					p->cfg.two_side[si].hw = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					p->cfg.psiz = first;
+					if (p->cfg.io_nr > first)
+						p->cfg.io_nr = first;
 					break;
+					/*
+				case TGSI_SEMANTIC_CLIP_DISTANCE:
+					p->cfg.clpd = MIN2(p->cfg.clpd, first);
+					break;
+					*/
 				default:
 					break;
 				}
-
 				break;
 			case TGSI_FILE_INPUT:
 			{
-				if (pc->attr_nr < (last + 1))
-					pc->attr_nr = last + 1;
-
-				if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				if (p->type != PIPE_SHADER_FRAGMENT)
 					break;
 
 				switch (d->Declaration.Interpolate) {
 				case TGSI_INTERPOLATE_CONSTANT:
 					mode = INTERP_FLAT;
+					flat_nr++;
 					break;
 				case TGSI_INTERPOLATE_PERSPECTIVE:
 					mode = INTERP_PERSPECTIVE;
+					p->cfg.regs[1] |= 0x08 << 24;
 					break;
 				default:
 					mode = INTERP_LINEAR;
 					break;
 				}
-
-				if (d->Declaration.Semantic) {
-					switch (d->Semantic.SemanticName) {
-					case TGSI_SEMANTIC_POSITION:
-						fcrd = first;
-						break;
-					case TGSI_SEMANTIC_COLOR:
-						fcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					case TGSI_SEMANTIC_BCOLOR:
-						bcol = first;
-						mode = INTERP_PERSPECTIVE;
-						break;
-					}
-				}
-
-				if (d->Declaration.Centroid) {
+				if (d->Declaration.Centroid)
 					mode |= INTERP_CENTROID;
-					if (mode & INTERP_PERSPECTIVE)
-						centroid_loads++;
-				} else
-				if (mode & INTERP_PERSPECTIVE)
-					perspect_loads++;
 
 				assert(last < 32);
 				for (i = first; i <= last; i++)
 					pc->interp_mode[i] = mode;
 			}
 				break;
+			case TGSI_FILE_ADDRESS:
 			case TGSI_FILE_CONSTANT:
-				if (pc->param_nr < (last + 1))
-					pc->param_nr = last + 1;
-				break;
 			case TGSI_FILE_SAMPLER:
 				break;
 			default:
@@ -1913,182 +2618,169 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			pc->insn_nr++;
-			prep_inspect_insn(pc, tok, r_usage);
+			prep_inspect_insn(pc, &tok->FullInstruction);
 			break;
 		default:
 			break;
 		}
 	}
 
-	if (pc->temp_nr) {
-		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->temp)
-			goto out_err;
+	if (p->type == PIPE_SHADER_VERTEX) {
+		int rid = 0;
 
-		for (i = 0; i < pc->temp_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->temp[i*4+c].type = P_TEMP;
-				pc->temp[i*4+c].hw = -1;
-				pc->temp[i*4+c].rhw = -1;
-				pc->temp[i*4+c].index = i;
-				pc->temp[i*4+c].acc = r_usage[0][i*4+c];
+		for (i = 0; i < pc->attr_nr * 4; ++i) {
+			if (pc->attr[i].acc) {
+				pc->attr[i].hw = rid++;
+				p->cfg.attr[i / 32] |= 1 << (i % 32);
 			}
 		}
-	}
 
-	if (pc->attr_nr) {
-		int oid = 4, mid = 4, aid = 0;
-		/* oid = VP output id
-		 * aid = FP attribute/interpolant id
-		 * mid = VP output mapping field ID
-		 */
+		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
+			p->cfg.io[i].hw = rid;
+			p->cfg.io[i].id_vp = i;
 
-		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->attr)
-			goto out_err;
-
-		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-			/* position should be loaded first */
-			if (fcrd != 0xffff) {
-				unsigned mask;
-				mid = 0;
-				mask = load_fp_attrib(pc, fcrd, r_usage[1],
-						      &mid, &aid, &oid);
-				oid = 0;
-				pc->p->cfg.fp.regs[1] |= (mask << 24);
-				pc->p->cfg.fp.map[0] = 0x04040404 * fcrd;
-			}
-			pc->p->cfg.fp.map[0] += 0x03020100;
-
-			/* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */
-
-			if (perspect_loads) {
-				pc->iv_p = alloc_temp(pc, NULL);
-
-				if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) {
-					pc->p->cfg.fp.regs[1] |= 0x08000000;
-					pc->iv_p->rhw = aid++;
-					emit_interp(pc, pc->iv_p, NULL,
-						    INTERP_LINEAR);
-					emit_flop(pc, 0, pc->iv_p, pc->iv_p);
-				} else {
-					pc->iv_p->rhw = aid - 1;
-					emit_flop(pc, 0, pc->iv_p,
-						  &pc->attr[fcrd * 4 + 3]);
-				}
+			for (c = 0; c < 4; ++c) {
+				int n = i * 4 + c;
+				if (!pc->result[n].acc)
+					continue;
+				pc->result[n].hw = rid++;
+				p->cfg.io[i].mask |= 1 << c;
 			}
+		}
 
-			if (centroid_loads) {
-				pc->iv_c = alloc_temp(pc, NULL);
-				pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++;
-				emit_interp(pc, pc->iv_c, NULL,
-					    INTERP_CENTROID);
-				emit_flop(pc, 0, pc->iv_c, pc->iv_c);
-				pc->p->cfg.fp.regs[1] |= 0x08000000;
-			}
+		for (c = 0; c < 2; ++c)
+			if (p->cfg.two_side[c].hw < 0x40)
+				p->cfg.two_side[c] = p->cfg.io[
+					p->cfg.two_side[c].hw];
 
-			for (c = 0; c < 4; c++) {
-				/* I don't know what these values do, but
-				 * let's set them like the blob does:
-				 */
-				if (fcol != 0xffff && r_usage[1][fcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-				if (bcol != 0xffff && r_usage[1][bcol * 4 + c])
-					pc->p->cfg.fp.regs[0] += 0x00010000;
-			}
+		if (p->cfg.psiz < 0x40)
+			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
+	} else
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		int rid, aid;
+		unsigned n = 0, m = pc->attr_nr - flat_nr;
 
-			for (i = 0; i < pc->attr_nr; i++)
-				load_fp_attrib(pc, i, r_usage[1],
-					       &mid, &aid, &oid);
+		pc->allow32 = TRUE;
 
-			if (pc->iv_p)
-				free_temp(pc, pc->iv_p);
-			if (pc->iv_c)
-				free_temp(pc, pc->iv_c);
+		int base = (TGSI_SEMANTIC_POSITION ==
+			    p->info.input_semantic_name[0]) ? 0 : 1;
 
-			pc->p->cfg.fp.high_map = (mid / 4);
-			pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0);
-		} else {
-			/* vertex program */
-			for (i = 0; i < pc->attr_nr * 4; i++) {
-				pc->p->cfg.vp.attr[aid / 32] |=
-					(1 << (aid % 32));
-				pc->attr[i].type = P_ATTR;
-				pc->attr[i].hw = aid++;
-				pc->attr[i].index = i / 4;
+		/* non-flat interpolants have to be mapped to
+		 * the lower hardware IDs, so sort them:
+		 */
+		for (i = 0; i < pc->attr_nr; i++) {
+			if (pc->interp_mode[i] == INTERP_FLAT) {
+				p->cfg.io[m].id_vp = i + base;
+				p->cfg.io[m++].id_fp = i;
+			} else {
+				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
+					p->cfg.io[n].linear = TRUE;
+				p->cfg.io[n].id_vp = i + base;
+				p->cfg.io[n++].id_fp = i;
 			}
 		}
-	}
 
-	if (pc->result_nr) {
-		int rid = 0;
+		if (!base) /* set w-coordinate mask from perspective interp */
+			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
 
-		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->result)
-			goto out_err;
+		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
+			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
 
-		for (i = 0; i < pc->result_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
-					pc->result[i*4+c].type = P_TEMP;
-					pc->result[i*4+c].hw = -1;
-					pc->result[i*4+c].rhw = (i == depr) ?
-						-1 : rid++;
-				} else {
-					pc->result[i*4+c].type = P_RESULT;
-					pc->result[i*4+c].hw = rid++;
-				}
-				pc->result[i*4+c].index = i;
+		for (n = 0; n < pc->attr_nr; ++n) {
+			p->cfg.io[n].hw = rid = aid;
+			i = p->cfg.io[n].id_fp;
+
+			if (p->info.input_semantic_name[n] ==
+			    TGSI_SEMANTIC_FACE) {
+				load_frontfacing(pc, &pc->attr[i * 4]);
+				continue;
 			}
 
-			if (pc->p->type == PIPE_SHADER_FRAGMENT &&
-			    depr != 0xffff) {
-				pc->result[depr * 4 + 2].rhw =
-					(pc->result_nr - 1) * 4;
+			for (c = 0; c < 4; ++c) {
+				if (!pc->attr[i * 4 + c].acc)
+					continue;
+				pc->attr[i * 4 + c].rhw = rid++;
+				p->cfg.io[n].mask |= 1 << c;
+
+				load_interpolant(pc, &pc->attr[i * 4 + c]);
 			}
+			aid += popcnt4(p->cfg.io[n].mask);
 		}
-	}
 
-	if (pc->param_nr) {
-		int rid = 0;
+		if (!base)
+			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
 
-		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
-		if (!pc->param)
-			goto out_err;
+		m = popcnt4(p->cfg.regs[1] >> 24);
 
-		for (i = 0; i < pc->param_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->param[i*4+c].type = P_CONST;
-				pc->param[i*4+c].hw = rid++;
-				pc->param[i*4+c].index = i;
+		/* set count of non-position inputs and of non-flat
+		 * non-position inputs for FP_INTERPOLANT_CTRL
+		 */
+		p->cfg.regs[1] |= aid - m;
+
+		if (flat_nr) {
+			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+			p->cfg.regs[1] |= (i - m) << 16;
+		} else
+			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
+
+		/* mark color semantic for light-twoside */
+		n = 0x40;
+		for (i = 0; i < pc->attr_nr; i++) {
+			ubyte si, sn;
+
+			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
+			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
+
+			if (sn == TGSI_SEMANTIC_COLOR) {
+				p->cfg.two_side[si] = p->cfg.io[i];
+
+				/* increase colour count */
+				p->cfg.regs[0] += popcnt4(
+					p->cfg.two_side[si].mask) << 16;
+
+				n = MIN2(n, p->cfg.io[i].hw - m);
 			}
 		}
+		if (n < 0x40)
+			p->cfg.regs[0] += n;
+
+		/* Initialize FP results:
+		 * FragDepth is always first TGSI and last hw output
+		 */
+		i = p->info.writes_z ? 4 : 0;
+		for (rid = 0; i < pc->result_nr * 4; i++)
+			pc->result[i].rhw = rid++;
+		if (p->info.writes_z)
+			pc->result[2].rhw = rid;
+
+		p->cfg.high_result = rid;
+
+		/* separate/different colour results for MRTs ? */
+		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
+			p->cfg.regs[2] |= 1;
 	}
 
 	if (pc->immd_nr) {
 		int rid = 0;
 
-		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
 		if (!pc->immd)
 			goto out_err;
 
 		for (i = 0; i < pc->immd_nr; i++) {
-			for (c = 0; c < 4; c++) {
-				pc->immd[i*4+c].type = P_IMMD;
-				pc->immd[i*4+c].hw = rid++;
-				pc->immd[i*4+c].index = i;
-			}
+			for (c = 0; c < 4; c++, rid++)
+				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
 		}
 	}
 
 	ret = TRUE;
 out_err:
-	if (r_usage[0])
-		FREE(r_usage[0]);
-	if (r_usage[1])
-		FREE(r_usage[1]);
+	if (pc->iv_p)
+		free_temp(pc, pc->iv_p);
+	if (pc->iv_c)
+		free_temp(pc, pc->iv_c);
 
-	tgsi_parse_free(&p);
+	tgsi_parse_free(&tp);
 	return ret;
 }
 
@@ -2110,18 +2802,175 @@ free_nv50_pc(struct nv50_pc *pc)
 }
 
 static boolean
+ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
+{
+	int i, c;
+	unsigned rtype[2] = { P_ATTR, P_RESULT };
+
+	pc->p = p;
+	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
+	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
+	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
+	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
+	assert(pc->addr_nr <= 2);
+
+	p->cfg.high_temp = 4;
+
+	p->cfg.two_side[0].hw = 0x40;
+	p->cfg.two_side[1].hw = 0x40;
+
+	switch (p->type) {
+	case PIPE_SHADER_VERTEX:
+		p->cfg.psiz = 0x40;
+		p->cfg.clpd = 0x40;
+		p->cfg.io_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_FRAGMENT:
+		rtype[0] = rtype[1] = P_TEMP;
+
+		p->cfg.regs[0] = 0x01000004;
+		p->cfg.io_nr = pc->attr_nr;
+
+		if (p->info.writes_z) {
+			p->cfg.regs[2] |= 0x00000100;
+			p->cfg.regs[3] |= 0x00000011;
+		}
+		if (p->info.uses_kill)
+			p->cfg.regs[2] |= 0x00100000;
+		break;
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->temp)
+			return FALSE;
+
+		for (i = 0; i < pc->temp_nr * 4; ++i)
+			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
+	}
+
+	if (pc->attr_nr) {
+		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->attr)
+			return FALSE;
+
+		for (i = 0; i < pc->attr_nr * 4; ++i)
+			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
+	}
+
+	if (pc->result_nr) {
+		unsigned nr = pc->result_nr * 4;
+
+		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
+		if (!pc->result)
+			return FALSE;
+
+		for (i = 0; i < nr; ++i)
+			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
+		if (!pc->param)
+			return FALSE;
+
+		for (i = 0; i < pc->param_nr; ++i)
+			for (c = 0; c < 4; ++c, ++rid)
+				ctor_reg(&pc->param[rid], P_CONST, i, rid);
+	}
+
+	if (pc->addr_nr) {
+		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
+		if (!pc->addr)
+			return FALSE;
+	}
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
+		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
+
+	return TRUE;
+}
+
+static void
+nv50_fp_move_results(struct nv50_pc *pc)
+{
+	struct nv50_reg reg;
+	unsigned i;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+
+	for (i = 0; i < pc->result_nr * 4; ++i) {
+		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
+			continue;
+		if (pc->result[i].rhw != pc->result[i].hw) {
+			reg.hw = pc->result[i].rhw;
+			emit_mov(pc, &reg, &pc->result[i]);
+		}
+	}
+}
+
+static void
+nv50_program_fixup_insns(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e, *prev = NULL, **bra_list;
+	unsigned i, n, pos;
+
+	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
+
+	/* Collect branch instructions, we need to adjust their offsets
+	 * when converting 32 bit instructions to 64 bit ones
+	 */
+	for (n = 0, e = pc->p->exec_head; e; e = e->next)
+		if (e->param.index >= 0 && !e->param.mask)
+			bra_list[n++] = e;
+
+	/* Make sure we don't have any single 32 bit instructions. */
+	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
+		pos += is_long(e) ? 2 : 1;
+
+		if ((pos & 1) && (!e->next || is_long(e->next))) {
+			for (i = 0; i < n; ++i)
+				if (bra_list[i]->param.index >= pos)
+					bra_list[i]->param.index += 1;
+			convert_to_long(pc, e);
+			++pos;
+		}
+		if (e->next)
+			prev = e;
+	}
+
+	assert(!is_immd(pc->p->exec_head));
+	assert(!is_immd(pc->p->exec_tail));
+
+	/* last instruction must be long so it can have the end bit set */
+	if (!is_long(pc->p->exec_tail)) {
+		convert_to_long(pc, pc->p->exec_tail);
+		if (prev)
+			convert_to_long(pc, prev);
+	}
+	assert(!(pc->p->exec_tail->inst[1] & 2));
+	/* set the end-bit */
+	pc->p->exec_tail->inst[1] |= 1;
+
+	FREE(bra_list);
+}
+
+static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
 	struct nv50_pc *pc;
-	unsigned k;
 	boolean ret;
 
 	pc = CALLOC_STRUCT(nv50_pc);
 	if (!pc)
 		return FALSE;
-	pc->p = p;
-	pc->p->cfg.high_temp = 4;
+
+	ret = ctor_nv50_pc(pc, p);
+	if (ret == FALSE)
+		goto out_cleanup;
 
 	ret = nv50_program_tx_prep(pc);
 	if (ret == FALSE)
@@ -2141,7 +2990,7 @@ nv50_program_tx(struct nv50_program *p)
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
 			++pc->insn_cur;
-			ret = nv50_program_tx_insn(pc, tok);
+			ret = nv50_tgsi_insn(pc, tok);
 			if (ret == FALSE)
 				goto out_err;
 			break;
@@ -2150,48 +2999,10 @@ nv50_program_tx(struct nv50_program *p)
 		}
 	}
 
-	if (p->type == PIPE_SHADER_FRAGMENT) {
-		struct nv50_reg out;
+	if (pc->p->type == PIPE_SHADER_FRAGMENT)
+		nv50_fp_move_results(pc);
 
-		out.type = P_TEMP;
-		for (k = 0; k < pc->result_nr * 4; k++) {
-			if (pc->result[k].rhw == -1)
-				continue;
-			if (pc->result[k].hw != pc->result[k].rhw) {
-				out.hw = pc->result[k].rhw;
-				emit_mov(pc, &out, &pc->result[k]);
-			}
-			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
-				pc->p->cfg.high_result = pc->result[k].rhw + 1;
-		}
-	}
-
-	/* look for single half instructions and make them long */
-	struct nv50_program_exec *e, *e_prev;
-
-	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
-		if (!is_long(e))
-			k++;
-
-		if (!e->next || is_long(e->next)) {
-			if (k & 1)
-				convert_to_long(pc, e);
-			k = 0;
-		}
-
-		if (e->next)
-			e_prev = e;
-	}
-
-	if (!is_long(pc->p->exec_tail)) {
-		/* this may occur if moving FP results */
-		assert(e_prev && !is_long(e_prev));
-		convert_to_long(pc, e_prev);
-		convert_to_long(pc, pc->p->exec_tail);
-	}
-
-	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
-	pc->p->exec_tail->inst[1] |= 0x00000001;
+	nv50_program_fixup_insns(pc);
 
 	p->param_nr = pc->param_nr * 4;
 	p->immd_nr = pc->immd_nr * 4;
@@ -2258,30 +3069,19 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 					 p->immd_nr, NV50_CB_PMISC);
 	}
 
-	if (!p->data[1] && p->param_nr) {
-		struct nouveau_resource *heap =
-			nv50->screen->parm_heap[p->type];
-
-		if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) {
-			while (heap->next && heap->size < p->param_nr) {
-				struct nv50_program *evict = heap->next->priv;
-				nouveau_resource_free(&evict->data[1]);
-			}
-
-			if (nouveau_resource_alloc(heap, p->param_nr, p,
-						   &p->data[1]))
-				assert(0);
-		}
-	}
+	assert(p->param_nr <= 512);
 
 	if (p->param_nr) {
-		unsigned cbuf = NV50_CB_PVP;
+		unsigned cb;
 		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
 					     PIPE_BUFFER_USAGE_CPU_READ);
-		if (p->type == PIPE_SHADER_FRAGMENT)
-			cbuf = NV50_CB_PFP;
-		nv50_program_upload_data(nv50, map, p->data[1]->start,
-					 p->param_nr, cbuf);
+
+		if (p->type == PIPE_SHADER_VERTEX)
+			cb = NV50_CB_PVP;
+		else
+			cb = NV50_CB_PFP;
+
+		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
 		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
 	}
 }
@@ -2290,11 +3090,8 @@ static void
 nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 {
 	struct nouveau_channel *chan = nv50->screen->base.channel;
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program_exec *e;
-	struct nouveau_stateobj *so;
-	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
-	unsigned start, count, *up, *ptr;
+	uint32_t *up, i;
 	boolean upload = FALSE;
 
 	if (!p->bo) {
@@ -2303,32 +3100,46 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 		upload = TRUE;
 	}
 
-	if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||
-		(p->data[1] && p->data[1]->start != p->data_start[1])) {
-		for (e = p->exec_head; e; e = e->next) {
-			unsigned ei, ci, bs;
+	if (p->data[0] && p->data[0]->start != p->data_start[0])
+		upload = TRUE;
 
-			if (e->param.index < 0)
-				continue;
+	if (!upload)
+		return;
+
+	up = MALLOC(p->exec_size * 4);
+
+	for (i = 0, e = p->exec_head; e; e = e->next) {
+		unsigned ei, ci, bs;
+
+		if (e->param.index >= 0 && e->param.mask) {
 			bs = (e->inst[1] >> 22) & 0x07;
 			assert(bs < 2);
 			ei = e->param.shift >> 5;
-			ci = e->param.index + p->data[bs]->start;
+			ci = e->param.index;
+			if (bs == 0)
+				ci += p->data[bs]->start;
 
 			e->inst[ei] &= ~e->param.mask;
 			e->inst[ei] |= (ci << e->param.shift);
+		} else
+		if (e->param.index >= 0) {
+			/* zero mask means param is a jump/branch offset */
+			assert(!(e->param.index & 1));
+			/* seem to be 8 byte steps */
+			ei = (e->param.index >> 1) + 0 /* START_ID */;
+
+			e->inst[0] &= 0xf0000fff;
+			e->inst[0] |= ei << 12;
 		}
 
-		if (p->data[0])
-			p->data_start[0] = p->data[0]->start;
-		if (p->data[1])
-			p->data_start[1] = p->data[1]->start;
-
-		upload = TRUE;
+		up[i++] = e->inst[0];
+		if (is_long(e))
+			up[i++] = e->inst[1];
 	}
+	assert(i == p->exec_size);
 
-	if (!upload)
-		return;
+	if (p->data[0])
+		p->data_start[0] = p->data[0]->start;
 
 #ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
@@ -2338,45 +3149,12 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
 	}
 #endif
-
-	up = ptr = MALLOC(p->exec_size * 4);
-	for (e = p->exec_head; e; e = e->next) {
-		*(ptr++) = e->inst[0];
-		if (is_long(e))
-			*(ptr++) = e->inst[1];
-	}
-
-	so = so_new(4,2);
-	so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
-
-	start = 0; count = p->exec_size;
-	while (count) {
-		struct nouveau_channel *chan = nv50->screen->base.channel;
-		unsigned nr;
-
-		so_emit(chan, so);
-
-		nr = MIN2(count, 2047);
-		nr = MIN2(chan->pushbuf->remaining, nr);
-		if (chan->pushbuf->remaining < (nr + 3)) {
-			FIRE_RING(chan);
-			continue;
-		}
-
-		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
-		OUT_RING  (chan, (start << 8) | NV50_CB_PUPLOAD);
-		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
-		OUT_RINGp (chan, up + start, nr);
-
-		start += nr;
-		count -= nr;
-	}
+	nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM,
+			 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
+			 up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0,
+			 0, 0, p->exec_size * 4, 1, 1);
 
 	FREE(up);
-	so_ref(NULL, &so);
 }
 
 void
@@ -2402,8 +3180,8 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
-	so_data  (so, p->cfg.vp.attr[0]);
-	so_data  (so, p->cfg.vp.attr[1]);
+	so_data  (so, p->cfg.attr[0]);
+	so_data  (so, p->cfg.attr[1]);
 	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, p->cfg.high_result);
 	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
@@ -2421,7 +3199,6 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *p = nv50->fragprog;
 	struct nouveau_stateobj *so;
-	unsigned i;
 
 	if (!p->translated) {
 		nv50_program_validate(nv50, p);
@@ -2438,29 +3215,186 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 		      NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
-	so_data  (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */
-	so_data  (so, 0x00000004);
-	so_data  (so, 0x00000000);
-	so_data  (so, 0x00000000);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map);
-	for (i = 0; i < p->cfg.fp.high_map; i++)
-		so_data(so, p->cfg.fp.map[i]);
-	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2);
-	so_data  (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */
+	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
 	so_data  (so, p->cfg.high_result);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
-	so_data  (so, p->cfg.fp.regs[2]);
+	so_data  (so, p->cfg.regs[2]);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
-	so_data  (so, p->cfg.fp.regs[3]);
+	so_data  (so, p->cfg.regs[3]);
 	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
 	so_ref(so, &nv50->state.fragprog);
 	so_ref(NULL, &so);
 }
 
+static void
+nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
+{
+	struct nv50_program *fp = nv50->fragprog;
+	struct nv50_program *vp = nv50->vertprog;
+	unsigned i, c, m = base;
+
+	/* XXX: This can't work correctly in all cases yet, we either
+	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
+	 * to be per FP input instead of per VP output
+	 */
+	memset(pntc, 0, 8 * sizeof(uint32_t));
+
+	for (i = 0; i < fp->cfg.io_nr; i++) {
+		uint8_t sn, si;
+		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
+		unsigned n = popcnt4(fp->cfg.io[i].mask);
+
+		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
+			m += n;
+			continue;
+		}
+
+		sn = vp->info.input_semantic_name[j];
+		si = vp->info.input_semantic_index[j];
+
+		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
+			ubyte mode =
+				nv50->rasterizer->pipe.sprite_coord_mode[si];
+
+			if (mode == PIPE_SPRITE_COORD_NONE) {
+				m += n;
+				continue;
+			}
+		}
+
+		/* this is either PointCoord or replaced by sprite coords */
+		for (c = 0; c < 4; c++) {
+			if (!(fp->cfg.io[i].mask & (1 << c)))
+				continue;
+			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+			++m;
+		}
+	}
+}
+
+static int
+nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
+	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+{
+	int c;
+	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
+	uint8_t *map = (uint8_t *)p_map;
+
+	for (c = 0; c < 4; ++c) {
+		if (mf & 1) {
+			if (fpi->linear == TRUE)
+				lin[mid / 32] |= 1 << (mid % 32);
+			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+		}
+
+		oid += mv & 1;
+		mf >>= 1;
+		mv >>= 1;
+	}
+
+	return mid;
+}
+
+void
+nv50_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *fp = nv50->fragprog;
+	struct nouveau_stateobj *so;
+	struct nv50_sreg4 dummy, *vpo;
+	int i, n, c, m = 0;
+	uint32_t map[16], lin[4], reg[5], pcrd[8];
+
+	memset(map, 0, sizeof(map));
+	memset(lin, 0, sizeof(lin));
+
+	reg[1] = 0x00000004; /* low and high clip distance map ids */
+	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
+	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
+	reg[4] = fp->cfg.regs[1]; /* interpolant info */
+
+	dummy.linear = FALSE;
+	dummy.mask = 0xf; /* map all components of HPOS */
+	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+
+	dummy.mask = 0x0;
+
+	if (vp->cfg.clpd < 0x40) {
+		for (c = 0; c < vp->cfg.clpd_nr; ++c)
+			map[m++] = vp->cfg.clpd + c;
+		reg[1] = (m << 8);
+	}
+
+	reg[0] |= m << 8; /* adjust BFC0 id */
+
+	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
+	if (nv50->rasterizer->pipe.light_twoside) {
+		vpo = &vp->cfg.two_side[0];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
+	}
+
+	reg[0] += m - 4; /* adjust FFC0 id */
+	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
+
+	i = 0;
+	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
+		i = 1;
+	for (; i < fp->cfg.io_nr; i++) {
+		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
+		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
+
+		n = fp->cfg.io[i].id_vp;
+		if (n >= vp->cfg.io_nr ||
+		    vp->info.output_semantic_name[n] != sn ||
+		    vp->info.output_semantic_index[n] != si)
+			vpo = &dummy;
+		else
+			vpo = &vp->cfg.io[n];
+
+		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+	}
+
+	if (nv50->rasterizer->pipe.point_size_per_vertex) {
+		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
+		reg[3] = (m++ << 4) | 1;
+	}
+
+	/* now fill the stateobj */
+	so = so_new(64, 0);
+
+	n = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+	so_datap (so, map, n);
+
+	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+	so_datap (so, reg, 4);
+
+	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+	so_data  (so, reg[4]);
+
+	so_method(so, tesla, 0x1540, 4);
+	so_datap (so, lin, 4);
+
+	if (nv50->rasterizer->pipe.point_sprite) {
+		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+
+		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
+		so_datap (so, pcrd, 8);
+	}
+
+        so_ref(so, &nv50->state.programs);
+        so_ref(NULL, &so);
+}
+
 void
 nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
@@ -2476,7 +3410,6 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 	nouveau_bo_ref(NULL, &p->bo);
 
 	nouveau_resource_free(&p->data[0]);
-	nouveau_resource_free(&p->data[1]);
 
 	p->translated = 0;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 096e0476aab..d78dee083f1 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -15,6 +15,15 @@ struct nv50_program_exec {
 	} param;
 };
 
+struct nv50_sreg4 {
+	uint8_t hw;
+	uint8_t id_vp;
+	uint8_t id_fp;
+
+	uint8_t mask;
+	boolean linear;
+};
+
 struct nv50_program {
 	struct pipe_shader_state pipe;
 	struct tgsi_shader_info info;
@@ -24,8 +33,8 @@ struct nv50_program {
 	struct nv50_program_exec *exec_head;
 	struct nv50_program_exec *exec_tail;
 	unsigned exec_size;
-	struct nouveau_resource *data[2];
-	unsigned data_start[2];
+	struct nouveau_resource *data[1];
+	unsigned data_start[1];
 
 	struct nouveau_bo *bo;
 
@@ -36,14 +45,20 @@ struct nv50_program {
 	struct {
 		unsigned high_temp;
 		unsigned high_result;
-		struct {
-			unsigned attr[2];
-		} vp;
-		struct {
-			unsigned regs[4];
-			unsigned map[5];
-			unsigned high_map;
-		} fp;
+
+		uint32_t attr[2];
+		uint32_t regs[4];
+
+		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
+		unsigned io_nr;
+		struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS];
+
+		/* FP colour inputs, VP/GP back colour outputs */
+		struct nv50_sreg4 two_side[2];
+
+		/* VP only */
+		uint8_t clpd, clpd_nr;
+		uint8_t psiz;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index c7f80a22037..e1b2f11239a 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -35,8 +35,14 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 {
 	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
 		switch (format) {
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
 		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_R16G16B16A16_SNORM:
+		case PIPE_FORMAT_R16G16B16A16_UNORM:
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		case PIPE_FORMAT_R16G16_SNORM:
+		case PIPE_FORMAT_R16G16_UNORM:
 			return TRUE;
 		default:
 			break;
@@ -55,6 +61,9 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 	} else {
 		switch (format) {
 		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_A8R8G8B8_SRGB:
+		case PIPE_FORMAT_X8R8G8B8_SRGB:
 		case PIPE_FORMAT_A1R5G5B5_UNORM:
 		case PIPE_FORMAT_A4R4G4B4_UNORM:
 		case PIPE_FORMAT_R5G6B5_UNORM:
@@ -66,6 +75,13 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 		case PIPE_FORMAT_DXT1_RGBA:
 		case PIPE_FORMAT_DXT3_RGBA:
 		case PIPE_FORMAT_DXT5_RGBA:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z32_FLOAT:
+		case PIPE_FORMAT_R16G16B16A16_SNORM:
+		case PIPE_FORMAT_R16G16B16A16_UNORM:
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		case PIPE_FORMAT_R16G16_SNORM:
+		case PIPE_FORMAT_R16G16_UNORM:
 			return TRUE;
 		default:
 			break;
@@ -87,12 +103,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
-	case PIPE_CAP_S3TC:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
-		return 0;
+		return 1;
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
 	case PIPE_CAP_OCCLUSION_QUERY:
@@ -218,7 +232,16 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		tesla_class = NV54TCL;
 		break;
 	case 0xa0:
-		tesla_class = NVA0TCL;
+		switch (chipset) {
+		case 0xa0:
+		case 0xaa:
+		case 0xac:
+			tesla_class = NVA0TCL;
+			break;
+		default:
+			tesla_class = 0x8597;
+			break;
+		}
 		break;
 	default:
 		NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset);
@@ -226,12 +249,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	if (tesla_class == 0) {
-		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
-		nv50_screen_destroy(pscreen);
-		return NULL;
-	}
-
 	ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class,
 		&screen->tesla);
 	if (ret) {
@@ -292,6 +309,12 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_method(so, screen->tesla, 0x121c, 1);
 	so_data  (so, 1);
 
+	/* activate all 32 lanes (threads) in a warp */
+	so_method(so, screen->tesla, 0x19a0, 1);
+	so_data  (so, 0x2);
+	so_method(so, screen->tesla, 0x1400, 1);
+	so_data  (so, 0xf);
+
 	so_method(so, screen->tesla, 0x13bc, 1);
 	so_data  (so, 0x54);
 	/* origin is top left (set to 1 for bottom left) */
@@ -301,7 +324,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_data  (so, 8);
 
 	/* constant buffers for immediates and VP/FP parameters */
-	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4,
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (32 * 4) * 4,
 			     &screen->constbuf_misc[0]);
 	if (ret) {
 		nv50_screen_destroy(pscreen);
@@ -309,7 +332,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	for (i = 0; i < 2; i++) {
-		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4,
+		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (128 * 4) * 4,
 				     &screen->constbuf_parm[i]);
 		if (ret) {
 			nv50_screen_destroy(pscreen);
@@ -318,8 +341,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	if (nouveau_resource_init(&screen->immd_heap[0], 0, 128) ||
-		nouveau_resource_init(&screen->parm_heap[0], 0, 128) ||
-		nouveau_resource_init(&screen->parm_heap[1], 0, 128))
+	    nouveau_resource_init(&screen->parm_heap[0], 0, 512) ||
+	    nouveau_resource_init(&screen->parm_heap[1], 0, 512))
 	{
 		NOUVEAU_ERR("Error initialising constant buffers.\n");
 		nv50_screen_destroy(pscreen);
@@ -340,7 +363,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, (NV50_CB_PMISC << 16) | 0x00000800);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00000200);
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000001 | (NV50_CB_PMISC << 12));
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
@@ -364,48 +387,31 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000131 | (NV50_CB_PFP << 12));
 
-	/* Texture sampler/image unit setup - we abuse the constant buffer
-	 * upload mechanism for the moment to upload data to the tex config
-	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
-	 * things?
-	 */
-	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tic);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tic);
 	if (ret) {
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, (NV50_CB_TIC << 16) | 0x0800);
 	so_method(so, screen->tesla, NV50TCL_TIC_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, 0x00000800);
+	so_data  (so, 0x000007ff);
 
-	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tsc);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tsc);
 	if (ret) {
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
 
-	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, (NV50_CB_TSC << 16) | 0x0800);
 	so_method(so, screen->tesla, NV50TCL_TSC_ADDRESS_HIGH, 3);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, 0x00000800);
+	so_data  (so, 0x00000000);
 
 
 	/* Vertex array limits - max them out */
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 4283808ed93..ffaa5e29d1c 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -146,6 +146,7 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		  (wrap_mode(cso->wrap_r) << 6));
 
 	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
 		break;
@@ -156,6 +157,7 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	}
 
 	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
 		break;
@@ -183,21 +185,15 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	else
 	if (cso->max_anisotropy >= 12.0)
 		tsc[0] |= (6 << 20);
-	else
-	if (cso->max_anisotropy >= 10.0)
-		tsc[0] |= (5 << 20);
-	else
-	if (cso->max_anisotropy >= 8.0)
-		tsc[0] |= (4 << 20);
-	else
-	if (cso->max_anisotropy >= 6.0)
-		tsc[0] |= (3 << 20);
-	else
-	if (cso->max_anisotropy >= 4.0)
-		tsc[0] |= (2 << 20);
-	else
-	if (cso->max_anisotropy >= 2.0)
-		tsc[0] |= (1 << 20);
+	else {
+		tsc[0] |= (int)(cso->max_anisotropy * 0.5f) << 20;
+
+		if (cso->max_anisotropy >= 4.0)
+			tsc[1] |= NV50TSC_1_1_UNKN_ANISO_35;
+		else
+		if (cso->max_anisotropy >= 2.0)
+			tsc[1] |= NV50TSC_1_1_UNKN_ANISO_15;
+	}
 
 	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
 		tsc[0] |= (1 << 8);
@@ -276,6 +272,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, 0x1684, 1);
 	so_data  (so, cso->flatshade_first ? 0 : 1);
 
+	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
+	so_data  (so, cso->light_twoside);
+
 	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
 	so_data  (so, fui(cso->line_width));
 	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
@@ -294,6 +293,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
 	so_data  (so, fui(cso->point_size));
 
+	so_method(so, tesla, NV50TCL_POINT_SPRITE_ENABLE, 1);
+	so_data  (so, cso->point_sprite);
+
 	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
 	if (cso->front_winding == PIPE_WINDING_CCW) {
 		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 344c2cf6dde..799d2758fee 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -23,6 +23,12 @@
 #include "nv50_context.h"
 #include "nouveau/nouveau_stateobj.h"
 
+#define NV50_CBUF_FORMAT_CASE(n) \
+	case PIPE_FORMAT_##n: so_data(so, NV50TCL_RT_FORMAT_##n); break
+
+#define NV50_ZETA_FORMAT_CASE(n) \
+	case PIPE_FORMAT_##n: so_data(so, NV50TCL_ZETA_FORMAT_##n); break
+
 static void
 nv50_state_validate_fb(struct nv50_context *nv50)
 {
@@ -31,6 +37,15 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
 	unsigned i, w, h, gw = 0;
 
+	/* Set nr of active RTs and select RT for each colour output.
+	 * FP result 0 always goes to RT[0], bits 4 - 6 are ignored.
+	 * Ambiguous assignment results in no rendering (no DATA_ERROR).
+	 */
+	so_method(so, tesla, 0x121c, 1);
+	so_data  (so, fb->nr_cbufs |
+		  (0 <<  4) | (1 <<  7) | (2 << 10) | (3 << 13) |
+		  (4 << 16) | (5 << 19) | (6 << 22) | (7 << 25));
+
 	for (i = 0; i < fb->nr_cbufs; i++) {
 		struct pipe_texture *pt = fb->cbufs[i]->texture;
 		struct nouveau_bo *bo = nv50_miptree(pt)->base.bo;
@@ -54,19 +69,22 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->cbufs[i]->format) {
-		case PIPE_FORMAT_A8R8G8B8_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM);
-			break;
-		case PIPE_FORMAT_R5G6B5_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM);
-			break;
+		NV50_CBUF_FORMAT_CASE(A8R8G8B8_UNORM);
+		NV50_CBUF_FORMAT_CASE(X8R8G8B8_UNORM);
+		NV50_CBUF_FORMAT_CASE(R5G6B5_UNORM);
+		NV50_CBUF_FORMAT_CASE(R16G16B16A16_SNORM);
+		NV50_CBUF_FORMAT_CASE(R16G16B16A16_UNORM);
+		NV50_CBUF_FORMAT_CASE(R32G32B32A32_FLOAT);
+		NV50_CBUF_FORMAT_CASE(R16G16_SNORM);
+		NV50_CBUF_FORMAT_CASE(R16G16_UNORM);
 		default:
 			NOUVEAU_ERR("AIIII unknown format %s\n",
 				    pf_name(fb->cbufs[i]->format));
 			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1224, 1);
@@ -92,25 +110,18 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
 		switch (fb->zsbuf->format) {
-		case PIPE_FORMAT_Z32_FLOAT:
-			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT);
-			break;
-		case PIPE_FORMAT_Z24S8_UNORM:
-			so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM);
-			break;
-		case PIPE_FORMAT_X8Z24_UNORM:
-			so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM);
-			break;
-		case PIPE_FORMAT_S8Z24_UNORM:
-			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
-			break;
+		NV50_ZETA_FORMAT_CASE(S8Z24_UNORM);
+		NV50_ZETA_FORMAT_CASE(X8Z24_UNORM);
+		NV50_ZETA_FORMAT_CASE(Z24S8_UNORM);
+		NV50_ZETA_FORMAT_CASE(Z32_FLOAT);
 		default:
 			NOUVEAU_ERR("AIIII unknown format %s\n",
 				    pf_name(fb->zsbuf->format));
 			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
 			break;
 		}
-		so_data(so, bo->tile_mode << 4);
+		so_data(so, nv50_miptree(pt)->
+				level[fb->zsbuf->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, 0x1538, 1);
@@ -119,6 +130,9 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 		so_data  (so, fb->zsbuf->width);
 		so_data  (so, fb->zsbuf->height);
 		so_data  (so, 0x00010001);
+	} else {
+		so_method(so, tesla, 0x1538, 1);
+		so_data  (so, 0);
 	}
 
 	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
@@ -187,6 +201,8 @@ nv50_state_emit(struct nv50_context *nv50)
 		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
 		so_emit(chan, nv50->state.fragprog);
+	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		so_emit(chan, nv50->state.programs);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
 		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
@@ -208,6 +224,15 @@ nv50_state_emit(struct nv50_context *nv50)
 			so_emit(chan, nv50->state.vtxattr);
 	}
 	nv50->state.dirty = 0;
+}
+
+void
+nv50_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nv50_context *nv50 = chan->user_private;
+
+	if (nv50->state.tic_upload && !(nv50->dirty & NV50_NEW_TEXTURE))
+		so_emit(chan, nv50->state.tic_upload);
 
 	so_emit_reloc_markers(chan, nv50->state.fb);
 	so_emit_reloc_markers(chan, nv50->state.vertprog);
@@ -220,6 +245,7 @@ boolean
 nv50_state_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 	struct nouveau_stateobj *so;
 	unsigned i;
 
@@ -238,6 +264,9 @@ nv50_state_validate(struct nv50_context *nv50)
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
 		nv50_fragprog_validate(nv50);
 
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
+		nv50_linkage_validate(nv50);
+
 	if (nv50->dirty & NV50_NEW_RASTERIZER)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
@@ -299,7 +328,7 @@ scissor_uptodate:
 			goto viewport_uptodate;
 		nv50->state.viewport_bypass = bypass;
 
-		so = so_new(12, 0);
+		so = so_new(14, 0);
 		if (!bypass) {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
@@ -312,12 +341,21 @@ scissor_uptodate:
 
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 1);
+			/* 0x0000 = remove whole primitive only (xyz)
+			 * 0x1018 = remove whole primitive only (xy), clamp z
+			 * 0x1080 = clip primitive (xyz)
+			 * 0x1098 = clip primitive (xy), clamp z
+			 */
+			so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+			so_data  (so, 0x1080);
 			/* no idea what 0f90 does */
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 0);
 		} else {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 0);
+			so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+			so_data  (so, 0x0000);
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 1);
 		}
@@ -329,15 +367,25 @@ scissor_uptodate:
 viewport_uptodate:
 
 	if (nv50->dirty & NV50_NEW_SAMPLER) {
-		int i;
-
-		so = so_new(nv50->sampler_nr * 8 + 3, 0);
-		so_method(so, tesla, NV50TCL_CB_ADDR, 1);
-		so_data  (so, NV50_CB_TSC);
-		so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000,
-			nv50->sampler_nr * 8);
-		for (i = 0; i < nv50->sampler_nr; i++)
+		unsigned i;
+
+		so = so_new(nv50->sampler_nr * 9 + 23 + 4, 2);
+
+		nv50_so_init_sifc(nv50, so, nv50->screen->tsc, NOUVEAU_BO_VRAM,
+				  nv50->sampler_nr * 8 * 4);
+
+		for (i = 0; i < nv50->sampler_nr; i++) {
+			if (!nv50->sampler[i])
+				continue;
+			so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8);
 			so_datap (so, nv50->sampler[i]->tsc, 8);
+		}
+
+		so_method(so, tesla, 0x1440, 1); /* sync SIFC */
+		so_data  (so, 0);
+		so_method(so, tesla, 0x1334, 1); /* flush TSC */
+		so_data  (so, 0);
+
 		so_ref(so, &nv50->state.tsc_upload);
 		so_ref(NULL, &so);
 	}
@@ -355,3 +403,33 @@ viewport_uptodate:
 	return TRUE;
 }
 
+void nv50_so_init_sifc(struct nv50_context *nv50,
+		       struct nouveau_stateobj *so,
+		       struct nouveau_bo *bo, unsigned reloc, unsigned size)
+{
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+
+	so_method(so, eng2d, NV50_2D_DST_FORMAT, 2);
+	so_data  (so, NV50_2D_DST_FORMAT_R8_UNORM);
+	so_data  (so, 1);
+	so_method(so, eng2d, NV50_2D_DST_PITCH, 5);
+	so_data  (so, 262144);
+	so_data  (so, 65536);
+	so_data  (so, 1);
+	so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, eng2d, NV50_2D_SIFC_UNK0800, 2);
+	so_data  (so, 0);
+	so_data  (so, NV50_2D_SIFC_FORMAT_R8_UNORM);
+	so_method(so, eng2d, NV50_2D_SIFC_WIDTH, 10);
+	so_data  (so, size);
+	so_data  (so, 1);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 0);
+	so_data  (so, 0);
+	so_data  (so, 0);
+	so_data  (so, 0);
+}
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index b266324f58d..6bf6f773b0c 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -60,13 +60,13 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  	format = nv50_format(ps->format);
  	if (format < 0)
  		return 1;
-  
+
  	if (!bo->tile_flags) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
  		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
- 		OUT_RING  (chan, mt->level[0].pitch);
+		OUT_RING  (chan, mt->level[ps->level].pitch);
  		OUT_RING  (chan, ps->width);
  		OUT_RING  (chan, ps->height);
  		OUT_RELOCh(chan, bo, ps->offset, flags);
@@ -75,7 +75,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
- 		OUT_RING  (chan, bo->tile_mode << 4);
+		OUT_RING  (chan, mt->level[ps->level].tile_mode << 4);
  		OUT_RING  (chan, 1);
  		OUT_RING  (chan, 0);
  		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index 033cb50c115..2813f544770 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -25,109 +25,115 @@
 
 #include "nouveau/nouveau_stateobj.h"
 
+#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f)		\
+{                                                       	\
+	PIPE_FORMAT_##pf,					\
+	NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 |	\
+	NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 |	\
+	NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 |	\
+	NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 |	\
+	NV50TIC_0_0_FMT_##f					\
+}
+
+#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f)
+
+struct nv50_texture_format {
+	enum pipe_format pf;
+	uint32_t hw;
+};
+
+#define NV50_TEX_FORMAT_LIST_SIZE \
+	(sizeof(nv50_tex_format_list) / sizeof(struct nv50_texture_format))
+
+static const struct nv50_texture_format nv50_tex_format_list[] =
+{
+	_(A8R8G8B8_UNORM, UNORM, C2, C1, C0, C3,  8_8_8_8),
+	_(A8R8G8B8_SRGB,  UNORM, C2, C1, C0, C3,  8_8_8_8),
+	_(X8R8G8B8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8),
+	_(X8R8G8B8_SRGB,  UNORM, C2, C1, C0, ONE, 8_8_8_8),
+	_(A1R5G5B5_UNORM, UNORM, C2, C1, C0, C3,  1_5_5_5),
+	_(A4R4G4B4_UNORM, UNORM, C2, C1, C0, C3,  4_4_4_4),
+
+	_(R5G6B5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5),
+
+	_(L8_UNORM, UNORM, C0, C0, C0, ONE, 8),
+	_(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8),
+	_(I8_UNORM, UNORM, C0, C0, C0, C0, 8),
+
+	_(A8L8_UNORM, UNORM, C0, C0, C0, C1, 8_8),
+
+	_(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1),
+	_(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1),
+	_(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3),
+	_(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5),
+
+	_MIXED(Z24S8_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8),
+
+	_(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16),
+	_(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16),
+	_(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32),
+
+	_(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16),
+	_(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16),
+
+	_MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH)
+
+};
+
+#undef _
+#undef _MIXED
+
 static int
 nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 		   struct nv50_miptree *mt, int unit)
 {
-	switch (mt->base.base.format) {
-	case PIPE_FORMAT_A8R8G8B8_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_8_8_8_8);
-		break;
-	case PIPE_FORMAT_A1R5G5B5_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_1_5_5_5);
-		break;
-	case PIPE_FORMAT_A4R4G4B4_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_4_4_4_4);
-		break;
-	case PIPE_FORMAT_R5G6B5_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_5_6_5);
-		break;
-	case PIPE_FORMAT_L8_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_8);
-		break;
-	case PIPE_FORMAT_A8_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_8);
-		break;
-	case PIPE_FORMAT_I8_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_8);
-		break;
-	case PIPE_FORMAT_A8L8_UNORM:
-		so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_8_8);
-		break;
-	case PIPE_FORMAT_DXT1_RGB:
-		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_DXT1);
+	unsigned i;
+	uint32_t mode;
+
+	for (i = 0; i < NV50_TEX_FORMAT_LIST_SIZE; i++)
+		if (nv50_tex_format_list[i].pf == mt->base.base.format)
+			break;
+	if (i == NV50_TEX_FORMAT_LIST_SIZE)
+                return 1;
+
+	if (nv50->sampler[unit]->normalized)
+		mode = 0x50001000 | (1 << 31);
+	else {
+		mode = 0x50001000 | (7 << 14);
+		assert(mt->base.base.target == PIPE_TEXTURE_2D);
+	}
+
+	mode |= ((mt->base.bo->tile_mode & 0x0f) << 22) |
+		((mt->base.bo->tile_mode & 0xf0) << 21);
+
+	if (pf_type(mt->base.base.format) == PIPE_FORMAT_TYPE_SRGB)
+		mode |= 0x0400;
+
+	switch (mt->base.base.target) {
+	case PIPE_TEXTURE_1D:
 		break;
-	case PIPE_FORMAT_DXT1_RGBA:
-		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_DXT1);
+	case PIPE_TEXTURE_2D:
+		mode |= (1 << 14);
 		break;
-	case PIPE_FORMAT_DXT3_RGBA:
-		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_DXT3);
+	case PIPE_TEXTURE_3D:
+		mode |= (2 << 14);
 		break;
-	case PIPE_FORMAT_DXT5_RGBA:
-		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
-			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
-			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
-			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
-			    NV50TIC_0_0_FMT_DXT5);
+	case PIPE_TEXTURE_CUBE:
+		mode |= (3 << 14);
 		break;
 	default:
-		return 1;
+		assert(!"unsupported texture target");
+		break;
 	}
 
+	so_data (so, nv50_tex_format_list[i].hw);
 	so_reloc(so, mt->base.bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW |
-		     NOUVEAU_BO_RD, 0, 0);
-	if (nv50->sampler[unit]->normalized)
-		so_data (so, 0xd0005000 | mt->base.bo->tile_mode << 22);
-	else
-		so_data (so, 0x5001d000 | mt->base.bo->tile_mode << 22);
+		 NOUVEAU_BO_RD, 0, 0);
+	so_data (so, mode);
 	so_data (so, 0x00300000);
-	so_data (so, mt->base.base.width[0]);
+	so_data (so, mt->base.base.width[0] | (1 << 31));
 	so_data (so, (mt->base.base.last_level << 28) |
-		     (mt->base.base.depth[0] << 16) | mt->base.base.height[0]);
+		 (mt->base.base.depth[0] << 16) | mt->base.base.height[0]);
 	so_data (so, 0x03000000);
 	so_data (so, mt->base.base.last_level << 4);
 
@@ -137,20 +143,24 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 void
 nv50_tex_validate(struct nv50_context *nv50)
 {
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_stateobj *so;
-	int unit, push;
+	unsigned i, unit, push;
+
+	push = MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2 + 23 + 6;
+	so = so_new(nv50->miptree_nr * 9 + push, nv50->miptree_nr * 2 + 2);
 
-	push  = nv50->miptree_nr * 9 + 2;
-	push += MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2;
+	nv50_so_init_sifc(nv50, so, nv50->screen->tic, NOUVEAU_BO_VRAM,
+			  nv50->miptree_nr * 8 * 4);
 
-	so = so_new(push, nv50->miptree_nr * 2);
-	so_method(so, tesla, NV50TCL_CB_ADDR, 1);
-	so_data  (so, NV50_CB_TIC);
-	for (unit = 0; unit < nv50->miptree_nr; unit++) {
+	for (i = 0, unit = 0; unit < nv50->miptree_nr; ++unit) {
 		struct nv50_miptree *mt = nv50->miptree[unit];
 
-		so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000, 8);
+		if (!mt)
+			continue;
+
+		so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8);
 		if (nv50_tex_construct(nv50, so, mt, unit)) {
 			NOUVEAU_ERR("failed tex validate\n");
 			so_ref(NULL, &so);
@@ -158,17 +168,25 @@ nv50_tex_validate(struct nv50_context *nv50)
 		}
 
 		so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1);
-		so_data  (so, (unit << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) |
-			(unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) |
-			NV50TCL_SET_SAMPLER_TEX_VALID);
+		so_data  (so, (i++ << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) |
+			  (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) |
+			  NV50TCL_SET_SAMPLER_TEX_VALID);
 	}
 
 	for (; unit < nv50->state.miptree_nr; unit++) {
 		so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1);
 		so_data  (so,
-			(unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0);
+			  (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0);
 	}
 
+	/* not sure if the following really do what I think: */
+	so_method(so, tesla, 0x1440, 1); /* sync SIFC */
+	so_data  (so, 0);
+	so_method(so, tesla, 0x1330, 1); /* flush TIC */
+	so_data  (so, 0);
+	so_method(so, tesla, 0x1338, 1); /* flush texture caches */
+	so_data  (so, 0x20);
+
 	so_ref(so, &nv50->state.tic_upload);
 	so_ref(NULL, &so);
 	nv50->state.miptree_nr = nv50->miptree_nr;
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
index 207fb039f70..d531e611327 100644
--- a/src/gallium/drivers/nv50/nv50_texture.h
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -38,18 +38,26 @@
 #define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
 #define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
 #define NV50TIC_0_0_TYPEA_SNORM                                   0x00008000
+#define NV50TIC_0_0_TYPEA_SINT                                    0x00018000
+#define NV50TIC_0_0_TYPEA_UINT                                    0x00020000
 #define NV50TIC_0_0_TYPEA_FLOAT                                   0x00038000
 #define NV50TIC_0_0_TYPEB_MASK                                    0x00007000
 #define NV50TIC_0_0_TYPEB_UNORM                                   0x00002000
 #define NV50TIC_0_0_TYPEB_SNORM                                   0x00001000
+#define NV50TIC_0_0_TYPEB_SINT                                    0x00003000
+#define NV50TIC_0_0_TYPEB_UINT                                    0x00004000
 #define NV50TIC_0_0_TYPEB_FLOAT                                   0x00007000
 #define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
 #define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
 #define NV50TIC_0_0_TYPEG_SNORM                                   0x00000200
+#define NV50TIC_0_0_TYPEG_SINT                                    0x00000600
+#define NV50TIC_0_0_TYPEG_UINT                                    0x00000800
 #define NV50TIC_0_0_TYPEG_FLOAT                                   0x00000e00
 #define NV50TIC_0_0_TYPER_MASK                                    0x000001c0
 #define NV50TIC_0_0_TYPER_UNORM                                   0x00000080
 #define NV50TIC_0_0_TYPER_SNORM                                   0x00000040
+#define NV50TIC_0_0_TYPER_SINT                                    0x000000c0
+#define NV50TIC_0_0_TYPER_UINT                                    0x00000100
 #define NV50TIC_0_0_TYPER_FLOAT                                   0x000001c0
 #define NV50TIC_0_0_FMT_MASK                                      0x0000003f
 #define NV50TIC_0_0_FMT_32_32_32_32                               0x00000001
@@ -57,6 +65,7 @@
 #define NV50TIC_0_0_FMT_32_32                                     0x00000004
 #define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
 #define NV50TIC_0_0_FMT_2_10_10_10                                0x00000009
+#define NV50TIC_0_0_FMT_16_16                                     0x0000000c
 #define NV50TIC_0_0_FMT_32                                        0x0000000f
 #define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
 /* #define NV50TIC_0_0_FMT_1_5_5_5                                0x00000013 */
@@ -65,12 +74,16 @@
 #define NV50TIC_0_0_FMT_8_8                                       0x00000018
 #define NV50TIC_0_0_FMT_16                                        0x0000001b
 #define NV50TIC_0_0_FMT_8                                         0x0000001d
+#define NV50TIC_0_0_FMT_5_9_9_9                                   0x00000020
 #define NV50TIC_0_0_FMT_10_11_11                                  0x00000021
 #define NV50TIC_0_0_FMT_DXT1                                      0x00000024
 #define NV50TIC_0_0_FMT_DXT3                                      0x00000025
 #define NV50TIC_0_0_FMT_DXT5                                      0x00000026
 #define NV50TIC_0_0_FMT_RGTC1                                     0x00000027
 #define NV50TIC_0_0_FMT_RGTC2                                     0x00000028
+#define NV50TIC_0_0_FMT_24_8                                      0x00000029
+#define NV50TIC_0_0_FMT_32_DEPTH                                  0x0000002f
+#define NV50TIC_0_0_FMT_32_8                                      0x00000030
 
 #define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
 #define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
@@ -133,6 +146,8 @@
 #define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
 #define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
 #define NV50TSC_1_1_LOD_BIAS_MASK                                0x01fff000
+#define NV50TSC_1_1_UNKN_ANISO_15                                0x10000000
+#define NV50TSC_1_1_UNKN_ANISO_35                                0x18000000
 
 #define NV50TSC_1_2_MIN_LOD_MASK                                 0x00000f00
 #define NV50TSC_1_2_MAX_LOD_MASK                                 0x00f00000
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index e9c3562194b..ea61357aaa6 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -12,6 +12,7 @@ struct nv50_transfer {
 	int level_pitch;
 	int level_width;
 	int level_height;
+	int level_depth;
 	int level_x;
 	int level_y;
 };
@@ -20,10 +21,10 @@ static void
 nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			struct nouveau_bo *src_bo, unsigned src_offset,
 			int src_pitch, unsigned src_tile_mode,
-			int sx, int sy, int sw, int sh,
+			int sx, int sy, int sw, int sh, int sd,
 			struct nouveau_bo *dst_bo, unsigned dst_offset,
 			int dst_pitch, unsigned dst_tile_mode,
-			int dx, int dy, int dw, int dh,
+			int dx, int dy, int dw, int dh, int dd,
 			int cpp, int width, int height,
 			unsigned src_reloc, unsigned dst_reloc)
 {
@@ -51,7 +52,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		OUT_RING  (chan, src_tile_mode << 4);
 		OUT_RING  (chan, sw * cpp);
 		OUT_RING  (chan, sh);
-		OUT_RING  (chan, 1);
+		OUT_RING  (chan, sd);
 		OUT_RING  (chan, 0);
 	}
 
@@ -70,7 +71,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		OUT_RING  (chan, dst_tile_mode << 4);
 		OUT_RING  (chan, dw * cpp);
 		OUT_RING  (chan, dh);
-		OUT_RING  (chan, 1);
+		OUT_RING  (chan, dd);
 		OUT_RING  (chan, 0);
 	}
 
@@ -89,14 +90,14 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		if (src_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1);
-			OUT_RING  (chan, (sy << 16) | sx);
+			OUT_RING  (chan, (sy << 16) | (sx * cpp));
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
 		if (dst_bo->tile_flags) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1);
-			OUT_RING  (chan, (dy << 16) | dx);
+			OUT_RING  (chan, (dy << 16) | (dx * cpp));
 		} else {
 			dst_offset += (line_count * dst_pitch);
 		}
@@ -114,6 +115,20 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 	}
 }
 
+static INLINE unsigned
+get_zslice_offset(unsigned tile_mode, unsigned z, unsigned pitch, unsigned ny)
+{
+	unsigned tile_h = get_tile_height(tile_mode);
+	unsigned tile_d = get_tile_depth(tile_mode);
+
+	/* pitch_2d == to next slice within this volume-tile */
+	/* pitch_3d == to next slice in next 2D array of blocks */
+	unsigned pitch_2d = tile_h * 64;
+	unsigned pitch_3d = tile_d * align(ny, tile_h) * pitch;
+
+	return (z % tile_d) * pitch_2d + (z / tile_d) * pitch_3d;
+}
+
 static struct pipe_transfer *
 nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		  unsigned face, unsigned level, unsigned zslice,
@@ -124,14 +139,11 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	struct nv50_miptree *mt = nv50_miptree(pt);
 	struct nv50_miptree_level *lvl = &mt->level[level];
 	struct nv50_transfer *tx;
-	unsigned image = 0;
+	unsigned nx, ny, image = 0;
 	int ret;
 
 	if (pt->target == PIPE_TEXTURE_CUBE)
 		image = face;
-	else
-	if (pt->target == PIPE_TEXTURE_3D)
-		image = zslice;
 
 	tx = CALLOC_STRUCT(nv50_transfer);
 	if (!tx)
@@ -142,34 +154,52 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	tx->base.width = w;
 	tx->base.height = h;
 	tx->base.block = pt->block;
-	tx->base.nblocksx = pt->nblocksx[level];
-	tx->base.nblocksy = pt->nblocksy[level];
-	tx->base.stride = (w * pt->block.size);
+	if (!pt->nblocksx[level]) {
+		tx->base.nblocksx = pf_get_nblocksx(&pt->block,
+						    pt->width[level]);
+		tx->base.nblocksy = pf_get_nblocksy(&pt->block,
+						    pt->height[level]);
+	} else {
+		tx->base.nblocksx = pt->nblocksx[level];
+		tx->base.nblocksy = pt->nblocksy[level];
+	}
+	tx->base.stride = tx->base.nblocksx * pt->block.size;
 	tx->base.usage = usage;
 
 	tx->level_pitch = lvl->pitch;
 	tx->level_width = mt->base.base.width[level];
 	tx->level_height = mt->base.base.height[level];
+	tx->level_depth = mt->base.base.depth[level];
 	tx->level_offset = lvl->image_offset[image];
 	tx->level_tiling = lvl->tile_mode;
-	tx->level_x = x;
-	tx->level_y = y;
+	tx->level_x = pf_get_nblocksx(&tx->base.block, x);
+	tx->level_y = pf_get_nblocksy(&tx->base.block, y);
 	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
-			     w * pt->block.size * h, &tx->bo);
+			     tx->base.nblocksy * tx->base.stride, &tx->bo);
 	if (ret) {
 		FREE(tx);
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (pt->target == PIPE_TEXTURE_3D)
+		tx->level_offset += get_zslice_offset(lvl->tile_mode, zslice,
+						      lvl->pitch,
+						      tx->base.nblocksy);
+
+	if (usage & PIPE_TRANSFER_READ) {
+		nx = pf_get_nblocksx(&tx->base.block, tx->base.width);
+		ny = pf_get_nblocksy(&tx->base.block, tx->base.height);
+
 		nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset,
 					tx->level_pitch, tx->level_tiling,
 					x, y,
-					tx->level_width, tx->level_height,
-					tx->bo, 0, tx->base.stride,
-					tx->bo->tile_mode, 0, 0,
-					tx->base.width, tx->base.height,
-					tx->base.block.size, w, h,
+					tx->base.nblocksx, tx->base.nblocksy,
+					tx->level_depth,
+					tx->bo, 0,
+					tx->base.stride, tx->bo->tile_mode,
+					0, 0,
+					tx->base.nblocksx, tx->base.nblocksy, 1,
+					tx->base.block.size, nx, ny,
 					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
 					NOUVEAU_BO_GART);
 	}
@@ -183,17 +213,22 @@ nv50_transfer_del(struct pipe_transfer *ptx)
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 	struct nv50_miptree *mt = nv50_miptree(ptx->texture);
 
-	if (ptx->usage != PIPE_TRANSFER_READ) {
+	unsigned nx = pf_get_nblocksx(&tx->base.block, tx->base.width);
+	unsigned ny = pf_get_nblocksy(&tx->base.block, tx->base.height);
+
+	if (ptx->usage & PIPE_TRANSFER_WRITE) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
-		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride,
-					tx->bo->tile_mode, 0, 0,
-					tx->base.width, tx->base.height,
+
+		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0,
+					tx->base.stride, tx->bo->tile_mode,
+					0, 0,
+					tx->base.nblocksx, tx->base.nblocksy, 1,
 					mt->base.bo, tx->level_offset,
 					tx->level_pitch, tx->level_tiling,
 					tx->level_x, tx->level_y,
-					tx->level_width, tx->level_height,
-					tx->base.block.size, tx->base.width,
-					tx->base.height,
+					tx->base.nblocksx, tx->base.nblocksy,
+					tx->level_depth,
+					tx->base.block.size, nx, ny,
 					NOUVEAU_BO_GART, NOUVEAU_BO_VRAM |
 					NOUVEAU_BO_GART);
 	}
@@ -237,3 +272,89 @@ nv50_transfer_init_screen_functions(struct pipe_screen *pscreen)
 	pscreen->transfer_map = nv50_transfer_map;
 	pscreen->transfer_unmap = nv50_transfer_unmap;
 }
+
+void
+nv50_upload_sifc(struct nv50_context *nv50,
+		 struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc,
+		 unsigned dst_format, int dst_w, int dst_h, int dst_pitch,
+		 void *src, unsigned src_format, int src_pitch,
+		 int x, int y, int w, int h, int cpp)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned line_dwords = (w * cpp + 3) / 4;
+
+	reloc |= NOUVEAU_BO_WR;
+
+	WAIT_RING (chan, 32);
+
+	if (bo->tile_flags) {
+		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5);
+		OUT_RING  (chan, dst_format);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, bo->tile_mode << 4);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 0);
+	} else {
+		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2);
+		OUT_RING  (chan, dst_format);
+		OUT_RING  (chan, 1);
+		BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1);
+		OUT_RING  (chan, dst_pitch);
+	}
+
+	BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 4);
+	OUT_RING  (chan, dst_w);
+	OUT_RING  (chan, dst_h);
+	OUT_RELOCh(chan, bo, dst_offset, reloc);
+	OUT_RELOCl(chan, bo, dst_offset, reloc);
+
+	/* NV50_2D_OPERATION_SRCCOPY assumed already set */
+
+	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_UNK0800, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, src_format);
+	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, x);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, y);
+
+	while (h--) {
+		const uint32_t *p = src;
+		unsigned count = line_dwords;
+
+		while (count) {
+			unsigned nr = MIN2(count, 1792);
+
+			if (chan->pushbuf->remaining <= nr) {
+				FIRE_RING (chan);
+
+				BEGIN_RING(chan, eng2d,
+					   NV50_2D_DST_ADDRESS_HIGH, 2);
+				OUT_RELOCh(chan, bo, dst_offset, reloc);
+				OUT_RELOCl(chan, bo, dst_offset, reloc);
+			}
+			assert(chan->pushbuf->remaining > nr);
+
+			BEGIN_RING(chan, eng2d,
+				   NV50_2D_SIFC_DATA | (2 << 29), nr);
+			OUT_RINGp (chan, p, nr);
+
+			p += nr;
+			count -= nr;
+		}
+
+		src += src_pitch;
+	}
+
+	BEGIN_RING(chan, tesla, 0x1440, 1);
+	OUT_RING  (chan, 0);
+}
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index eeed148c7b9..db54380241f 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -26,6 +26,18 @@
 
 #include "nv50_context.h"
 
+static boolean
+nv50_push_elements_u08(struct nv50_context *, uint8_t *, unsigned);
+
+static boolean
+nv50_push_elements_u16(struct nv50_context *, uint16_t *, unsigned);
+
+static boolean
+nv50_push_elements_u32(struct nv50_context *, uint32_t *, unsigned);
+
+static boolean
+nv50_push_arrays(struct nv50_context *, unsigned, unsigned);
+
 static INLINE unsigned
 nv50_prim(unsigned mode)
 {
@@ -132,6 +144,7 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	struct nv50_context *nv50 = nv50_context(pipe);
 	struct nouveau_channel *chan = nv50->screen->tesla->channel;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	boolean ret;
 
 	nv50_state_validate(nv50);
 
@@ -139,24 +152,25 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	OUT_RING  (chan, 0);
 	BEGIN_RING(chan, tesla, 0x142c, 1);
 	OUT_RING  (chan, 0);
-	BEGIN_RING(chan, tesla, 0x1440, 1);
-	OUT_RING  (chan, 0);
-	BEGIN_RING(chan, tesla, 0x1334, 1);
-	OUT_RING  (chan, 0);
 
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
 	OUT_RING  (chan, nv50_prim(mode));
-	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
-	OUT_RING  (chan, start);
-	OUT_RING  (chan, count);
+
+	if (nv50->vbo_fifo)
+		ret = nv50_push_arrays(nv50, start, count);
+	else {
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+		OUT_RING  (chan, start);
+		OUT_RING  (chan, count);
+		ret = TRUE;
+	}
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
-	pipe->flush(pipe, 0, NULL);
-	return TRUE;
+	return ret;
 }
 
-static INLINE void
+static INLINE boolean
 nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 			      unsigned start, unsigned count)
 {
@@ -165,6 +179,9 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 
 	map += start;
 
+	if (nv50->vbo_fifo)
+		return nv50_push_elements_u08(nv50, map, count);
+
 	if (count & 1) {
 		BEGIN_RING(chan, tesla, 0x15e8, 1);
 		OUT_RING  (chan, map[0]);
@@ -183,9 +200,10 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 		count -= nr;
 		map += nr;
 	}
+	return TRUE;
 }
 
-static INLINE void
+static INLINE boolean
 nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 			      unsigned start, unsigned count)
 {
@@ -194,6 +212,9 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 
 	map += start;
 
+	if (nv50->vbo_fifo)
+		return nv50_push_elements_u16(nv50, map, count);
+
 	if (count & 1) {
 		BEGIN_RING(chan, tesla, 0x15e8, 1);
 		OUT_RING  (chan, map[0]);
@@ -212,9 +233,10 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 		count -= nr;
 		map += nr;
 	}
+	return TRUE;
 }
 
-static INLINE void
+static INLINE boolean
 nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 			      unsigned start, unsigned count)
 {
@@ -223,6 +245,9 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 
 	map += start;
 
+	if (nv50->vbo_fifo)
+		return nv50_push_elements_u32(nv50, map, count);
+
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
@@ -232,6 +257,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 		count -= nr;
 		map += nr;
 	}
+	return TRUE;
 }
 
 boolean
@@ -244,6 +270,7 @@ nv50_draw_elements(struct pipe_context *pipe,
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct pipe_screen *pscreen = pipe->screen;
 	void *map;
+	boolean ret;
 	
 	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
 
@@ -258,23 +285,25 @@ nv50_draw_elements(struct pipe_context *pipe,
 	OUT_RING  (chan, nv50_prim(mode));
 	switch (indexSize) {
 	case 1:
-		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		ret = nv50_draw_elements_inline_u08(nv50, map, start, count);
 		break;
 	case 2:
-		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		ret = nv50_draw_elements_inline_u16(nv50, map, start, count);
 		break;
 	case 4:
-		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		ret = nv50_draw_elements_inline_u32(nv50, map, start, count);
 		break;
 	default:
 		assert(0);
+		ret = FALSE;
+		break;
 	}
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
 	pipe_buffer_unmap(pscreen, indexBuffer);
-	pipe->flush(pipe, 0, NULL);
-	return TRUE;
+
+	return ret;
 }
 
 static INLINE boolean
@@ -341,17 +370,24 @@ nv50_vbo_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_stateobj *vtxbuf, *vtxfmt, *vtxattr;
-	unsigned i;
+	unsigned i, n_ve;
 
 	/* don't validate if Gallium took away our buffers */
 	if (nv50->vtxbuf_nr == 0)
 		return;
+	nv50->vbo_fifo = 0;
+
+	for (i = 0; i < nv50->vtxbuf_nr; ++i)
+		if (nv50->vtxbuf[i].stride &&
+		    !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX))
+			nv50->vbo_fifo = 0xffff;
+
+	n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
 
 	vtxattr = NULL;
-	vtxbuf = so_new(nv50->vtxelt_nr * 7, nv50->vtxelt_nr * 4);
-	vtxfmt = so_new(nv50->vtxelt_nr + 1, 0);
-	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0),
-		nv50->vtxelt_nr);
+	vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4);
+	vtxfmt = so_new(n_ve + 1, 0);
+	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve);
 
 	for (i = 0; i < nv50->vtxelt_nr; i++) {
 		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
@@ -367,10 +403,19 @@ nv50_vbo_validate(struct nv50_context *nv50)
 			so_method(vtxbuf, tesla,
 				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
 			so_data  (vtxbuf, 0);
+
+			nv50->vbo_fifo &= ~(1 << i);
 			continue;
 		}
 		so_data(vtxfmt, hw | i);
 
+		if (nv50->vbo_fifo) {
+			so_method(vtxbuf, tesla,
+				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
+			so_data  (vtxbuf, 0);
+			continue;
+		}
+
 		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3);
 		so_data  (vtxbuf, 0x20000000 | vb->stride);
 		so_reloc (vtxbuf, bo, vb->buffer_offset +
@@ -389,6 +434,13 @@ nv50_vbo_validate(struct nv50_context *nv50)
 			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			  NOUVEAU_BO_LOW, 0, 0);
 	}
+	for (; i < n_ve; ++i) {
+		so_data  (vtxfmt, 0x7e080010);
+
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
+		so_data  (vtxbuf, 0);
+	}
+	nv50->state.vtxelt_nr = nv50->vtxelt_nr;
 
 	so_ref (vtxfmt, &nv50->state.vtxfmt);
 	so_ref (vtxbuf, &nv50->state.vtxbuf);
@@ -398,3 +450,320 @@ nv50_vbo_validate(struct nv50_context *nv50)
 	so_ref (NULL, &vtxattr);
 }
 
+typedef void (*pfn_push)(struct nouveau_channel *, void *);
+
+struct nv50_vbo_emitctx
+{
+	pfn_push push[16];
+	void *map[16];
+	unsigned stride[16];
+	unsigned nr_ve;
+	unsigned vtx_dwords;
+	unsigned vtx_max;
+};
+
+static INLINE void
+emit_vtx_next(struct nouveau_channel *chan, struct nv50_vbo_emitctx *emit)
+{
+	unsigned i;
+
+	for (i = 0; i < emit->nr_ve; ++i) {
+		emit->push[i](chan, emit->map[i]);
+		emit->map[i] += emit->stride[i];
+	}
+}
+
+static INLINE void
+emit_vtx(struct nouveau_channel *chan, struct nv50_vbo_emitctx *emit,
+	 uint32_t vi)
+{
+	unsigned i;
+
+	for (i = 0; i < emit->nr_ve; ++i)
+		emit->push[i](chan, emit->map[i] + emit->stride[i] * vi);
+}
+
+static INLINE boolean
+nv50_map_vbufs(struct nv50_context *nv50)
+{
+	int i;
+
+	for (i = 0; i < nv50->vtxbuf_nr; ++i) {
+		struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
+		unsigned size, delta;
+
+		if (nouveau_bo(vb->buffer)->map)
+			continue;
+
+		size = vb->stride * (vb->max_index + 1);
+		delta = vb->buffer_offset;
+
+		if (!size)
+			size = vb->buffer->size - vb->buffer_offset;
+
+		if (nouveau_bo_map_range(nouveau_bo(vb->buffer),
+					 delta, size, NOUVEAU_BO_RD))
+			break;
+	}
+
+	if (i == nv50->vtxbuf_nr)
+		return TRUE;
+	for (; i >= 0; --i)
+		nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
+	return FALSE;
+}
+
+static INLINE void
+nv50_unmap_vbufs(struct nv50_context *nv50)
+{
+        unsigned i;
+
+        for (i = 0; i < nv50->vtxbuf_nr; ++i)
+                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
+                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
+}
+
+static void
+emit_b32_1(struct nouveau_channel *chan, void *data)
+{
+	uint32_t *v = data;
+
+	OUT_RING(chan, v[0]);
+}
+
+static void
+emit_b32_2(struct nouveau_channel *chan, void *data)
+{
+	uint32_t *v = data;
+
+	OUT_RING(chan, v[0]);
+	OUT_RING(chan, v[1]);
+}
+
+static void
+emit_b32_3(struct nouveau_channel *chan, void *data)
+{
+	uint32_t *v = data;
+
+	OUT_RING(chan, v[0]);
+	OUT_RING(chan, v[1]);
+	OUT_RING(chan, v[2]);
+}
+
+static void
+emit_b32_4(struct nouveau_channel *chan, void *data)
+{
+	uint32_t *v = data;
+
+	OUT_RING(chan, v[0]);
+	OUT_RING(chan, v[1]);
+	OUT_RING(chan, v[2]);
+	OUT_RING(chan, v[3]);
+}
+
+static void
+emit_b16_1(struct nouveau_channel *chan, void *data)
+{
+	uint16_t *v = data;
+
+	OUT_RING(chan, v[0]);
+}
+
+static void
+emit_b16_3(struct nouveau_channel *chan, void *data)
+{
+	uint16_t *v = data;
+
+	OUT_RING(chan, (v[1] << 16) | v[0]);
+	OUT_RING(chan, v[2]);
+}
+
+static void
+emit_b08_1(struct nouveau_channel *chan, void *data)
+{
+	uint8_t *v = data;
+
+	OUT_RING(chan, v[0]);
+}
+
+static void
+emit_b08_3(struct nouveau_channel *chan, void *data)
+{
+	uint8_t *v = data;
+
+	OUT_RING(chan, (v[2] << 16) | (v[1] << 8) | v[0]);
+}
+
+static boolean
+emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
+	     unsigned start)
+{
+	unsigned i;
+
+	if (nv50_map_vbufs(nv50) == FALSE)
+		return FALSE;
+
+	emit->nr_ve = 0;
+	emit->vtx_dwords = 0;
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned n, type, size;
+
+		ve = &nv50->vtxelt[i];
+		vb = &nv50->vtxbuf[ve->vertex_buffer_index];
+		if (!(nv50->vbo_fifo & (1 << i)))
+			continue;
+		n = emit->nr_ve++;
+
+		emit->stride[n] = vb->stride;
+		emit->map[n] = nouveau_bo(vb->buffer)->map +
+			(start * vb->stride + ve->src_offset);
+
+		type = pf_type(ve->src_format);
+		size = pf_size_x(ve->src_format) << pf_exp2(ve->src_format);
+
+		assert(ve->nr_components > 0 && ve->nr_components <= 4);
+
+		/* It shouldn't be necessary to push the implicit 1s
+		 * for case 3 and size 8 cases 1, 2, 3.
+		 */
+		switch (size) {
+		default:
+			NOUVEAU_ERR("unsupported vtxelt size: %u\n", size);
+			return FALSE;
+		case 32:
+			switch (ve->nr_components) {
+			case 1: emit->push[n] = emit_b32_1; break;
+			case 2: emit->push[n] = emit_b32_2; break;
+			case 3: emit->push[n] = emit_b32_3; break;
+			case 4: emit->push[n] = emit_b32_4; break;
+			}
+			emit->vtx_dwords += ve->nr_components;
+			break;
+		case 16:
+			switch (ve->nr_components) {
+			case 1: emit->push[n] = emit_b16_1; break;
+			case 2: emit->push[n] = emit_b32_1; break;
+			case 3: emit->push[n] = emit_b16_3; break;
+			case 4: emit->push[n] = emit_b32_2; break;
+			}
+			emit->vtx_dwords += (ve->nr_components + 1) >> 1;
+			break;
+		case 8:
+			switch (ve->nr_components) {
+			case 1: emit->push[n] = emit_b08_1; break;
+			case 2: emit->push[n] = emit_b16_1; break;
+			case 3: emit->push[n] = emit_b08_3; break;
+			case 4: emit->push[n] = emit_b32_1; break;
+			}
+			emit->vtx_dwords += 1;
+			break;
+		}
+	}
+
+	emit->vtx_max = 512 / emit->vtx_dwords;
+
+	return TRUE;
+}
+
+static boolean
+nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_vbo_emitctx emit;
+
+	if (emit_prepare(nv50, &emit, start) == FALSE)
+		return FALSE;
+
+	while (count) {
+		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
+	        dw = nr * emit.vtx_dwords;
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		for (i = 0; i < nr; ++i)
+			emit_vtx_next(chan, &emit);
+
+		count -= nr;
+	}
+	nv50_unmap_vbufs(nv50);
+
+	return TRUE;
+}
+
+static boolean
+nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_vbo_emitctx emit;
+
+	if (emit_prepare(nv50, &emit, 0) == FALSE)
+		return FALSE;
+
+	while (count) {
+		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
+	        dw = nr * emit.vtx_dwords;
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		for (i = 0; i < nr; ++i)
+			emit_vtx(chan, &emit, *map++);
+
+		count -= nr;
+	}
+	nv50_unmap_vbufs(nv50);
+
+	return TRUE;
+}
+
+static boolean
+nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_vbo_emitctx emit;
+
+	if (emit_prepare(nv50, &emit, 0) == FALSE)
+		return FALSE;
+
+	while (count) {
+		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
+	        dw = nr * emit.vtx_dwords;
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		for (i = 0; i < nr; ++i)
+			emit_vtx(chan, &emit, *map++);
+
+		count -= nr;
+	}
+	nv50_unmap_vbufs(nv50);
+
+	return TRUE;
+}
+
+static boolean
+nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count)
+{
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_vbo_emitctx emit;
+
+	if (emit_prepare(nv50, &emit, 0) == FALSE)
+		return FALSE;
+
+	while (count) {
+		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
+	        dw = nr * emit.vtx_dwords;
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		for (i = 0; i < nr; ++i)
+			emit_vtx(chan, &emit, *map++);
+
+		count -= nr;
+	}
+	nv50_unmap_vbufs(nv50);
+
+	return TRUE;
+}
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index 8d0c6e33bb1..121b65063f3 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -4,11 +4,10 @@ include $(TOP)/configs/current
 LIBNAME = r300
 
 C_SOURCES = \
-	r3xx_fs.c \
-	r5xx_fs.c \
 	r300_chipset.c \
 	r300_clear.c \
 	r300_context.c \
+	r300_debug.c \
 	r300_emit.c \
 	r300_flush.c \
 	r300_fs.c \
@@ -18,15 +17,13 @@ C_SOURCES = \
 	r300_state.c \
 	r300_state_derived.c \
 	r300_state_invariant.c \
+	r300_vbo.c \
 	r300_vs.c \
-	r300_surface.c \
 	r300_texture.c \
 	r300_tgsi_to_rc.c
 
 LIBRARY_INCLUDES = \
-	-I$(TOP)/src/mesa/drivers/dri/r300/compiler \
-	-I$(TOP)/src/mesa \
-	-I$(TOP)/include
+	-I$(TOP)/src/mesa/drivers/dri/r300/compiler
 
 COMPILER_ARCHIVE = $(TOP)/src/mesa/drivers/dri/r300/compiler/libr300compiler.a
 
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
index 493d7b28bc3..97989040d2e 100644
--- a/src/gallium/drivers/r300/SConscript
+++ b/src/gallium/drivers/r300/SConscript
@@ -1,12 +1,14 @@
 Import('*')
 
+r300compiler = SConscript('#/src/mesa/drivers/dri/r300/compiler/SConscript')
+
 env = env.Clone()
+# add the paths for r300compiler
+env.Append(CPPPATH = ['#/src/mesa/drivers/dri/r300/compiler', '#/include', '#/src/mesa'])
 
 r300 = env.ConvenienceLibrary(
     target = 'r300',
     source = [
-        'r3xx_fs.c',
-        'r5xx_fs.c',
         'r300_chipset.c',
         'r300_clear.c',
         'r300_context.c',
@@ -21,9 +23,9 @@ r300 = env.ConvenienceLibrary(
         'r300_state_derived.c',
         'r300_state_invariant.c',
         'r300_vs.c',
-        'r300_surface.c',
         'r300_texture.c',
-    ])
+        'r300_tgsi_to_rc.c',
+    ] + r300compiler) + r300compiler
 
 Export('r300')
 
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index d138866d33c..51fdb82ff34 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -21,6 +21,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "r300_chipset.h"
+
 #include "util/u_debug.h"
 
 /* r300_chipset: A file all to itself for deducing the various properties of
@@ -31,7 +32,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
 {
     /* Reasonable defaults */
     caps->num_vert_fpus = 4;
-    caps->has_tcl = getenv("RADEON_NO_TCL") ? FALSE : TRUE;
+    caps->has_tcl = debug_get_bool_option("RADEON_NO_TCL", FALSE) ? FALSE : TRUE;
     caps->is_r500 = FALSE;
     caps->high_second_pipe = FALSE;
 
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
index 322d4a57e41..0633a8b8a72 100644
--- a/src/gallium/drivers/r300/r300_chipset.h
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -33,9 +33,11 @@ struct r300_capabilities {
     /* Chipset family */
     int family;
     /* The number of vertex floating-point units */
-    int num_vert_fpus;
+    unsigned num_vert_fpus;
     /* The number of fragment pipes */
-    int num_frag_pipes;
+    unsigned num_frag_pipes;
+    /* The number of z pipes */
+    unsigned num_z_pipes;
     /* Whether or not TCL is physically present */
     boolean has_tcl;
     /* Whether or not this is an RV515 or newer; R500s have many differences
diff --git a/src/gallium/drivers/r300/r300_clear.c b/src/gallium/drivers/r300/r300_clear.c
index 8b9cb819ae6..02d6d504fc0 100644
--- a/src/gallium/drivers/r300/r300_clear.c
+++ b/src/gallium/drivers/r300/r300_clear.c
@@ -21,6 +21,9 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "r300_clear.h"
+#include "r300_context.h"
+
+#include "util/u_clear.h"
 
 /* Clears currently bound buffers. */
 void r300_clear(struct pipe_context* pipe,
diff --git a/src/gallium/drivers/r300/r300_clear.h b/src/gallium/drivers/r300/r300_clear.h
index cd5900565e8..b8fcdf273c7 100644
--- a/src/gallium/drivers/r300/r300_clear.h
+++ b/src/gallium/drivers/r300/r300_clear.h
@@ -23,9 +23,7 @@
 #ifndef R300_CLEAR_H
 #define R300_CLEAR_H
 
-#include "util/u_clear.h"
-
-#include "r300_context.h"
+struct pipe_context;
 
 void r300_clear(struct pipe_context* pipe,
                 unsigned buffers,
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index da67bc29b89..ae23329b83f 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -20,87 +20,50 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "r300_context.h"
+#include "draw/draw_context.h"
 
-static boolean r300_draw_range_elements(struct pipe_context* pipe,
-                                        struct pipe_buffer* indexBuffer,
-                                        unsigned indexSize,
-                                        unsigned minIndex,
-                                        unsigned maxIndex,
-                                        unsigned mode,
-                                        unsigned start,
-                                        unsigned count)
-{
-    struct r300_context* r300 = r300_context(pipe);
-    int i;
-
-    for (i = 0; i < r300->vertex_buffer_count; i++) {
-        void* buf = pipe_buffer_map(pipe->screen,
-                                    r300->vertex_buffers[i].buffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
-        draw_set_mapped_vertex_buffer(r300->draw, i, buf);
-    }
+#include "tgsi/tgsi_scan.h"
 
-    if (indexBuffer) {
-        void* indices = pipe_buffer_map(pipe->screen, indexBuffer,
-                                        PIPE_BUFFER_USAGE_CPU_READ);
-        draw_set_mapped_element_buffer_range(r300->draw, indexSize,
-                                             minIndex, maxIndex, indices);
-    } else {
-        draw_set_mapped_element_buffer(r300->draw, 0, NULL);
-    }
+#include "util/u_hash_table.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
 
-    draw_set_mapped_constant_buffer(r300->draw,
-            r300->shader_constants[PIPE_SHADER_VERTEX].constants,
-            r300->shader_constants[PIPE_SHADER_VERTEX].count *
-                (sizeof(float) * 4));
-
-    draw_arrays(r300->draw, mode, start, count);
-
-    for (i = 0; i < r300->vertex_buffer_count; i++) {
-        pipe_buffer_unmap(pipe->screen, r300->vertex_buffers[i].buffer);
-        draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
-    }
-
-    if (indexBuffer) {
-        pipe_buffer_unmap(pipe->screen, indexBuffer);
-        draw_set_mapped_element_buffer_range(r300->draw, 0, start,
-                                             start + count - 1, NULL);
-    }
-
-    return TRUE;
-}
-
-static boolean r300_draw_elements(struct pipe_context* pipe,
-                                  struct pipe_buffer* indexBuffer,
-                                  unsigned indexSize, unsigned mode,
-                                  unsigned start, unsigned count)
+#include "r300_clear.h"
+#include "r300_context.h"
+#include "r300_flush.h"
+#include "r300_query.h"
+#include "r300_render.h"
+#include "r300_screen.h"
+#include "r300_state_derived.h"
+#include "r300_state_invariant.h"
+#include "r300_winsys.h"
+
+static enum pipe_error r300_clear_hash_table(void* key, void* value,
+                                             void* data)
 {
-    return r300_draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
-                                    mode, start, count);
+    FREE(key);
+    FREE(value);
+    return PIPE_OK;
 }
 
-static boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                                unsigned start, unsigned count)
+static void r300_destroy_context(struct pipe_context* context)
 {
-    return r300_draw_elements(pipe, NULL, 0, mode, start, count);
-}
-
-static void r300_destroy_context(struct pipe_context* context) {
     struct r300_context* r300 = r300_context(context);
     struct r300_query* query, * temp;
 
+    util_hash_table_foreach(r300->shader_hash_table, r300_clear_hash_table,
+        NULL);
+    util_hash_table_destroy(r300->shader_hash_table);
+
     draw_destroy(r300->draw);
 
     /* Free the OQ BO. */
     context->screen->buffer_destroy(r300->oqbo);
 
     /* If there are any queries pending or not destroyed, remove them now. */
-    if (r300->query_list) {
-        foreach_s(query, temp, r300->query_list) {
-            remove_from_list(query);
-            FREE(query);
-        }
+    foreach_s(query, temp, &r300->query_list) {
+        remove_from_list(query);
+        FREE(query);
     }
 
     FREE(r300->blend_color_state);
@@ -111,32 +74,40 @@ static void r300_destroy_context(struct pipe_context* context) {
 }
 
 static unsigned int
-r300_is_texture_referenced( struct pipe_context *pipe,
-			    struct pipe_texture *texture,
-			    unsigned face, unsigned level)
+r300_is_texture_referenced(struct pipe_context *pipe,
+                           struct pipe_texture *texture,
+                           unsigned face, unsigned level)
 {
-   /**
-    * FIXME: Optimize.
-    */
+    struct pipe_buffer* buf = 0;
+
+    r300_get_texture_buffer(texture, &buf, NULL);
 
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+    return pipe->is_buffer_referenced(pipe, buf);
 }
 
 static unsigned int
-r300_is_buffer_referenced( struct pipe_context *pipe,
-			   struct pipe_buffer *buf)
+r300_is_buffer_referenced(struct pipe_context *pipe,
+                          struct pipe_buffer *buf)
+{
+    /* This only checks to see whether actual hardware buffers are
+     * referenced. Since we use managed BOs and transfers, it's actually not
+     * possible for pipe_buffers to ever reference the actual hardware, so
+     * buffers are never referenced. */
+    return 0;
+}
+
+static void r300_flush_cb(void *data)
 {
-   /**
-    * FIXME: Optimize.
-    */
+    struct r300_context* const cs_context_copy = data;
 
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+    cs_context_copy->context.flush(&cs_context_copy->context, 0, NULL);
 }
 
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
                                          struct r300_winsys* r300_winsys)
 {
     struct r300_context* r300 = CALLOC_STRUCT(r300_context);
+    struct r300_screen* r300screen = r300_screen(screen);
 
     if (!r300)
         return NULL;
@@ -144,19 +115,31 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->winsys = r300_winsys;
 
     r300->context.winsys = (struct pipe_winsys*)r300_winsys;
-    r300->context.screen = r300_screen(screen);
+    r300->context.screen = screen;
+
+    r300_init_debug(r300);
 
     r300->context.destroy = r300_destroy_context;
 
     r300->context.clear = r300_clear;
 
-    r300->context.draw_arrays = r300_draw_arrays;
-    r300->context.draw_elements = r300_draw_elements;
-    r300->context.draw_range_elements = r300_draw_range_elements;
+    if (r300screen->caps->has_tcl)
+    {
+        r300->context.draw_arrays = r300_draw_arrays;
+        r300->context.draw_elements = r300_draw_elements;
+        r300->context.draw_range_elements = r300_draw_range_elements;
+    }
+    else
+    {
+        assert(0);
+    }
 
     r300->context.is_texture_referenced = r300_is_texture_referenced;
     r300->context.is_buffer_referenced = r300_is_buffer_referenced;
 
+    r300->shader_hash_table = util_hash_table_create(r300_shader_key_hash,
+        r300_shader_key_compare);
+
     r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
     r300->rs_block = CALLOC_STRUCT(r300_rs_block);
     r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
@@ -175,18 +158,20 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     /* Open up the OQ BO. */
     r300->oqbo = screen->buffer_create(screen, 4096,
             PIPE_BUFFER_USAGE_VERTEX, 4096);
+    make_empty_list(&r300->query_list);
 
     r300_init_flush_functions(r300);
 
     r300_init_query_functions(r300);
 
-    r300_init_surface_functions(r300);
+    /* r300_init_surface_functions(r300); */
 
     r300_init_state_functions(r300);
 
     r300_emit_invariant_state(r300);
+
+    r300->winsys->set_flush_cb(r300->winsys, r300_flush_cb, r300);
     r300->dirty_state = R300_NEW_KITCHEN_SINK;
     r300->dirty_hw++;
-
     return &r300->context;
 }
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index f78492d4aa9..f954ba7f9aa 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -23,20 +23,10 @@
 #ifndef R300_CONTEXT_H
 #define R300_CONTEXT_H
 
-#include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
 
 #include "pipe/p_context.h"
-
-#include "tgsi/tgsi_scan.h"
-
-#include "util/u_memory.h"
-#include "util/u_simple_list.h"
-
-#include "r300_clear.h"
-#include "r300_query.h"
-#include "r300_screen.h"
-#include "r300_winsys.h"
+#include "pipe/p_inlines.h"
 
 struct r300_fragment_shader;
 struct r300_vertex_shader;
@@ -44,6 +34,7 @@ struct r300_vertex_shader;
 struct r300_blend_state {
     uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
     uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
+    uint32_t color_channel_mask;  /* R300_RB3D_COLOR_CHANNEL_MASK: 0x4e0c */
     uint32_t rop;                 /* R300_RB3D_ROPCNTL: 0x4e18 */
     uint32_t dither;              /* R300_RB3D_DITHER_CTL: 0x4e50 */
 };
@@ -62,7 +53,6 @@ struct r300_dsa_state {
     uint32_t z_buffer_control;  /* R300_ZB_CNTL: 0x4f00 */
     uint32_t z_stencil_control; /* R300_ZB_ZSTENCILCNTL: 0x4f04 */
     uint32_t stencil_ref_mask;  /* R300_ZB_STENCILREFMASK: 0x4f08 */
-    uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
     uint32_t stencil_ref_bf;    /* R500_ZB_STENCILREFMASK_BF: 0x4fd4 */
 };
 
@@ -88,6 +78,7 @@ struct r300_rs_state {
     uint32_t line_stipple_config;   /* R300_GA_LINE_STIPPLE_CONFIG: 0x4328 */
     uint32_t line_stipple_value;    /* R300_GA_LINE_STIPPLE_VALUE: 0x4260 */
     uint32_t color_control;         /* R300_GA_COLOR_CONTROL: 0x4278 */
+    uint32_t polygon_mode;          /* R300_GA_POLY_MODE: 0x4288 */
 };
 
 struct r300_rs_block {
@@ -124,13 +115,17 @@ struct r300_viewport_state {
     uint32_t vte_control; /* R300_VAP_VTE_CNTL:      0x20b0 */
 };
 
+struct r300_ztop_state {
+    uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
+};
+
 #define R300_NEW_BLEND           0x00000001
 #define R300_NEW_BLEND_COLOR     0x00000002
 #define R300_NEW_CLIP            0x00000004
-#define R300_NEW_CONSTANTS       0x00000008
-#define R300_NEW_DSA             0x00000010
-#define R300_NEW_FRAMEBUFFERS    0x00000020
-#define R300_NEW_FRAGMENT_SHADER 0x00000040
+#define R300_NEW_DSA             0x00000008
+#define R300_NEW_FRAMEBUFFERS    0x00000010
+#define R300_NEW_FRAGMENT_SHADER 0x00000020
+#define R300_NEW_FRAGMENT_SHADER_CONSTANTS    0x00000040
 #define R300_NEW_RASTERIZER      0x00000080
 #define R300_NEW_RS_BLOCK        0x00000100
 #define R300_NEW_SAMPLER         0x00000200
@@ -140,8 +135,10 @@ struct r300_viewport_state {
 #define R300_ANY_NEW_TEXTURES    0x03fc0000
 #define R300_NEW_VERTEX_FORMAT   0x04000000
 #define R300_NEW_VERTEX_SHADER   0x08000000
-#define R300_NEW_VIEWPORT        0x10000000
-#define R300_NEW_KITCHEN_SINK    0x1fffffff
+#define R300_NEW_VERTEX_SHADER_CONSTANTS    0x10000000
+#define R300_NEW_VIEWPORT        0x20000000
+#define R300_NEW_QUERY           0x40000000
+#define R300_NEW_KITCHEN_SINK    0x7fffffff
 
 /* The next several objects are not pure Radeon state; they inherit from
  * various Gallium classes. */
@@ -172,6 +169,10 @@ struct r300_query {
     unsigned int count;
     /* The offset of this query into the query buffer, in bytes. */
     unsigned offset;
+    /* if we've flushed the query */
+    boolean flushed;
+    /* if begin has been emitted */
+    boolean begin_emitted;
     /* Linked list members. */
     struct r300_query* prev;
     struct r300_query* next;
@@ -184,12 +185,30 @@ struct r300_texture {
     /* Offsets into the buffer. */
     unsigned offset[PIPE_MAX_TEXTURE_LEVELS];
 
-    /* Stride (pitch?) of this texture in bytes */
-    unsigned stride;
+    /* A pitch for each mip-level */
+    unsigned pitch[PIPE_MAX_TEXTURE_LEVELS];
+
+    /* Size of one zslice or face based on the texture target */
+    unsigned layer_size[PIPE_MAX_TEXTURE_LEVELS];
+
+    /**
+     * If non-zero, override the natural texture layout with
+     * a custom stride (in bytes).
+     *
+     * \note Mipmapping fails for textures with a non-natural layout!
+     *
+     * \sa r300_texture_get_stride
+     */
+    unsigned stride_override;
 
     /* Total size of this texture, in bytes. */
     unsigned size;
 
+    /* Whether this texture has non-power-of-two dimensions.
+     * It can be either a regular texture or a rectangle one.
+     */
+    boolean is_npot;
+
     /* Pipe buffer backing this texture. */
     struct pipe_buffer* buffer;
 
@@ -197,25 +216,23 @@ struct r300_texture {
     struct r300_texture_state state;
 };
 
-struct r300_vertex_format {
+struct r300_vertex_info {
     /* Parent class */
     struct vertex_info vinfo;
-    /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
-    uint32_t vap_prog_stream_cntl[8];
-    /* R300_VAP_PROG_STREAK_CNTL_EXT_[0-7] */
-    uint32_t vap_prog_stream_cntl_ext[8];
     /* Map of vertex attributes into PVS memory for HW TCL,
      * or GA memory for SW TCL. */
     int vs_tab[16];
     /* Map of rasterizer attributes from GB through RS to US. */
     int fs_tab[16];
-};
 
-static struct pipe_viewport_state r300_viewport_identity = {
-    .scale = {1.0, 1.0, 1.0, 1.0},
-    .translate = {0.0, 0.0, 0.0, 0.0},
+    /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
+    uint32_t vap_prog_stream_cntl[8];
+    /* R300_VAP_PROG_STREAK_CNTL_EXT_[0-7] */
+    uint32_t vap_prog_stream_cntl_ext[8];
 };
 
+extern struct pipe_viewport_state r300_viewport_identity;
+
 struct r300_context {
     /* Parent class */
     struct pipe_context context;
@@ -233,7 +250,14 @@ struct r300_context {
     /* Occlusion query buffer. */
     struct pipe_buffer* oqbo;
     /* Query list. */
-    struct r300_query* query_list;
+    struct r300_query *query_current;
+    struct r300_query query_list;
+
+    /* Shader hash table. Used to store vertex formatting information, which
+     * depends on the combination of both currently loaded shaders. */
+    struct util_hash_table* shader_hash_table;
+    /* Vertex formatting information. */
+    struct r300_vertex_info* vertex_info;
 
     /* Various CSO state objects. */
     /* Blend state. */
@@ -262,19 +286,27 @@ struct r300_context {
     /* Texture states. */
     struct r300_texture* textures[8];
     int texture_count;
-    /* Vertex buffers for Gallium. */
-    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
-    int vertex_buffer_count;
-    /* Vertex information. */
-    struct r300_vertex_format vertex_info;
     /* Vertex shader. */
     struct r300_vertex_shader* vs;
     /* Viewport state. */
     struct r300_viewport_state* viewport_state;
+    /* ZTOP state. */
+    struct r300_ztop_state ztop_state;
+
+    /* Vertex buffers for Gallium. */
+    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+    int vertex_buffer_count;
+    /* Vertex elements for Gallium. */
+    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+    int vertex_element_count;
+
     /* Bitmask of dirty state objects. */
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
 };
 
 /* Convenience cast wrapper. */
@@ -288,4 +320,42 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_surface_functions(struct r300_context* r300);
 
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP    0x0000001
+#define DBG_FP      0x0000002
+#define DBG_VP      0x0000004
+#define DBG_CS      0x0000008
+#define DBG_DRAW    0x0000010
+#define DBG_TEX     0x0000020
+#define DBG_FALL    0x0000040
+/*@}*/
+
+static INLINE boolean DBG_ON(struct r300_context * ctx, unsigned flags)
+{
+    return (ctx->debug & flags) ? TRUE : FALSE;
+}
+
+static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...)
+{
+    if (DBG_ON(ctx, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        debug_vprintf(fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_context * ctx);
+
 #endif /* R300_CONTEXT_H */
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 71b142c0dbf..86ba91db52e 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -34,8 +34,8 @@
 
 #define MAX_CS_SIZE 64 * 1024 / 4
 
-#define VERY_VERBOSE_CS 0
-#define VERY_VERBOSE_REGISTERS 0
+#define VERY_VERBOSE_CS 1
+#define VERY_VERBOSE_REGISTERS 1
 
 /* XXX stolen from radeon_drm.h */
 #define RADEON_GEM_DOMAIN_CPU  0x1
@@ -49,7 +49,8 @@
     (RADEON_CP_PACKET0 | ((count) << 16) | ((register) >> 2))
 
 #define CS_LOCALS(context) \
-    struct r300_winsys* cs_winsys = context->winsys; \
+    struct r300_context* const cs_context_copy = (context); \
+    struct r300_winsys* cs_winsys = cs_context_copy->winsys; \
     int cs_count = 0;
 
 #define CHECK_CS(size) \
@@ -58,7 +59,7 @@
 #define BEGIN_CS(size) do { \
     CHECK_CS(size); \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: BEGIN_CS, count %d, in %s (%s:%d)\n", \
                 size, __FUNCTION__, __FILE__, __LINE__); \
     } \
     cs_winsys->begin_cs(cs_winsys, (size), \
@@ -67,47 +68,55 @@
 } while (0)
 
 #define OUT_CS(value) do { \
+    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
+        DBG(cs_context_copy, DBG_CS, "r300: writing %08x\n", value); \
+    } \
     cs_winsys->write_cs_dword(cs_winsys, (value)); \
     cs_count--; \
 } while (0)
 
 #define OUT_CS_32F(value) do { \
+    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
+        DBG(cs_context_copy, DBG_CS, "r300: writing %f\n", value); \
+    } \
     cs_winsys->write_cs_dword(cs_winsys, fui(value)); \
     cs_count--; \
 } while (0)
 
 #define OUT_CS_REG(register, value) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing 0x%08X to register 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing 0x%08X to register 0x%04X\n", \
             value, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, 0)); \
-    OUT_CS(value); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0(register, 0)); \
+    cs_winsys->write_cs_dword(cs_winsys, value); \
+    cs_count -= 2; \
 } while (0)
 
 /* Note: This expects count to be the number of registers,
  * not the actual packet0 count! */
 #define OUT_CS_REG_SEQ(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing register sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing register sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, ((count) - 1))); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1))); \
+    cs_count--; \
 } while (0)
 
 #define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for buffer %p, offset %d, " \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, offset %d, " \
             "domains (%d, %d, %d)\n", \
         bo, offset, rd, wd, flags); \
     assert(bo); \
-    OUT_CS(offset); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 2; \
+    cs_count -= 3; \
 } while (0)
 
 #define END_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     if (cs_count != 0) \
@@ -117,7 +126,7 @@
 
 #define FLUSH_CS do { \
     if (VERY_VERBOSE_CS) { \
-        debug_printf("r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
+        DBG(cs_context_copy, DBG_CS, "r300: FLUSH_CS in %s (%s:%d)\n\n", __FUNCTION__, \
                 __FILE__, __LINE__); \
     } \
     cs_winsys->flush_cs(cs_winsys); \
@@ -127,27 +136,29 @@
 
 #define OUT_CS_ONE_REG(register, count) do { \
     if (VERY_VERBOSE_REGISTERS) \
-        debug_printf("r300: writing data sequence of %d to 0x%04X\n", \
+        DBG(cs_context_copy, DBG_CS, "r300: writing data sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, ((count) - 1)) | RADEON_ONE_REG_WR); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1)) | RADEON_ONE_REG_WR); \
+    cs_count--; \
 } while (0)
 
 #define CP_PACKET3(op, count) \
     (RADEON_CP_PACKET3 | (op) | ((count) << 16))
 
 #define OUT_CS_PKT3(op, count) do { \
-    OUT_CS(CP_PACKET3(op, count)); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET3(op, count)); \
+    cs_count--; \
 } while (0)
 
 #define OUT_CS_INDEX_RELOC(bo, offset, count, rd, wd, flags) do { \
-    debug_printf("r300: writing relocation for index buffer %p," \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for index buffer %p," \
             "offset %d\n", bo, offset); \
     assert(bo); \
-    OUT_CS(offset); \
-    OUT_CS(count); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
+    cs_winsys->write_cs_dword(cs_winsys, count); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 2; \
+    cs_count -= 4; \
 } while (0)
 
 #endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
new file mode 100644
index 00000000000..2a6ed54ac9b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2009 Nicolai Haehnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_context.h"
+
+#include <ctype.h>
+
+
+struct debug_option {
+    const char * name;
+    unsigned flag;
+    const char * description;
+};
+
+static struct debug_option debug_options[] = {
+    { "help", DBG_HELP, "Helpful meta-information about the driver" },
+    { "fp", DBG_FP, "Fragment program handling" },
+    { "vp", DBG_VP, "Vertex program handling" },
+    { "cs", DBG_CS, "Command submissions" },
+    { "draw", DBG_DRAW, "Draw and emit" },
+    { "tex", DBG_TEX, "Textures" },
+    { "fall", DBG_FALL, "Fallbacks" },
+
+    { "all", ~0, "Convenience option that enables all debug flags" },
+
+    /* must be last */
+    { 0, 0, 0 }
+};
+
+void r300_init_debug(struct r300_context * ctx)
+{
+    const char * options = debug_get_option("RADEON_DEBUG", 0);
+    boolean printhint = FALSE;
+    size_t length;
+    struct debug_option * opt;
+
+    if (options) {
+        while(*options) {
+            if (*options == ' ' || *options == ',') {
+                options++;
+                continue;
+            }
+
+            length = strcspn(options, " ,");
+
+            for(opt = debug_options; opt->name; ++opt) {
+                if (!strncmp(options, opt->name, length)) {
+                    ctx->debug |= opt->flag;
+                    break;
+                }
+            }
+
+            if (!opt->name) {
+                debug_printf("Unknown debug option: %s\n", options);
+                printhint = TRUE;
+            }
+
+            options += length;
+        }
+
+        if (!ctx->debug)
+            printhint = TRUE;
+    }
+
+    if (printhint || ctx->debug & DBG_HELP) {
+        debug_printf("You can enable debug output by setting the RADEON_DEBUG environment variable\n"
+                     "to a comma-separated list of debug options. Available options are:\n");
+        for(opt = debug_options; opt->name; ++opt) {
+            debug_printf("    %s: %s\n", opt->name, opt->description);
+        }
+    }
+}
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index bd4d59e6f1a..eeb97a2d370 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -22,19 +22,27 @@
 
 /* r300_emit: Functions for emitting state. */
 
-#include "r300_emit.h"
+#include "util/u_math.h"
 
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
 #include "r300_fs.h"
+#include "r300_screen.h"
+#include "r300_state_derived.h"
+#include "r300_state_inlines.h"
+#include "r300_texture.h"
 #include "r300_vs.h"
 
 void r300_emit_blend_state(struct r300_context* r300,
                            struct r300_blend_state* blend)
 {
     CS_LOCALS(r300);
-    BEGIN_CS(7);
-    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 2);
+    BEGIN_CS(8);
+    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
     OUT_CS(blend->blend_control);
     OUT_CS(blend->alpha_blend_control);
+    OUT_CS(blend->color_channel_mask);
     OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
     OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
     END_CS;
@@ -95,19 +103,23 @@ void r300_emit_dsa_state(struct r300_context* r300,
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
-    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 8);
+    BEGIN_CS(r300screen->caps->is_r500 ? 10 : 8);
     OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
-    /* XXX figure out the r300 counterpart for this */
-    if (r300screen->caps->is_r500) {
-        /* OUT_CS_REG(R500_FG_ALPHA_VALUE, dsa->alpha_reference); */
-    }
+
+    /* not needed since we use the 8bit alpha ref */
+    /*if (r300screen->caps->is_r500) {
+        OUT_CS_REG(R500_FG_ALPHA_VALUE, dsa->alpha_reference);
+    }*/
+
     OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
     OUT_CS(dsa->z_buffer_control);
     OUT_CS(dsa->z_stencil_control);
     OUT_CS(dsa->stencil_ref_mask);
-    OUT_CS_REG(R300_ZB_ZTOP, dsa->z_buffer_top);
+    OUT_CS_REG(R300_ZB_ZTOP, r300->ztop_state.z_buffer_top);
+
+    /* XXX it seems r3xx doesn't support STENCILREFMASK_BF */
     if (r300screen->caps->is_r500) {
-        /* OUT_CS_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf); */
+        OUT_CS_REG(R500_ZB_STENCILREFMASK_BF, dsa->stencil_ref_bf);
     }
     END_CS;
 }
@@ -167,18 +179,15 @@ static uint32_t pack_float24(float f)
 }
 
 void r300_emit_fragment_program_code(struct r300_context* r300,
-                                     struct rX00_fragment_program_code* generic_code,
-                                     struct r300_constant_buffer* externals)
+                                     struct rX00_fragment_program_code* generic_code)
 {
     struct r300_fragment_program_code * code = &generic_code->code.r300;
-    struct rc_constant_list * constants = &generic_code->constants;
     int i;
     CS_LOCALS(r300);
 
     BEGIN_CS(15 +
              code->alu.length * 4 +
-             (code->tex.length ? (1 + code->tex.length) : 0) +
-             (constants->Count ? (1 + constants->Count * 4) : 0));
+             (code->tex.length ? (1 + code->tex.length) : 0));
 
     OUT_CS_REG(R300_US_CONFIG, code->config);
     OUT_CS_REG(R300_US_PIXSIZE, code->pixsize);
@@ -210,32 +219,41 @@ void r300_emit_fragment_program_code(struct r300_context* r300,
             OUT_CS(code->tex.inst[i]);
     }
 
-    if (constants->Count) {
-        OUT_CS_ONE_REG(R300_PFS_PARAM_0_X, constants->Count * 4);
-        for(i = 0; i < constants->Count; ++i) {
-            const float * data = get_shader_constant(r300, &constants->Constants[i], externals);
-            OUT_CS(pack_float24(data[0]));
-            OUT_CS(pack_float24(data[1]));
-            OUT_CS(pack_float24(data[2]));
-            OUT_CS(pack_float24(data[3]));
-        }
-    }
+    END_CS;
+}
+
+void r300_emit_fs_constant_buffer(struct r300_context* r300,
+                                  struct rc_constant_list* constants)
+{
+    int i;
+    CS_LOCALS(r300);
 
+    if (constants->Count == 0)
+        return;
+
+    BEGIN_CS(constants->Count * 4 + 1);
+    OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X, constants->Count * 4);
+    for(i = 0; i < constants->Count; ++i) {
+        const float * data = get_shader_constant(r300,
+                                                 &constants->Constants[i],
+                                                 &r300->shader_constants[PIPE_SHADER_FRAGMENT]);
+        OUT_CS(pack_float24(data[0]));
+        OUT_CS(pack_float24(data[1]));
+        OUT_CS(pack_float24(data[2]));
+        OUT_CS(pack_float24(data[3]));
+    }
     END_CS;
 }
 
 void r500_emit_fragment_program_code(struct r300_context* r300,
-                                     struct rX00_fragment_program_code* generic_code,
-                                     struct r300_constant_buffer* externals)
+                                     struct rX00_fragment_program_code* generic_code)
 {
     struct r500_fragment_program_code * code = &generic_code->code.r500;
-    struct rc_constant_list * constants = &generic_code->constants;
     int i;
     CS_LOCALS(r300);
 
     BEGIN_CS(13 +
-             ((code->inst_end + 1) * 6) +
-             (constants->Count ? (3 + (constants->Count * 4)) : 0));
+             ((code->inst_end + 1) * 6));
     OUT_CS_REG(R500_US_CONFIG, 0);
     OUT_CS_REG(R500_US_PIXSIZE, code->max_temp_idx);
     OUT_CS_REG(R500_US_CODE_RANGE,
@@ -255,18 +273,30 @@ void r500_emit_fragment_program_code(struct r300_context* r300,
         OUT_CS(code->inst[i].inst5);
     }
 
-    if (constants->Count) {
-        OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST);
-        OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, constants->Count * 4);
-        for (i = 0; i < constants->Count; i++) {
-            const float * data = get_shader_constant(r300, &constants->Constants[i], externals);
-            OUT_CS_32F(data[0]);
-            OUT_CS_32F(data[1]);
-            OUT_CS_32F(data[2]);
-            OUT_CS_32F(data[3]);
-        }
-    }
+    END_CS;
+}
+
+void r500_emit_fs_constant_buffer(struct r300_context* r300,
+                                  struct rc_constant_list* constants)
+{
+    int i;
+    CS_LOCALS(r300);
+
+    if (constants->Count == 0)
+        return;
 
+    BEGIN_CS(constants->Count * 4 + 3);
+    OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST);
+    OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, constants->Count * 4);
+    for (i = 0; i < constants->Count; i++) {
+        const float * data = get_shader_constant(r300,
+                                                 &constants->Constants[i],
+                                                 &r300->shader_constants[PIPE_SHADER_FRAGMENT]);
+        OUT_CS_32F(data[0]);
+        OUT_CS_32F(data[1]);
+        OUT_CS_32F(data[2]);
+        OUT_CS_32F(data[3]);
+    }
     END_CS;
 }
 
@@ -274,76 +304,84 @@ void r300_emit_fb_state(struct r300_context* r300,
                         struct pipe_framebuffer_state* fb)
 {
     struct r300_texture* tex;
-    unsigned pixpitch;
+    struct pipe_surface* surf;
     int i;
     CS_LOCALS(r300);
 
     BEGIN_CS((10 * fb->nr_cbufs) + (fb->zsbuf ? 10 : 0) + 4);
+    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+
     for (i = 0; i < fb->nr_cbufs; i++) {
-        tex = (struct r300_texture*)fb->cbufs[i]->texture;
+        surf = fb->cbufs[i];
+        tex = (struct r300_texture*)surf->texture;
         assert(tex && tex->buffer && "cbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0 + (4 * i), 1);
-        OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+        OUT_CS_RELOC(tex->buffer, surf->offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
-        OUT_CS_RELOC(tex->buffer, pixpitch |
+        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
                      r300_translate_colorformat(tex->tex.format), 0,
                      RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i),
-            r300_translate_out_fmt(fb->cbufs[i]->format));
+            r300_translate_out_fmt(surf->format));
     }
 
     if (fb->zsbuf) {
-        tex = (struct r300_texture*)fb->zsbuf->texture;
+        surf = fb->zsbuf;
+        tex = (struct r300_texture*)surf->texture;
         assert(tex && tex->buffer && "zsbuf is marked, but NULL!");
-        pixpitch = tex->stride / tex->tex.block.size;
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHOFFSET, 1);
-        OUT_CS_RELOC(tex->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+        OUT_CS_RELOC(tex->buffer, surf->offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_ZB_FORMAT, r300_translate_zsformat(tex->tex.format));
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
-        OUT_CS_RELOC(tex->buffer, pixpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level], 0,
+                     RADEON_GEM_DOMAIN_VRAM, 0);
     }
 
-    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
-        R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
-        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
-    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
-        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
     END_CS;
 }
 
-void r300_emit_query_begin(struct r300_context* r300,
-                           struct r300_query* query)
+static void r300_emit_query_start(struct r300_context *r300)
 {
+    struct r300_capabilities *caps = r300_screen(r300->context.screen)->caps;
+    struct r300_query *query = r300->query_current;
     CS_LOCALS(r300);
 
+    if (!query)
+	return;
+
     /* XXX This will almost certainly not return good results
      * for overlapping queries. */
-    BEGIN_CS(2);
+    BEGIN_CS(4);
+    if (caps->family == CHIP_FAMILY_RV530) {
+        OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+    } else {
+        OUT_CS_REG(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_ALL);
+    }
     OUT_CS_REG(R300_ZB_ZPASS_DATA, 0);
     END_CS;
+    query->begin_emitted = TRUE;
 }
 
-void r300_emit_query_end(struct r300_context* r300,
-                         struct r300_query* query)
+
+static void r300_emit_query_finish(struct r300_context *r300,
+                                   struct r300_query *query)
 {
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
     CS_LOCALS(r300);
 
-    if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
-                0, RADEON_GEM_DOMAIN_GTT)) {
-        debug_printf("r300: There wasn't room for the OQ buffer!?"
-                " Oh noes!\n");
-    }
-
     assert(caps->num_frag_pipes);
+
     BEGIN_CS(6 * caps->num_frag_pipes + 2);
     /* I'm not so sure I like this switch, but it's hard to be elegant
      * when there's so many special cases...
@@ -380,6 +418,7 @@ void r300_emit_query_end(struct r300_context* r300,
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
             OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 0),
                     0, RADEON_GEM_DOMAIN_GTT, 0);
+            break;
         default:
             debug_printf("r300: Implementation error: Chipset reports %d"
                     " pixel pipes!\n", caps->num_frag_pipes);
@@ -389,14 +428,62 @@ void r300_emit_query_end(struct r300_context* r300,
     /* And, finally, reset it to normal... */
     OUT_CS_REG(R300_SU_REG_DEST, 0xF);
     END_CS;
+}
+
+static void rv530_emit_query_single(struct r300_context *r300,
+                                    struct r300_query *query)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(8);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
+    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+    OUT_CS_RELOC(r300->oqbo, query->offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+    END_CS;
+}
+
+static void rv530_emit_query_double(struct r300_context *r300,
+                                    struct r300_query *query)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(14);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
+    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+    OUT_CS_RELOC(r300->oqbo, query->offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_1);
+    OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
+    OUT_CS_RELOC(r300->oqbo, query->offset + sizeof(uint32_t), 0, RADEON_GEM_DOMAIN_GTT, 0);
+    OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+    END_CS;
+}
+
+void r300_emit_query_end(struct r300_context* r300)
+{
+    struct r300_capabilities *caps = r300_screen(r300->context.screen)->caps;
+    struct r300_query *query = r300->query_current;
+
+    if (!query)
+	return;
+
+    if (query->begin_emitted == FALSE)
+        return;
 
+    if (caps->family == CHIP_FAMILY_RV530) {
+        if (caps->num_z_pipes == 2)
+            rv530_emit_query_double(r300, query);
+        else
+            rv530_emit_query_single(r300, query);
+    } else 
+        r300_emit_query_finish(r300, query);
 }
 
 void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
 {
     CS_LOCALS(r300);
 
-    BEGIN_CS(20);
+    BEGIN_CS(22);
     OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
     OUT_CS_REG(R300_GA_POINT_SIZE, rs->point_size);
     OUT_CS_REG_SEQ(R300_GA_POINT_MINMAX, 2);
@@ -412,6 +499,7 @@ void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
     OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, rs->line_stipple_config);
     OUT_CS_REG(R300_GA_LINE_STIPPLE_VALUE, rs->line_stipple_value);
     OUT_CS_REG(R300_GA_COLOR_CONTROL, rs->color_control);
+    OUT_CS_REG(R300_GA_POLY_MODE, rs->polygon_mode);
     END_CS;
 }
 
@@ -422,6 +510,8 @@ void r300_emit_rs_block_state(struct r300_context* r300,
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
+    DBG(r300, DBG_DRAW, "r300: RS emit:\n");
+
     BEGIN_CS(21);
     if (r300screen->caps->is_r500) {
         OUT_CS_REG_SEQ(R500_RS_IP_0, 8);
@@ -430,7 +520,7 @@ void r300_emit_rs_block_state(struct r300_context* r300,
     }
     for (i = 0; i < 8; i++) {
         OUT_CS(rs->ip[i]);
-        /* debug_printf("ip %d: 0x%08x\n", i, rs->ip[i]); */
+        DBG(r300, DBG_DRAW, "    : ip %d: 0x%08x\n", i, rs->ip[i]);
     }
 
     OUT_CS_REG_SEQ(R300_RS_COUNT, 2);
@@ -444,11 +534,11 @@ void r300_emit_rs_block_state(struct r300_context* r300,
     }
     for (i = 0; i < 8; i++) {
         OUT_CS(rs->inst[i]);
-        /* debug_printf("inst %d: 0x%08x\n", i, rs->inst[i]); */
+        DBG(r300, DBG_DRAW, "    : inst %d: 0x%08x\n", i, rs->inst[i]);
     }
 
-    /* debug_printf("count: 0x%08x inst_count: 0x%08x\n", rs->count,
-     *        rs->inst_count); */
+    DBG(r300, DBG_DRAW, "    : count: 0x%08x inst_count: 0x%08x\n",
+        rs->count, rs->inst_count);
 
     END_CS;
 }
@@ -470,10 +560,18 @@ void r300_emit_texture(struct r300_context* r300,
                        struct r300_texture* tex,
                        unsigned offset)
 {
+    uint32_t filter0 = sampler->filter0;
     CS_LOCALS(r300);
 
+    /* to emulate 1D textures through 2D ones correctly */
+    if (tex->tex.target == PIPE_TEXTURE_1D) {
+        filter0 &= ~R300_TX_WRAP_T_MASK;
+        filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
+    }
+
     BEGIN_CS(16);
-    OUT_CS_REG(R300_TX_FILTER0_0 + (offset * 4), sampler->filter0);
+    OUT_CS_REG(R300_TX_FILTER0_0 + (offset * 4), filter0 |
+        (offset << 28));
     OUT_CS_REG(R300_TX_FILTER1_0 + (offset * 4), sampler->filter1);
     OUT_CS_REG(R300_TX_BORDER_COLOR_0 + (offset * 4), sampler->border_color);
 
@@ -486,13 +584,58 @@ void r300_emit_texture(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_vertex_buffer(struct r300_context* r300)
+/* XXX I can't read this and that's not good */
+void r300_emit_aos(struct r300_context* r300, unsigned offset)
+{
+    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->vertex_element;
+    CS_LOCALS(r300);
+    int i;
+    unsigned aos_count = r300->vertex_element_count;
+
+    unsigned packet_size = (aos_count * 3 + 1) / 2;
+    BEGIN_CS(2 + packet_size + aos_count * 2);
+    OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, packet_size);
+    OUT_CS(aos_count);
+    for (i = 0; i < aos_count - 1; i += 2) {
+        int buf_num1 = velem[i].vertex_buffer_index;
+        int buf_num2 = velem[i+1].vertex_buffer_index;
+        assert(vbuf[buf_num1].stride % 4 == 0 && pf_get_size(velem[i].src_format) % 4 == 0);
+        assert(vbuf[buf_num2].stride % 4 == 0 && pf_get_size(velem[i+1].src_format) % 4 == 0);
+        OUT_CS((pf_get_size(velem[i].src_format) >> 2) | (vbuf[buf_num1].stride << 6) |
+               (pf_get_size(velem[i+1].src_format) << 14) | (vbuf[buf_num2].stride << 22));
+        OUT_CS(vbuf[buf_num1].buffer_offset + velem[i].src_offset +
+               offset * vbuf[buf_num1].stride);
+        OUT_CS(vbuf[buf_num2].buffer_offset + velem[i+1].src_offset +
+               offset * vbuf[buf_num2].stride);
+    }
+    if (aos_count & 1) {
+        int buf_num = velem[i].vertex_buffer_index;
+        assert(vbuf[buf_num].stride % 4 == 0 && pf_get_size(velem[i].src_format) % 4 == 0);
+        OUT_CS((pf_get_size(velem[i].src_format) >> 2) | (vbuf[buf_num].stride << 6));
+        OUT_CS(vbuf[buf_num].buffer_offset + velem[i].src_offset +
+               offset * vbuf[buf_num].stride);
+    }
+
+    /* XXX bare CS reloc */
+    for (i = 0; i < aos_count; i++) {
+        cs_winsys->write_cs_reloc(cs_winsys,
+                                  vbuf[velem[i].vertex_buffer_index].buffer,
+                                  RADEON_GEM_DOMAIN_GTT,
+                                  0,
+                                  0);
+        cs_count -= 2;
+    }
+    END_CS;
+}
+#if 0
+void r300_emit_draw_packet(struct r300_context* r300)
 {
     CS_LOCALS(r300);
 
-    debug_printf("r300: Preparing vertex buffer %p for render, "
+    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
             "vertex size %d\n", r300->vbo,
-            r300->vertex_info.vinfo.size);
+            r300->vertex_info->vinfo.size);
     /* Set the pointer to our vertex buffer. The emitted values are this:
      * PACKET3 [3D_LOAD_VBPNTR]
      * COUNT   [1]
@@ -503,50 +646,52 @@ void r300_emit_vertex_buffer(struct r300_context* r300)
     BEGIN_CS(7);
     OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, 3);
     OUT_CS(1);
-    OUT_CS(r300->vertex_info.vinfo.size |
-            (r300->vertex_info.vinfo.size << 8));
+    OUT_CS(r300->vertex_info->vinfo.size |
+            (r300->vertex_info->vinfo.size << 8));
     OUT_CS(r300->vbo_offset);
     OUT_CS_RELOC(r300->vbo, 0, RADEON_GEM_DOMAIN_GTT, 0, 0);
     END_CS;
 }
+#endif
 
 void r300_emit_vertex_format_state(struct r300_context* r300)
 {
     int i;
     CS_LOCALS(r300);
 
+    DBG(r300, DBG_DRAW, "r300: VAP/PSC emit:\n");
+
     BEGIN_CS(26);
-    OUT_CS_REG(R300_VAP_VTX_SIZE, r300->vertex_info.vinfo.size);
+    OUT_CS_REG(R300_VAP_VTX_SIZE, r300->vertex_info->vinfo.size);
 
     OUT_CS_REG_SEQ(R300_VAP_VTX_STATE_CNTL, 2);
-    OUT_CS(r300->vertex_info.vinfo.hwfmt[0]);
-    OUT_CS(r300->vertex_info.vinfo.hwfmt[1]);
+    OUT_CS(r300->vertex_info->vinfo.hwfmt[0]);
+    OUT_CS(r300->vertex_info->vinfo.hwfmt[1]);
     OUT_CS_REG_SEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
-    OUT_CS(r300->vertex_info.vinfo.hwfmt[2]);
-    OUT_CS(r300->vertex_info.vinfo.hwfmt[3]);
-    /* for (i = 0; i < 4; i++) {
-     *    debug_printf("hwfmt%d: 0x%08x\n", i,
-     *            r300->vertex_info.vinfo.hwfmt[i]);
-     * } */
+    OUT_CS(r300->vertex_info->vinfo.hwfmt[2]);
+    OUT_CS(r300->vertex_info->vinfo.hwfmt[3]);
+    for (i = 0; i < 4; i++) {
+       DBG(r300, DBG_DRAW, "    : hwfmt%d: 0x%08x\n", i,
+               r300->vertex_info->vinfo.hwfmt[i]);
+    }
 
     OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_0, 8);
     for (i = 0; i < 8; i++) {
-        OUT_CS(r300->vertex_info.vap_prog_stream_cntl[i]);
-        /* debug_printf("prog_stream_cntl%d: 0x%08x\n", i,
-         *        r300->vertex_info.vap_prog_stream_cntl[i]); */
+        OUT_CS(r300->vertex_info->vap_prog_stream_cntl[i]);
+        DBG(r300, DBG_DRAW, "    : prog_stream_cntl%d: 0x%08x\n", i,
+               r300->vertex_info->vap_prog_stream_cntl[i]);
     }
     OUT_CS_REG_SEQ(R300_VAP_PROG_STREAM_CNTL_EXT_0, 8);
     for (i = 0; i < 8; i++) {
-        OUT_CS(r300->vertex_info.vap_prog_stream_cntl_ext[i]);
-        /* debug_printf("prog_stream_cntl_ext%d: 0x%08x\n", i,
-         *        r300->vertex_info.vap_prog_stream_cntl_ext[i]); */
+        OUT_CS(r300->vertex_info->vap_prog_stream_cntl_ext[i]);
+        DBG(r300, DBG_DRAW, "    : prog_stream_cntl_ext%d: 0x%08x\n", i,
+               r300->vertex_info->vap_prog_stream_cntl_ext[i]);
     }
     END_CS;
 }
 
 void r300_emit_vertex_program_code(struct r300_context* r300,
-                                   struct r300_vertex_program_code* code,
-                                   struct r300_constant_buffer* constants)
+                                   struct r300_vertex_program_code* code)
 {
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
@@ -559,12 +704,7 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
         return;
     }
 
-    if (code->constants.Count) {
-        BEGIN_CS(14 + code->length + (code->constants.Count * 4));
-    } else {
-        BEGIN_CS(11 + code->length);
-    }
-
+    BEGIN_CS(9 + code->length);
     /* R300_VAP_PVS_CODE_CNTL_0
      * R300_VAP_PVS_CONST_CNTL
      * R300_VAP_PVS_CODE_CNTL_1
@@ -582,32 +722,50 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
     for (i = 0; i < code->length; i++)
         OUT_CS(code->body.d[i]);
 
-    if (code->constants.Count) {
-        OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
-                (r300screen->caps->is_r500 ?
-                 R500_PVS_CONST_START : R300_PVS_CONST_START));
-        OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->constants.Count * 4);
-        for (i = 0; i < code->constants.Count; i++) {
-            const float * data = get_shader_constant(r300, &code->constants.Constants[i], constants);
-            OUT_CS_32F(data[0]);
-            OUT_CS_32F(data[1]);
-            OUT_CS_32F(data[2]);
-            OUT_CS_32F(data[3]);
-        }
-    }
-
     OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(10) |
             R300_PVS_NUM_CNTLRS(5) |
             R300_PVS_NUM_FPUS(r300screen->caps->num_vert_fpus) |
             R300_PVS_VF_MAX_VTX_NUM(12));
-    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
     END_CS;
 }
 
 void r300_emit_vertex_shader(struct r300_context* r300,
                              struct r300_vertex_shader* vs)
 {
-    r300_emit_vertex_program_code(r300, &vs->code, &r300->shader_constants[PIPE_SHADER_VERTEX]);
+    r300_emit_vertex_program_code(r300, &vs->code);
+}
+
+void r300_emit_vs_constant_buffer(struct r300_context* r300,
+                                  struct rc_constant_list* constants)
+{
+    int i;
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    CS_LOCALS(r300);
+
+    if (!r300screen->caps->has_tcl) {
+        debug_printf("r300: Implementation error: emit_vertex_shader called,"
+        " but has_tcl is FALSE!\n");
+        return;
+    }
+
+    if (constants->Count == 0)
+        return;
+
+    BEGIN_CS(constants->Count * 4 + 3);
+    OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG,
+               (r300screen->caps->is_r500 ?
+               R500_PVS_CONST_START : R300_PVS_CONST_START));
+    OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, constants->Count * 4);
+    for (i = 0; i < constants->Count; i++) {
+        const float * data = get_shader_constant(r300,
+                                                 &constants->Constants[i],
+                                                 &r300->shader_constants[PIPE_SHADER_VERTEX]);
+        OUT_CS_32F(data[0]);
+        OUT_CS_32F(data[1]);
+        OUT_CS_32F(data[2]);
+        OUT_CS_32F(data[3]);
+    }
+    END_CS;
 }
 
 void r300_emit_viewport_state(struct r300_context* r300,
@@ -642,6 +800,15 @@ void r300_flush_textures(struct r300_context* r300)
     END_CS;
 }
 
+static void r300_flush_pvs(struct r300_context* r300)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
+    END_CS;
+}
+
 /* Emit all dirty state. */
 void r300_emit_dirty_state(struct r300_context* r300)
 {
@@ -654,7 +821,8 @@ void r300_emit_dirty_state(struct r300_context* r300)
         return;
     }
 
-    r300_update_derived_state(r300);
+    /* Clean out BOs. */
+    r300->winsys->reset_bos(r300->winsys);
 
     /* XXX check size */
 validate:
@@ -681,7 +849,8 @@ validate:
     /* ...textures... */
     for (i = 0; i < r300->texture_count; i++) {
         tex = r300->textures[i];
-        assert(tex && tex->buffer && "texture is marked, but NULL!");
+        if (!tex)
+            continue;
         if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
                     RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0)) {
             r300->context.flush(&r300->context, 0, NULL);
@@ -702,7 +871,7 @@ validate:
             goto validate;
         }
     } else {
-        debug_printf("No VBO while emitting dirty state!\n");
+        // debug_printf("No VBO while emitting dirty state!\n");
     }
     if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
@@ -715,6 +884,11 @@ validate:
         goto validate;
     }
 
+    if (r300->dirty_state & R300_NEW_QUERY) {
+        r300_emit_query_start(r300);
+        r300->dirty_state &= ~R300_NEW_QUERY;
+    }
+
     if (r300->dirty_state & R300_NEW_BLEND) {
         r300_emit_blend_state(r300, r300->blend_state);
         r300->dirty_state &= ~R300_NEW_BLEND;
@@ -737,13 +911,22 @@ validate:
 
     if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
         if (r300screen->caps->is_r500) {
-            r500_emit_fragment_program_code(r300, &r300->fs->code, &r300->shader_constants[PIPE_SHADER_FRAGMENT]);
+            r500_emit_fragment_program_code(r300, &r300->fs->code);
         } else {
-            r300_emit_fragment_program_code(r300, &r300->fs->code, &r300->shader_constants[PIPE_SHADER_FRAGMENT]);
+            r300_emit_fragment_program_code(r300, &r300->fs->code);
         }
         r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER;
     }
 
+    if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER_CONSTANTS) {
+        if (r300screen->caps->is_r500) {
+            r500_emit_fs_constant_buffer(r300, &r300->fs->code.constants);
+        } else {
+            r300_emit_fs_constant_buffer(r300, &r300->fs->code.constants);
+        }
+        r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
+
     if (r300->dirty_state & R300_NEW_FRAMEBUFFERS) {
         r300_emit_fb_state(r300, &r300->framebuffer_state);
         r300->dirty_state &= ~R300_NEW_FRAMEBUFFERS;
@@ -768,12 +951,13 @@ validate:
     if (r300->dirty_state &
             (R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES)) {
         for (i = 0; i < MIN2(r300->sampler_count, r300->texture_count); i++) {
-            if (r300->dirty_state &
-                    ((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i))) {
-                r300_emit_texture(r300,
-                        r300->sampler_states[i],
-                        r300->textures[i],
-                        i);
+  	    if (r300->dirty_state &
+		((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i))) {
+		if (r300->textures[i]) 
+		    r300_emit_texture(r300,
+				      r300->sampler_states[i],
+				      r300->textures[i],
+				      i);
                 r300->dirty_state &=
                     ~((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i));
                 dirty_tex++;
@@ -796,17 +980,26 @@ validate:
         r300->dirty_state &= ~R300_NEW_VERTEX_FORMAT;
     }
 
+    if (r300->dirty_state & (R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS)) {
+        r300_flush_pvs(r300);
+    }
+
     if (r300->dirty_state & R300_NEW_VERTEX_SHADER) {
         r300_emit_vertex_shader(r300, r300->vs);
         r300->dirty_state &= ~R300_NEW_VERTEX_SHADER;
     }
 
+    if (r300->dirty_state & R300_NEW_VERTEX_SHADER_CONSTANTS) {
+        r300_emit_vs_constant_buffer(r300, &r300->vs->code.constants);
+        r300->dirty_state &= ~R300_NEW_VERTEX_SHADER_CONSTANTS;
+    }
+
     /* XXX
     assert(r300->dirty_state == 0);
     */
 
     /* Finally, emit the VBO. */
-    r300_emit_vertex_buffer(r300);
+    //r300_emit_vertex_buffer(r300);
 
     r300->dirty_hw++;
 }
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 350691d592d..7c83c5166de 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -23,16 +23,14 @@
 #ifndef R300_EMIT_H
 #define R300_EMIT_H
 
-#include "util/u_math.h"
-
 #include "r300_context.h"
-#include "r300_cs.h"
-#include "r300_screen.h"
-#include "r300_state_inlines.h"
+#include "radeon_code.h"
 
 struct rX00_fragment_program_code;
 struct r300_vertex_program_code;
 
+void r300_emit_aos(struct r300_context* r300, unsigned offset);
+
 void r300_emit_blend_state(struct r300_context* r300,
                            struct r300_blend_state* blend);
 
@@ -46,16 +44,25 @@ void r300_emit_dsa_state(struct r300_context* r300,
                          struct r300_dsa_state* dsa);
 
 void r300_emit_fragment_program_code(struct r300_context* r300,
-                                     struct rX00_fragment_program_code* generic_code,
-                                     struct r300_constant_buffer* externals);
+                                     struct rX00_fragment_program_code* generic_code);
+
+void r300_emit_fs_constant_buffer(struct r300_context* r300,
+                                  struct rc_constant_list* constants);
 
 void r500_emit_fragment_program_code(struct r300_context* r300,
-                                     struct rX00_fragment_program_code* generic_code,
-                                     struct r300_constant_buffer* externals);
+                                     struct rX00_fragment_program_code* generic_code);
+
+void r500_emit_fs_constant_buffer(struct r300_context* r300,
+                                  struct rc_constant_list* constants);
 
 void r300_emit_fb_state(struct r300_context* r300,
                         struct pipe_framebuffer_state* fb);
 
+void r300_emit_query_begin(struct r300_context* r300,
+                           struct r300_query* query);
+
+void r300_emit_query_end(struct r300_context* r300);
+
 void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
@@ -74,8 +81,10 @@ void r300_emit_vertex_buffer(struct r300_context* r300);
 void r300_emit_vertex_format_state(struct r300_context* r300);
 
 void r300_emit_vertex_program_code(struct r300_context* r300,
-                                   struct r300_vertex_program_code* code,
-                                   struct r300_constant_buffer* constants);
+                                   struct r300_vertex_program_code* code);
+
+void r300_emit_vs_constant_buffer(struct r300_context* r300,
+                                  struct rc_constant_list* constants);
 
 void r300_emit_vertex_shader(struct r300_context* r300,
                              struct r300_vertex_shader* vs);
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 0dff1c6f4fb..14a08241fc4 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -20,29 +20,48 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+#include "util/u_simple_list.h"
+
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
 #include "r300_flush.h"
+#include "r300_state_invariant.h"
 
 static void r300_flush(struct pipe_context* pipe,
                        unsigned flags,
                        struct pipe_fence_handle** fence)
 {
-    struct r300_context* r300 = r300_context(pipe);
-    CS_LOCALS(r300);
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_query *query;
 
+    CS_LOCALS(r300);
     /* We probably need to flush Draw, but we may have been called from
-     * within Draw. This feels kludgy, but it might be the best thing. */
-    if (!r300->draw->flushing) {
+     * within Draw. This feels kludgy, but it might be the best thing.
+     *
+     * Of course, the best thing is to kill Draw with fire. :3 */
+    if (r300->draw && !r300->draw->flushing) {
         draw_flush(r300->draw);
     }
 
+    r300_emit_query_end(r300);
+
     if (r300->dirty_hw) {
         FLUSH_CS;
         r300_emit_invariant_state(r300);
         r300->dirty_state = R300_NEW_KITCHEN_SINK;
         r300->dirty_hw = 0;
     }
+    /* reset flushed query */
+    foreach(query, &r300->query_list) {
+        query->flushed = TRUE;
+    }
 }
 
+
 void r300_init_flush_functions(struct r300_context* r300)
 {
     r300->context.flush = r300_flush;
diff --git a/src/gallium/drivers/r300/r300_flush.h b/src/gallium/drivers/r300/r300_flush.h
index 9a83d89daab..0e9e6106bb7 100644
--- a/src/gallium/drivers/r300/r300_flush.h
+++ b/src/gallium/drivers/r300/r300_flush.h
@@ -23,13 +23,6 @@
 #ifndef R300_FLUSH_H
 #define R300_FLUSH_H
 
-#include "draw/draw_private.h"
-
-#include "pipe/p_context.h"
-
-#include "r300_context.h"
-#include "r300_cs.h"
-
 void r300_init_flush_functions(struct r300_context* r300);
 
 #endif /* R300_FLUSH_H */
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 36463b9a2eb..29ddc84c411 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -21,10 +21,14 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "r300_fs.h"
+#include "tgsi/tgsi_dump.h"
 
+#include "r300_context.h"
+#include "r300_screen.h"
+#include "r300_fs.h"
 #include "r300_tgsi_to_rc.h"
 
+#include "radeon_code.h"
 #include "radeon_compiler.h"
 
 static void find_output_registers(struct r300_fragment_program_compiler * compiler,
@@ -96,7 +100,7 @@ void r300_translate_fragment_shader(struct r300_context* r300,
 
     memset(&compiler, 0, sizeof(compiler));
     rc_init(&compiler.Base);
-    compiler.Base.Debug = 1;
+    compiler.Base.Debug = DBG_ON(r300, DBG_FP);
 
     compiler.code = &fs->code;
     compiler.is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
@@ -126,9 +130,9 @@ void r300_translate_fragment_shader(struct r300_context* r300,
     /* Invoke the compiler */
     r3xx_compile_fragment_program(&compiler);
     if (compiler.Base.Error) {
-        /* Todo: Fail gracefully */
-        fprintf(stderr, "r300 FP: Compiler error\n");
-        abort();
+        /* XXX failover maybe? */
+        DBG(r300, DBG_FP, "r300: Error compiling fragment program: %s\n",
+            compiler.Base.ErrorMsg);
     }
 
     /* And, finally... */
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 9fab7894024..e831c30301b 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -24,11 +24,9 @@
 #ifndef R300_FS_H
 #define R300_FS_H
 
-#include "tgsi/tgsi_dump.h"
+#include "pipe/p_state.h"
 
-#include "r300_context.h"
-#include "r3xx_fs.h"
-#include "r5xx_fs.h"
+#include "tgsi/tgsi_scan.h"
 
 #include "radeon_code.h"
 
@@ -48,4 +46,10 @@ struct r300_fragment_shader {
 void r300_translate_fragment_shader(struct r300_context* r300,
                                     struct r300_fragment_shader* fs);
 
-    #endif /* R300_FS_H */
+static inline boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
+{
+    if (!fs)
+	return FALSE;
+    return (fs->code.writes_depth) ? TRUE : FALSE;
+}
+#endif /* R300_FS_H */
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 1d5185b417e..ca00b043c51 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -20,15 +20,23 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include "r300_context.h"
+#include "r300_screen.h"
+#include "r300_cs.h"
+#include "r300_emit.h"
 #include "r300_query.h"
+#include "r300_reg.h"
 
-static struct pipe_query* r300_create_query(struct pipe_context* pipe,
+static struct pipe_query *r300_create_query(struct pipe_context *pipe,
                                             unsigned query_type)
 {
-    struct r300_context* r300 = r300_context(pipe);
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    unsigned query_size = r300screen->caps->num_frag_pipes * 4;
-    struct r300_query* q, * qptr;
+    struct r300_context *r300 = r300_context(pipe);
+    struct r300_screen *r300screen = r300_screen(r300->context.screen);
+    unsigned query_size;
+    struct r300_query *q, *qptr;
 
     q = CALLOC_STRUCT(r300_query);
 
@@ -37,13 +45,16 @@ static struct pipe_query* r300_create_query(struct pipe_context* pipe,
 
     q->active = FALSE;
 
-    if (!r300->query_list) {
-        r300->query_list = q;
-    } else if (!is_empty_list(r300->query_list)) {
-        qptr = last_elem(r300->query_list);
+    if (r300screen->caps->family == CHIP_FAMILY_RV530)
+	query_size = r300screen->caps->num_z_pipes * sizeof(uint32_t);
+    else
+	query_size = r300screen->caps->num_frag_pipes * sizeof(uint32_t);
+
+    if (!is_empty_list(&r300->query_list)) {
+        qptr = last_elem(&r300->query_list);
         q->offset = qptr->offset + query_size;
-        insert_at_tail(r300->query_list, q);
     }
+    insert_at_tail(&r300->query_list, q);
 
     /* XXX */
     if (q->offset >= 4096) {
@@ -69,24 +80,26 @@ static void r300_begin_query(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
     struct r300_query* q = (struct r300_query*)query;
 
+    assert(r300->query_current == NULL);
+
     map = pipe->screen->buffer_map(pipe->screen, r300->oqbo,
             PIPE_BUFFER_USAGE_CPU_WRITE);
     map += q->offset / 4;
-    *map = ~0;
+    *map = ~0U;
     pipe->screen->buffer_unmap(pipe->screen, r300->oqbo);
 
-    r300_emit_dirty_state(r300);
-    r300_emit_query_begin(r300, q);
+    q->flushed = FALSE;
+    r300->query_current = q;
+    r300->dirty_state |= R300_NEW_QUERY;
 }
 
 static void r300_end_query(struct pipe_context* pipe,
-                           struct pipe_query* query)
+	                   struct pipe_query* query)
 {
     struct r300_context* r300 = r300_context(pipe);
-    struct r300_query* q = (struct r300_query*)query;
 
-    r300_emit_dirty_state(r300);
-    r300_emit_query_end(r300, q);
+    r300_emit_query_end(r300);
+    r300->query_current = NULL;
 }
 
 static boolean r300_get_query_result(struct pipe_context* pipe,
@@ -96,28 +109,36 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct r300_query* q = (struct r300_query*)query;
+    struct r300_query *q = (struct r300_query*)query;
     unsigned flags = PIPE_BUFFER_USAGE_CPU_READ;
     uint32_t* map;
-    uint32_t temp;
-    unsigned i;
+    uint32_t temp = 0;
+    unsigned i, num_results;
 
-    if (wait) {
+    if (q->flushed == FALSE)
         pipe->flush(pipe, 0, NULL);
-    } else {
+    if (!wait) {
         flags |= PIPE_BUFFER_USAGE_DONTBLOCK;
     }
 
     map = pipe->screen->buffer_map(pipe->screen, r300->oqbo, flags);
+    if (!map)
+        return FALSE;
     map += q->offset / 4;
-    for (i = 0; i < r300screen->caps->num_frag_pipes; i++) {
-        if (*map == ~0) {
+
+    if (r300screen->caps->family == CHIP_FAMILY_RV530)
+        num_results = r300screen->caps->num_z_pipes;
+    else
+        num_results = r300screen->caps->num_frag_pipes;
+
+    for (i = 0; i < num_results; i++) {
+        if (*map == ~0U) {
             /* Looks like our results aren't ready yet. */
             if (wait) {
                 debug_printf("r300: Despite waiting, OQ results haven't"
                         " come in yet.\n");
             }
-            temp = ~0;
+            temp = ~0U;
             break;
         }
         temp += *map;
@@ -125,7 +146,7 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
     }
     pipe->screen->buffer_unmap(pipe->screen, r300->oqbo);
 
-    if (temp == ~0) {
+    if (temp == ~0U) {
         /* Our results haven't been written yet... */
         return FALSE;
     }
diff --git a/src/gallium/drivers/r300/r300_query.h b/src/gallium/drivers/r300/r300_query.h
index 4f50e8f8440..48876da3123 100644
--- a/src/gallium/drivers/r300/r300_query.h
+++ b/src/gallium/drivers/r300/r300_query.h
@@ -23,10 +23,6 @@
 #ifndef R300_QUERY_H
 #define R300_QUERY_H
 
-#include "r300_context.h"
-#include "r300_cs.h"
-#include "r300_reg.h"
-
 struct r300_context;
 
 static INLINE struct r300_query* r300_query(struct pipe_query* q)
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 03cd219cde9..8ca785cb587 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -348,6 +348,27 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_WRITE_ENA_W                         8
 #       define R300_SWIZZLE1_SHIFT                      16
 
+#       define R300_VAP_SWIZZLE_X001 \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+#       define R300_VAP_SWIZZLE_XY01 \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
+#       define R300_VAP_SWIZZLE_XYZ1 \
+        ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
+         (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) | \
+         (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | \
+         (0xf << R300_WRITE_ENA_SHIFT))
+
 #       define R300_VAP_SWIZZLE_XYZW \
         ((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) | \
          (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) | \
@@ -841,10 +862,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_POINTSIZE_X_MASK          0xffff0000
 #       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
 
-/* Blue fill color */
+/* Red fill color */
 #define R500_GA_FILL_R                                0x4220
 
-/* Blue fill color */
+/* Green fill color */
 #define R500_GA_FILL_G                                0x4224
 
 /* Blue fill color */
@@ -1172,6 +1193,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* SU Depth Offset value */
 #define R300_SU_DEPTH_OFFSET                0x42c4
 
+#define R300_SU_REG_DEST		    0x42c8
+#	define R300_RASTER_PIPE_SELECT_0	(1 << 0)
+#	define R300_RASTER_PIPE_SELECT_1	(1 << 1)
+#	define R300_RASTER_PIPE_SELECT_2	(1 << 2)
+#	define R300_RASTER_PIPE_SELECT_3	(1 << 3)
+#	define R300_RASTER_PIPE_SELECT_ALL	0xf
+
 
 /* BEGIN: Rasterization / Interpolators - many guesses */
 
@@ -1478,6 +1506,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_TX_PITCH_EN                  (1 << 31)
 #       define R300_TX_WIDTH(x)                  ((x) << 0)
 #       define R300_TX_HEIGHT(x)                 ((x) << 11)
+#       define R300_TX_DEPTH(x)                  ((x) << 22)
 #       define R300_TX_NUM_LEVELS(x)             ((x) << 26)
 
 #define R300_TX_FORMAT1_0                   0x44C0
@@ -1855,6 +1884,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_RGB_ADDR0(x)                ((x) << 0)
 #       define R300_RGB_ADDR1(x)                ((x) << 6)
 #       define R300_RGB_ADDR2(x)                ((x) << 12)
+#       define R300_RGB_TARGET(x)               ((x) << 29)
 
 #define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
 #       define R300_ALU_SRC0A_SHIFT             0
@@ -1872,9 +1902,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_ALU_DSTA_REG                (1 << 23)
 #       define R300_ALU_DSTA_OUTPUT             (1 << 24)
 #		define R300_ALU_DSTA_DEPTH              (1 << 27)
-#       define R300_ALPHA_ADDR0(x)                ((x) << 0)
-#       define R300_ALPHA_ADDR1(x)                ((x) << 6)
-#       define R300_ALPHA_ADDR2(x)                ((x) << 12)
+#       define R300_ALPHA_ADDR0(x)              ((x) << 0)
+#       define R300_ALPHA_ADDR1(x)              ((x) << 6)
+#       define R300_ALPHA_ADDR2(x)              ((x) << 12)
+#       define R300_ALPHA_TARGET(x)             ((x) << 25)
 
 #define R300_US_ALU_RGB_INST_0                   0x48C0
 #       define R300_ALU_ARGC_SRC0C_XYZ          0
@@ -2094,6 +2125,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R500_FG_ALPHA_VALUE                0x4be0
 #	define R500_FG_ALPHA_VALUE_MASK 0x0000ffff
 
+#define RV530_FG_ZBREG_DEST                 0x4be8
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_0             (1 << 0)
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_1             (1 << 1)
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL           (3 << 0)
 /* gap */
 
 /* Fragment program parameters in 7.16 floating point */
@@ -2383,6 +2418,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_Z_WRITE_ENABLE		 (1 << 2)
 #	define R300_Z_SIGNED_COMPARE		 (1 << 3)
 #	define R300_STENCIL_FRONT_BACK		 (1 << 4)
+#   define R500_STENCIL_ZSIGNED_MAGNITUDE (1 << 5)
+#   define R500_STENCIL_REFMASK_FRONT_BACK (1 << 6)
 
 #define R300_ZB_ZSTENCILCNTL                   0x4f04
 	/* functions */
@@ -3312,10 +3349,6 @@ enum {
 
 #define R200_3D_DRAW_IMMD_2      0xC0003500
 
-/* XXX Oh look, stuff not brought over from docs yet */
-
-#define R300_SU_REG_DEST                    0x42C8
-
 #endif /* _R300_REG_H */
 
 /* *INDENT-ON* */
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index cd458d019ae..62e1456ed36 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -20,21 +20,318 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "draw/draw_pipe.h"
+/* r300_render: Vertex and index buffer primitive emission. Contains both
+ * HW TCL fastpath rendering, and SW TCL Draw-assisted rendering. */
+
+#include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
+
+#include "pipe/p_inlines.h"
+
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 
 #include "r300_cs.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_reg.h"
+#include "r300_render.h"
 #include "r300_state_derived.h"
+#include "r300_vbo.h"
 
 /* r300_render: Vertex and index buffer primitive emission. */
+#define R300_MAX_VBO_SIZE  (1024 * 1024)
+
+uint32_t r300_translate_primitive(unsigned prim)
+{
+    switch (prim) {
+        case PIPE_PRIM_POINTS:
+            return R300_VAP_VF_CNTL__PRIM_POINTS;
+        case PIPE_PRIM_LINES:
+            return R300_VAP_VF_CNTL__PRIM_LINES;
+        case PIPE_PRIM_LINE_LOOP:
+            return R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
+        case PIPE_PRIM_LINE_STRIP:
+            return R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
+        case PIPE_PRIM_TRIANGLES:
+            return R300_VAP_VF_CNTL__PRIM_TRIANGLES;
+        case PIPE_PRIM_TRIANGLE_STRIP:
+            return R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
+        case PIPE_PRIM_TRIANGLE_FAN:
+            return R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
+        case PIPE_PRIM_QUADS:
+            return R300_VAP_VF_CNTL__PRIM_QUADS;
+        case PIPE_PRIM_QUAD_STRIP:
+            return R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
+        case PIPE_PRIM_POLYGON:
+            return R300_VAP_VF_CNTL__PRIM_POLYGON;
+        default:
+            return 0;
+    }
+}
+
+static void r300_emit_draw_arrays(struct r300_context *r300,
+                                  unsigned mode,
+                                  unsigned count)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(4);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (count << 16) |
+           r300_translate_primitive(mode));
+    END_CS;
+}
+
+static void r300_emit_draw_elements(struct r300_context *r300,
+                                    struct pipe_buffer* indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned minIndex,
+                                    unsigned maxIndex,
+                                    unsigned mode,
+                                    unsigned start,
+                                    unsigned count)
+{
+    uint32_t count_dwords;
+    uint32_t offset_dwords = indexSize * start / sizeof(uint32_t);
+    CS_LOCALS(r300);
+
+    /* XXX most of these are stupid */
+    assert(indexSize == 4 || indexSize == 2);
+    assert((start * indexSize)  % 4 == 0);
+    assert(offset_dwords == 0);
+
+    BEGIN_CS(10);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, maxIndex);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+    if (indexSize == 4) {
+        count_dwords = count + start;
+        OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
+               R300_VAP_VF_CNTL__INDEX_SIZE_32bit |
+               r300_translate_primitive(mode));
+    } else {
+        count_dwords = (count + start + 1) / 2;
+        OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
+               r300_translate_primitive(mode));
+    }
+
+    /* INDX_BUFFER is a truly special packet3.
+     * Unlike most other packet3, where the offset is after the count,
+     * the order is reversed, so the relocation ends up carrying the
+     * size of the indexbuf instead of the offset.
+     *
+     * XXX Fix offset
+     */
+    OUT_CS_PKT3(R300_PACKET3_INDX_BUFFER, 2);
+    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2) |
+           (0 << R300_INDX_BUFFER_SKIP_SHIFT));
+    OUT_CS(offset_dwords);
+    OUT_CS_RELOC(indexBuffer, count_dwords,
+        RADEON_GEM_DOMAIN_GTT, 0, 0);
+
+    END_CS;
+}
+
+
+static boolean r300_setup_vertex_buffers(struct r300_context *r300)
+{
+    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->vertex_element;
+
+validate:
+    for (int i = 0; i < r300->vertex_element_count; i++) {
+        if (!r300->winsys->add_buffer(r300->winsys,
+                vbuf[velem[i].vertex_buffer_index].buffer,
+            RADEON_GEM_DOMAIN_GTT, 0)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
+    }
+
+    if (!r300->winsys->validate(r300->winsys)) {
+        r300->context.flush(&r300->context, 0, NULL);
+        return r300->winsys->validate(r300->winsys);
+    }
+
+    return TRUE;
+}
+
+/* This is the fast-path drawing & emission for HW TCL. */
+boolean r300_draw_range_elements(struct pipe_context* pipe,
+                                 struct pipe_buffer* indexBuffer,
+                                 unsigned indexSize,
+                                 unsigned minIndex,
+                                 unsigned maxIndex,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return FALSE;
+    }
+
+    if (count > 65535) {
+        return FALSE;
+    }
+
+    r300_update_derived_state(r300);
+
+    if (!r300_setup_vertex_buffers(r300)) {
+        return FALSE;
+    }
+
+    setup_vertex_attributes(r300);
+
+    setup_index_buffer(r300, indexBuffer, indexSize);
+
+    r300_emit_dirty_state(r300);
+
+    r300_emit_aos(r300, 0);
+
+    r300_emit_draw_elements(r300, indexBuffer, indexSize, minIndex, maxIndex,
+                            mode, start, count);
+
+    return TRUE;
+}
+
+/* Simple helpers for context setup. Should probably be moved to util. */
+boolean r300_draw_elements(struct pipe_context* pipe,
+                           struct pipe_buffer* indexBuffer,
+                           unsigned indexSize, unsigned mode,
+                           unsigned start, unsigned count)
+{
+    return pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
+                                     mode, start, count);
+}
+
+boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                         unsigned start, unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return FALSE;
+    }
+
+    if (count > 65535) {
+        return FALSE;
+    }
+
+    r300_update_derived_state(r300);
+
+    if (!r300_setup_vertex_buffers(r300)) {
+        return FALSE;
+    }
+
+    setup_vertex_attributes(r300);
 
+    r300_emit_dirty_state(r300);
+
+    r300_emit_aos(r300, start);
+
+    r300_emit_draw_arrays(r300, mode, count);
+
+    return TRUE;
+}
+
+/****************************************************************************
+ * The rest of this file is for SW TCL rendering only. Please be polite and *
+ * keep these functions separated so that they are easier to locate. ~C.    *
+ ***************************************************************************/
+
+/* SW TCL arrays, using Draw. */
+boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return FALSE;
+    }
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        void* buf = pipe_buffer_map(pipe->screen,
+                                    r300->vertex_buffer[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+        draw_set_mapped_vertex_buffer(r300->draw, i, buf);
+    }
+
+    draw_set_mapped_element_buffer(r300->draw, 0, NULL);
+
+    draw_set_mapped_constant_buffer(r300->draw,
+            r300->shader_constants[PIPE_SHADER_VERTEX].constants,
+            r300->shader_constants[PIPE_SHADER_VERTEX].count *
+                (sizeof(float) * 4));
+
+    draw_arrays(r300->draw, mode, start, count);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        pipe_buffer_unmap(pipe->screen, r300->vertex_buffer[i].buffer);
+        draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
+    }
+
+    return TRUE;
+}
+
+/* SW TCL elements, using Draw. */
+boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+                                       struct pipe_buffer* indexBuffer,
+                                       unsigned indexSize,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    int i;
+
+    if (!u_trim_pipe_prim(mode, &count)) {
+        return FALSE;
+    }
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        void* buf = pipe_buffer_map(pipe->screen,
+                                    r300->vertex_buffer[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+        draw_set_mapped_vertex_buffer(r300->draw, i, buf);
+    }
+
+    void* indices = pipe_buffer_map(pipe->screen, indexBuffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+    draw_set_mapped_element_buffer_range(r300->draw, indexSize,
+                                         minIndex, maxIndex, indices);
+
+    draw_set_mapped_constant_buffer(r300->draw,
+            r300->shader_constants[PIPE_SHADER_VERTEX].constants,
+            r300->shader_constants[PIPE_SHADER_VERTEX].count *
+                (sizeof(float) * 4));
+
+    draw_arrays(r300->draw, mode, start, count);
+
+    for (i = 0; i < r300->vertex_buffer_count; i++) {
+        pipe_buffer_unmap(pipe->screen, r300->vertex_buffer[i].buffer);
+        draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
+    }
+
+    pipe_buffer_unmap(pipe->screen, indexBuffer);
+    draw_set_mapped_element_buffer_range(r300->draw, 0, start,
+                                         start + count - 1, NULL);
+
+    return TRUE;
+}
+
+/* Object for rendering using Draw. */
 struct r300_render {
     /* Parent class */
     struct vbuf_render base;
-    
+
     /* Pipe context */
     struct r300_context* r300;
 
@@ -45,7 +342,10 @@ struct r300_render {
 
     /* VBO */
     struct pipe_buffer* vbo;
-    size_t vbo_alloc_size;
+    size_t vbo_size;
+    size_t vbo_offset;
+    size_t vbo_max_used;
+    void * vbo_ptr;
 };
 
 static INLINE struct r300_render*
@@ -62,7 +362,7 @@ r300_render_get_vertex_info(struct vbuf_render* render)
 
     r300_update_derived_state(r300);
 
-    return &r300->vertex_info.vinfo;
+    return &r300->vertex_info->vinfo;
 }
 
 static boolean r300_render_allocate_vertices(struct vbuf_render* render,
@@ -74,19 +374,20 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (r300render->vbo && (size > r300render->vbo_alloc_size)) {
-        pipe_buffer_reference(&r300render->vbo, NULL);
-    }
-    
-    if (!r300render->vbo) {
+    if (size + r300render->vbo_offset > r300render->vbo_size)
+    {
+        pipe_buffer_reference(&r300->vbo, NULL);
         r300render->vbo = pipe_buffer_create(screen,
                                              64,
                                              PIPE_BUFFER_USAGE_VERTEX,
-                                             size);
+                                             R300_MAX_VBO_SIZE);
+        r300render->vbo_offset = 0;
+        r300render->vbo_size = R300_MAX_VBO_SIZE;
     }
 
-    r300render->vbo_alloc_size = MAX2(size, r300render->vbo_alloc_size);
     r300render->vertex_size = vertex_size;
+    r300->vbo = r300render->vbo;
+    r300->vbo_offset = r300render->vbo_offset;
 
     return (r300render->vbo) ? TRUE : FALSE;
 }
@@ -96,8 +397,10 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
 
-    return (unsigned char*)pipe_buffer_map(screen, r300render->vbo,
-                                           PIPE_BUFFER_USAGE_CPU_WRITE);
+    r300render->vbo_ptr = pipe_buffer_map(screen, r300render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    return (r300render->vbo_ptr + r300render->vbo_offset);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
@@ -106,7 +409,13 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
 {
     struct r300_render* r300render = r300_render(render);
     struct pipe_screen* screen = r300render->r300->context.screen;
+    CS_LOCALS(r300render->r300);
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, max);
+    END_CS;
 
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used,
+                                    r300render->vertex_size * (max + 1));
     pipe_buffer_unmap(screen, r300render->vbo);
 }
 
@@ -114,65 +423,21 @@ static void r300_render_release_vertices(struct vbuf_render* render)
 {
     struct r300_render* r300render = r300_render(render);
 
-    pipe_buffer_reference(&r300render->vbo, NULL);
+    r300render->vbo_offset += r300render->vbo_max_used;
+    r300render->vbo_max_used = 0;
 }
 
 static boolean r300_render_set_primitive(struct vbuf_render* render,
                                                unsigned prim)
 {
     struct r300_render* r300render = r300_render(render);
-    r300render->prim = prim;
 
-    switch (prim) {
-        case PIPE_PRIM_POINTS:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_POINTS;
-            break;
-        case PIPE_PRIM_LINES:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_LINES;
-            break;
-        case PIPE_PRIM_LINE_LOOP:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
-            break;
-        case PIPE_PRIM_LINE_STRIP:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
-            break;
-        case PIPE_PRIM_TRIANGLES:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_TRIANGLES;
-            break;
-        case PIPE_PRIM_TRIANGLE_STRIP:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
-            break;
-        case PIPE_PRIM_TRIANGLE_FAN:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
-            break;
-        case PIPE_PRIM_QUADS:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_QUADS;
-            break;
-        case PIPE_PRIM_QUAD_STRIP:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
-            break;
-        case PIPE_PRIM_POLYGON:
-            r300render->hwprim = R300_VAP_VF_CNTL__PRIM_POLYGON;
-            break;
-        default:
-            return FALSE;
-            break;
-    }
+    r300render->prim = prim;
+    r300render->hwprim = r300_translate_primitive(prim);
 
     return TRUE;
 }
 
-static void prepare_render(struct r300_render* render, unsigned count)
-{
-    struct r300_context* r300 = render->r300;
-
-    CS_LOCALS(r300);
-
-    r300->vbo = render->vbo;
-
-    r300_emit_dirty_state(r300);
-}
-
 static void r300_render_draw_arrays(struct vbuf_render* render,
                                           unsigned start,
                                           unsigned count)
@@ -182,9 +447,9 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
+    r300_emit_dirty_state(r300);
 
-    debug_printf("r300: Doing vbuf render, count %d\n", count);
+    DBG(r300, DBG_DRAW, "r300: Doing vbuf render, count %d\n", count);
 
     BEGIN_CS(2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
@@ -199,39 +464,11 @@ static void r300_render_draw(struct vbuf_render* render,
 {
     struct r300_render* r300render = r300_render(render);
     struct r300_context* r300 = r300render->r300;
-    struct pipe_screen* screen = r300->context.screen;
-    struct pipe_buffer* index_buffer;
-    void* index_map;
     int i;
-    uint32_t index;
 
     CS_LOCALS(r300);
 
-    prepare_render(r300render, count);
-
-    /* Send our indices into an index buffer. */
-    index_buffer = pipe_buffer_create(screen, 64, PIPE_BUFFER_USAGE_VERTEX,
-                                      count * 2);
-    if (!index_buffer) {
-        return;
-    }
-
-/*
-    index_map = pipe_buffer_map(screen, index_buffer,
-                                PIPE_BUFFER_USAGE_CPU_WRITE);
-    memcpy(index_map, indices, count);
-    pipe_buffer_unmap(screen, index_buffer);
-
-    debug_printf("r300: Doing indexbuf render, count %d\n", count);
-
-    BEGIN_CS(8);
-    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
-    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
-           r300render->hwprim);
-    OUT_CS_PKT3(R300_PACKET3_INDX_BUFFER, 2);
-    OUT_CS(R300_INDX_BUFFER_ONE_REG_WR | (R300_VAP_PORT_IDX0 >> 2));
-    OUT_CS_INDEX_RELOC(index_buffer, 0, count, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS; */
+    r300_emit_dirty_state(r300);
 
     BEGIN_CS(2 + (count+1)/2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
@@ -271,6 +508,10 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300)
     r300render->base.release_vertices = r300_render_release_vertices;
     r300render->base.destroy = r300_render_destroy;
 
+    r300render->vbo = NULL;
+    r300render->vbo_size = 0;
+    r300render->vbo_offset = 0;
+
     return &r300render->base;
 }
 
diff --git a/src/gallium/drivers/r300/r300_render.h b/src/gallium/drivers/r300/r300_render.h
new file mode 100644
index 00000000000..da83069083d
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_render.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_RENDER_H
+#define R300_RENDER_H
+
+uint32_t r300_translate_primitive(unsigned prim);
+
+boolean r300_draw_range_elements(struct pipe_context* pipe,
+                                 struct pipe_buffer* indexBuffer,
+                                 unsigned indexSize,
+                                 unsigned minIndex,
+                                 unsigned maxIndex,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count);
+
+boolean r300_draw_elements(struct pipe_context* pipe,
+                           struct pipe_buffer* indexBuffer,
+                           unsigned indexSize, unsigned mode,
+                           unsigned start, unsigned count);
+
+boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                         unsigned start, unsigned count);
+
+boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count);
+
+boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+                                       struct pipe_buffer* indexBuffer,
+                                       unsigned indexSize,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count);
+
+#endif /* R300_RENDER_H */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 15740f61252..390b63007e5 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -20,7 +20,14 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_simple_screen.h"
+
+#include "r300_context.h"
 #include "r300_screen.h"
+#include "r300_texture.h"
+#include "r300_winsys.h"
 
 /* Return the identifier behind whom the brave coders responsible for this
  * amalgamation of code, sweat, and duct tape, routinely obscure their names.
@@ -73,14 +80,13 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
     struct r300_screen* r300screen = r300_screen(pscreen);
 
     switch (param) {
-        /* XXX cases marked "IN THEORY" are possible on the hardware,
-         * but haven't been implemented yet. */
         case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
             /* XXX I'm told this goes up to 16 */
             return 8;
         case PIPE_CAP_NPOT_TEXTURES:
-            /* IN THEORY */
-            return 0;
+            /* XXX enable now to get GL2.1 API,
+             * figure out later how to emulate this */
+            return 1;
         case PIPE_CAP_TWO_SIDED_STENCIL:
             if (r300screen->caps->is_r500) {
                 return 1;
@@ -88,53 +94,40 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
                 return 0;
             }
         case PIPE_CAP_GLSL:
-            if (r300screen->caps->is_r500) {
-                return 1;
-            } else {
-                return 0;
-            }
-        case PIPE_CAP_S3TC:
+            /* I'll be frank. This is a lie.
+             *
+             * We don't truly support GLSL on any of this driver's chipsets.
+             * To be fair, no chipset supports the full GLSL specification
+             * to the best of our knowledge, but some of the less esoteric
+             * features are still missing here.
+             *
+             * Rather than cripple ourselves intentionally, I'm going to set
+             * this flag, and as Gallium's interface continues to change, I
+             * hope that this single monolithic GLSL enable can slowly get
+             * split down into many different pieces and the state tracker
+             * will handle fallbacks transparently, like it should.
+             *
+             * ~ C.
+             */
             return 1;
         case PIPE_CAP_ANISOTROPIC_FILTER:
             return 1;
         case PIPE_CAP_POINT_SPRITE:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_MAX_RENDER_TARGETS:
             return 4;
         case PIPE_CAP_OCCLUSION_QUERY:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_TEXTURE_SHADOW_MAP:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-            if (r300screen->caps->is_r500) {
-                /* 13 == 4096x4096 */
-                return 13;
-            } else {
-                /* 12 == 2048x2048 */
-                return 12;
-            }
         case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-            /* So, technically, the limit is the same as above, but some math
-             * shows why this is silly. Assuming RGBA, 4cpp, we can see that
-             * 4096*4096*4096 = 64.0 GiB exactly, so it's not exactly
-             * practical. However, if at some point a game really wants this,
-             * then we can remove or raise this limit. */
-            if (r300screen->caps->is_r500) {
-                /* 9 == 256x256x256 */
-                return 9;
-            } else {
-                /* 8 == 128*128*128 */
-                return 8;
-            }
         case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
             if (r300screen->caps->is_r500) {
-                /* 13 == 4096x4096 */
+                /* 13 == 4096 */
                 return 13;
             } else {
-                /* 12 == 2048x2048 */
+                /* 12 == 2048 */
                 return 12;
             }
         case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -142,10 +135,8 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
         case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
             return 1;
         case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-            /* XXX guessing (what a terrible guess) */
-            return 2;
+            return 0;
         case PIPE_CAP_TGSI_CONT_SUPPORTED:
-            /* XXX */
             return 0;
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
             return 1;
@@ -183,19 +174,22 @@ static float r300_get_paramf(struct pipe_screen* pscreen, int param)
     }
 }
 
-static boolean check_tex_2d_format(enum pipe_format format, uint32_t usage,
-                                   boolean is_r500)
+static boolean check_tex_format(enum pipe_format format, uint32_t usage,
+                                boolean is_r500)
 {
+    uint32_t retval = 0;
+
     switch (format) {
         /* Supported formats. */
         /* Colorbuffer */
         case PIPE_FORMAT_A4R4G4B4_UNORM:
         case PIPE_FORMAT_R5G6B5_UNORM:
         case PIPE_FORMAT_A1R5G5B5_UNORM:
-            return usage &
+            retval = usage &
                 (PIPE_TEXTURE_USAGE_RENDER_TARGET |
                  PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                  PIPE_TEXTURE_USAGE_PRIMARY);
+            break;
 
         /* Texture */
         case PIPE_FORMAT_A8R8G8B8_SRGB:
@@ -205,31 +199,37 @@ static boolean check_tex_2d_format(enum pipe_format format, uint32_t usage,
         case PIPE_FORMAT_DXT3_RGBA:
         case PIPE_FORMAT_DXT5_RGBA:
         case PIPE_FORMAT_YCBCR:
-            return usage & PIPE_TEXTURE_USAGE_SAMPLER;
+        case PIPE_FORMAT_L8_UNORM:
+        case PIPE_FORMAT_A8L8_UNORM:
+            retval = usage & PIPE_TEXTURE_USAGE_SAMPLER;
+            break;
 
         /* Colorbuffer or texture */
         case PIPE_FORMAT_A8R8G8B8_UNORM:
+        case PIPE_FORMAT_X8R8G8B8_UNORM:
         case PIPE_FORMAT_R8G8B8A8_UNORM:
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
         case PIPE_FORMAT_I8_UNORM:
-            return usage &
+            retval = usage &
                 (PIPE_TEXTURE_USAGE_RENDER_TARGET |
                  PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                  PIPE_TEXTURE_USAGE_PRIMARY |
                  PIPE_TEXTURE_USAGE_SAMPLER);
+            break;
 
-        /* Z buffer */
+        /* Z buffer or texture */
         case PIPE_FORMAT_Z16_UNORM:
-            return usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL;
-
+        case PIPE_FORMAT_Z24X8_UNORM:
         /* Z buffer with stencil or texture */
         case PIPE_FORMAT_Z24S8_UNORM:
-            return usage &
+            retval = usage &
                 (PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
                  PIPE_TEXTURE_USAGE_SAMPLER);
+            break;
 
         /* Definitely unsupported formats. */
         /* Non-usable Z buffer/stencil formats. */
-        case PIPE_FORMAT_Z24X8_UNORM:
+        case PIPE_FORMAT_Z32_UNORM:
         case PIPE_FORMAT_S8Z24_UNORM:
         case PIPE_FORMAT_X8Z24_UNORM:
             debug_printf("r300: Note: Got unsupported format: %s in %s\n",
@@ -239,7 +239,6 @@ static boolean check_tex_2d_format(enum pipe_format format, uint32_t usage,
         /* XXX These don't even exist
         case PIPE_FORMAT_A32R32G32B32:
         case PIPE_FORMAT_A16R16G16B16: */
-        /* XXX Insert YUV422 packed VYUY and YVYU here */
         /* XXX What the deuce is UV88? (r3xx accel page 14)
             debug_printf("r300: Warning: Got unimplemented format: %s in %s\n",
                 pf_name(format), __FUNCTION__);
@@ -263,10 +262,15 @@ static boolean check_tex_2d_format(enum pipe_format format, uint32_t usage,
             break;
     }
 
-    return FALSE;
+    /* If usage was a mask that contained multiple bits, and not all of them
+     * are supported, this will catch that and return FALSE.
+     * e.g. usage = 2 | 4; retval = 4; (retval >= usage) == FALSE
+     *
+     * This also returns FALSE for any unknown formats.
+     */
+    return (retval >= usage);
 }
 
-/* XXX moar targets */
 static boolean r300_is_format_supported(struct pipe_screen* pscreen,
                                         enum pipe_format format,
                                         enum pipe_texture_target target,
@@ -274,15 +278,13 @@ static boolean r300_is_format_supported(struct pipe_screen* pscreen,
                                         unsigned geom_flags)
 {
     switch (target) {
+        case PIPE_TEXTURE_1D:   /* handle 1D textures as 2D ones */
         case PIPE_TEXTURE_2D:
-            return check_tex_2d_format(format, tex_usage,
-                r300_screen(pscreen)->caps->is_r500);
-        case PIPE_TEXTURE_1D:
         case PIPE_TEXTURE_3D:
         case PIPE_TEXTURE_CUBE:
-            debug_printf("r300: Implementation error: Unsupported format "
-                    "target: %d\n", target);
-            break;
+            return check_tex_format(format, tex_usage,
+                r300_screen(pscreen)->caps->is_r500);
+
         default:
             debug_printf("r300: Fatal: This is not a format target: %d\n",
                 target);
@@ -302,35 +304,29 @@ r300_get_tex_transfer(struct pipe_screen *screen,
 {
     struct r300_texture *tex = (struct r300_texture *)texture;
     struct r300_transfer *trans;
-    unsigned offset;  /* in bytes */
+    unsigned offset;
 
-    /* XXX Add support for these things */
-    if (texture->target == PIPE_TEXTURE_CUBE) {
-        debug_printf("PIPE_TEXTURE_CUBE is not yet supported.\n");
-        /* offset = tex->image_offset[level][face]; */
-    }
-    else if (texture->target == PIPE_TEXTURE_3D) {
-        debug_printf("PIPE_TEXTURE_3D is not yet supported.\n");
-        /* offset = tex->image_offset[level][zslice]; */
-    }
-    else {
-        offset = tex->offset[level];
-        assert(face == 0);
-        assert(zslice == 0);
-    }
+    offset = r300_texture_get_offset(tex, level, zslice, face);  /* in bytes */
 
     trans = CALLOC_STRUCT(r300_transfer);
     if (trans) {
         pipe_texture_reference(&trans->transfer.texture, texture);
         trans->transfer.format = texture->format;
+        trans->transfer.x = x;
+        trans->transfer.y = y;
         trans->transfer.width = w;
         trans->transfer.height = h;
         trans->transfer.block = texture->block;
         trans->transfer.nblocksx = texture->nblocksx[level];
         trans->transfer.nblocksy = texture->nblocksy[level];
-        trans->transfer.stride = align(pf_get_stride(&trans->transfer.block,
-                                                     texture->width[level]), 32);
+        trans->transfer.stride = r300_texture_get_stride(tex, level);
         trans->transfer.usage = usage;
+
+        /* XXX not sure whether it's required to set these two,
+               the driver doesn't use them */
+        trans->transfer.zslice = zslice;
+        trans->transfer.face = face;
+
         trans->offset = offset;
     }
     return &trans->transfer;
@@ -348,16 +344,9 @@ static void* r300_transfer_map(struct pipe_screen* screen,
 {
     struct r300_texture* tex = (struct r300_texture*)transfer->texture;
     char* map;
-    unsigned flags = 0;
 
-    if (transfer->usage != PIPE_TRANSFER_WRITE) {
-        flags |= PIPE_BUFFER_USAGE_CPU_READ;
-    }
-    if (transfer->usage != PIPE_TRANSFER_READ) {
-        flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-    }
-    
-    map = pipe_buffer_map(screen, tex->buffer, flags);
+    map = pipe_buffer_map(screen, tex->buffer,
+                          pipe_transfer_buffer_flags(transfer));
 
     if (!map) {
         return NULL;
@@ -393,6 +382,7 @@ struct pipe_screen* r300_create_screen(struct r300_winsys* r300_winsys)
 
     caps->pci_id = r300_winsys->pci_id;
     caps->num_frag_pipes = r300_winsys->gb_pipes;
+    caps->num_z_pipes = r300_winsys->z_pipes;
 
     r300_parse_chipset(caps);
 
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 2a0e41fbc3b..41df31f670f 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -23,14 +23,9 @@
 #ifndef R300_SCREEN_H
 #define R300_SCREEN_H
 
-#include "pipe/p_inlines.h"
 #include "pipe/p_screen.h"
-#include "util/u_memory.h"
-#include "util/u_simple_screen.h"
 
 #include "r300_chipset.h"
-#include "r300_texture.h"
-#include "r300_winsys.h"
 
 struct r300_screen {
     /* Parent class */
diff --git a/src/gallium/drivers/r300/r300_shader_inlines.h b/src/gallium/drivers/r300/r300_shader_inlines.h
deleted file mode 100644
index a04f45b03e2..00000000000
--- a/src/gallium/drivers/r300/r300_shader_inlines.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#ifndef R300_SHADER_INLINES_H
-#define R300_SHADER_INLINES_H
-
-/* TGSI constants. TGSI is like XML: If it can't solve your problems, you're
- * not using enough of it. */
-static const struct tgsi_full_src_register r300_constant_zero = {
-    .SrcRegister.Extended = TRUE,
-    .SrcRegister.File = TGSI_FILE_NULL,
-    .SrcRegisterExtSwz.ExtSwizzleX = TGSI_EXTSWIZZLE_ZERO,
-    .SrcRegisterExtSwz.ExtSwizzleY = TGSI_EXTSWIZZLE_ZERO,
-    .SrcRegisterExtSwz.ExtSwizzleZ = TGSI_EXTSWIZZLE_ZERO,
-    .SrcRegisterExtSwz.ExtSwizzleW = TGSI_EXTSWIZZLE_ZERO,
-};
-
-static const struct tgsi_full_src_register r300_constant_one = {
-    .SrcRegister.Extended = TRUE,
-    .SrcRegister.File = TGSI_FILE_NULL,
-    .SrcRegisterExtSwz.ExtSwizzleX = TGSI_EXTSWIZZLE_ONE,
-    .SrcRegisterExtSwz.ExtSwizzleY = TGSI_EXTSWIZZLE_ONE,
-    .SrcRegisterExtSwz.ExtSwizzleZ = TGSI_EXTSWIZZLE_ONE,
-    .SrcRegisterExtSwz.ExtSwizzleW = TGSI_EXTSWIZZLE_ONE,
-};
-
-#endif /* R300_SHADER_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index c16cadd0407..d1eced61db1 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -20,16 +20,20 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "draw/draw_context.h"
+
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_pack_color.h"
 
-#include "util/u_debug.h"
+#include "tgsi/tgsi_parse.h"
 
 #include "pipe/p_config.h"
 #include "pipe/internal/p_winsys_screen.h"
 
 #include "r300_context.h"
 #include "r300_reg.h"
+#include "r300_screen.h"
 #include "r300_state_inlines.h"
 #include "r300_fs.h"
 #include "r300_vs.h"
@@ -45,23 +49,49 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
 {
     struct r300_blend_state* blend = CALLOC_STRUCT(r300_blend_state);
 
-    if (state->blend_enable) {
-        /* XXX for now, always do separate alpha...
-         * is it faster to do it with one reg? */
+    if (state->blend_enable)
+    {
+        unsigned eqRGB = state->rgb_func;
+        unsigned srcRGB = state->rgb_src_factor;
+        unsigned dstRGB = state->rgb_dst_factor;
+
+        unsigned eqA = state->alpha_func;
+        unsigned srcA = state->alpha_src_factor;
+        unsigned dstA = state->alpha_dst_factor;
+
+        /* despite the name, ALPHA_BLEND_ENABLE has nothing to do with alpha,
+         * this is just the crappy D3D naming */
         blend->blend_control = R300_ALPHA_BLEND_ENABLE |
-                R300_SEPARATE_ALPHA_ENABLE |
-                R300_READ_ENABLE |
-                r300_translate_blend_function(state->rgb_func) |
-                (r300_translate_blend_factor(state->rgb_src_factor) <<
-                    R300_SRC_BLEND_SHIFT) |
-                (r300_translate_blend_factor(state->rgb_dst_factor) <<
-                    R300_DST_BLEND_SHIFT);
-        blend->alpha_blend_control =
-                r300_translate_blend_function(state->alpha_func) |
-                (r300_translate_blend_factor(state->alpha_src_factor) <<
-                    R300_SRC_BLEND_SHIFT) |
-                (r300_translate_blend_factor(state->alpha_dst_factor) <<
-                    R300_DST_BLEND_SHIFT);
+            r300_translate_blend_function(eqRGB) |
+            ( r300_translate_blend_factor(srcRGB) << R300_SRC_BLEND_SHIFT) |
+            ( r300_translate_blend_factor(dstRGB) << R300_DST_BLEND_SHIFT);
+
+        /* optimization: some operations do not require the destination color */
+        if (eqRGB == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MIN ||
+            eqRGB == PIPE_BLEND_MAX || eqA == PIPE_BLEND_MAX ||
+            dstRGB != PIPE_BLENDFACTOR_ZERO ||
+            dstA != PIPE_BLENDFACTOR_ZERO ||
+            srcRGB == PIPE_BLENDFACTOR_DST_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_DST_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_INV_DST_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_DST_COLOR ||
+            srcA == PIPE_BLENDFACTOR_DST_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_INV_DST_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA)
+            blend->blend_control |= R300_READ_ENABLE;
+
+        /* XXX implement the optimization with DISCARD_SRC_PIXELS*/
+        /* XXX implement the optimization with SRC_ALPHA_?_NO_READ */
+
+        /* separate alpha */
+        if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
+            blend->blend_control |= R300_SEPARATE_ALPHA_ENABLE;
+            blend->alpha_blend_control =
+                r300_translate_blend_function(eqA) |
+                (r300_translate_blend_factor(srcA) << R300_SRC_BLEND_SHIFT) |
+                (r300_translate_blend_factor(dstA) << R300_DST_BLEND_SHIFT);
+        }
     }
 
     /* PIPE_LOGICOP_* don't need to be translated, fortunately. */
@@ -70,6 +100,20 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
                 (state->logicop_func) << R300_RB3D_ROPCNTL_ROP_SHIFT;
     }
 
+    /* Color Channel Mask */
+    if (state->colormask & PIPE_MASK_R) {
+        blend->color_channel_mask |= RB3D_COLOR_CHANNEL_MASK_RED_MASK0;
+    }
+    if (state->colormask & PIPE_MASK_G) {
+        blend->color_channel_mask |= RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0;
+    }
+    if (state->colormask & PIPE_MASK_B) {
+        blend->color_channel_mask |= RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0;
+    }
+    if (state->colormask & PIPE_MASK_A) {
+        blend->color_channel_mask |= RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0;
+    }
+
     if (state->dither) {
         blend->dither = R300_RB3D_DITHER_CTL_DITHER_MODE_LUT |
                 R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT;
@@ -95,25 +139,29 @@ static void r300_delete_blend_state(struct pipe_context* pipe,
     FREE(state);
 }
 
+/* Convert float to 10bit integer */
+static unsigned float_to_fixed10(float f)
+{
+    return CLAMP((unsigned)(f * 1023.9f), 0, 1023);
+}
+
 /* Set blend color.
  * Setup both R300 and R500 registers, figure out later which one to write. */
 static void r300_set_blend_color(struct pipe_context* pipe,
                                  const struct pipe_blend_color* color)
 {
     struct r300_context* r300 = r300_context(pipe);
-    ubyte ur, ug, ub, ua;
-
-    ur = float_to_ubyte(color->color[0]);
-    ug = float_to_ubyte(color->color[1]);
-    ub = float_to_ubyte(color->color[2]);
-    ua = float_to_ubyte(color->color[3]);
 
     util_pack_color(color->color, PIPE_FORMAT_A8R8G8B8_UNORM,
             &r300->blend_color_state->blend_color);
 
-    /* XXX this is wrong */
-    r300->blend_color_state->blend_color_red_alpha = ur | (ua << 16);
-    r300->blend_color_state->blend_color_green_blue = ub | (ug << 16);
+    /* XXX if FP16 blending is enabled, we should use the FP16 format */
+    r300->blend_color_state->blend_color_red_alpha =
+        float_to_fixed10(color->color[0]) |
+        (float_to_fixed10(color->color[3]) << 16);
+    r300->blend_color_state->blend_color_green_blue =
+        float_to_fixed10(color->color[2]) |
+        (float_to_fixed10(color->color[1]) << 16);
 
     r300->dirty_state |= R300_NEW_BLEND_COLOR;
 }
@@ -132,31 +180,6 @@ static void r300_set_clip_state(struct pipe_context* pipe,
     }
 }
 
-static void
-    r300_set_constant_buffer(struct pipe_context* pipe,
-                             uint shader, uint index,
-                             const struct pipe_constant_buffer* buffer)
-{
-    struct r300_context* r300 = r300_context(pipe);
-
-    /* This entire chunk of code seems ever-so-slightly baked.
-     * It's as if I've got pipe_buffer* matryoshkas... */
-    if (buffer && buffer->buffer && buffer->buffer->size) {
-        void* map = pipe->winsys->buffer_map(pipe->winsys, buffer->buffer,
-                                             PIPE_BUFFER_USAGE_CPU_READ);
-        memcpy(r300->shader_constants[shader].constants,
-            map, buffer->buffer->size);
-        pipe->winsys->buffer_unmap(pipe->winsys, buffer->buffer);
-
-        r300->shader_constants[shader].count =
-            buffer->buffer->size / (sizeof(float) * 4);
-    } else {
-        r300->shader_constants[shader].count = 0;
-    }
-
-    r300->dirty_state |= R300_NEW_CONSTANTS;
-}
-
 /* Create a new depth, stencil, and alpha state based on the CSO dsa state.
  *
  * This contains the depth buffer, stencil buffer, alpha test, and such.
@@ -166,6 +189,8 @@ static void*
         r300_create_dsa_state(struct pipe_context* pipe,
                               const struct pipe_depth_stencil_alpha_state* state)
 {
+    struct r300_capabilities *caps =
+        r300_screen(r300_context(pipe)->context.screen)->caps;
     struct r300_dsa_state* dsa = CALLOC_STRUCT(r300_dsa_state);
 
     /* Depth test setup. */
@@ -210,9 +235,16 @@ static void*
             (r300_translate_stencil_op(state->stencil[1].zfail_op) <<
                 R300_S_BACK_ZFAIL_OP_SHIFT);
 
-            dsa->stencil_ref_bf = (state->stencil[1].ref_value) |
-                (state->stencil[1].valuemask << R300_STENCILMASK_SHIFT) |
-                (state->stencil[1].writemask << R300_STENCILWRITEMASK_SHIFT);
+            /* XXX it seems r3xx doesn't support STENCILREFMASK_BF */
+            if (caps->is_r500)
+            {
+                dsa->z_buffer_control |= R500_STENCIL_REFMASK_FRONT_BACK;
+                dsa->stencil_ref_bf = (state->stencil[1].ref_value) |
+                    (state->stencil[1].valuemask <<
+                    R300_STENCILMASK_SHIFT) |
+                    (state->stencil[1].writemask <<
+                    R300_STENCILWRITEMASK_SHIFT);
+            }
         }
     }
 
@@ -221,11 +253,13 @@ static void*
         dsa->alpha_function =
             r300_translate_alpha_function(state->alpha.func) |
             R300_FG_ALPHA_FUNC_ENABLE;
-        dsa->alpha_reference = CLAMP(state->alpha.ref_value * 1023.0f,
-                                     0, 1023);
-    } else {
-        /* XXX need to fix this to be dynamically set
-        dsa->z_buffer_top = R300_ZTOP_ENABLE; */
+
+        /* XXX figure out why emitting 10bit alpha ref causes CS to dump */
+        /* always use 8bit alpha ref */
+        dsa->alpha_function |= float_to_ubyte(state->alpha.ref_value);
+
+        if (caps->is_r500)
+            dsa->alpha_function |= R500_FG_ALPHA_FUNC_8BIT;
     }
 
     return (void*)dsa;
@@ -261,7 +295,9 @@ static void
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    draw_flush(r300->draw);
+    if (r300->draw) {
+        draw_flush(r300->draw);
+    }
 
     r300->framebuffer_state = *state;
 
@@ -300,7 +336,7 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
 
     r300->fs = fs;
 
-    r300->dirty_state |= R300_NEW_FRAGMENT_SHADER;
+    r300->dirty_state |= R300_NEW_FRAGMENT_SHADER | R300_NEW_FRAGMENT_SHADER_CONSTANTS;
 }
 
 /* Delete fragment shader state. */
@@ -308,7 +344,7 @@ static void r300_delete_fs_state(struct pipe_context* pipe, void* shader)
 {
     struct r300_fragment_shader* fs = (struct r300_fragment_shader*)shader;
     rc_constants_destroy(&fs->code.constants);
-    FREE(fs->state.tokens);
+    FREE((void*)fs->state.tokens);
     FREE(shader);
 }
 
@@ -362,25 +398,52 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     rs->line_control = pack_float_16_6x(state->line_width) |
         R300_GA_LINE_CNTL_END_TYPE_COMP;
 
+    /* XXX I think there is something wrong with the polygon mode,
+     * XXX re-test when r300g is in a better shape */
+
+    /* Enable polygon mode */
+    if (state->fill_cw != PIPE_POLYGON_MODE_FILL ||
+        state->fill_ccw != PIPE_POLYGON_MODE_FILL) {
+        rs->polygon_mode = R300_GA_POLY_MODE_DUAL;
+    }
+
     /* Radeons don't think in "CW/CCW", they think in "front/back". */
     if (state->front_winding == PIPE_WINDING_CW) {
         rs->cull_mode = R300_FRONT_FACE_CW;
 
+        /* Polygon offset */
         if (state->offset_cw) {
             rs->polygon_offset_enable |= R300_FRONT_ENABLE;
         }
         if (state->offset_ccw) {
             rs->polygon_offset_enable |= R300_BACK_ENABLE;
         }
+
+        /* Polygon mode */
+        if (rs->polygon_mode) {
+            rs->polygon_mode |=
+                r300_translate_polygon_mode_front(state->fill_cw);
+            rs->polygon_mode |=
+                r300_translate_polygon_mode_back(state->fill_ccw);
+        }
     } else {
         rs->cull_mode = R300_FRONT_FACE_CCW;
 
+        /* Polygon offset */
         if (state->offset_ccw) {
             rs->polygon_offset_enable |= R300_FRONT_ENABLE;
         }
         if (state->offset_cw) {
             rs->polygon_offset_enable |= R300_BACK_ENABLE;
         }
+
+        /* Polygon mode */
+        if (rs->polygon_mode) {
+            rs->polygon_mode |=
+                r300_translate_polygon_mode_front(state->fill_ccw);
+            rs->polygon_mode |=
+                r300_translate_polygon_mode_back(state->fill_cw);
+        }
     }
     if (state->front_winding & state->cull_mode) {
         rs->cull_mode |= R300_CULL_FRONT;
@@ -424,11 +487,17 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
     struct r300_context* r300 = r300_context(pipe);
     struct r300_rs_state* rs = (struct r300_rs_state*)state;
 
-    draw_flush(r300->draw);
-    draw_set_rasterizer_state(r300->draw, &rs->rs);
+    if (r300->draw) {
+        draw_flush(r300->draw);
+        draw_set_rasterizer_state(r300->draw, &rs->rs);
+    }
 
     r300->rs_state = rs;
+    /* XXX Clean these up when we move to atom emits */
     r300->dirty_state |= R300_NEW_RASTERIZER;
+    r300->dirty_state |= R300_NEW_RS_BLOCK;
+    r300->dirty_state |= R300_NEW_SCISSOR;
+    r300->dirty_state |= R300_NEW_VIEWPORT;
 }
 
 /* Free rasterizer state. */
@@ -508,6 +577,8 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
     if (count > 8) {
         return;
     }
+    
+    r300->context.flush(&r300->context, 0, NULL);
 
     for (i = 0; i < count; i++) {
         if (r300->textures[i] != (struct r300_texture*)texture[i]) {
@@ -562,17 +633,14 @@ static void r300_set_viewport_state(struct pipe_context* pipe,
     r300->viewport_state->vte_control = R300_VTX_W0_FMT;
 
     if (state->scale[0] != 1.0f) {
-        assert(state->scale[0] != 0.0f);
         r300->viewport_state->xscale = state->scale[0];
         r300->viewport_state->vte_control |= R300_VPORT_X_SCALE_ENA;
     }
     if (state->scale[1] != 1.0f) {
-        assert(state->scale[1] != 0.0f);
         r300->viewport_state->yscale = state->scale[1];
         r300->viewport_state->vte_control |= R300_VPORT_Y_SCALE_ENA;
     }
     if (state->scale[2] != 1.0f) {
-        assert(state->scale[2] != 0.0f);
         r300->viewport_state->zscale = state->scale[2];
         r300->viewport_state->vte_control |= R300_VPORT_Z_SCALE_ENA;
     }
@@ -598,13 +666,14 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    memcpy(r300->vertex_buffers, buffers,
+    memcpy(r300->vertex_buffer, buffers,
         sizeof(struct pipe_vertex_buffer) * count);
-
     r300->vertex_buffer_count = count;
 
-    draw_flush(r300->draw);
-    draw_set_vertex_buffers(r300->draw, count, buffers);
+    if (r300->draw) {
+        draw_flush(r300->draw);
+        draw_set_vertex_buffers(r300->draw, count, buffers);
+    }
 }
 
 static void r300_set_vertex_elements(struct pipe_context* pipe,
@@ -613,8 +682,15 @@ static void r300_set_vertex_elements(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    draw_flush(r300->draw);
-    draw_set_vertex_elements(r300->draw, count, elements);
+    memcpy(r300->vertex_element,
+           elements,
+           sizeof(struct pipe_vertex_element) * count);
+    r300->vertex_element_count = count;
+
+    if (r300->draw) {
+        draw_flush(r300->draw);
+        draw_set_vertex_elements(r300->draw, count, elements);
+    }
 }
 
 static void* r300_create_vs_state(struct pipe_context* pipe,
@@ -657,7 +733,7 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
 
         draw_bind_vertex_shader(r300->draw, vs->draw);
         r300->vs = vs;
-        r300->dirty_state |= R300_NEW_VERTEX_SHADER;
+        r300->dirty_state |= R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS;
     } else {
         draw_bind_vertex_shader(r300->draw,
                 (struct draw_vertex_shader*)shader);
@@ -673,7 +749,7 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
 
         rc_constants_destroy(&vs->code.constants);
         draw_delete_vertex_shader(r300->draw, vs->draw);
-        FREE(vs->state.tokens);
+        FREE((void*)vs->state.tokens);
         FREE(shader);
     } else {
         draw_delete_vertex_shader(r300->draw,
@@ -681,6 +757,31 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
     }
 }
 
+static void r300_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+    struct r300_context* r300 = r300_context(pipe);
+    void *mapped;
+
+    if (buf == NULL || buf->buffer->size == 0 ||
+        (mapped = pipe_buffer_map(pipe->screen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)) == NULL)
+    {
+        r300->shader_constants[shader].count = 0;
+        return;
+    }
+
+    assert((buf->buffer->size % 4 * sizeof(float)) == 0);
+    memcpy(r300->shader_constants[shader].constants, mapped, buf->buffer->size);
+    r300->shader_constants[shader].count = buf->buffer->size / (4 * sizeof(float));
+    pipe_buffer_unmap(pipe->screen, buf->buffer);
+
+    if (shader == PIPE_SHADER_VERTEX)
+        r300->dirty_state |= R300_NEW_VERTEX_SHADER_CONSTANTS;
+    else if (shader == PIPE_SHADER_FRAGMENT)
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+}
+
 void r300_init_state_functions(struct r300_context* r300)
 {
     r300->context.create_blend_state = r300_create_blend_state;
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index c01e61a9b19..7166694edf4 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -20,17 +20,50 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "r300_state_derived.h"
+#include "draw/draw_context.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
+#include "r300_context.h"
 #include "r300_fs.h"
+#include "r300_screen.h"
+#include "r300_state_derived.h"
+#include "r300_state_inlines.h"
 #include "r300_vs.h"
 
 /* r300_state_derived: Various bits of state which are dependent upon
  * currently bound CSO data. */
 
+struct r300_shader_key {
+    struct r300_vertex_shader* vs;
+    struct r300_fragment_shader* fs;
+};
+
+struct r300_shader_derived_value {
+    struct r300_vertex_format* vformat;
+    struct r300_rs_block* rs_block;
+};
+
+unsigned r300_shader_key_hash(void* key) {
+    struct r300_shader_key* shader_key = (struct r300_shader_key*)key;
+    unsigned vs = (unsigned)shader_key->vs;
+    unsigned fs = (unsigned)shader_key->fs;
+
+    return (vs << 16) | (fs & 0xffff);
+}
+
+int r300_shader_key_compare(void* key1, void* key2) {
+    struct r300_shader_key* shader_key1 = (struct r300_shader_key*)key1;
+    struct r300_shader_key* shader_key2 = (struct r300_shader_key*)key2;
+
+    return (shader_key1->vs == shader_key2->vs) &&
+        (shader_key1->fs == shader_key2->fs);
+}
+
 /* Set up the vs_tab and routes. */
 static void r300_vs_tab_routes(struct r300_context* r300,
-                               struct r300_vertex_format* vformat)
+                               struct r300_vertex_info* vformat)
 {
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct vertex_info* vinfo = &vformat->vinfo;
@@ -52,7 +85,7 @@ static void r300_vs_tab_routes(struct r300_context* r300,
     if (!r300screen->caps->has_tcl || !r300->rs_state->enable_vte)
     {
         for (i = 0; i < info->num_inputs; i++) {
-            switch (info->input_semantic_name[i]) {
+            switch (r300->vs->code.inputs[i]) {
                 case TGSI_SEMANTIC_POSITION:
                     pos = TRUE;
                     tab[i] = 0;
@@ -62,10 +95,12 @@ static void r300_vs_tab_routes(struct r300_context* r300,
                     cols++;
                     break;
                 case TGSI_SEMANTIC_PSIZE:
+                    assert(psize == FALSE);
                     psize = TRUE;
                     tab[i] = 15;
                     break;
                 case TGSI_SEMANTIC_FOG:
+                    assert(fog == FALSE);
                     fog = TRUE;
                     /* Fall through */
                 case TGSI_SEMANTIC_GENERIC:
@@ -124,7 +159,9 @@ static void r300_vs_tab_routes(struct r300_context* r300,
 
     vinfo->hwfmt[0] = 0x5555; /* XXX this is classic Mesa bonghits */
 
-    if (!pos) {
+    /* We need to add vertex position attribute only for SW TCL case,
+     * for HW TCL case it could be generated by vertex shader */
+    if (!pos && !r300screen->caps->has_tcl) {
         debug_printf("r300: Forcing vertex position attribute emit...\n");
         /* Make room for the position attribute
          * at the beginning of the tab. */
@@ -133,20 +170,30 @@ static void r300_vs_tab_routes(struct r300_context* r300,
         }
         tab[0] = 0;
     }
-    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-        draw_find_vs_output(r300->draw, TGSI_SEMANTIC_POSITION, 0));
+
+    /* Position. */
+    if (r300->draw) {
+        draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
+            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_POSITION, 0));
+    }
     vinfo->hwfmt[1] |= R300_INPUT_CNTL_POS;
     vinfo->hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
 
+    /* Point size. */
     if (psize) {
-        draw_emit_vertex_attr(vinfo, EMIT_1F_PSIZE, INTERP_POS,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_PSIZE, 0));
+        if (r300->draw) {
+            draw_emit_vertex_attr(vinfo, EMIT_1F_PSIZE, INTERP_POS,
+                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_PSIZE, 0));
+        }
         vinfo->hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
     }
 
+    /* Colors. */
     for (i = 0; i < cols; i++) {
-        draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_COLOR, i));
+        if (r300->draw) {
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR,
+                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_COLOR, i));
+        }
         vinfo->hwfmt[1] |= R300_INPUT_CNTL_COLOR;
         vinfo->hwfmt[2] |= (R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i);
     }
@@ -155,53 +202,53 @@ static void r300_vs_tab_routes(struct r300_context* r300,
      * This gets around a double-increment problem. */
     i = 0;
 
+    /* Fog. This is a special-cased texcoord. */
     if (fog) {
         i++;
-        draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_FOG, 0));
+        if (r300->draw) {
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
+                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_FOG, 0));
+        }
         vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
         vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
-    for (i; i < texs; i++) {
-        draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
+    /* Texcoords. */
+    for (; i < texs; i++) {
+        if (r300->draw) {
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
+                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
+        }
         vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
         vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
-    /* Handle the case where the vertex shader will be generating some of
-     * the attribs based on its inputs. */
-    if (r300screen->caps->has_tcl &&
-            info->num_inputs < info->num_outputs) {
-        vinfo->num_attribs = info->num_inputs;
-    }
-
     draw_compute_vertex_size(vinfo);
 }
 
 /* Update the PSC tables. */
 static void r300_vertex_psc(struct r300_context* r300,
-                            struct r300_vertex_format* vformat)
+                            struct r300_vertex_info* vformat)
 {
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct vertex_info* vinfo = &vformat->vinfo;
     int* tab = vformat->vs_tab;
-    uint32_t temp;
-    int i, attrib_count;
+    uint16_t type, swizzle;
+    enum pipe_format format;
+    unsigned i, attrib_count;
 
     /* Vertex shaders have no semantics on their inputs,
      * so PSC should just route stuff based on their info,
      * and not on attrib information. */
     if (r300screen->caps->has_tcl) {
         attrib_count = r300->vs->info.num_inputs;
-        debug_printf("r300: routing %d attribs in psc for vs\n",
+        DBG(r300, DBG_DRAW, "r300: routing %d attribs in psc for vs\n",
                 attrib_count);
     } else {
         attrib_count = vinfo->num_attribs;
-        debug_printf("r300: attrib count: %d\n", attrib_count);
+        DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
         for (i = 0; i < attrib_count; i++) {
-            debug_printf("r300: attrib: offset %d, interp %d, size %d,"
+            DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
                    " tab %d\n", vinfo->attrib[i].src_index,
                    vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
                    tab[i]);
@@ -209,64 +256,43 @@ static void r300_vertex_psc(struct r300_context* r300,
     }
 
     for (i = 0; i < attrib_count; i++) {
-        /* Make sure we have a proper destination for our attribute */
+        /* Make sure we have a proper destination for our attribute. */
         assert(tab[i] != -1);
 
-        /* Add the attribute to the PSC table. */
-        temp = r300screen->caps->has_tcl ?
-            R300_DATA_TYPE_FLOAT_4 :
-            translate_vertex_data_type(vinfo->attrib[i].emit);
-        temp |= tab[i] << R300_DST_VEC_LOC_SHIFT;
+        format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
+
+        /* Obtain the type of data in this attribute. */
+        type = r300_translate_vertex_data_type(format) |
+            tab[i] << R300_DST_VEC_LOC_SHIFT;
+
+        /* Obtain the swizzle for this attribute. Note that the default
+         * swizzle in the hardware is not XYZW! */
+        swizzle = r300_translate_vertex_data_swizzle(format);
 
+        /* Add the attribute to the PSC table. */
         if (i & 1) {
-            vformat->vap_prog_stream_cntl[i >> 1] &= 0x0000ffff;
-            vformat->vap_prog_stream_cntl[i >> 1] |= temp << 16;
+            vformat->vap_prog_stream_cntl[i >> 1] |= type << 16;
 
-            vformat->vap_prog_stream_cntl_ext[i >> 1] |=
-                (R300_VAP_SWIZZLE_XYZW << 16);
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
         } else {
-            vformat->vap_prog_stream_cntl[i >> 1] &= 0xffff0000;
-            vformat->vap_prog_stream_cntl[i >> 1] |= temp <<  0;
+            vformat->vap_prog_stream_cntl[i >> 1] |= type <<  0;
 
-            vformat->vap_prog_stream_cntl_ext[i >> 1] |=
-                (R300_VAP_SWIZZLE_XYZW <<  0);
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 0;
         }
     }
 
     /* Set the last vector in the PSC. */
-    i--;
+    if (i) {
+        i -= 1;
+    }
     vformat->vap_prog_stream_cntl[i >> 1] |=
         (R300_LAST_VEC << (i & 1 ? 16 : 0));
 }
 
-/* Update the vertex format. */
-static void r300_update_vertex_format(struct r300_context* r300)
-{
-    struct r300_vertex_format vformat;
-    int i;
-
-    memset(&vformat, 0, sizeof(struct r300_vertex_format));
-    for (i = 0; i < 16; i++) {
-        vformat.vs_tab[i] = -1;
-        vformat.fs_tab[i] = -1;
-    }
-
-    r300_vs_tab_routes(r300, &vformat);
-
-    r300_vertex_psc(r300, &vformat);
-
-    if (memcmp(&r300->vertex_info, &vformat,
-                sizeof(struct r300_vertex_format))) {
-        memcpy(&r300->vertex_info, &vformat,
-                sizeof(struct r300_vertex_format));
-        r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
-    }
-}
-
 /* Set up the mappings from GB to US, for RS block. */
-static void r300_update_fs_tab(struct r300_context* r300)
+static void r300_update_fs_tab(struct r300_context* r300,
+                               struct r300_vertex_info* vformat)
 {
-    struct r300_vertex_format* vformat = &r300->vertex_info;
     struct tgsi_shader_info* info = &r300->fs->info;
     int i, cols = 0, texs = 0, cols_emitted = 0;
     int* tab = vformat->fs_tab;
@@ -299,18 +325,18 @@ static void r300_update_fs_tab(struct r300_context* r300)
     }
 
     /* Now that we know where everything is... */
-    debug_printf("r300: fp input count: %d\n", info->num_inputs);
+    DBG(r300, DBG_DRAW, "r300: fp input count: %d\n", info->num_inputs);
     for (i = 0; i < info->num_inputs; i++) {
         switch (tab[i]) {
             case INTERP_LINEAR:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, color,    tab %d\n",
                         i, cols_emitted);
                 tab[i] = cols_emitted;
                 cols_emitted++;
                 break;
             case INTERP_PERSPECTIVE:
-                debug_printf("r300: attrib: "
+                DBG(r300, DBG_DRAW, "r300: attrib: "
                         "stack offset %d, texcoord, tab %d\n",
                         i, cols + texs);
                 tab[i] = cols + texs;
@@ -328,53 +354,39 @@ static void r300_update_fs_tab(struct r300_context* r300)
 /* Set up the RS block. This is the part of the chipset that actually does
  * the rasterization of vertices into fragments. This is also the part of the
  * chipset that locks up if any part of it is even slightly wrong. */
-static void r300_update_rs_block(struct r300_context* r300)
+static void r300_update_rs_block(struct r300_context* r300,
+                                 struct r300_rs_block* rs)
 {
-    struct r300_rs_block* rs = r300->rs_block;
     struct tgsi_shader_info* info = &r300->fs->info;
-    int* tab = r300->vertex_info.fs_tab;
-    int col_count = 0, fp_offset = 0, i, memory_pos, tex_count = 0;
-
-    memset(rs, 0, sizeof(struct r300_rs_block));
+    int col_count = 0, fp_offset = 0, i, tex_count = 0;
+    int rs_tex_comp = 0;
 
     if (r300_screen(r300->context.screen)->caps->is_r500) {
         for (i = 0; i < info->num_inputs; i++) {
-            assert(tab[i] != -1);
-            memory_pos = tab[i] * 4;
             switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_COLOR:
                     rs->ip[col_count] |=
-                        R500_RS_COL_PTR(memory_pos) |
+                        R500_RS_COL_PTR(col_count) |
                         R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
                     col_count++;
                     break;
                 case TGSI_SEMANTIC_GENERIC:
                     rs->ip[tex_count] |=
-                        R500_RS_SEL_S(memory_pos) |
-                        R500_RS_SEL_T(memory_pos + 1) |
-                        R500_RS_SEL_R(memory_pos + 2) |
-                        R500_RS_SEL_Q(memory_pos + 3);
+                        R500_RS_SEL_S(rs_tex_comp) |
+                        R500_RS_SEL_T(rs_tex_comp + 1) |
+                        R500_RS_SEL_R(rs_tex_comp + 2) |
+                        R500_RS_SEL_Q(rs_tex_comp + 3);
                     tex_count++;
+                    rs_tex_comp += 4;
                     break;
                 default:
                     break;
             }
         }
 
-        if (col_count == 0) {
-            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
-        }
-
-        if (tex_count == 0) {
-            rs->ip[0] |=
-                R500_RS_SEL_S(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_T(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_Q(R500_RS_IP_PTR_K1);
-        }
-
         /* Rasterize at least one color, or bad things happen. */
         if ((col_count == 0) && (tex_count == 0)) {
+            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
             col_count++;
         }
 
@@ -391,23 +403,22 @@ static void r300_update_rs_block(struct r300_context* r300)
         }
     } else {
         for (i = 0; i < info->num_inputs; i++) {
-            assert(tab[i] != -1);
-            memory_pos = tab[i] * 4;
             switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_COLOR:
                     rs->ip[col_count] |=
-                        R300_RS_COL_PTR(memory_pos) |
+                        R300_RS_COL_PTR(col_count) |
                         R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
                     col_count++;
                     break;
                 case TGSI_SEMANTIC_GENERIC:
                     rs->ip[tex_count] |=
-                        R300_RS_TEX_PTR(memory_pos) |
+                        R300_RS_TEX_PTR(rs_tex_comp) |
                         R300_RS_SEL_S(R300_RS_SEL_C0) |
                         R300_RS_SEL_T(R300_RS_SEL_C1) |
                         R300_RS_SEL_R(R300_RS_SEL_C2) |
                         R300_RS_SEL_Q(R300_RS_SEL_C3);
                     tex_count++;
+                    rs_tex_comp+=4;
                     break;
                 default:
                     break;
@@ -444,21 +455,112 @@ static void r300_update_rs_block(struct r300_context* r300)
         }
     }
 
-    rs->count = (tex_count * 4) | (col_count << R300_IC_COUNT_SHIFT) |
+    rs->count = (rs_tex_comp) | (col_count << R300_IC_COUNT_SHIFT) |
         R300_HIRES_EN;
 
     rs->inst_count = MAX2(MAX2(col_count - 1, tex_count - 1), 0);
 }
 
+/* Update the vertex format. */
+static void r300_update_derived_shader_state(struct r300_context* r300)
+{
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct r300_vertex_info* vformat;
+    struct r300_rs_block* rs_block;
+    int i;
+
+    /*
+    struct r300_shader_key* key;
+    struct r300_shader_derived_value* value;
+    key = CALLOC_STRUCT(r300_shader_key);
+    key->vs = r300->vs;
+    key->fs = r300->fs;
+
+    value = (struct r300_shader_derived_value*)
+        util_hash_table_get(r300->shader_hash_table, (void*)key);
+    if (value) {
+        //vformat = value->vformat;
+        rs_block = value->rs_block;
+
+        FREE(key);
+    } else {
+        rs_block = CALLOC_STRUCT(r300_rs_block);
+        value = CALLOC_STRUCT(r300_shader_derived_value);
+
+        r300_update_rs_block(r300, rs_block);
+
+        //value->vformat = vformat;
+        value->rs_block = rs_block;
+        util_hash_table_set(r300->shader_hash_table,
+            (void*)key, (void*)value);
+    } */
+
+    /* XXX This will be refactored ASAP. */
+    vformat = CALLOC_STRUCT(r300_vertex_info);
+    rs_block = CALLOC_STRUCT(r300_rs_block);
+
+    for (i = 0; i < 16; i++) {
+        vformat->vs_tab[i] = -1;
+        vformat->fs_tab[i] = -1;
+    }
+
+    r300_vs_tab_routes(r300, vformat);
+    r300_vertex_psc(r300, vformat);
+    r300_update_fs_tab(r300, vformat);
+
+    r300_update_rs_block(r300, rs_block);
+
+    FREE(r300->vertex_info);
+    FREE(r300->rs_block);
+
+    r300->vertex_info = vformat;
+    r300->rs_block = rs_block;
+    r300->dirty_state |= (R300_NEW_VERTEX_FORMAT | R300_NEW_RS_BLOCK);
+}
+
+static void r300_update_ztop(struct r300_context* r300)
+{
+    r300->ztop_state.z_buffer_top = R300_ZTOP_ENABLE;
+
+    /* This is important enough that I felt it warranted a comment.
+     *
+     * According to the docs, these are the conditions where ZTOP must be
+     * disabled:
+     * 1) Alpha testing enabled
+     * 2) Texture kill instructions in fragment shader
+     * 3) Chroma key culling enabled
+     * 4) W-buffering enabled
+     *
+     * The docs claim that for the first three cases, if no ZS writes happen,
+     * then ZTOP can be used.
+     *
+     * Additionally, the following conditions require disabled ZTOP:
+     * ~) Depth writes in fragment shader
+     * ~) Outstanding occlusion queries
+     *
+     * ~C.
+     */
+    if (r300->dsa_state->alpha_function) {
+        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300->fs->info.uses_kill) {
+        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300_fragment_shader_writes_depth(r300->fs)) {
+        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300->query_current) {
+        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+    }
+}
+
 void r300_update_derived_state(struct r300_context* r300)
 {
-    if (r300->dirty_state &
-            (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER)) {
-        r300_update_vertex_format(r300);
+    /* XXX */
+    if (TRUE || r300->dirty_state &
+        (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER)) {
+        r300_update_derived_shader_state(r300);
     }
 
-    if (r300->dirty_state & R300_NEW_VERTEX_FORMAT) {
-        r300_update_fs_tab(r300);
-        r300_update_rs_block(r300);
+    if (r300->dirty_state &
+            (R300_NEW_DSA | R300_NEW_FRAGMENT_SHADER | R300_NEW_QUERY)) {
+        r300_update_ztop(r300);
     }
 }
diff --git a/src/gallium/drivers/r300/r300_state_derived.h b/src/gallium/drivers/r300/r300_state_derived.h
index 63ae8eb8d08..05ad535e2de 100644
--- a/src/gallium/drivers/r300/r300_state_derived.h
+++ b/src/gallium/drivers/r300/r300_state_derived.h
@@ -23,11 +23,11 @@
 #ifndef R300_STATE_DERIVED_H
 #define R300_STATE_DERIVED_H
 
-#include "draw/draw_vertex.h"
+struct r300_context;
 
-#include "r300_context.h"
-#include "r300_reg.h"
-#include "r300_state_inlines.h"
+unsigned r300_shader_key_hash(void* key);
+
+int r300_shader_key_compare(void* key1, void* key2);
 
 void r300_update_derived_state(struct r300_context* r300);
 
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index 91b93fc367e..e6c1cb54dac 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -24,6 +24,8 @@
 #ifndef R300_STATE_INLINES_H
 #define R300_STATE_INLINES_H
 
+#include "draw/draw_vertex.h"
+
 #include "pipe/p_format.h"
 
 #include "r300_reg.h"
@@ -51,6 +53,7 @@ static INLINE uint32_t r300_translate_blend_function(int blend_func)
             return R300_COMB_FCN_MAX;
         default:
             debug_printf("r300: Unknown blend function %d\n", blend_func);
+            assert(0);
             break;
     }
     return 0;
@@ -98,6 +101,7 @@ static INLINE uint32_t r300_translate_blend_factor(int blend_fact)
         case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: */
         default:
             debug_printf("r300: Unknown blend factor %d\n", blend_fact);
+            assert(0);
             break;
     }
     return 0;
@@ -127,6 +131,7 @@ static INLINE uint32_t r300_translate_depth_stencil_function(int zs_func)
         default:
             debug_printf("r300: Unknown depth/stencil function %d\n",
                 zs_func);
+            assert(0);
             break;
     }
     return 0;
@@ -153,6 +158,7 @@ static INLINE uint32_t r300_translate_stencil_op(int s_op)
             return R300_ZS_INVERT;
         default:
             debug_printf("r300: Unknown stencil op %d", s_op);
+            assert(0);
             break;
     }
     return 0;
@@ -179,11 +185,48 @@ static INLINE uint32_t r300_translate_alpha_function(int alpha_func)
             return R300_FG_ALPHA_FUNC_ALWAYS;
         default:
             debug_printf("r300: Unknown alpha function %d", alpha_func);
+            assert(0);
             break;
     }
     return 0;
 }
 
+static INLINE uint32_t
+r300_translate_polygon_mode_front(unsigned mode) {
+    switch (mode)
+    {
+        case PIPE_POLYGON_MODE_FILL:
+            return R300_GA_POLY_MODE_FRONT_PTYPE_TRI;
+        case PIPE_POLYGON_MODE_LINE:
+            return R300_GA_POLY_MODE_FRONT_PTYPE_LINE;
+        case PIPE_POLYGON_MODE_POINT:
+            return R300_GA_POLY_MODE_FRONT_PTYPE_POINT;
+
+        default:
+            debug_printf("r300: Bad polygon mode %i in %s\n", mode,
+                __FUNCTION__);
+            return R300_GA_POLY_MODE_FRONT_PTYPE_TRI;
+    }
+}
+
+static INLINE uint32_t
+r300_translate_polygon_mode_back(unsigned mode) {
+    switch (mode)
+    {
+        case PIPE_POLYGON_MODE_FILL:
+            return R300_GA_POLY_MODE_BACK_PTYPE_TRI;
+        case PIPE_POLYGON_MODE_LINE:
+            return R300_GA_POLY_MODE_BACK_PTYPE_LINE;
+        case PIPE_POLYGON_MODE_POINT:
+            return R300_GA_POLY_MODE_BACK_PTYPE_POINT;
+
+        default:
+            debug_printf("r300: Bad polygon mode %i in %s\n", mode,
+                __FUNCTION__);
+            return R300_GA_POLY_MODE_BACK_PTYPE_TRI;
+    }
+}
+
 /* Texture sampler state. */
 
 static INLINE uint32_t r300_translate_wrap(int wrap)
@@ -207,6 +250,7 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
             return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
         default:
             debug_printf("r300: Unknown texture wrap %d", wrap);
+            assert(0);
             return 0;
     }
 }
@@ -226,6 +270,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip)
             break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", min);
+            assert(0);
             break;
     }
     switch (mag) {
@@ -240,6 +285,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip)
             break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", mag);
+            assert(0);
             break;
     }
     switch (mip) {
@@ -254,6 +300,7 @@ static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip)
             break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", mip);
+            assert(0);
             break;
     }
 
@@ -277,6 +324,8 @@ static INLINE uint32_t r300_anisotropy(float max_aniso)
 
 /* Buffer formats. */
 
+/* Colorbuffer formats. This is the unswizzled format of the RB3D block's
+ * output. For the swizzling of the targets, check the shader's format. */
 static INLINE uint32_t r300_translate_colorformat(enum pipe_format format)
 {
     switch (format) {
@@ -292,7 +341,9 @@ static INLINE uint32_t r300_translate_colorformat(enum pipe_format format)
             return R300_COLOR_FORMAT_ARGB4444;
         /* 32-bit buffers */
         case PIPE_FORMAT_A8R8G8B8_UNORM:
-        case PIPE_FORMAT_Z24S8_UNORM:
+        case PIPE_FORMAT_X8R8G8B8_UNORM:
+        case PIPE_FORMAT_R8G8B8A8_UNORM:
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
             return R300_COLOR_FORMAT_ARGB8888;
         /* XXX Not in pipe_format
         case PIPE_FORMAT_A32R32G32B32:
@@ -309,17 +360,21 @@ static INLINE uint32_t r300_translate_colorformat(enum pipe_format format)
             debug_printf("r300: Implementation error: "
                 "Got unsupported color format %s in %s\n",
                 pf_name(format), __FUNCTION__);
+            assert(0);
             break;
     }
     return 0;
 }
 
+/* Depthbuffer and stencilbuffer. Thankfully, we only support two flavors. */
 static INLINE uint32_t r300_translate_zsformat(enum pipe_format format)
 {
     switch (format) {
         /* 16-bit depth, no stencil */
         case PIPE_FORMAT_Z16_UNORM:
             return R300_DEPTHFORMAT_16BIT_INT_Z;
+        /* 24-bit depth, ignored stencil */
+        case PIPE_FORMAT_Z24X8_UNORM:
         /* 24-bit depth, 8-bit stencil */
         case PIPE_FORMAT_Z24S8_UNORM:
             return R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
@@ -327,25 +382,36 @@ static INLINE uint32_t r300_translate_zsformat(enum pipe_format format)
             debug_printf("r300: Implementation error: "
                 "Got unsupported ZS format %s in %s\n",
                 pf_name(format), __FUNCTION__);
+            assert(0);
             break;
     }
     return 0;
 }
 
-/* Translate pipe_format into US_OUT_FMT.
+/* Shader output formats. This is essentially the swizzle from the shader
+ * to the RB3D block.
+ *
  * Note that formats are stored from C3 to C0. */
 static INLINE uint32_t r300_translate_out_fmt(enum pipe_format format)
 {
     switch (format) {
         case PIPE_FORMAT_A8R8G8B8_UNORM:
+        case PIPE_FORMAT_X8R8G8B8_UNORM:
+        /* XXX */
         case PIPE_FORMAT_Z24S8_UNORM:
             return R300_US_OUT_FMT_C4_8 |
                 R300_C0_SEL_B | R300_C1_SEL_G |
                 R300_C2_SEL_R | R300_C3_SEL_A;
+        case PIPE_FORMAT_R8G8B8A8_UNORM:
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
+            return R300_US_OUT_FMT_C4_8 |
+                R300_C0_SEL_A | R300_C1_SEL_B |
+                R300_C2_SEL_G | R300_C3_SEL_R;
         default:
             debug_printf("r300: Implementation error: "
                 "Got unsupported output format %s in %s\n",
                 pf_name(format), __FUNCTION__);
+            assert(0);
             return R300_US_OUT_FMT_UNUSED;
     }
     return 0;
@@ -372,32 +438,114 @@ static INLINE uint32_t r300_translate_gb_pipes(int pipe_count)
     return 0;
 }
 
-static INLINE uint32_t translate_vertex_data_type(int type) {
-    switch (type) {
-        case EMIT_1F:
-        case EMIT_1F_PSIZE:
-            return R300_DATA_TYPE_FLOAT_1;
-            break;
-        case EMIT_2F:
-            return R300_DATA_TYPE_FLOAT_2;
-            break;
-        case EMIT_3F:
-            return R300_DATA_TYPE_FLOAT_3;
-            break;
-        case EMIT_4F:
-            return R300_DATA_TYPE_FLOAT_4;
+/* Utility function to count the number of components in RGBAZS formats.
+ * XXX should go to util or p_format.h */
+static INLINE unsigned pf_component_count(enum pipe_format format) {
+    unsigned count = 0;
+
+    if (pf_layout(format) != PIPE_FORMAT_LAYOUT_RGBAZS) {
+        return count;
+    }
+
+    if (pf_size_x(format)) {
+        count++;
+    }
+    if (pf_size_y(format)) {
+        count++;
+    }
+    if (pf_size_z(format)) {
+        count++;
+    }
+    if (pf_size_w(format)) {
+        count++;
+    }
+
+    return count;
+}
+
+/* Translate pipe_formats into PSC vertex types. */
+static INLINE uint16_t
+r300_translate_vertex_data_type(enum pipe_format format) {
+    uint32_t result = 0;
+    unsigned components = pf_component_count(format);
+
+    if (pf_layout(format) != PIPE_FORMAT_LAYOUT_RGBAZS) {
+        debug_printf("r300: Bad format %s in %s:%d\n", pf_name(format),
+            __FUNCTION__, __LINE__);
+        assert(0);
+    }
+
+    switch (pf_type(format)) {
+        /* Half-floats, floats, doubles */
+        case PIPE_FORMAT_TYPE_FLOAT:
+            switch (pf_size_x(format)) {
+                case 4:
+                    result = R300_DATA_TYPE_FLOAT_1 + (components - 1);
+                    break;
+                default:
+                    debug_printf("r300: Bad format %s in %s:%d\n",
+                        pf_name(format), __FUNCTION__, __LINE__);
+                    assert(0);
+            }
             break;
-        case EMIT_4UB:
-            return R300_DATA_TYPE_BYTE;
+        /* Normalized unsigned ints */
+        case PIPE_FORMAT_TYPE_UNORM:
+        /* Normalized signed ints */
+        case PIPE_FORMAT_TYPE_SNORM:
+        /* Non-normalized unsigned ints */
+        case PIPE_FORMAT_TYPE_USCALED:
+        /* Non-normalized signed ints */
+        case PIPE_FORMAT_TYPE_SSCALED:
+            switch (pf_size_x(format)) {
+                case 1:
+                    result = R300_DATA_TYPE_BYTE;
+                    break;
+                case 2:
+                    if (components > 2) {
+                        result = R300_DATA_TYPE_SHORT_4;
+                    } else {
+                        result = R300_DATA_TYPE_SHORT_2;
+                    }
+                    break;
+                default:
+                    debug_printf("r300: Bad format %s in %s:%d\n",
+                        pf_name(format), __FUNCTION__, __LINE__);
+                    debug_printf("r300: pf_size_x(format) == %d\n",
+                        pf_size_x(format));
+                    assert(0);
+            }
             break;
         default:
-            debug_printf("r300: Implementation error: "
-                    "Bad vertex data type!\n");
+            debug_printf("r300: Bad format %s in %s:%d\n",
+                pf_name(format), __FUNCTION__, __LINE__);
             assert(0);
-            break;
     }
 
-    return 0;
+    if (pf_type(format) == PIPE_FORMAT_TYPE_SSCALED) {
+        result |= R300_SIGNED;
+    } else if (pf_type(format) == PIPE_FORMAT_TYPE_UNORM) {
+        result |= R300_NORMALIZE;
+    } else if (pf_type(format) == PIPE_FORMAT_TYPE_SNORM) {
+        result |= (R300_SIGNED | R300_NORMALIZE);
+    }
+
+    return result;
+}
+
+static INLINE uint16_t
+r300_translate_vertex_data_swizzle(enum pipe_format format) {
+
+    if (pf_layout(format) != PIPE_FORMAT_LAYOUT_RGBAZS) {
+        debug_printf("r300: Bad format %s in %s:%d\n",
+            pf_name(format), __FUNCTION__, __LINE__);
+        return 0;
+    }
+
+    return ((pf_swizzle_x(format) << R300_SWIZZLE_SELECT_X_SHIFT) |
+        (pf_swizzle_y(format) << R300_SWIZZLE_SELECT_Y_SHIFT) |
+        (pf_swizzle_z(format) << R300_SWIZZLE_SELECT_Z_SHIFT) |
+        (pf_swizzle_w(format) << R300_SWIZZLE_SELECT_W_SHIFT) |
+        (0xf << R300_WRITE_ENA_SHIFT));
 }
 
 #endif /* R300_STATE_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index 7d822fec483..c07e6ae676d 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -21,8 +21,17 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "r300_context.h"
+#include "r300_cs.h"
+#include "r300_reg.h"
+#include "r300_screen.h"
 #include "r300_state_invariant.h"
 
+struct pipe_viewport_state r300_viewport_identity = {
+    .scale = {1.0, 1.0, 1.0, 1.0},
+    .translate = {0.0, 0.0, 0.0, 0.0},
+};
+
 /* Calculate and emit invariant state. This is data that the 3D engine
  * will probably want at the beginning of every CS, but it's not currently
  * handled by any CSO setup, and in addition it doesn't really change much.
@@ -75,7 +84,7 @@ void r300_emit_invariant_state(struct r300_context* r300)
     END_CS;
 
     /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(64 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
+    BEGIN_CS(60 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
     /* Flush PVS. */
     OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
 
@@ -105,7 +114,6 @@ void r300_emit_invariant_state(struct r300_context* r300)
     /* XXX this big chunk should be refactored into rs_state */
     OUT_CS_REG(R300_GA_SOLID_RG, 0x00000000);
     OUT_CS_REG(R300_GA_SOLID_BA, 0x00000000);
-    OUT_CS_REG(R300_GA_POLY_MODE, 0x00000000);
     OUT_CS_REG(R300_GA_ROUND_MODE, 0x00000001);
     OUT_CS_REG(R300_GA_OFFSET, 0x00000000);
     OUT_CS_REG(R300_GA_FOG_SCALE, 0x3DBF1412);
@@ -116,7 +124,6 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
     OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
     OUT_CS_REG(R300_RB3D_CCTL, 0x00000000);
-    OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0x0000000F);
     OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
     if (caps->is_r500) {
         OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x00000000);
diff --git a/src/gallium/drivers/r300/r300_state_invariant.h b/src/gallium/drivers/r300/r300_state_invariant.h
index 5bea6779fe5..05cff0d6dfe 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.h
+++ b/src/gallium/drivers/r300/r300_state_invariant.h
@@ -23,11 +23,7 @@
 #ifndef R300_STATE_INVARIANT_H
 #define R300_STATE_INVARIANT_H
 
-#include "r300_chipset.h"
-#include "r300_context.h"
-#include "r300_cs.h"
-#include "r300_reg.h"
-#include "r300_state_inlines.h"
+struct r300_context;
 
 void r300_emit_invariant_state(struct r300_context* r300);
 
diff --git a/src/gallium/drivers/r300/r300_surface.c b/src/gallium/drivers/r300/r300_surface.c
deleted file mode 100644
index 96e6e4a77d4..00000000000
--- a/src/gallium/drivers/r300/r300_surface.c
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#include "r300_surface.h"
-
-static void r300_surface_setup(struct r300_context* r300,
-                               struct r300_texture* dest,
-                               unsigned x, unsigned y,
-                               unsigned w, unsigned h)
-{
-    struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
-    unsigned pixpitch = dest->stride / dest->tex.block.size;
-    CS_LOCALS(r300);
-
-    r300_emit_blend_state(r300, &blend_clear_state);
-    r300_emit_blend_color_state(r300, &blend_color_clear_state);
-    r300_emit_dsa_state(r300, &dsa_clear_state);
-    r300_emit_rs_state(r300, &rs_clear_state);
-
-    BEGIN_CS(26);
-
-    /* Viewport setup */
-    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
-    OUT_CS_32F((float)w);
-    OUT_CS_32F((float)x);
-    OUT_CS_32F((float)h);
-    OUT_CS_32F((float)y);
-    OUT_CS_32F(1.0);
-    OUT_CS_32F(0.0);
-
-    OUT_CS_REG(R300_VAP_VTE_CNTL, R300_VPORT_X_SCALE_ENA |
-            R300_VPORT_X_OFFSET_ENA |
-            R300_VPORT_Y_SCALE_ENA |
-            R300_VPORT_Y_OFFSET_ENA |
-            R300_VTX_XY_FMT | R300_VTX_Z_FMT);
-
-    /* Pixel scissors. */
-    OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
-    if (caps->is_r500) {
-        OUT_CS((x << R300_SCISSORS_X_SHIFT) | (y << R300_SCISSORS_Y_SHIFT));
-        OUT_CS(((w - 1) << R300_SCISSORS_X_SHIFT) | ((h - 1) << R300_SCISSORS_Y_SHIFT));
-    } else {
-        /* Non-R500 chipsets have an offset of 1440 in their scissors. */
-        OUT_CS(((x + 1440) << R300_SCISSORS_X_SHIFT) |
-                ((y + 1440) << R300_SCISSORS_Y_SHIFT));
-        OUT_CS((((w - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
-                (((h - 1) + 1440) << R300_SCISSORS_Y_SHIFT));
-    }
-
-    /* Flush colorbuffer and blend caches. */
-    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
-        R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D |
-        R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL);
-    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT,
-        R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-        R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
-
-    /* Setup colorbuffer. */
-    OUT_CS_REG_SEQ(R300_RB3D_COLOROFFSET0, 1);
-    OUT_CS_RELOC(dest->buffer, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-    OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0, 1);
-    OUT_CS_RELOC(dest->buffer, pixpitch |
-                 r300_translate_colorformat(dest->tex.format), 0,
-                 RADEON_GEM_DOMAIN_VRAM, 0);
-    OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0xf);
-
-    END_CS;
-}
-
-/* Provides pipe_context's "surface_fill". Commonly used for clearing
- * buffers. */
-static void r300_surface_fill(struct pipe_context* pipe,
-                              struct pipe_surface* dest,
-                              unsigned x, unsigned y,
-                              unsigned w, unsigned h,
-                              unsigned color)
-{
-    int i;
-    float r, g, b, a, depth;
-    struct r300_context* r300 = r300_context(pipe);
-    struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
-    struct r300_texture* tex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = tex->stride / tex->tex.block.size;
-    boolean invalid = FALSE;
-    CS_LOCALS(r300);
-
-    a = (float)((color >> 24) & 0xff) / 255.0f;
-    r = (float)((color >> 16) & 0xff) / 255.0f;
-    g = (float)((color >>  8) & 0xff) / 255.0f;
-    b = (float)((color >>  0) & 0xff) / 255.0f;
-    debug_printf("r300: Filling surface %p at (%d,%d),"
-        " dimensions %dx%d (pixel pitch %d), color 0x%x\n",
-        dest, x, y, w, h, pixpitch, color);
-
-    /* Fallback? */
-    if (FALSE) {
-fallback:
-        debug_printf("r300: Falling back on surface clear...");
-        util_surface_fill(pipe, dest, x, y, w, h, color);
-        return;
-    }
-
-    /* Make sure our target BO is okay. */
-validate:
-    if (!r300->winsys->add_buffer(r300->winsys, tex->buffer,
-                0, RADEON_GEM_DOMAIN_VRAM)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        goto validate;
-    }
-    if (!r300->winsys->validate(r300->winsys)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        if (invalid) {
-            debug_printf("r300: Stuck in validation loop, gonna fallback.");
-            goto fallback;
-        }
-        invalid = TRUE;
-        goto validate;
-    }
-
-    r300_surface_setup(r300, tex, x, y, w, h);
-
-    /* Vertex shader setup */
-    if (caps->has_tcl) {
-        r300_emit_vertex_program_code(r300, &r300_passthrough_vertex_shader, 0);
-    } else {
-        BEGIN_CS(4);
-        OUT_CS_REG(R300_VAP_CNTL_STATUS,
-#ifdef PIPE_ARCH_BIG_ENDIAN
-                   R300_VC_32BIT_SWAP |
-#endif
-                   R300_VAP_TCL_BYPASS);
-        OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(5) |
-                R300_PVS_NUM_CNTLRS(5) |
-                R300_PVS_NUM_FPUS(caps->num_vert_fpus) |
-                R300_PVS_VF_MAX_VTX_NUM(12));
-        END_CS;
-    }
-
-    /* Fragment shader setup */
-    if (caps->is_r500) {
-        r500_emit_fragment_program_code(r300, &r5xx_passthrough_fragment_shader, 0);
-        r300_emit_rs_block_state(r300, &r5xx_rs_block_clear_state);
-    } else {
-        r300_emit_fragment_program_code(r300, &r3xx_passthrough_fragment_shader, 0);
-        r300_emit_rs_block_state(r300, &r3xx_rs_block_clear_state);
-    }
-
-    BEGIN_CS(26);
-
-    /* VAP stream control, mapping from input memory to PVS/RS memory */
-    if (caps->has_tcl) {
-        OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
-            (R300_DATA_TYPE_FLOAT_4 << R300_DATA_TYPE_0_SHIFT) |
-            ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) |
-                R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT));
-    } else {
-        OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
-            (R300_DATA_TYPE_FLOAT_4 << R300_DATA_TYPE_0_SHIFT) |
-            ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) |
-                R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT));
-    }
-    OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_EXT_0,
-            (R300_VAP_SWIZZLE_XYZW << R300_SWIZZLE0_SHIFT) |
-            (R300_VAP_SWIZZLE_XYZW << R300_SWIZZLE1_SHIFT));
-
-    /* VAP format controls */
-    OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_0,
-            R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
-            R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
-    OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_1, 0x0);
-
-    /* Disable textures */
-    OUT_CS_REG(R300_TX_ENABLE, 0x0);
-
-    /* The size of the point we're about to draw, in sixths of pixels */
-    OUT_CS_REG(R300_GA_POINT_SIZE,
-        ((h * 6)  & R300_POINTSIZE_Y_MASK) |
-        ((w * 6) << R300_POINTSIZE_X_SHIFT));
-
-    /* Vertex size. */
-    OUT_CS_REG(R300_VAP_VTX_SIZE, 0x8);
-
-    /* Packet3 with our point vertex */
-    OUT_CS_PKT3(R200_3D_DRAW_IMMD_2, 8);
-    OUT_CS(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
-            (1 << R300_PRIM_NUM_VERTICES_SHIFT));
-    /* Position */
-    OUT_CS_32F(0.5);
-    OUT_CS_32F(0.5);
-    OUT_CS_32F(1.0);
-    OUT_CS_32F(1.0);
-    /* Color */
-    OUT_CS_32F(r);
-    OUT_CS_32F(g);
-    OUT_CS_32F(b);
-    OUT_CS_32F(a);
-
-    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT, 0xA);
-
-    END_CS;
-
-    r300->dirty_hw++;
-}
-
-static void r300_surface_copy(struct pipe_context* pipe,
-                              struct pipe_surface* dest,
-                              unsigned destx, unsigned desty,
-                              struct pipe_surface* src,
-                              unsigned srcx, unsigned srcy,
-                              unsigned w, unsigned h)
-{
-    struct r300_context* r300 = r300_context(pipe);
-    struct r300_capabilities* caps = r300_screen(pipe->screen)->caps;
-    struct r300_texture* srctex = (struct r300_texture*)src->texture;
-    struct r300_texture* desttex = (struct r300_texture*)dest->texture;
-    unsigned pixpitch = srctex->stride / srctex->tex.block.size;
-    boolean invalid = FALSE;
-    float fsrcx = srcx, fsrcy = srcy, fdestx = destx, fdesty = desty;
-    CS_LOCALS(r300);
-
-    debug_printf("r300: Copying surface %p at (%d,%d) to %p at (%d, %d),"
-        " dimensions %dx%d (pixel pitch %d)\n",
-        src, srcx, srcy, dest, destx, desty, w, h, pixpitch);
-
-    if ((srctex->buffer == desttex->buffer) &&
-            ((destx < srcx + w) || (srcx < destx + w)) &&
-            ((desty < srcy + h) || (srcy < desty + h))) {
-fallback:
-        debug_printf("r300: Falling back on surface_copy\n");
-        util_surface_copy(pipe, FALSE, dest, destx, desty, src,
-                srcx, srcy, w, h);
-    }
-
-    /* Add our target BOs to the list. */
-validate:
-    if (!r300->winsys->add_buffer(r300->winsys, srctex->buffer,
-                RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        goto validate;
-    }
-    if (!r300->winsys->add_buffer(r300->winsys, desttex->buffer,
-                0, RADEON_GEM_DOMAIN_VRAM)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        goto validate;
-    }
-    if (!r300->winsys->validate(r300->winsys)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        if (invalid) {
-            debug_printf("r300: Stuck in validation loop, gonna fallback.");
-            goto fallback;
-        }
-        invalid = TRUE;
-        goto validate;
-    }
-
-    r300_surface_setup(r300, desttex, destx, desty, w, h);
-
-    /* Setup the texture. */
-    r300_emit_texture(r300, &r300_sampler_copy_state, srctex, 0);
-
-    /* Flush and enable. */
-    r300_flush_textures(r300);
-
-    /* Vertex shader setup */
-    if (caps->has_tcl) {
-        r300_emit_vertex_program_code(r300, &r300_passthrough_vertex_shader, 0);
-    } else {
-        BEGIN_CS(4);
-        OUT_CS_REG(R300_VAP_CNTL_STATUS,
-#ifdef PIPE_ARCH_BIG_ENDIAN
-                   R300_VC_32BIT_SWAP |
-#endif
-                   R300_VAP_TCL_BYPASS);
-        OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(5) |
-                R300_PVS_NUM_CNTLRS(5) |
-                R300_PVS_NUM_FPUS(caps->num_vert_fpus) |
-                R300_PVS_VF_MAX_VTX_NUM(12));
-        END_CS;
-    }
-
-    /* Fragment shader setup */
-    if (caps->is_r500) {
-        r500_emit_fragment_program_code(r300, &r5xx_texture_fragment_shader, 0);
-        r300_emit_rs_block_state(r300, &r5xx_rs_block_copy_state);
-    } else {
-        r300_emit_fragment_program_code(r300, &r3xx_texture_fragment_shader, 0);
-        r300_emit_rs_block_state(r300, &r3xx_rs_block_copy_state);
-    }
-
-    BEGIN_CS(30);
-    /* VAP stream control, mapping from input memory to PVS/RS memory */
-    if (caps->has_tcl) {
-        OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
-            (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
-            ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) |
-                R300_DATA_TYPE_FLOAT_2) << R300_DATA_TYPE_1_SHIFT));
-    } else {
-        OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_0,
-            (R300_DATA_TYPE_FLOAT_2 << R300_DATA_TYPE_0_SHIFT) |
-            ((R300_LAST_VEC | (6 << R300_DST_VEC_LOC_SHIFT) |
-                R300_DATA_TYPE_FLOAT_2) << R300_DATA_TYPE_1_SHIFT));
-    }
-    OUT_CS_REG(R300_VAP_PROG_STREAM_CNTL_EXT_0,
-            (R300_VAP_SWIZZLE_XYZW << R300_SWIZZLE0_SHIFT) |
-            (R300_VAP_SWIZZLE_XYZW << R300_SWIZZLE1_SHIFT));
-
-    /* VAP format controls */
-    OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_0,
-            R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT);
-    /* Two components of texture 0 */
-    OUT_CS_REG(R300_VAP_OUTPUT_VTX_FMT_1, 0x2);
-
-    /* Vertex size. */
-    OUT_CS_REG(R300_VAP_VTX_SIZE, 0x4);
-
-    /* Packet3 with our texcoords */
-    OUT_CS_PKT3(R200_3D_DRAW_IMMD_2, 16);
-    OUT_CS(R300_PRIM_TYPE_QUADS | R300_PRIM_WALK_RING |
-            (4 << R300_PRIM_NUM_VERTICES_SHIFT));
-    /* (x    , y    ) */
-    OUT_CS_32F(fdestx / dest->width);
-    OUT_CS_32F(fdesty / dest->height);
-    OUT_CS_32F(fsrcx  / src->width);
-    OUT_CS_32F(fsrcy  / src->height);
-    /* (x    , y + h) */
-    OUT_CS_32F(fdestx / dest->width);
-    OUT_CS_32F((fdesty + h) / dest->height);
-    OUT_CS_32F(fsrcx  / src->width);
-    OUT_CS_32F((fsrcy  + h) / src->height);
-    /* (x + w, y + h) */
-    OUT_CS_32F((fdestx + w) / dest->width);
-    OUT_CS_32F((fdesty + h) / dest->height);
-    OUT_CS_32F((fsrcx  + w) / src->width);
-    OUT_CS_32F((fsrcy  + h) / src->height);
-    /* (x + w, y    ) */
-    OUT_CS_32F((fdestx + w) / dest->width);
-    OUT_CS_32F(fdesty / dest->height);
-    OUT_CS_32F((fsrcx  + w) / src->width);
-    OUT_CS_32F(fsrcy  / src->height);
-
-    OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT, 0xA);
-
-    END_CS;
-
-    r300->dirty_hw++;
-}
-
-void r300_init_surface_functions(struct r300_context* r300)
-{
-    r300->context.surface_fill = r300_surface_fill;
-    r300->context.surface_copy = r300_surface_copy;
-}
diff --git a/src/gallium/drivers/r300/r300_surface.h b/src/gallium/drivers/r300/r300_surface.h
deleted file mode 100644
index f9e98b2ec9c..00000000000
--- a/src/gallium/drivers/r300/r300_surface.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#ifndef R300_SURFACE_H
-#define R300_SURFACE_H
-
-#include "pipe/p_context.h"
-#include "pipe/p_screen.h"
-
-#include "util/u_rect.h"
-
-#include "r300_context.h"
-#include "r300_cs.h"
-#include "r300_emit.h"
-#include "r300_fs.h"
-#include "r300_vs.h"
-#include "r300_state_inlines.h"
-
-static struct r300_blend_state blend_clear_state = {
-    .blend_control = 0x0,
-    .alpha_blend_control = 0x0,
-    .rop = 0x0,
-    .dither = 0x0,
-};
-
-static struct r300_blend_color_state blend_color_clear_state = {
-    .blend_color = 0x0,
-    .blend_color_red_alpha = 0x0,
-    .blend_color_green_blue = 0x0,
-};
-
-static struct r300_dsa_state dsa_clear_state = {
-    .alpha_function = 0x0,
-    .alpha_reference = 0x0,
-    .z_buffer_control = 0x0,
-    .z_stencil_control = 0x0,
-    .stencil_ref_mask = R300_STENCILWRITEMASK_MASK,
-    .z_buffer_top = R300_ZTOP_ENABLE,
-    .stencil_ref_bf = 0x0,
-};
-
-static struct r300_rs_state rs_clear_state = {
-    .point_minmax = 0x36000006,
-    .line_control = 0x00030006,
-    .depth_scale_front = 0x0,
-    .depth_offset_front = 0x0,
-    .depth_scale_back = 0x0,
-    .depth_offset_back = 0x0,
-    .polygon_offset_enable = 0x0,
-    .cull_mode = 0x0,
-    .line_stipple_config = 0x3BAAAAAB,
-    .line_stipple_value = 0x0,
-    .color_control = R300_SHADE_MODEL_FLAT,
-};
-
-static struct r300_rs_block r3xx_rs_block_clear_state = {
-    .ip[0] = R500_RS_SEL_S(R300_RS_SEL_C0) |
-        R500_RS_SEL_T(R300_RS_SEL_C0) |
-        R500_RS_SEL_R(R300_RS_SEL_C0) |
-        R500_RS_SEL_Q(R300_RS_SEL_K1),
-    .inst[0] = R300_RS_INST_COL_CN_WRITE,
-    .count = R300_IT_COUNT(0) | R300_IC_COUNT(1) | R300_HIRES_EN,
-    .inst_count = 0,
-};
-
-static struct r300_rs_block r5xx_rs_block_clear_state = {
-    .ip[0] = R500_RS_SEL_S(R500_RS_IP_PTR_K0) |
-        R500_RS_SEL_T(R500_RS_IP_PTR_K0) |
-        R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
-        R500_RS_SEL_Q(R500_RS_IP_PTR_K1),
-    .inst[0] = R500_RS_INST_COL_CN_WRITE,
-    .count = R300_IT_COUNT(0) | R300_IC_COUNT(1) | R300_HIRES_EN,
-    .inst_count = 0,
-};
-
-/* The following state is used for surface_copy only. */
-
-static struct r300_rs_block r3xx_rs_block_copy_state = {
-    .ip[0] = R500_RS_SEL_S(R300_RS_SEL_K0) |
-        R500_RS_SEL_T(R300_RS_SEL_K0) |
-        R500_RS_SEL_R(R300_RS_SEL_K0) |
-        R500_RS_SEL_Q(R300_RS_SEL_K1),
-    .inst[0] = R300_RS_INST_COL_CN_WRITE,
-    .count = R300_IT_COUNT(2) | R300_IC_COUNT(0) | R300_HIRES_EN,
-    .inst_count = R300_RS_TX_OFFSET(0),
-};
-
-static struct r300_rs_block r5xx_rs_block_copy_state = {
-    .ip[0] = R500_RS_SEL_S(0) |
-        R500_RS_SEL_T(1) |
-        R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
-        R500_RS_SEL_Q(R500_RS_IP_PTR_K1),
-    .inst[0] = R500_RS_INST_TEX_CN_WRITE,
-    .count = R300_IT_COUNT(2) | R300_IC_COUNT(0) | R300_HIRES_EN,
-    .inst_count = R300_RS_TX_OFFSET(0),
-};
-
-static struct r300_sampler_state r300_sampler_copy_state = {
-    .filter0 = R300_TX_WRAP_S(R300_TX_CLAMP) |
-        R300_TX_WRAP_T(R300_TX_CLAMP) |
-        R300_TX_MAG_FILTER_NEAREST |
-        R300_TX_MIN_FILTER_NEAREST,
-};
-
-#endif /* R300_SURFACE_H */
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 590052509cc..aea25cf71dd 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -20,43 +20,99 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "pipe/p_screen.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "r300_context.h"
 #include "r300_texture.h"
+#include "r300_screen.h"
 
-static void r300_setup_texture_state(struct r300_texture* tex,
-                                     unsigned width,
-                                     unsigned height,
-                                     unsigned pitch,
-                                     unsigned levels)
+static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
 {
     struct r300_texture_state* state = &tex->state;
+    struct pipe_texture *pt = &tex->tex;
+
+    state->format0 = R300_TX_WIDTH((pt->width[0] - 1) & 0x7ff) |
+                     R300_TX_HEIGHT((pt->height[0] - 1) & 0x7ff);
+
+    if (tex->is_npot) {
+        /* rectangles love this */
+        state->format0 |= R300_TX_PITCH_EN;
+        state->format2 = (tex->pitch[0] - 1) & 0x1fff;
+    } else {
+        /* power of two textures (3D, mipmaps, and no pitch) */
+        state->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth[0]) & 0xf) |
+                          R300_TX_NUM_LEVELS(pt->last_level & 0xf);
+    }
 
-    state->format0 = R300_TX_WIDTH((width - 1) & 0x7ff) |
-        R300_TX_HEIGHT((height - 1) & 0x7ff) |
-        R300_TX_NUM_LEVELS(levels) |
-        R300_TX_PITCH_EN;
+    state->format1 = r300_translate_texformat(pt->format);
+    if (pt->target == PIPE_TEXTURE_CUBE) {
+        state->format1 |= R300_TX_FORMAT_CUBIC_MAP;
+    }
+    if (pt->target == PIPE_TEXTURE_3D) {
+        state->format1 |= R300_TX_FORMAT_3D;
+    }
 
-    /* XXX */
-    state->format1 = r300_translate_texformat(tex->tex.format);
+    /* large textures on r500 */
+    if (is_r500)
+    {
+        if (pt->width[0] > 2048) {
+            state->format2 |= R500_TXWIDTH_BIT11;
+        }
+        if (pt->height[0] > 2048) {
+            state->format2 |= R500_TXHEIGHT_BIT11;
+        }
+    }
+    assert(is_r500 || (pt->width[0] <= 2048 && pt->height[0] <= 2048));
 
-    state->format2 = pitch - 1;
+    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
+		 pt->width[0], pt->height[0], pt->last_level);
+}
 
-    /* Assume (somewhat foolishly) that oversized textures will
-     * not be permitted by the state tracker. */
-    if (width > 2048) {
-        state->format2 |= R500_TXWIDTH_BIT11;
+unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
+                                 unsigned zslice, unsigned face)
+{
+    unsigned offset = tex->offset[level];
+
+    switch (tex->tex.target) {
+        case PIPE_TEXTURE_3D:
+            assert(face == 0);
+            return offset + zslice * tex->layer_size[level];
+
+        case PIPE_TEXTURE_CUBE:
+            assert(zslice == 0);
+            return offset + face * tex->layer_size[level];
+
+        default:
+            assert(zslice == 0 && face == 0);
+            return offset;
     }
-    if (height > 2048) {
-        state->format2 |= R500_TXHEIGHT_BIT11;
+}
+
+/**
+ * Return the stride, in bytes, of the texture images of the given texture
+ * at the given level.
+ */
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
+{
+    if (tex->stride_override)
+        return tex->stride_override;
+
+    if (level > tex->tex.last_level) {
+        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__,
+            level, tex->tex.last_level);
+        return 0;
     }
 
-    debug_printf("r300: Set texture state (%dx%d, pitch %d, %d levels)\n",
-            width, height, pitch, levels);
+    return align(pf_get_stride(&tex->tex.block, tex->tex.width[level]), 32);
 }
 
 static void r300_setup_miptree(struct r300_texture* tex)
 {
     struct pipe_texture* base = &tex->tex;
-    int stride, size, offset;
+    int stride, size, layer_size;
     int i;
 
     for (i = 0; i <= base->last_level; i++) {
@@ -69,28 +125,32 @@ static void r300_setup_miptree(struct r300_texture* tex)
         base->nblocksx[i] = pf_get_nblocksx(&base->block, base->width[i]);
         base->nblocksy[i] = pf_get_nblocksy(&base->block, base->height[i]);
 
-        /* Radeons enjoy things in multiples of 64.
-         *
-         * XXX
-         * POT, uncompressed, unmippmapped textures can be aligned to 32,
-         * instead of 64. */
-        stride = align(pf_get_stride(&base->block, base->width[i]), 32);
-        size = stride * base->nblocksy[i] * base->depth[i];
+        stride = r300_texture_get_stride(tex, i);
+        layer_size = stride * base->nblocksy[i];
+
+        if (base->target == PIPE_TEXTURE_CUBE)
+            size = layer_size * 6;
+        else
+            size = layer_size * base->depth[i];
 
         tex->offset[i] = align(tex->size, 32);
         tex->size = tex->offset[i] + size;
+        tex->layer_size[i] = layer_size;
+        tex->pitch[i] = stride / base->block.size;
 
         debug_printf("r300: Texture miptree: Level %d "
                 "(%dx%dx%d px, pitch %d bytes)\n",
                 i, base->width[i], base->height[i], base->depth[i],
                 stride);
-        /* Save stride of first level to the texture. */
-        if (i == 0) {
-            tex->stride = stride;
-        }
     }
 }
 
+static void r300_setup_flags(struct r300_texture* tex)
+{
+    tex->is_npot = !util_is_power_of_two(tex->tex.width[0]) ||
+                   !util_is_power_of_two(tex->tex.height[0]);
+}
+
 /* Create a new texture. */
 static struct pipe_texture*
     r300_texture_create(struct pipe_screen* screen,
@@ -106,10 +166,9 @@ static struct pipe_texture*
     pipe_reference_init(&tex->tex.reference, 1);
     tex->tex.screen = screen;
 
+    r300_setup_flags(tex);
     r300_setup_miptree(tex);
-
-    r300_setup_texture_state(tex, template->width[0], template->height[0],
-            template->width[0], template->last_level);
+    r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
 
     tex->buffer = screen->buffer_create(screen, 1024,
                                         PIPE_BUFFER_USAGE_PIXEL,
@@ -143,8 +202,7 @@ static struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
     struct pipe_surface* surface = CALLOC_STRUCT(pipe_surface);
     unsigned offset;
 
-    /* XXX this is certainly dependent on tex target */
-    offset = tex->offset[level];
+    offset = r300_texture_get_offset(tex, level, zslice, face);
 
     if (surface) {
         pipe_reference_init(&surface->reference, 1);
@@ -154,6 +212,10 @@ static struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
         surface->height = texture->height[level];
         surface->offset = offset;
         surface->usage = flags;
+        surface->zslice = zslice;
+        surface->texture = texture;
+        surface->face = face;
+        surface->level = level;
     }
 
     return surface;
@@ -173,10 +235,10 @@ static struct pipe_texture*
 {
     struct r300_texture* tex;
 
-    /* XXX we should start doing mips now... */
+    /* Support only 2D textures without mipmaps */
     if (base->target != PIPE_TEXTURE_2D ||
-        base->last_level != 0 ||
-        base->depth[0] != 1) {
+        base->depth[0] != 1 ||
+        base->last_level != 0) {
         return NULL;
     }
 
@@ -189,17 +251,66 @@ static struct pipe_texture*
     pipe_reference_init(&tex->tex.reference, 1);
     tex->tex.screen = screen;
 
-    tex->stride = *stride;
+    tex->stride_override = *stride;
+    tex->pitch[0] = *stride / base->block.size;
 
-    /* XXX */
-    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0],
-            tex->stride, 0);
+    r300_setup_flags(tex);
+    r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
     return (struct pipe_texture*)tex;
 }
 
+static struct pipe_video_surface *
+r300_video_surface_create(struct pipe_screen *screen,
+                          enum pipe_video_chroma_format chroma_format,
+                          unsigned width, unsigned height)
+{
+    struct r300_video_surface *r300_vsfc;
+    struct pipe_texture template;
+
+    assert(screen);
+    assert(width && height);
+
+    r300_vsfc = CALLOC_STRUCT(r300_video_surface);
+    if (!r300_vsfc)
+       return NULL;
+
+    pipe_reference_init(&r300_vsfc->base.reference, 1);
+    r300_vsfc->base.screen = screen;
+    r300_vsfc->base.chroma_format = chroma_format;
+    r300_vsfc->base.width = width;
+    r300_vsfc->base.height = height;
+
+    memset(&template, 0, sizeof(struct pipe_texture));
+    template.target = PIPE_TEXTURE_2D;
+    template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+    template.last_level = 0;
+    template.width[0] = util_next_power_of_two(width);
+    template.height[0] = util_next_power_of_two(height);
+    template.depth[0] = 1;
+    pf_get_block(template.format, &template.block);
+    template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER |
+                         PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+    r300_vsfc->tex = screen->texture_create(screen, &template);
+    if (!r300_vsfc->tex)
+    {
+        FREE(r300_vsfc);
+        return NULL;
+    }
+
+    return &r300_vsfc->base;
+}
+
+static void r300_video_surface_destroy(struct pipe_video_surface *vsfc)
+{
+    struct r300_video_surface *r300_vsfc = r300_video_surface(vsfc);
+    pipe_texture_reference(&r300_vsfc->tex, NULL);
+    FREE(r300_vsfc);
+}
+
 void r300_init_screen_texture_functions(struct pipe_screen* screen)
 {
     screen->texture_create = r300_texture_create;
@@ -207,6 +318,9 @@ void r300_init_screen_texture_functions(struct pipe_screen* screen)
     screen->get_tex_surface = r300_get_tex_surface;
     screen->tex_surface_destroy = r300_tex_surface_destroy;
     screen->texture_blanket = r300_texture_blanket;
+
+    screen->video_surface_create = r300_video_surface_create;
+    screen->video_surface_destroy= r300_video_surface_destroy;
 }
 
 boolean r300_get_texture_buffer(struct pipe_texture* texture,
@@ -221,7 +335,7 @@ boolean r300_get_texture_buffer(struct pipe_texture* texture,
     pipe_buffer_reference(buffer, tex->buffer);
 
     if (stride) {
-        *stride = tex->stride;
+        *stride = r300_texture_get_stride(tex, 0);
     }
 
     return TRUE;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 3b56f0307c0..55ceb1a5136 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -23,15 +23,19 @@
 #ifndef R300_TEXTURE_H
 #define R300_TEXTURE_H
 
-#include "pipe/p_screen.h"
+#include "pipe/p_video_state.h"
 
-#include "util/u_math.h"
-
-#include "r300_context.h"
 #include "r300_reg.h"
 
+struct r300_texture;
+
 void r300_init_screen_texture_functions(struct pipe_screen* screen);
 
+unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level);
+
+unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
+                                 unsigned zslice, unsigned face);
+
 /* Note the signature of R300_EASY_TX_FORMAT(A, R, G, B, FORMAT)... */
 static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
 {
@@ -39,11 +43,28 @@ static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
         /* X8 */
         case PIPE_FORMAT_I8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, X8);
+        case PIPE_FORMAT_L8_UNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, ONE, X8);
+        /* X16 */
+        case PIPE_FORMAT_R16_UNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, X, X16);
+        case PIPE_FORMAT_R16_SNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, X, X16) |
+                R300_TX_FORMAT_SIGNED;
+        case PIPE_FORMAT_Z16_UNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, X, X16);
+        /* Y8X8 */
+        case PIPE_FORMAT_A8L8_UNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8);
         /* W8Z8Y8X8 */
         case PIPE_FORMAT_A8R8G8B8_UNORM:
             return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
         case PIPE_FORMAT_R8G8B8A8_UNORM:
             return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8);
+        case PIPE_FORMAT_X8R8G8B8_UNORM:
+            return R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+        case PIPE_FORMAT_R8G8B8X8_UNORM:
+            return R300_EASY_TX_FORMAT(Y, Z, ONE, X, W8Z8Y8X8);
         case PIPE_FORMAT_A8R8G8B8_SRGB:
             return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8) |
                 R300_TX_FORMAT_GAMMA;
@@ -67,7 +88,9 @@ static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
                 R300_TX_FORMAT_YUV_TO_RGB;
         /* W24_FP */
         case PIPE_FORMAT_Z24S8_UNORM:
+        case PIPE_FORMAT_Z24X8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, W24_FP);
+
         default:
             debug_printf("r300: Implementation error: "
                 "Got unsupported texture format %s in %s\n",
@@ -78,6 +101,18 @@ static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
     return 0;
 }
 
+struct r300_video_surface
+{
+    struct pipe_video_surface   base;
+    struct pipe_texture         *tex;
+};
+
+static INLINE struct r300_video_surface *
+r300_video_surface(struct pipe_video_surface *pvs)
+{
+    return (struct r300_video_surface *)pvs;
+}
+
 #ifndef R300_WINSYS_H
 
 boolean r300_get_texture_buffer(struct pipe_texture* texture,
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index d68a1041063..589f1984ee3 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -33,151 +33,145 @@
 static unsigned translate_opcode(unsigned opcode)
 {
     switch(opcode) {
-        case TGSI_OPCODE_ARL: return OPCODE_ARL;
-        case TGSI_OPCODE_MOV: return OPCODE_MOV;
-        case TGSI_OPCODE_LIT: return OPCODE_LIT;
-        case TGSI_OPCODE_RCP: return OPCODE_RCP;
-        case TGSI_OPCODE_RSQ: return OPCODE_RSQ;
-        case TGSI_OPCODE_EXP: return OPCODE_EXP;
-        case TGSI_OPCODE_LOG: return OPCODE_LOG;
-        case TGSI_OPCODE_MUL: return OPCODE_MUL;
-        case TGSI_OPCODE_ADD: return OPCODE_ADD;
-        case TGSI_OPCODE_DP3: return OPCODE_DP3;
-        case TGSI_OPCODE_DP4: return OPCODE_DP4;
-        case TGSI_OPCODE_DST: return OPCODE_DST;
-        case TGSI_OPCODE_MIN: return OPCODE_MIN;
-        case TGSI_OPCODE_MAX: return OPCODE_MAX;
-        case TGSI_OPCODE_SLT: return OPCODE_SLT;
-        case TGSI_OPCODE_SGE: return OPCODE_SGE;
-        case TGSI_OPCODE_MAD: return OPCODE_MAD;
-        case TGSI_OPCODE_SUB: return OPCODE_SUB;
-        case TGSI_OPCODE_LRP: return OPCODE_LRP;
-     /* case TGSI_OPCODE_CND: return OPCODE_CND; */
-        case TGSI_OPCODE_DP2A: return OPCODE_DP2A;
+        case TGSI_OPCODE_ARL: return RC_OPCODE_ARL;
+        case TGSI_OPCODE_MOV: return RC_OPCODE_MOV;
+        case TGSI_OPCODE_LIT: return RC_OPCODE_LIT;
+        case TGSI_OPCODE_RCP: return RC_OPCODE_RCP;
+        case TGSI_OPCODE_RSQ: return RC_OPCODE_RSQ;
+        case TGSI_OPCODE_EXP: return RC_OPCODE_EXP;
+        case TGSI_OPCODE_LOG: return RC_OPCODE_LOG;
+        case TGSI_OPCODE_MUL: return RC_OPCODE_MUL;
+        case TGSI_OPCODE_ADD: return RC_OPCODE_ADD;
+        case TGSI_OPCODE_DP3: return RC_OPCODE_DP3;
+        case TGSI_OPCODE_DP4: return RC_OPCODE_DP4;
+        case TGSI_OPCODE_DST: return RC_OPCODE_DST;
+        case TGSI_OPCODE_MIN: return RC_OPCODE_MIN;
+        case TGSI_OPCODE_MAX: return RC_OPCODE_MAX;
+        case TGSI_OPCODE_SLT: return RC_OPCODE_SLT;
+        case TGSI_OPCODE_SGE: return RC_OPCODE_SGE;
+        case TGSI_OPCODE_MAD: return RC_OPCODE_MAD;
+        case TGSI_OPCODE_SUB: return RC_OPCODE_SUB;
+        case TGSI_OPCODE_LRP: return RC_OPCODE_LRP;
+     /* case TGSI_OPCODE_CND: return RC_OPCODE_CND; */
+     /* case TGSI_OPCODE_CND0: return RC_OPCODE_CND0; */
+     /* case TGSI_OPCODE_DP2A: return RC_OPCODE_DP2A; */
                                         /* gap */
-        case TGSI_OPCODE_FRC: return OPCODE_FRC;
-     /* case TGSI_OPCODE_CLAMP: return OPCODE_CLAMP; */
-        case TGSI_OPCODE_FLR: return OPCODE_FLR;
-     /* case TGSI_OPCODE_ROUND: return OPCODE_ROUND; */
-        case TGSI_OPCODE_EX2: return OPCODE_EX2;
-        case TGSI_OPCODE_LG2: return OPCODE_LG2;
-        case TGSI_OPCODE_POW: return OPCODE_POW;
-        case TGSI_OPCODE_XPD: return OPCODE_XPD;
+        case TGSI_OPCODE_FRC: return RC_OPCODE_FRC;
+     /* case TGSI_OPCODE_CLAMP: return RC_OPCODE_CLAMP; */
+        case TGSI_OPCODE_FLR: return RC_OPCODE_FLR;
+     /* case TGSI_OPCODE_ROUND: return RC_OPCODE_ROUND; */
+        case TGSI_OPCODE_EX2: return RC_OPCODE_EX2;
+        case TGSI_OPCODE_LG2: return RC_OPCODE_LG2;
+        case TGSI_OPCODE_POW: return RC_OPCODE_POW;
+        case TGSI_OPCODE_XPD: return RC_OPCODE_XPD;
                                         /* gap */
-        case TGSI_OPCODE_ABS: return OPCODE_ABS;
-        case TGSI_OPCODE_RCC: return OPCODE_RCC;
-        case TGSI_OPCODE_DPH: return OPCODE_DPH;
-        case TGSI_OPCODE_COS: return OPCODE_COS;
-        case TGSI_OPCODE_DDX: return OPCODE_DDX;
-        case TGSI_OPCODE_DDY: return OPCODE_DDY;
-     /* case TGSI_OPCODE_KILP: return OPCODE_KILP; */
-        case TGSI_OPCODE_PK2H: return OPCODE_PK2H;
-        case TGSI_OPCODE_PK2US: return OPCODE_PK2US;
-        case TGSI_OPCODE_PK4B: return OPCODE_PK4B;
-        case TGSI_OPCODE_PK4UB: return OPCODE_PK4UB;
-        case TGSI_OPCODE_RFL: return OPCODE_RFL;
-        case TGSI_OPCODE_SEQ: return OPCODE_SEQ;
-        case TGSI_OPCODE_SFL: return OPCODE_SFL;
-        case TGSI_OPCODE_SGT: return OPCODE_SGT;
-        case TGSI_OPCODE_SIN: return OPCODE_SIN;
-        case TGSI_OPCODE_SLE: return OPCODE_SLE;
-        case TGSI_OPCODE_SNE: return OPCODE_SNE;
-        case TGSI_OPCODE_STR: return OPCODE_STR;
-        case TGSI_OPCODE_TEX: return OPCODE_TEX;
-        case TGSI_OPCODE_TXD: return OPCODE_TXD;
-        case TGSI_OPCODE_TXP: return OPCODE_TXP;
-        case TGSI_OPCODE_UP2H: return OPCODE_UP2H;
-        case TGSI_OPCODE_UP2US: return OPCODE_UP2US;
-        case TGSI_OPCODE_UP4B: return OPCODE_UP4B;
-        case TGSI_OPCODE_UP4UB: return OPCODE_UP4UB;
-        case TGSI_OPCODE_X2D: return OPCODE_X2D;
-        case TGSI_OPCODE_ARA: return OPCODE_ARA;
-        case TGSI_OPCODE_ARR: return OPCODE_ARR;
-        case TGSI_OPCODE_BRA: return OPCODE_BRA;
-        case TGSI_OPCODE_CAL: return OPCODE_CAL;
-        case TGSI_OPCODE_RET: return OPCODE_RET;
-        case TGSI_OPCODE_SSG: return OPCODE_SSG;
-        case TGSI_OPCODE_CMP: return OPCODE_CMP;
-        case TGSI_OPCODE_SCS: return OPCODE_SCS;
-        case TGSI_OPCODE_TXB: return OPCODE_TXB;
-     /* case TGSI_OPCODE_NRM: return OPCODE_NRM; */
-     /* case TGSI_OPCODE_DIV: return OPCODE_DIV; */
-        case TGSI_OPCODE_DP2: return OPCODE_DP2;
-        case TGSI_OPCODE_TXL: return OPCODE_TXL;
-        case TGSI_OPCODE_BRK: return OPCODE_BRK;
-        case TGSI_OPCODE_IF: return OPCODE_IF;
-     /* case TGSI_OPCODE_LOOP: return OPCODE_LOOP; */
-     /* case TGSI_OPCODE_REP: return OPCODE_REP; */
-        case TGSI_OPCODE_ELSE: return OPCODE_ELSE;
-        case TGSI_OPCODE_ENDIF: return OPCODE_ENDIF;
-        case TGSI_OPCODE_ENDLOOP: return OPCODE_ENDLOOP;
-     /* case TGSI_OPCODE_ENDREP: return OPCODE_ENDREP; */
-        case TGSI_OPCODE_PUSHA: return OPCODE_PUSHA;
-        case TGSI_OPCODE_POPA: return OPCODE_POPA;
-     /* case TGSI_OPCODE_CEIL: return OPCODE_CEIL; */
-     /* case TGSI_OPCODE_I2F: return OPCODE_I2F; */
-        case TGSI_OPCODE_NOT: return OPCODE_NOT;
-        case TGSI_OPCODE_TRUNC: return OPCODE_TRUNC;
-     /* case TGSI_OPCODE_SHL: return OPCODE_SHL; */
-     /* case TGSI_OPCODE_SHR: return OPCODE_SHR; */
-        case TGSI_OPCODE_AND: return OPCODE_AND;
-        case TGSI_OPCODE_OR: return OPCODE_OR;
-     /* case TGSI_OPCODE_MOD: return OPCODE_MOD; */
-        case TGSI_OPCODE_XOR: return OPCODE_XOR;
-     /* case TGSI_OPCODE_SAD: return OPCODE_SAD; */
-     /* case TGSI_OPCODE_TXF: return OPCODE_TXF; */
-     /* case TGSI_OPCODE_TXQ: return OPCODE_TXQ; */
-        case TGSI_OPCODE_CONT: return OPCODE_CONT;
-     /* case TGSI_OPCODE_EMIT: return OPCODE_EMIT; */
-     /* case TGSI_OPCODE_ENDPRIM: return OPCODE_ENDPRIM; */
-     /* case TGSI_OPCODE_BGNLOOP2: return OPCODE_BGNLOOP2; */
-        case TGSI_OPCODE_BGNSUB: return OPCODE_BGNSUB;
-     /* case TGSI_OPCODE_ENDLOOP2: return OPCODE_ENDLOOP2; */
-        case TGSI_OPCODE_ENDSUB: return OPCODE_ENDSUB;
-        case TGSI_OPCODE_NOISE1: return OPCODE_NOISE1;
-        case TGSI_OPCODE_NOISE2: return OPCODE_NOISE2;
-        case TGSI_OPCODE_NOISE3: return OPCODE_NOISE3;
-        case TGSI_OPCODE_NOISE4: return OPCODE_NOISE4;
-        case TGSI_OPCODE_NOP: return OPCODE_NOP;
+        case TGSI_OPCODE_ABS: return RC_OPCODE_ABS;
+     /* case TGSI_OPCODE_RCC: return RC_OPCODE_RCC; */
+        case TGSI_OPCODE_DPH: return RC_OPCODE_DPH;
+        case TGSI_OPCODE_COS: return RC_OPCODE_COS;
+        case TGSI_OPCODE_DDX: return RC_OPCODE_DDX;
+        case TGSI_OPCODE_DDY: return RC_OPCODE_DDY;
+     /* case TGSI_OPCODE_KILP: return RC_OPCODE_KILP; */
+     /* case TGSI_OPCODE_PK2H: return RC_OPCODE_PK2H; */
+     /* case TGSI_OPCODE_PK2US: return RC_OPCODE_PK2US; */
+     /* case TGSI_OPCODE_PK4B: return RC_OPCODE_PK4B; */
+     /* case TGSI_OPCODE_PK4UB: return RC_OPCODE_PK4UB; */
+     /* case TGSI_OPCODE_RFL: return RC_OPCODE_RFL; */
+        case TGSI_OPCODE_SEQ: return RC_OPCODE_SEQ;
+        case TGSI_OPCODE_SFL: return RC_OPCODE_SFL;
+        case TGSI_OPCODE_SGT: return RC_OPCODE_SGT;
+        case TGSI_OPCODE_SIN: return RC_OPCODE_SIN;
+        case TGSI_OPCODE_SLE: return RC_OPCODE_SLE;
+        case TGSI_OPCODE_SNE: return RC_OPCODE_SNE;
+     /* case TGSI_OPCODE_STR: return RC_OPCODE_STR; */
+        case TGSI_OPCODE_TEX: return RC_OPCODE_TEX;
+        case TGSI_OPCODE_TXD: return RC_OPCODE_TXD;
+        case TGSI_OPCODE_TXP: return RC_OPCODE_TXP;
+     /* case TGSI_OPCODE_UP2H: return RC_OPCODE_UP2H; */
+     /* case TGSI_OPCODE_UP2US: return RC_OPCODE_UP2US; */
+     /* case TGSI_OPCODE_UP4B: return RC_OPCODE_UP4B; */
+     /* case TGSI_OPCODE_UP4UB: return RC_OPCODE_UP4UB; */
+     /* case TGSI_OPCODE_X2D: return RC_OPCODE_X2D; */
+     /* case TGSI_OPCODE_ARA: return RC_OPCODE_ARA; */
+     /* case TGSI_OPCODE_ARR: return RC_OPCODE_ARR; */
+     /* case TGSI_OPCODE_BRA: return RC_OPCODE_BRA; */
+     /* case TGSI_OPCODE_CAL: return RC_OPCODE_CAL; */
+     /* case TGSI_OPCODE_RET: return RC_OPCODE_RET; */
+     /* case TGSI_OPCODE_SSG: return RC_OPCODE_SSG; */
+        case TGSI_OPCODE_CMP: return RC_OPCODE_CMP;
+        case TGSI_OPCODE_SCS: return RC_OPCODE_SCS;
+        case TGSI_OPCODE_TXB: return RC_OPCODE_TXB;
+     /* case TGSI_OPCODE_NRM: return RC_OPCODE_NRM; */
+     /* case TGSI_OPCODE_DIV: return RC_OPCODE_DIV; */
+     /* case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; */
+        case TGSI_OPCODE_TXL: return RC_OPCODE_TXL;
+     /* case TGSI_OPCODE_BRK: return RC_OPCODE_BRK; */
+        case TGSI_OPCODE_IF: return RC_OPCODE_IF;
+     /* case TGSI_OPCODE_LOOP: return RC_OPCODE_LOOP; */
+     /* case TGSI_OPCODE_REP: return RC_OPCODE_REP; */
+        case TGSI_OPCODE_ELSE: return RC_OPCODE_ELSE;
+        case TGSI_OPCODE_ENDIF: return RC_OPCODE_ENDIF;
+     /* case TGSI_OPCODE_ENDLOOP: return RC_OPCODE_ENDLOOP; */
+     /* case TGSI_OPCODE_ENDREP: return RC_OPCODE_ENDREP; */
+     /* case TGSI_OPCODE_PUSHA: return RC_OPCODE_PUSHA; */
+     /* case TGSI_OPCODE_POPA: return RC_OPCODE_POPA; */
+     /* case TGSI_OPCODE_CEIL: return RC_OPCODE_CEIL; */
+     /* case TGSI_OPCODE_I2F: return RC_OPCODE_I2F; */
+     /* case TGSI_OPCODE_NOT: return RC_OPCODE_NOT; */
+     /* case TGSI_OPCODE_TRUNC: return RC_OPCODE_TRUNC; */
+     /* case TGSI_OPCODE_SHL: return RC_OPCODE_SHL; */
+     /* case TGSI_OPCODE_SHR: return RC_OPCODE_SHR; */
+     /* case TGSI_OPCODE_AND: return RC_OPCODE_AND; */
+     /* case TGSI_OPCODE_OR: return RC_OPCODE_OR; */
+     /* case TGSI_OPCODE_MOD: return RC_OPCODE_MOD; */
+     /* case TGSI_OPCODE_XOR: return RC_OPCODE_XOR; */
+     /* case TGSI_OPCODE_SAD: return RC_OPCODE_SAD; */
+     /* case TGSI_OPCODE_TXF: return RC_OPCODE_TXF; */
+     /* case TGSI_OPCODE_TXQ: return RC_OPCODE_TXQ; */
+     /* case TGSI_OPCODE_CONT: return RC_OPCODE_CONT; */
+     /* case TGSI_OPCODE_EMIT: return RC_OPCODE_EMIT; */
+     /* case TGSI_OPCODE_ENDPRIM: return RC_OPCODE_ENDPRIM; */
+     /* case TGSI_OPCODE_BGNLOOP2: return RC_OPCODE_BGNLOOP2; */
+     /* case TGSI_OPCODE_BGNSUB: return RC_OPCODE_BGNSUB; */
+     /* case TGSI_OPCODE_ENDLOOP2: return RC_OPCODE_ENDLOOP2; */
+     /* case TGSI_OPCODE_ENDSUB: return RC_OPCODE_ENDSUB; */
+        case TGSI_OPCODE_NOP: return RC_OPCODE_NOP;
                                         /* gap */
-        case TGSI_OPCODE_NRM4: return OPCODE_NRM4;
-     /* case TGSI_OPCODE_CALLNZ: return OPCODE_CALLNZ; */
-     /* case TGSI_OPCODE_IFC: return OPCODE_IFC; */
-     /* case TGSI_OPCODE_BREAKC: return OPCODE_BREAKC; */
-        case TGSI_OPCODE_KIL: return OPCODE_KIL;
-        case TGSI_OPCODE_END: return OPCODE_END;
-        case TGSI_OPCODE_SWZ: return OPCODE_SWZ;
+     /* case TGSI_OPCODE_NRM4: return RC_OPCODE_NRM4; */
+     /* case TGSI_OPCODE_CALLNZ: return RC_OPCODE_CALLNZ; */
+     /* case TGSI_OPCODE_IFC: return RC_OPCODE_IFC; */
+     /* case TGSI_OPCODE_BREAKC: return RC_OPCODE_BREAKC; */
+        case TGSI_OPCODE_KIL: return RC_OPCODE_KIL;
     }
 
     fprintf(stderr, "Unknown opcode: %i\n", opcode);
-    abort();
+    return RC_OPCODE_ILLEGAL_OPCODE;
 }
 
 static unsigned translate_saturate(unsigned saturate)
 {
     switch(saturate) {
-        case TGSI_SAT_NONE: return SATURATE_OFF;
-        case TGSI_SAT_ZERO_ONE: return SATURATE_ZERO_ONE;
-        case TGSI_SAT_MINUS_PLUS_ONE: return SATURATE_PLUS_MINUS_ONE;
+        default:
+            fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
+            /* fall-through */
+        case TGSI_SAT_NONE: return RC_SATURATE_NONE;
+        case TGSI_SAT_ZERO_ONE: return RC_SATURATE_ZERO_ONE;
     }
-
-    fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
-    abort();
 }
 
 static unsigned translate_register_file(unsigned file)
 {
     switch(file) {
-        case TGSI_FILE_CONSTANT: return PROGRAM_CONSTANT;
-        case TGSI_FILE_IMMEDIATE: return PROGRAM_CONSTANT;
-        case TGSI_FILE_INPUT: return PROGRAM_INPUT;
-        case TGSI_FILE_OUTPUT: return PROGRAM_OUTPUT;
-        case TGSI_FILE_TEMPORARY: return PROGRAM_TEMPORARY;
-        case TGSI_FILE_ADDRESS: return PROGRAM_ADDRESS;
+        case TGSI_FILE_CONSTANT: return RC_FILE_CONSTANT;
+        case TGSI_FILE_IMMEDIATE: return RC_FILE_CONSTANT;
+        case TGSI_FILE_INPUT: return RC_FILE_INPUT;
+        case TGSI_FILE_OUTPUT: return RC_FILE_OUTPUT;
+        default:
+            fprintf(stderr, "Unhandled register file: %i\n", file);
+            /* fall-through */
+        case TGSI_FILE_TEMPORARY: return RC_FILE_TEMPORARY;
+        case TGSI_FILE_ADDRESS: return RC_FILE_ADDRESS;
     }
-
-    fprintf(stderr, "Unhandled register file: %i\n", file);
-    abort();
 }
 
 static int translate_register_index(
@@ -193,7 +187,7 @@ static int translate_register_index(
 
 static void transform_dstreg(
     struct tgsi_to_rc * ttr,
-    struct prog_dst_register * dst,
+    struct rc_dst_register * dst,
     struct tgsi_full_dst_register * src)
 {
     dst->File = translate_register_file(src->DstRegister.File);
@@ -204,77 +198,73 @@ static void transform_dstreg(
 
 static void transform_srcreg(
     struct tgsi_to_rc * ttr,
-    struct prog_src_register * dst,
+    struct rc_src_register * dst,
     struct tgsi_full_src_register * src)
 {
     dst->File = translate_register_file(src->SrcRegister.File);
     dst->Index = translate_register_index(ttr, src->SrcRegister.File, src->SrcRegister.Index);
     dst->RelAddr = src->SrcRegister.Indirect;
-    dst->Swizzle = tgsi_util_get_full_src_register_extswizzle(src, 0);
-    dst->Swizzle |= tgsi_util_get_full_src_register_extswizzle(src, 1) << 3;
-    dst->Swizzle |= tgsi_util_get_full_src_register_extswizzle(src, 2) << 6;
-    dst->Swizzle |= tgsi_util_get_full_src_register_extswizzle(src, 3) << 9;
+    dst->Swizzle = tgsi_util_get_full_src_register_swizzle(src, 0);
+    dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 1) << 3;
+    dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 2) << 6;
+    dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 3) << 9;
     dst->Abs = src->SrcRegisterExtMod.Absolute;
-    dst->Negate =
-        src->SrcRegisterExtSwz.NegateX |
-        (src->SrcRegisterExtSwz.NegateY << 1) |
-        (src->SrcRegisterExtSwz.NegateZ << 2) |
-        (src->SrcRegisterExtSwz.NegateW << 3);
-    dst->Negate ^= src->SrcRegister.Negate ? NEGATE_XYZW : 0;
+    dst->Negate = src->SrcRegister.Negate ? RC_MASK_XYZW : 0;
 }
 
 static void transform_texture(struct rc_instruction * dst, struct tgsi_instruction_ext_texture src)
 {
     switch(src.Texture) {
         case TGSI_TEXTURE_1D:
-            dst->I.TexSrcTarget = TEXTURE_1D_INDEX;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_1D;
             break;
         case TGSI_TEXTURE_2D:
-            dst->I.TexSrcTarget = TEXTURE_2D_INDEX;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_2D;
             break;
         case TGSI_TEXTURE_3D:
-            dst->I.TexSrcTarget = TEXTURE_3D_INDEX;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_3D;
             break;
         case TGSI_TEXTURE_CUBE:
-            dst->I.TexSrcTarget = TEXTURE_CUBE_INDEX;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_CUBE;
             break;
         case TGSI_TEXTURE_RECT:
-            dst->I.TexSrcTarget = TEXTURE_RECT_INDEX;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_RECT;
             break;
         case TGSI_TEXTURE_SHADOW1D:
-            dst->I.TexSrcTarget = TEXTURE_1D_INDEX;
-            dst->I.TexShadow = 1;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_1D;
+            dst->U.I.TexShadow = 1;
             break;
         case TGSI_TEXTURE_SHADOW2D:
-            dst->I.TexSrcTarget = TEXTURE_2D_INDEX;
-            dst->I.TexShadow = 1;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_2D;
+            dst->U.I.TexShadow = 1;
             break;
         case TGSI_TEXTURE_SHADOWRECT:
-            dst->I.TexSrcTarget = TEXTURE_RECT_INDEX;
-            dst->I.TexShadow = 1;
+            dst->U.I.TexSrcTarget = RC_TEXTURE_RECT;
+            dst->U.I.TexShadow = 1;
             break;
     }
 }
 
 static void transform_instruction(struct tgsi_to_rc * ttr, struct tgsi_full_instruction * src)
 {
+    struct rc_instruction * dst;
+    int i;
+
     if (src->Instruction.Opcode == TGSI_OPCODE_END)
         return;
 
-    struct rc_instruction * dst = rc_insert_new_instruction(ttr->compiler, ttr->compiler->Program.Instructions.Prev);
-    int i;
-
-    dst->I.Opcode = translate_opcode(src->Instruction.Opcode);
-    dst->I.SaturateMode = translate_saturate(src->Instruction.Saturate);
+    dst = rc_insert_new_instruction(ttr->compiler, ttr->compiler->Program.Instructions.Prev);
+    dst->U.I.Opcode = translate_opcode(src->Instruction.Opcode);
+    dst->U.I.SaturateMode = translate_saturate(src->Instruction.Saturate);
 
     if (src->Instruction.NumDstRegs)
-        transform_dstreg(ttr, &dst->I.DstReg, &src->FullDstRegisters[0]);
+        transform_dstreg(ttr, &dst->U.I.DstReg, &src->FullDstRegisters[0]);
 
     for(i = 0; i < src->Instruction.NumSrcRegs; ++i) {
         if (src->FullSrcRegisters[i].SrcRegister.File == TGSI_FILE_SAMPLER)
-            dst->I.TexSrcUnit = src->FullSrcRegisters[i].SrcRegister.Index;
+            dst->U.I.TexSrcUnit = src->FullSrcRegisters[i].SrcRegister.Index;
         else
-            transform_srcreg(ttr, &dst->I.SrcReg[i], &src->FullSrcRegisters[i]);
+            transform_srcreg(ttr, &dst->U.I.SrcReg[i], &src->FullSrcRegisters[i]);
     }
 
     /* Texturing. */
diff --git a/src/gallium/drivers/r300/r300_vbo.c b/src/gallium/drivers/r300/r300_vbo.c
new file mode 100644
index 00000000000..a6a159667a3
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_vbo.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* r300_vbo: Various helpers for emitting vertex buffers. Needs cleanup,
+ * refactoring, etc. */
+
+#include "r300_vbo.h"
+
+#include "pipe/p_format.h"
+
+#include "r300_cs.h"
+#include "r300_context.h"
+#include "r300_state_inlines.h"
+#include "r300_reg.h"
+#include "r300_winsys.h"
+
+static INLINE void setup_vertex_attribute(struct r300_vertex_info *vinfo,
+                                          struct pipe_vertex_element *vert_elem,
+                                          unsigned attr_num)
+{
+    uint16_t hw_fmt1, hw_fmt2;
+
+    hw_fmt1 = r300_translate_vertex_data_type(vert_elem->src_format) |
+        (attr_num << R300_DST_VEC_LOC_SHIFT);
+    hw_fmt2 = r300_translate_vertex_data_swizzle(vert_elem->src_format);
+
+    if (attr_num % 2 == 0)
+    {
+        vinfo->vap_prog_stream_cntl[attr_num >> 1] = hw_fmt1;
+        vinfo->vap_prog_stream_cntl_ext[attr_num >> 1] = hw_fmt2;
+    }
+    else
+    {
+        vinfo->vap_prog_stream_cntl[attr_num >> 1] |= hw_fmt1 << 16;
+        vinfo->vap_prog_stream_cntl_ext[attr_num >> 1] |= hw_fmt2 << 16;
+    }
+}
+
+static void finish_vertex_attribs_setup(struct r300_vertex_info *vinfo,
+                                        unsigned attribs_num)
+{
+    uint32_t last_vec_bit = (attribs_num % 2 == 0) ?
+        (R300_LAST_VEC << 16) : R300_LAST_VEC;
+
+    assert(attribs_num > 0 && attribs_num <= 16);
+    vinfo->vap_prog_stream_cntl[(attribs_num - 1) >> 1] |= last_vec_bit;
+}
+
+void setup_vertex_attributes(struct r300_context *r300)
+{
+    struct pipe_vertex_element *vert_elem;
+    int i;
+
+    for (i = 0; i < r300->vertex_element_count; i++) {
+        vert_elem = &r300->vertex_element[i];
+        setup_vertex_attribute(r300->vertex_info, vert_elem, i);
+    }
+
+    finish_vertex_attribs_setup(r300->vertex_info,
+        r300->vertex_element_count);
+}
+
+static INLINE int get_buffer_offset(struct r300_context *r300,
+                                    unsigned int buf_nr,
+                                    unsigned int elem_offset)
+{
+    return r300->vertex_buffer[buf_nr].buffer_offset + elem_offset;
+}
+#if 0
+/* XXX not called at all */
+static void setup_vertex_buffers(struct r300_context *r300)
+{
+    struct pipe_vertex_element *vert_elem;
+    int i;
+
+    for (i = 0; i < r300->aos_count; i++)
+    {
+        vert_elem = &r300->vertex_element[i];
+            /* XXX use translate module to convert the data */
+        if (!format_is_supported(vert_elem->src_format,
+                                 vert_elem->nr_components)) {
+            assert(0);
+            /*
+            struct pipe_buffer *buf;
+            const unsigned int max_index = r300->vertex_buffers[vert_elem->vertex_buffer_index].max_index;
+            buf = pipe_buffer_create(r300->context.screen, 4, usage, vert_elem->nr_components * max_index * sizeof(float));
+            */
+        }
+
+        if (get_buffer_offset(r300,
+                              vert_elem->vertex_buffer_index,
+                              vert_elem->src_offset) % 4) {
+            /* XXX need to align buffer */
+            assert(0);
+        }
+    }
+}
+#endif
+/* XXX these shouldn't be asserts since we can work around bad indexbufs */
+void setup_index_buffer(struct r300_context *r300,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize)
+{
+    if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
+                                  RADEON_GEM_DOMAIN_GTT, 0)) {
+        assert(0);
+    }
+
+    if (!r300->winsys->validate(r300->winsys)) {
+        assert(0);
+    }
+}
diff --git a/src/gallium/drivers/r300/r3xx_fs.h b/src/gallium/drivers/r300/r300_vbo.h
index 51cd245724d..7afa75899cf 100644
--- a/src/gallium/drivers/r300/r3xx_fs.h
+++ b/src/gallium/drivers/r300/r300_vbo.h
@@ -1,6 +1,5 @@
 /*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -19,14 +18,19 @@
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
 
-#ifndef R3XX_FS_H
-#define R3XX_FS_H
+#ifndef R300_VBO_H
+#define R300_VBO_H
 
-#include "radeon_code.h"
+struct r300_context;
+struct pipe_buffer;
 
-struct rX00_fragment_program_code r3xx_passthrough_fragment_shader;
-struct rX00_fragment_program_code r3xx_texture_fragment_shader;
+void setup_vertex_attributes(struct r300_context *r300);
 
-#endif /* R3XX_FS_H */
+void setup_index_buffer(struct r300_context *r300,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize);
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 2cb903bba2f..74ef416dc14 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -35,7 +35,9 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
 {
     struct r300_vertex_shader * vs = c->UserData;
     struct tgsi_shader_info* info = &vs->info;
-    boolean pointsize = false;
+    struct tgsi_parse_context parser;
+    struct tgsi_full_declaration * decl;
+    boolean pointsize = FALSE;
     int out_colors = 0;
     int colors = 0;
     int out_generic = 0;
@@ -50,7 +52,7 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
     for (i = 0; i < info->num_outputs; i++) {
         switch (info->output_semantic_name[i]) {
             case TGSI_SEMANTIC_PSIZE:
-                pointsize = true;
+                pointsize = TRUE;
                 break;
             case TGSI_SEMANTIC_COLOR:
                 out_colors++;
@@ -62,8 +64,6 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
         }
     }
 
-    struct tgsi_parse_context parser;
-
     tgsi_parse_init(&parser, vs->state.tokens);
 
     while (!tgsi_parse_end_of_tokens(&parser)) {
@@ -72,7 +72,7 @@ static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
         if (parser.FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION)
             continue;
 
-        struct tgsi_full_declaration * decl = &parser.FullToken.FullDeclaration;
+        decl = &parser.FullToken.FullDeclaration;
 
         if (decl->Declaration.File != TGSI_FILE_OUTPUT)
             continue;
@@ -116,7 +116,7 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     /* Setup the compiler */
     rc_init(&compiler.Base);
 
-    compiler.Base.Debug = 1;
+    compiler.Base.Debug = DBG_ON(r300, DBG_VP);
     compiler.code = &vs->code;
     compiler.UserData = vs;
 
@@ -146,89 +146,3 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     rc_destroy(&compiler.Base);
     vs->translated = TRUE;
 }
-
-
-/* XXX get these to r300_reg */
-#define R300_PVS_DST_OPCODE(x)   ((x) << 0)
-#   define R300_VE_DOT_PRODUCT            1
-#   define R300_VE_MULTIPLY               2
-#   define R300_VE_ADD                    3
-#   define R300_VE_MAXIMUM                7
-#   define R300_VE_SET_LESS_THAN          10
-#define R300_PVS_DST_MATH_INST     (1 << 6)
-#   define R300_ME_RECIP_DX               6
-#define R300_PVS_DST_MACRO_INST    (1 << 7)
-#   define R300_PVS_MACRO_OP_2CLK_MADD    0
-#define R300_PVS_DST_REG_TYPE(x) ((x) << 8)
-#   define R300_PVS_DST_REG_TEMPORARY     0
-#   define R300_PVS_DST_REG_A0            1
-#   define R300_PVS_DST_REG_OUT           2
-#   define R300_PVS_DST_REG_OUT_REPL_X    3
-#   define R300_PVS_DST_REG_ALT_TEMPORARY 4
-#   define R300_PVS_DST_REG_INPUT         5
-#define R300_PVS_DST_OFFSET(x)   ((x) << 13)
-#define R300_PVS_DST_WE(x)       ((x) << 20)
-#define R300_PVS_DST_WE_XYZW     (0xf << 20)
-
-#define R300_PVS_SRC_REG_TYPE(x) ((x) << 0)
-#   define R300_PVS_SRC_REG_TEMPORARY     0
-#   define R300_PVS_SRC_REG_INPUT         1
-#   define R300_PVS_SRC_REG_CONSTANT      2
-#   define R300_PVS_SRC_REG_ALT_TEMPORARY 3
-#define R300_PVS_SRC_OFFSET(x)   ((x) << 5)
-#define R300_PVS_SRC_SWIZZLE(x)  ((x) << 13)
-#   define R300_PVS_SRC_SELECT_X          0
-#   define R300_PVS_SRC_SELECT_Y          1
-#   define R300_PVS_SRC_SELECT_Z          2
-#   define R300_PVS_SRC_SELECT_W          3
-#   define R300_PVS_SRC_SELECT_FORCE_0    4
-#   define R300_PVS_SRC_SELECT_FORCE_1    5
-#   define R300_PVS_SRC_SWIZZLE_XYZW \
-    ((R300_PVS_SRC_SELECT_X | (R300_PVS_SRC_SELECT_Y << 3) | \
-     (R300_PVS_SRC_SELECT_Z << 6) | (R300_PVS_SRC_SELECT_W << 9)) << 13)
-#   define R300_PVS_SRC_SWIZZLE_ZERO \
-    ((R300_PVS_SRC_SELECT_FORCE_0 | (R300_PVS_SRC_SELECT_FORCE_0 << 3) | \
-     (R300_PVS_SRC_SELECT_FORCE_0 << 6) | \
-      (R300_PVS_SRC_SELECT_FORCE_0 << 9)) << 13)
-#   define R300_PVS_SRC_SWIZZLE_ONE \
-    ((R300_PVS_SRC_SELECT_FORCE_1 | (R300_PVS_SRC_SELECT_FORCE_1 << 3) | \
-     (R300_PVS_SRC_SELECT_FORCE_1 << 6) | \
-      (R300_PVS_SRC_SELECT_FORCE_1 << 9)) << 13)
-#define R300_PVS_MODIFIER_X        (1 << 25)
-#define R300_PVS_MODIFIER_Y        (1 << 26)
-#define R300_PVS_MODIFIER_Z        (1 << 27)
-#define R300_PVS_MODIFIER_W        (1 << 28)
-#define R300_PVS_NEGATE_XYZW \
-    (R300_PVS_MODIFIER_X | R300_PVS_MODIFIER_Y | \
-     R300_PVS_MODIFIER_Z | R300_PVS_MODIFIER_W)
-
-struct r300_vertex_program_code r300_passthrough_vertex_shader = {
-    .length = 8, /* two instructions */
-
-    /* MOV out[0], in[0] */
-    .body.d[0] = R300_PVS_DST_OPCODE(R300_VE_ADD) |
-        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-        R300_PVS_DST_OFFSET(0) | R300_PVS_DST_WE_XYZW,
-    .body.d[1] = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-        R300_PVS_SRC_OFFSET(0) | R300_PVS_SRC_SWIZZLE_XYZW,
-    .body.d[2] = R300_PVS_SRC_SWIZZLE_ZERO,
-    .body.d[3] = 0x0,
-
-    /* MOV out[1], in[1] */
-    .body.d[4] = R300_PVS_DST_OPCODE(R300_VE_ADD) |
-        R300_PVS_DST_REG_TYPE(R300_PVS_DST_REG_OUT) |
-        R300_PVS_DST_OFFSET(1) | R300_PVS_DST_WE_XYZW,
-    .body.d[5] = R300_PVS_SRC_REG_TYPE(R300_PVS_SRC_REG_INPUT) |
-        R300_PVS_SRC_OFFSET(1) | R300_PVS_SRC_SWIZZLE_XYZW,
-    .body.d[6] = R300_PVS_SRC_SWIZZLE_ZERO,
-    .body.d[7] = 0x0,
-
-    .inputs[0] = 0,
-    .inputs[1] = 1,
-    .outputs[0] = 0,
-    .outputs[1] = 1,
-
-    .InputsRead = 3,
-    .OutputsWritten = 3
-};
-
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index f18ad75a47d..864a6146b22 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -48,6 +48,9 @@ struct r300_winsys {
     /* GB pipe count */
     uint32_t gb_pipes;
 
+    /* Z pipe count (rv530 only) */
+    uint32_t z_pipes;
+
     /* GART size. */
     uint32_t gart_size;
 
@@ -92,6 +95,12 @@ struct r300_winsys {
 
     /* Flush the CS. */
     void (*flush_cs)(struct r300_winsys* winsys);
+
+    /* winsys flush - callback from winsys when flush required */
+    void (*set_flush_cb)(struct r300_winsys *winsys,
+			 void (*flush_cb)(void *), void *data);
+
+    void (*reset_bos)(struct r300_winsys *winsys);
 };
 
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
diff --git a/src/gallium/drivers/r300/r3xx_fs.c b/src/gallium/drivers/r300/r3xx_fs.c
deleted file mode 100644
index c1c1194d58e..00000000000
--- a/src/gallium/drivers/r300/r3xx_fs.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#include "r3xx_fs.h"
-
-#include "r300_reg.h"
-
-struct rX00_fragment_program_code r3xx_passthrough_fragment_shader = {
-    .code.r300.alu.length = 1,
-    .code.r300.tex.length = 0,
-
-    .code.r300.config = 0,
-    .code.r300.pixsize = 0,
-    .code.r300.code_offset = 0,
-    .code.r300.code_addr[3] = R300_RGBA_OUT,
-
-    .code.r300.alu.inst[0].rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZB(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
-        R300_ALU_OUTC_CMP,
-    .code.r300.alu.inst[0].rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
-        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
-    .code.r300.alu.inst[0].alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZB(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
-        R300_ALU_OUTA_CMP,
-    .code.r300.alu.inst[0].alpha_addr = R300_ALPHA_ADDR0(0) |
-        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
-};
-
-struct rX00_fragment_program_code r3xx_texture_fragment_shader = {
-    .code.r300.alu.length = 1,
-    .code.r300.tex.length = 1,
-
-    .code.r300.config = R300_PFS_CNTL_FIRST_NODE_HAS_TEX,
-    .code.r300.pixsize = 0,
-    .code.r300.code_offset = 0,
-    .code.r300.code_addr[3] = R300_RGBA_OUT,
-
-    .code.r300.tex.inst[0] = R300_TEX_OP_LD << R300_TEX_INST_SHIFT,
-
-    .code.r300.alu.inst[0].rgb_inst = R300_RGB_SWIZA(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZB(R300_ALU_ARGC_SRC0C_XYZ) |
-        R300_RGB_SWIZC(R300_ALU_ARGC_ZERO) |
-        R300_ALU_OUTC_CMP,
-    .code.r300.alu.inst[0].rgb_addr = R300_RGB_ADDR0(0) | R300_RGB_ADDR1(0) |
-        R300_RGB_ADDR2(0) | R300_ALU_DSTC_OUTPUT_XYZ,
-    .code.r300.alu.inst[0].alpha_inst = R300_ALPHA_SWIZA(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZB(R300_ALU_ARGA_SRC0A) |
-        R300_ALPHA_SWIZC(R300_ALU_ARGA_ZERO) |
-        R300_ALU_OUTA_CMP,
-    .code.r300.alu.inst[0].alpha_addr = R300_ALPHA_ADDR0(0) |
-        R300_ALPHA_ADDR1(0) | R300_ALPHA_ADDR2(0) | R300_ALU_DSTA_OUTPUT,
-};
diff --git a/src/gallium/drivers/r300/r5xx_fs.c b/src/gallium/drivers/r300/r5xx_fs.c
deleted file mode 100644
index f072deab0d9..00000000000
--- a/src/gallium/drivers/r300/r5xx_fs.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#include "r5xx_fs.h"
-
-#include "r300_reg.h"
-
-/* XXX this all should find its way back to r300_reg */
-/* Swizzle tools */
-#define R500_SWIZZLE_ZERO 4
-#define R500_SWIZZLE_HALF 5
-#define R500_SWIZZLE_ONE 6
-#define R500_SWIZ_RGB_ZERO ((4 << 0) | (4 << 3) | (4 << 6))
-#define R500_SWIZ_RGB_ONE ((6 << 0) | (6 << 3) | (6 << 6))
-#define R500_SWIZ_RGB_RGB ((0 << 0) | (1 << 3) | (2 << 6))
-#define R500_SWIZ_MOD_NEG 1
-#define R500_SWIZ_MOD_ABS 2
-#define R500_SWIZ_MOD_NEG_ABS 3
-/* Swizzles for inst2 */
-#define R500_SWIZ_TEX_STRQ(x) ((x) << 8)
-#define R500_SWIZ_TEX_RGBA(x) ((x) << 24)
-/* Swizzles for inst3 */
-#define R500_SWIZ_RGB_A(x) ((x) << 2)
-#define R500_SWIZ_RGB_B(x) ((x) << 15)
-/* Swizzles for inst4 */
-#define R500_SWIZ_ALPHA_A(x) ((x) << 14)
-#define R500_SWIZ_ALPHA_B(x) ((x) << 21)
-/* Swizzle for inst5 */
-#define R500_SWIZ_RGBA_C(x) ((x) << 14)
-#define R500_SWIZ_ALPHA_C(x) ((x) << 27)
-/* Writemasks */
-#define R500_TEX_WMASK(x) ((x) << 11)
-#define R500_ALU_WMASK(x) ((x) << 11)
-#define R500_ALU_OMASK(x) ((x) << 15)
-#define R500_W_OMASK (1 << 31)
-
-struct rX00_fragment_program_code r5xx_passthrough_fragment_shader = {
-    .code.r500.max_temp_idx = 0,
-    .code.r500.inst_end = 0,
-
-    .code.r500.inst[0].inst0 = R500_INST_TYPE_OUT |
-        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
-        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
-        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
-    .code.r500.inst[0].inst1 =
-        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
-        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
-    .code.r500.inst[0].inst2 =
-        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
-        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
-    .code.r500.inst[0].inst3 =
-        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
-        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
-        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
-        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
-    .code.r500.inst[0].inst4 =
-        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
-    .code.r500.inst[0].inst5 =
-        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
-        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
-        R500_ALU_RGBA_A_SWIZ_0,
-};
-
-struct rX00_fragment_program_code r5xx_texture_fragment_shader = {
-    .code.r500.max_temp_idx = 0,
-    .code.r500.inst_end = 1,
-
-    .code.r500.inst[0].inst0 = R500_INST_TYPE_TEX |
-        R500_INST_TEX_SEM_WAIT |
-        R500_INST_RGB_WMASK_RGB | R500_INST_ALPHA_WMASK |
-        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
-    .code.r500.inst[0].inst1 = R500_TEX_ID(0) | R500_TEX_INST_LD |
-        R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED,
-    .code.r500.inst[0].inst2 = R500_TEX_SRC_ADDR(0) |
-        R500_TEX_SRC_S_SWIZ_R | R500_TEX_SRC_T_SWIZ_G |
-        R500_TEX_SRC_R_SWIZ_B | R500_TEX_SRC_Q_SWIZ_A |
-        R500_TEX_DST_ADDR(0) |
-        R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G |
-        R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A,
-    .code.r500.inst[0].inst3 = 0x0,
-    .code.r500.inst[0].inst4 = 0x0,
-    .code.r500.inst[0].inst5 = 0x0,
-
-    .code.r500.inst[1].inst0 = R500_INST_TYPE_OUT |
-        R500_INST_TEX_SEM_WAIT | R500_INST_LAST |
-        R500_INST_RGB_OMASK_RGB | R500_INST_ALPHA_OMASK |
-        R500_INST_RGB_CLAMP | R500_INST_ALPHA_CLAMP,
-    .code.r500.inst[1].inst1 =
-        R500_RGB_ADDR0(0) | R500_RGB_ADDR1(0) | R500_RGB_ADDR1_CONST |
-        R500_RGB_ADDR2(0) | R500_RGB_ADDR2_CONST,
-    .code.r500.inst[1].inst2 =
-        R500_ALPHA_ADDR0(0) | R500_ALPHA_ADDR1(0) | R500_ALPHA_ADDR1_CONST |
-        R500_ALPHA_ADDR2(0) | R500_ALPHA_ADDR2_CONST,
-    .code.r500.inst[1].inst3 =
-        R500_ALU_RGB_SEL_A_SRC0 | R500_ALU_RGB_R_SWIZ_A_R |
-        R500_ALU_RGB_G_SWIZ_A_G | R500_ALU_RGB_B_SWIZ_A_B |
-        R500_ALU_RGB_SEL_B_SRC0 | R500_ALU_RGB_R_SWIZ_B_R |
-        R500_ALU_RGB_B_SWIZ_B_G | R500_ALU_RGB_G_SWIZ_B_B,
-    .code.r500.inst[1].inst4 =
-        R500_ALPHA_OP_CMP | R500_ALPHA_SWIZ_A_A | R500_ALPHA_SWIZ_B_A,
-    .code.r500.inst[1].inst5 =
-        R500_ALU_RGBA_OP_CMP | R500_ALU_RGBA_R_SWIZ_0 |
-        R500_ALU_RGBA_G_SWIZ_0 | R500_ALU_RGBA_B_SWIZ_0 |
-        R500_ALU_RGBA_A_SWIZ_0,
-};
diff --git a/src/gallium/drivers/r300/r5xx_fs.h b/src/gallium/drivers/r300/r5xx_fs.h
deleted file mode 100644
index a4addde32b2..00000000000
--- a/src/gallium/drivers/r300/r5xx_fs.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *                Joakim Sindholt <opensource@zhasha.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#ifndef R5XX_FS_H
-#define R5XX_FS_H
-
-#include "radeon_code.h"
-
-struct rX00_fragment_program_code r5xx_passthrough_fragment_shader;
-struct rX00_fragment_program_code r5xx_texture_fragment_shader;
-
-#endif /* R5XX_FS_H */
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 516e3992fdd..bcb887a0b26 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -6,26 +6,17 @@ LIBNAME = softpipe
 C_SOURCES = \
 	sp_fs_exec.c \
 	sp_fs_sse.c \
-	sp_fs_llvm.c \
 	sp_clear.c \
 	sp_flush.c \
 	sp_query.c \
 	sp_context.c \
 	sp_draw_arrays.c \
-	sp_prim_setup.c \
 	sp_prim_vbuf.c \
 	sp_quad_pipe.c \
-	sp_quad_alpha_test.c \
-	sp_quad_blend.c \
-	sp_quad_colormask.c \
-	sp_quad_coverage.c \
+	sp_quad_stipple.c \
 	sp_quad_depth_test.c \
-	sp_quad_earlyz.c \
 	sp_quad_fs.c \
-	sp_quad_occlusion.c \
-	sp_quad_output.c \
-	sp_quad_stencil.c \
-	sp_quad_stipple.c \
+	sp_quad_blend.c \
 	sp_screen.c \
         sp_setup.c \
 	sp_state_blend.c \
@@ -38,7 +29,9 @@ C_SOURCES = \
 	sp_state_vertex.c \
 	sp_texture.c \
 	sp_tex_sample.c \
+	sp_tex_tile_cache.c \
 	sp_tile_cache.c \
-	sp_surface.c 
+	sp_surface.c \
+	sp_video_context.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index f8720638a76..aac9edf44e6 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -7,25 +7,16 @@ softpipe = env.ConvenienceLibrary(
 	source = [
 		'sp_fs_exec.c',
 		'sp_fs_sse.c',
-		'sp_fs_llvm.c',
 		'sp_clear.c',
 		'sp_context.c',
 		'sp_draw_arrays.c',
 		'sp_flush.c',
-		'sp_prim_setup.c',
 		'sp_prim_vbuf.c',
 		'sp_setup.c',
-		'sp_quad_alpha_test.c',
 		'sp_quad_blend.c',
 		'sp_quad_pipe.c',
-		'sp_quad_colormask.c',
-		'sp_quad_coverage.c',
 		'sp_quad_depth_test.c',
-		'sp_quad_earlyz.c',
 		'sp_quad_fs.c',
-		'sp_quad_occlusion.c',
-		'sp_quad_output.c',
-		'sp_quad_stencil.c',
 		'sp_quad_stipple.c',
 		'sp_query.c',
 		'sp_screen.c',
@@ -39,8 +30,10 @@ softpipe = env.ConvenienceLibrary(
 		'sp_state_vertex.c',
 		'sp_surface.c',
 		'sp_tex_sample.c',
+		'sp_tex_tile_cache.c',
 		'sp_texture.c',
 		'sp_tile_cache.c',
+		'sp_video_context.c',
 	])
 
-Export('softpipe')
-\ No newline at end of file
+Export('softpipe')
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index d3af18e162b..8fac8e6e05f 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,8 +36,6 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_state.h"
 #include "sp_tile_cache.h"
 
 
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
index 2e450672f58..9be3b86fe9f 100644
--- a/src/gallium/drivers/softpipe/sp_clear.h
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -32,7 +32,6 @@
 #ifndef SP_CLEAR_H
 #define SP_CLEAR_H
 
-#include "pipe/p_state.h"
 struct pipe_context;
 
 extern void
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index b4650c0dc58..5f60139968a 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -31,17 +31,18 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
-#include "sp_prim_setup.h"
 #include "sp_prim_vbuf.h"
 #include "sp_state.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_texture.h"
 #include "sp_winsys.h"
 #include "sp_query.h"
@@ -72,18 +73,16 @@ softpipe_unmap_transfers(struct softpipe_context *sp)
 {
    uint i;
 
-   for (i = 0; i < sp->framebuffer.nr_cbufs; i++)
-      sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
-   sp_flush_tile_cache(sp, sp->zsbuf_cache);
-
    for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
       sp_tile_cache_unmap_transfers(sp->cbuf_cache[i]);
    }
+
    sp_tile_cache_unmap_transfers(sp->zsbuf_cache);
 }
 
 
-static void softpipe_destroy( struct pipe_context *pipe )
+static void
+softpipe_destroy( struct pipe_context *pipe )
 {
    struct softpipe_context *softpipe = softpipe_context( pipe );
    uint i;
@@ -91,19 +90,9 @@ static void softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
-      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
-      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
-      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
-      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
-      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
-      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
-      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
-      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
-      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
-      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
-   }
+      softpipe->quad.shade->destroy( softpipe->quad.shade );
+      softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+      softpipe->quad.blend->destroy( softpipe->quad.blend );
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
@@ -113,7 +102,7 @@ static void softpipe_destroy( struct pipe_context *pipe )
    pipe_surface_reference(&softpipe->framebuffer.zsbuf, NULL);
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      sp_destroy_tile_cache(softpipe->tex_cache[i]);
+      sp_destroy_tex_tile_cache(softpipe->tex_cache[i]);
       pipe_texture_reference(&softpipe->texture[i], NULL);
    }
 
@@ -126,6 +115,15 @@ static void softpipe_destroy( struct pipe_context *pipe )
    FREE( softpipe );
 }
 
+
+/**
+ * if (the texture is being used as a framebuffer surface)
+ *    return PIPE_REFERENCED_FOR_WRITE
+ * else if (the texture is a bound texture source)
+ *    return PIPE_REFERENCED_FOR_READ
+ * else
+ *    return PIPE_UNREFERENCED
+ */
 static unsigned int
 softpipe_is_texture_referenced( struct pipe_context *pipe,
 				struct pipe_texture *texture,
@@ -134,22 +132,31 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
    struct softpipe_context *softpipe = softpipe_context( pipe );
    unsigned i;
 
-   if(softpipe->dirty_render_cache) {
+   /* check if any of the bound drawing surfaces are this texture */
+   if (softpipe->dirty_render_cache) {
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-         if(softpipe->framebuffer.cbufs[i] && 
-            softpipe->framebuffer.cbufs[i]->texture == texture)
+         if (softpipe->framebuffer.cbufs[i] && 
+             softpipe->framebuffer.cbufs[i]->texture == texture) {
             return PIPE_REFERENCED_FOR_WRITE;
+         }
       }
-      if(softpipe->framebuffer.zsbuf && 
-         softpipe->framebuffer.zsbuf->texture == texture)
+      if (softpipe->framebuffer.zsbuf && 
+          softpipe->framebuffer.zsbuf->texture == texture) {
          return PIPE_REFERENCED_FOR_WRITE;
+      }
    }
    
-   /* FIXME: we also need to do the same for the texture cache */
+   /* check if any of the tex_cache textures are this texture */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (softpipe->tex_cache[i] &&
+          softpipe->tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
    
    return PIPE_UNREFERENCED;
 }
 
+
 static unsigned int
 softpipe_is_buffer_referenced( struct pipe_context *pipe,
 			       struct pipe_buffer *buf)
@@ -157,6 +164,7 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
    return PIPE_UNREFERENCED;
 }
 
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -227,7 +235,6 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.is_buffer_referenced = softpipe_is_buffer_referenced;
 
    softpipe_init_query_funcs( softpipe );
-   softpipe_init_texture_funcs( softpipe );
 
    /*
     * Alloc caches for accessing drawing surfaces and textures.
@@ -238,41 +245,14 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->zsbuf_cache = sp_create_tile_cache( screen );
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      softpipe->tex_cache[i] = sp_create_tile_cache( screen );
+      softpipe->tex_cache[i] = sp_create_tex_tile_cache( screen );
 
 
    /* setup quad rendering stages */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
-      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
-      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
-      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
-      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
-      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
-      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
-      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
-      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
-      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
-      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
-   }
-
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.vert_samplers[i].base.get_samples = sp_get_samples_vertex;
-      softpipe->tgsi.vert_samplers[i].unit = i;
-      softpipe->tgsi.vert_samplers[i].sp = softpipe;
-      softpipe->tgsi.vert_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.vert_samplers_list[i] = &softpipe->tgsi.vert_samplers[i];
-   }
+   softpipe->quad.shade = sp_quad_shade_stage(softpipe);
+   softpipe->quad.depth_test = sp_quad_depth_test_stage(softpipe);
+   softpipe->quad.blend = sp_quad_blend_stage(softpipe);
 
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.frag_samplers[i].base.get_samples = sp_get_samples_fragment;
-      softpipe->tgsi.frag_samplers[i].unit = i;
-      softpipe->tgsi.frag_samplers[i].sp = softpipe;
-      softpipe->tgsi.frag_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.frag_samplers_list[i] = &softpipe->tgsi.frag_samplers[i];
-   }
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -286,30 +266,27 @@ softpipe_create( struct pipe_screen *screen )
                          (struct tgsi_sampler **)
                             softpipe->tgsi.vert_samplers_list);
 
-   softpipe->setup = sp_draw_render_stage(softpipe);
-   if (!softpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "SP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(softpipe->draw, softpipe->setup);
-   }
-   else {
-      sp_init_vbuf(softpipe);
-   }
+   softpipe->vbuf_backend = sp_create_vbuf_backend(softpipe);
+   if (!softpipe->vbuf_backend)
+      goto fail;
+
+   softpipe->vbuf = draw_vbuf_stage(softpipe->draw, softpipe->vbuf_backend);
+   if (!softpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(softpipe->draw, softpipe->vbuf);
+   draw_set_render(softpipe->draw, softpipe->vbuf_backend);
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
    draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
 
-#if USE_DRAW_STAGE_PSTIPPLE
    /* Do polygon stipple w/ texture map + frag prog? */
    draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
-#endif
 
    sp_init_surface_functions(softpipe);
 
@@ -319,4 +296,3 @@ softpipe_create( struct pipe_screen *screen )
    softpipe_destroy(&softpipe->pipe);
    return NULL;
 }
-
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 7888c2f644b..a735573d6fb 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -36,24 +36,13 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
-#include "sp_tex_sample.h"
 
 
-/**
- * This is a temporary variable for testing draw-stage polygon stipple.
- * If zero, do stipple in sp_quad_stipple.c
- */
-#define USE_DRAW_STAGE_PSTIPPLE 1
-
-/* Number of threads working on individual quads.
- * Setting to 1 disables this feature.
- */
-#define SP_NUM_QUAD_THREADS 1
-
 struct softpipe_vbuf_render;
 struct draw_context;
 struct draw_stage;
 struct softpipe_tile_cache;
+struct softpipe_tex_tile_cache;
 struct sp_fragment_shader;
 struct sp_vertex_shader;
 
@@ -62,12 +51,12 @@ struct softpipe_context {
    struct pipe_context pipe;  /**< base class */
 
    /** Constant state objects */
-   const struct pipe_blend_state *blend;
-   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
-   const struct pipe_depth_stencil_alpha_state *depth_stencil;
-   const struct pipe_rasterizer_state *rasterizer;
-   const struct sp_fragment_shader *fs;
-   const struct sp_vertex_shader *vs;
+   struct pipe_blend_state *blend;
+   struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_depth_stencil_alpha_state *depth_stencil;
+   struct pipe_rasterizer_state *rasterizer;
+   struct sp_fragment_shader *fs;
+   struct sp_vertex_shader *vs;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
@@ -96,7 +85,7 @@ struct softpipe_context {
 
    /** Mapped vertex buffers */
    ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
-   
+
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
@@ -107,7 +96,15 @@ struct softpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
+   /** The reduced version of the primitive supplied by the state tracker */
+   unsigned reduced_api_prim;
+
+   /**
+    * The reduced primitive after unfilled triangles, wide-line decomposition,
+    * etc, are taken into account.  This is the primitive type that's actually
+    * rasterized.
+    */
+   unsigned reduced_prim;
 
    /** Derived from scissor and surface bounds: */
    struct pipe_scissor_state cliprect;
@@ -116,41 +113,32 @@ struct softpipe_context {
 
    /** Software quad rendering pipeline */
    struct {
-      struct quad_stage *polygon_stipple;
-      struct quad_stage *earlyz;
       struct quad_stage *shade;
-      struct quad_stage *alpha_test;
-      struct quad_stage *stencil_test;
       struct quad_stage *depth_test;
-      struct quad_stage *occlusion;
-      struct quad_stage *coverage;
       struct quad_stage *blend;
-      struct quad_stage *colormask;
-      struct quad_stage *output;
-
       struct quad_stage *first; /**< points to one of the above stages */
-   } quad[SP_NUM_QUAD_THREADS];
+   } quad;
 
    /** TGSI exec things */
    struct {
-      struct sp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
    } tgsi;
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct softpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
-   
+
    struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
    struct softpipe_tile_cache *zsbuf_cache;
 
-   struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   unsigned tex_timestamp;
+   struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
@@ -164,5 +152,8 @@ softpipe_context( struct pipe_context *pipe )
    return (struct softpipe_context *)pipe;
 }
 
-#endif /* SP_CONTEXT_H */
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe);
+
 
+#endif /* SP_CONTEXT_H */
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 4a14d49686e..e38b767cf2c 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -37,6 +37,7 @@
 #include "sp_surface.h"
 #include "sp_state.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_winsys.h"
 
 
@@ -52,17 +53,19 @@ softpipe_flush( struct pipe_context *pipe,
 
    if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
       for (i = 0; i < softpipe->num_textures; i++) {
-         sp_flush_tile_cache(softpipe, softpipe->tex_cache[i]);
+         sp_flush_tex_tile_cache(softpipe->tex_cache[i]);
       }
    }
 
-   if (flags & PIPE_FLUSH_RENDER_CACHE) {
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      /* If this is a swapbuffers, just flush color buffers.
+       *
+       * The zbuffer changes are not discarded, but held in the cache
+       * in the hope that a later clear will wipe them out.
+       */
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
          if (softpipe->cbuf_cache[i])
-            sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
-
-      if (softpipe->zsbuf_cache)
-         sp_flush_tile_cache(softpipe, softpipe->zsbuf_cache);
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
 
       /* Need this call for hardware buffers before swapbuffers.
        *
@@ -71,7 +74,15 @@ softpipe_flush( struct pipe_context *pipe,
        * to unmap surfaces when flushing.
        */
       softpipe_unmap_transfers(softpipe);
-      
+   }
+   else if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe->zsbuf_cache);
+     
       softpipe->dirty_render_cache = FALSE;
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 9ee86fe7878..4076114d392 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -59,15 +59,34 @@ sp_exec_fragment_shader(const struct sp_fragment_shader *base)
 }
 
 
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
  *
  * This should really be part of the compiled shader.
  */
-void
-sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
-		    float x, float y,
-		    struct tgsi_exec_vector *quadpos)
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+                 float x, float y,
+                 struct tgsi_exec_vector *quadpos)
 {
    uint chan;
    /* do X */
@@ -95,24 +114,6 @@ sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
 }
 
 
-static void
-exec_prepare( const struct sp_fragment_shader *base,
-	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler **samplers )
-{
-   /*
-    * Bind tokens/shader to the interpreter's machine state.
-    * Avoid redundant binding.
-    */
-   if (machine->Tokens != base->shader.tokens) {
-      tgsi_exec_machine_bind_shader( machine,
-                                     base->shader.tokens,
-                                     PIPE_MAX_SAMPLERS,
-                                     samplers );
-   }
-}
-
-
 /* TODO: hide the machine struct in here somewhere, remove from this
  * interface:
  */
@@ -122,11 +123,43 @@ exec_run( const struct sp_fragment_shader *base,
 	  struct quad_header *quad )
 {
    /* Compute X, Y, Z, W vals for this quad */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       &machine->QuadPos);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    &machine->QuadPos);
    
-   return tgsi_exec_machine_run( machine );
+   quad->inout.mask &= tgsi_exec_machine_run( machine );
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
deleted file mode 100644
index 95c0d982d12..00000000000
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Execute fragment shader using LLVM code generation.
- * Authors:
- *   Zack Rusin
- */
-
-#include "sp_context.h"
-#include "sp_state.h"
-#include "sp_fs.h"
-
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_sse2.h"
-
-#if 0
-
-/**
- * Subclass of sp_fragment_shader
- */
-struct sp_llvm_fragment_shader
-{
-   struct sp_fragment_shader base;
-   struct gallivm_prog *llvm_prog;
-};
-
-
-static void
-shade_quad_llvm(struct quad_stage *qs,
-                struct quad_header *quad)
-{
-   struct quad_shade_stage *qss = quad_shade_stage(qs);
-   struct softpipe_context *softpipe = qs->softpipe;
-   float dests[4][16][4] ALIGN16_ATTRIB;
-   float inputs[4][16][4] ALIGN16_ATTRIB;
-   const float fx = (float) quad->x0;
-   const float fy = (float) quad->y0;
-   struct gallivm_prog *llvm = qss->llvm_prog;
-
-   inputs[0][0][0] = fx;
-   inputs[1][0][0] = fx + 1.0f;
-   inputs[2][0][0] = fx;
-   inputs[3][0][0] = fx + 1.0f;
-
-   inputs[0][0][1] = fy;
-   inputs[1][0][1] = fy;
-   inputs[2][0][1] = fy + 1.0f;
-   inputs[3][0][1] = fy + 1.0f;
-
-
-   gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
-
-#if DLLVM
-   debug_printf("MASK = %d\n", quad->mask);
-   for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < 2; ++j) {
-         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
-                inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
-      }
-   }
-#endif
-
-   quad->mask &=
-      gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs,
-                                   softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
-                                   qss->samplers);
-#if DLLVM
-   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
-          dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
-          dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
-#endif
-
-   /* store result color */
-   if (qss->colorOutSlot >= 0) {
-      unsigned i;
-      /* XXX need to handle multiple color outputs someday */
-      allvmrt(qss->stage.softpipe->fs->info.output_semantic_name[qss->colorOutSlot]
-             == TGSI_SEMANTIC_COLOR);
-      for (i = 0; i < QUAD_SIZE; ++i) {
-         quad->outputs.color[0][0][i] = dests[i][qss->colorOutSlot][0];
-         quad->outputs.color[0][1][i] = dests[i][qss->colorOutSlot][1];
-         quad->outputs.color[0][2][i] = dests[i][qss->colorOutSlot][2];
-         quad->outputs.color[0][3][i] = dests[i][qss->colorOutSlot][3];
-      }
-   }
-#if DLLVM
-   for (int i = 0; i < QUAD_SIZE; ++i) {
-      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
-             quad->outputs.color[0][0][i],
-             quad->outputs.color[0][1][i],
-             quad->outputs.color[0][2][i],
-             quad->outputs.color[0][3][i]);
-   }
-#endif
-
-   /* store result Z */
-   if (qss->depthOutSlot >= 0) {
-      /* output[slot] is new Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = dests[i][0][2];
-      }
-   }
-   else {
-      /* copy input Z (which was interpolated by the executor) to output Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = inputs[i][0][2];
-      }
-   }
-#if DLLVM
-   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
-             quad->outputs.depth[0],
-             quad->outputs.depth[1],
-             quad->outputs.depth[2],
-             quad->outputs.depth[3], quad->mask);
-#endif
-
-   /* shader may cull fragments */
-   if( quad->mask ) {
-      qs->next->run( qs->next, quad );
-   }
-}
-
-
-unsigned 
-run_llvm_fs( const struct sp_fragment_shader *base,
-	     struct foo *machine )
-{
-}
-
-
-void 
-delete_llvm_fs( struct sp_fragment_shader *base )
-{
-   FREE(base);
-}
-
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-                        const struct pipe_shader_state *templ)
-{
-   struct sp_llvm_fragment_shader *shader = NULL;
-
-   /* LLVM fragment shaders currently disabled:
-    */
-   state = CALLOC_STRUCT(sp_llvm_shader_state);
-   if (!state)
-      return NULL;
-
-   state->llvm_prog = 0;
-
-   if (!gallivm_global_cpu_engine()) {
-      gallivm_cpu_engine_create(state->llvm_prog);
-   }
-   else
-      gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
-   
-   if (shader) {
-      shader->base.run = run_llvm_fs;
-      shader->base.delete = delete_llvm_fs;
-   }
-
-   return shader;
-}
-
-
-#else
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-		       const struct pipe_shader_state *templ)
-{
-   return NULL;
-}
-
-#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 31ccc3bda9a..f9129506585 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -76,6 +76,43 @@ fs_sse_prepare( const struct sp_fragment_shader *base,
 }
 
 
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
 /* TODO: codegenerate the whole run function, skip this wrapper.
  * TODO: break dependency on tgsi_exec_machine struct
  * TODO: push Position calculation into the generated shader
@@ -89,9 +126,9 @@ fs_sse_run( const struct sp_fragment_shader *base,
    struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
 
    /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       machine->Temps);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    machine->Temps);
 
    /* init kill mask */
    tgsi_set_kill_mask(machine, 0x0);
@@ -104,7 +141,39 @@ fs_sse_run( const struct sp_fragment_shader *base,
 		 /*, &machine->QuadPos*/
       );
 
-   return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
deleted file mode 100644
index 038ff04d4f1..00000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in sp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
- */
-
-
-#include "sp_context.h"
-#include "sp_setup.h"
-#include "sp_state.h"
-#include "sp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = setup_create_context(softpipe);
-   sstage->stage.draw = softpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-sp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-sp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.h b/src/gallium/drivers/softpipe/sp_prim_setup.h
deleted file mode 100644
index 49bdd98ed87..00000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SP_PRIM_SETUP_H
-#define SP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct softpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-sp_draw_render_stage( struct softpipe_context *softpipe );
-
-extern struct setup_context *
-sp_draw_setup_context( struct draw_stage * );
-
-extern void
-sp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-sp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* SP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 42021789ea8..5fbac06a535 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -37,13 +37,13 @@
 
 
 #include "sp_context.h"
+#include "sp_setup.h"
 #include "sp_state.h"
 #include "sp_prim_vbuf.h"
-#include "sp_prim_setup.h"
-#include "sp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
@@ -58,6 +58,8 @@ struct softpipe_vbuf_render
 {
    struct vbuf_render base;
    struct softpipe_context *softpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -74,6 +76,11 @@ softpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -104,36 +111,6 @@ sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 sp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         softpipe_get_vbuf_vertex_info(cvbr->softpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -159,14 +136,11 @@ static boolean
 sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   struct setup_context *setup_ctx = cvbr->setup;
    
-   setup_prepare( setup_ctx );
+   sp_setup_prepare( setup_ctx );
 
+   cvbr->softpipe->reduced_prim = u_reduced_prim(prim);
    cvbr->prim = prim;
    return TRUE;
 
@@ -191,25 +165,20 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct softpipe_context *softpipe = cvbr->softpipe;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
-         setup_point( setup_ctx,
+         sp_setup_point( setup_ctx,
                       get_vert(vertex_buffer, indices[i-0], stride) );
       }
       break;
 
    case PIPE_PRIM_LINES:
       for (i = 1; i < nr; i += 2) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, indices[i-1], stride),
                      get_vert(vertex_buffer, indices[i-0], stride) );
       }
@@ -217,7 +186,7 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
 
    case PIPE_PRIM_LINE_STRIP:
       for (i = 1; i < nr; i ++) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, indices[i-1], stride),
                      get_vert(vertex_buffer, indices[i-0], stride) );
       }
@@ -225,27 +194,29 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
 
    case PIPE_PRIM_LINE_LOOP:
       for (i = 1; i < nr; i ++) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, indices[i-1], stride),
                      get_vert(vertex_buffer, indices[i-0], stride) );
       }
       if (nr) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, indices[nr-1], stride),
                      get_vert(vertex_buffer, indices[0], stride) );
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
@@ -254,15 +225,17 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
@@ -271,15 +244,17 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
@@ -288,24 +263,26 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
 
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
@@ -314,23 +291,25 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-3], stride));
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-0], stride) );
@@ -345,7 +324,7 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
        * flatshade_first state makes no difference.
        */
       for (i = 2; i < nr; i += 1) {
-         setup_tri( setup_ctx,
+         sp_setup_tri( setup_ctx,
                     get_vert(vertex_buffer, indices[i-0], stride),
                     get_vert(vertex_buffer, indices[i-1], stride),
                     get_vert(vertex_buffer, indices[0], stride) );
@@ -355,11 +334,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in sp_setup.c:
-    */
-   sp_draw_flush( setup );
 }
 
 
@@ -372,28 +346,23 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
-         setup_point( setup_ctx,
+         sp_setup_point( setup_ctx,
                       get_vert(vertex_buffer, i-0, stride) );
       }
       break;
 
    case PIPE_PRIM_LINES:
       for (i = 1; i < nr; i += 2) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, i-1, stride),
                      get_vert(vertex_buffer, i-0, stride) );
       }
@@ -401,7 +370,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 
    case PIPE_PRIM_LINE_STRIP:
       for (i = 1; i < nr; i ++) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, i-1, stride),
                      get_vert(vertex_buffer, i-0, stride) );
       }
@@ -409,27 +378,29 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 
    case PIPE_PRIM_LINE_LOOP:
       for (i = 1; i < nr; i ++) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, i-1, stride),
                      get_vert(vertex_buffer, i-0, stride) );
       }
       if (nr) {
-         setup_line( setup_ctx,
+         sp_setup_line( setup_ctx,
                      get_vert(vertex_buffer, nr-1, stride),
                      get_vert(vertex_buffer, 0, stride) );
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride) );
@@ -438,15 +409,17 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i++) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i++) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-1, stride),
                        get_vert(vertex_buffer, i-(i&1), stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 2; i < nr; i++) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-2, stride),
                        get_vert(vertex_buffer, i-(i&1)-1, stride),
                        get_vert(vertex_buffer, i-0, stride) );
@@ -455,15 +428,17 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride) );
@@ -472,23 +447,25 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-3, stride) );
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-0, stride) );
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride) );
@@ -497,23 +474,25 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
-            setup_tri( setup_ctx,
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-3, stride) );
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
-            setup_tri( setup_ctx,
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-0, stride) );
-            setup_tri( setup_ctx,
+            sp_setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-0, stride) );
@@ -528,7 +507,7 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
        * flatshade_first state makes no difference.
        */
       for (i = 2; i < nr; i += 1) {
-         setup_tri( setup_ctx,
+         sp_setup_tri( setup_ctx,
                     get_vert(vertex_buffer, i-1, stride),
                     get_vert(vertex_buffer, i-0, stride),
                     get_vert(vertex_buffer, 0, stride) );
@@ -546,40 +525,38 @@ static void
 sp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-   cvbr->softpipe->vbuf_render = NULL;
+   sp_setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-sp_init_vbuf(struct softpipe_context *sp)
+struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *sp)
 {
-   assert(sp->draw);
+   struct softpipe_vbuf_render *cvbr = CALLOC_STRUCT(softpipe_vbuf_render);
 
-   sp->vbuf_render = CALLOC_STRUCT(softpipe_vbuf_render);
+   assert(sp->draw);
 
-   sp->vbuf_render->base.max_indices = SP_MAX_VBUF_INDEXES;
-   sp->vbuf_render->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
-   sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
-   sp->vbuf_render->base.map_vertices = sp_vbuf_map_vertices;
-   sp->vbuf_render->base.unmap_vertices = sp_vbuf_unmap_vertices;
-   sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
-   sp->vbuf_render->base.draw = sp_vbuf_draw;
-   sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
-   sp->vbuf_render->base.release_vertices = sp_vbuf_release_vertices;
-   sp->vbuf_render->base.destroy = sp_vbuf_destroy;
+   cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->softpipe = sp;
+   cvbr->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = sp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = sp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = sp_vbuf_set_primitive;
+   cvbr->base.draw = sp_vbuf_draw;
+   cvbr->base.draw_arrays = sp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = sp_vbuf_release_vertices;
+   cvbr->base.destroy = sp_vbuf_destroy;
 
-   sp->vbuf = draw_vbuf_stage(sp->draw, &sp->vbuf_render->base);
+   cvbr->softpipe = sp;
 
-   draw_set_rasterize_stage(sp->draw, sp->vbuf);
+   cvbr->setup = sp_setup_create_context(cvbr->softpipe);
 
-   draw_set_render(sp->draw, &sp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
index 1de9cc2a894..ad01cc2f289 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.h
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct softpipe_context;
 
-extern void
-sp_init_vbuf(struct softpipe_context *softpipe);
+extern struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *softpipe);
 
 
 #endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index bd6c6cb9123..a3236bd1169 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -97,10 +97,10 @@ struct quad_header {
    struct quad_header_inout inout;
    struct quad_header_output output;
 
-   const struct tgsi_interp_coef *coef;
+   /* Redundant/duplicated:
+    */
    const struct tgsi_interp_coef *posCoef;
-
-   unsigned nr_attrs;
+   const struct tgsi_interp_coef *coef;
 };
 
 #endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
deleted file mode 100644
index 0845bae0e68..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ /dev/null
@@ -1,108 +0,0 @@
-
-/**
- * quad alpha test
- */
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-static void
-alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const float ref = softpipe->depth_stencil->alpha.ref_value;
-   unsigned passMask = 0x0, j;
-   const uint cbuf = 0; /* only output[0].alpha is tested */
-   const float *aaaa = quad->output.color[cbuf][3];
-
-   switch (softpipe->depth_stencil->alpha.func) {
-   case PIPE_FUNC_NEVER:
-      break;
-   case PIPE_FUNC_LESS:
-      /*
-       * If mask were an array [4] we could do this SIMD-style:
-       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
-       */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] < ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] == ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] <= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] > ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] != ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] >= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   quad->inout.mask &= passMask;
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void alpha_test_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void alpha_test_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *
-sp_quad_alpha_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = alpha_test_begin;
-   stage->run = alpha_test_quad;
-   stage->destroy = alpha_test_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index b1e18805c70..fe6b6cec353 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -117,644 +117,873 @@ do { \
 
 
 static void
-logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+logicop_quad(struct quad_stage *qs, 
+             float (*quadColor)[4],
+             float (*dest)[4])
 {
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   ubyte src[4][4], dst[4][4], res[4][4];
+   uint *src4 = (uint *) src;
+   uint *dst4 = (uint *) dst;
+   uint *res4 = (uint *) res;
+   uint j;
+
+
+   /* convert to ubyte */
+   for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+      dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+      dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+      dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+      dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+      src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+      src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+      src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+      src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+   }
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      ubyte src[4][4], dst[4][4], res[4][4];
-      uint *src4 = (uint *) src;
-      uint *dst4 = (uint *) dst;
-      uint *res4 = (uint *) res;
-      struct softpipe_cached_tile *
-         tile = sp_get_cached_tile(softpipe,
-                                   softpipe->cbuf_cache[cbuf],
-                                   quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
+   switch (softpipe->blend->logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      for (j = 0; j < 4; j++)
+         res4[j] = 0;
+      break;
+   case PIPE_LOGICOP_NOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] | dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j];
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & ~dst4[j];
+      break;
+   case PIPE_LOGICOP_INVERT:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~dst4[j];
+      break;
+   case PIPE_LOGICOP_XOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j] ^ src4[j];
+      break;
+   case PIPE_LOGICOP_NAND:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] & dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] ^ dst4[j]);
+      break;
+   case PIPE_LOGICOP_NOOP:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j];
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j];
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | ~dst4[j];
+      break;
+   case PIPE_LOGICOP_OR:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_SET:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~0;
+      break;
+   default:
+      assert(0);
+   }
 
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
+   for (j = 0; j < 4; j++) {
+      quadColor[j][0] = ubyte_to_float(res[j][0]);
+      quadColor[j][1] = ubyte_to_float(res[j][1]);
+      quadColor[j][2] = ubyte_to_float(res[j][2]);
+      quadColor[j][3] = ubyte_to_float(res[j][3]);
+   }
+}
 
-      /* convert to ubyte */
-      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
-         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
-         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
-         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
-
-         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
-         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
-         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
-         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
-      }
 
-      switch (softpipe->blend->logicop_func) {
-      case PIPE_LOGICOP_CLEAR:
-         for (j = 0; j < 4; j++)
-            res4[j] = 0;
-         break;
-      case PIPE_LOGICOP_NOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] | dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j];
-         break;
-      case PIPE_LOGICOP_AND_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & ~dst4[j];
-         break;
-      case PIPE_LOGICOP_INVERT:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~dst4[j];
-         break;
-      case PIPE_LOGICOP_XOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j] ^ src4[j];
-         break;
-      case PIPE_LOGICOP_NAND:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] & dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_EQUIV:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] ^ dst4[j]);
-         break;
-      case PIPE_LOGICOP_NOOP:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j];
-         break;
-      case PIPE_LOGICOP_OR_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j];
-         break;
-      case PIPE_LOGICOP_OR_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | ~dst4[j];
-         break;
-      case PIPE_LOGICOP_OR:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_SET:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~0;
-         break;
-      default:
-         assert(0);
-      }
 
-      for (j = 0; j < 4; j++) {
-         quadColor[j][0] = ubyte_to_float(res[j][0]);
-         quadColor[j][1] = ubyte_to_float(res[j][1]);
-         quadColor[j][2] = ubyte_to_float(res[j][2]);
-         quadColor[j][3] = ubyte_to_float(res[j][3]);
-      }
+static void
+blend_quad(struct quad_stage *qs, 
+           float (*quadColor)[4],
+           float (*dest)[4])
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
+   float source[4][QUAD_SIZE];
+
+   /*
+    * Compute src/first term RGB
+    */
+   switch (softpipe->blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[0], quadColor[0]); /* R */
+      VEC4_COPY(source[1], quadColor[1]); /* G */
+      VEC4_COPY(source[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   {
+      const float *alpha = dest[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   {
+      const float *alpha = quadColor[3];
+      float diff[4], temp[4];
+      VEC4_SUB(diff, one, dest[3]);
+      VEC4_MIN(temp, alpha, diff);
+      VEC4_MUL(source[0], quadColor[0], temp); /* R */
+      VEC4_MUL(source[1], quadColor[1], temp); /* G */
+      VEC4_MUL(source[2], quadColor[2], temp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float alpha[4];
+      VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[0], zero); /* R */
+      VEC4_COPY(source[1], zero); /* G */
+      VEC4_COPY(source[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(source[0], quadColor[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(source[1], quadColor[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(source[2], quadColor[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (softpipe->blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* multiply alpha by 1.0 */
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(source[3], quadColor[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      /* A */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
    }
 
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
 
+   /*
+    * Compute dest/second term RGB
+    */
+   switch (softpipe->blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   {
+      const float *alpha = quadColor[3];
+      float diff[4], temp[4];
+      VEC4_SUB(diff, one, dest[3]);
+      VEC4_MIN(temp, alpha, diff);
+      VEC4_MUL(dest[0], quadColor[0], temp); /* R */
+      VEC4_MUL(dest[1], quadColor[1], temp); /* G */
+      VEC4_MUL(dest[2], quadColor[2], temp); /* B */
+   }
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[0], zero); /* R */
+      VEC4_COPY(dest[1], zero); /* G */
+      VEC4_COPY(dest[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Compute dest/second term A
+    */
+   switch (softpipe->blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[3], dest[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[3], dest[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Combine RGB terms
+    */
+   switch (softpipe->blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (softpipe->blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
 
 static void
-blend_quad(struct quad_stage *qs, struct quad_header *quad)
+colormask_quad(struct quad_stage *qs,
+               float (*quadColor)[4],
+               float (*dest)[4])
 {
-   static const float zero[4] = { 0, 0, 0, 0 };
-   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   /* R */
+   if (!(softpipe->blend->colormask & PIPE_MASK_R))
+      COPY_4V(quadColor[0], dest[0]);
+
+   /* G */
+   if (!(softpipe->blend->colormask & PIPE_MASK_G))
+      COPY_4V(quadColor[1], dest[1]);
+
+   /* B */
+   if (!(softpipe->blend->colormask & PIPE_MASK_B))
+      COPY_4V(quadColor[2], dest[2]);
 
+   /* A */
+   if (!(softpipe->blend->colormask & PIPE_MASK_A))
+      COPY_4V(quadColor[3], dest[3]);
+}
+
+
+static void
+blend_fallback(struct quad_stage *qs, 
+               struct quad_header *quads[],
+               unsigned nr)
+{
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   const struct pipe_blend_state *blend = softpipe->blend;
+   unsigned cbuf;
+
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
+   {
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      uint q, i, j;
+
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         const int itx = (quad->input.x0 & (TILE_SIZE-1));
+         const int ity = (quad->input.y0 & (TILE_SIZE-1));
+
+         /* get/swizzle dest colors 
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
+         }
+
+
+         if (blend->logicop_enable) {
+            logicop_quad( qs, quadColor, dest );
+         }
+         else if (blend->blend_enable) {
+            blend_quad( qs, quadColor, dest );
+         }
 
-   if (softpipe->blend->logicop_enable) {
-      logicop_quad(qs, quad);
-      return;
+         if (blend->colormask != 0xf)
+            colormask_quad( qs, quadColor, dest );
+   
+         /* Output color values
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            if (quad->inout.mask & (1 << j)) {
+               int x = itx + (j & 1);
+               int y = ity + (j >> 1);
+               for (i = 0; i < 4; i++) { /* loop over color chans */
+                  tile->data.color[y][x][i] = quadColor[i][j];
+               }
+            }
+         }
+      }
    }
+}
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
 
+static void
+blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs, 
+                                         struct quad_header *quads[],
+                                         unsigned nr)
+{
+   static const float one[4] = { 1, 1, 1, 1 };
+   float one_minus_alpha[QUAD_SIZE];
+   float dest[4][QUAD_SIZE];
+   float source[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const float *alpha = quadColor[3];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
       }
 
-      /*
-       * Compute src/first term RGB
-       */
-      switch (softpipe->blend->rgb_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[0], quadColor[0]); /* R */
-         VEC4_COPY(source[1], quadColor[1]); /* G */
-         VEC4_COPY(source[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         {
-            const float *alpha = dest[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         {
-            const float *alpha = quadColor[3];
-            float diff[4], temp[4];
-            VEC4_SUB(diff, one, dest[3]);
-            VEC4_MIN(temp, alpha, diff);
-            VEC4_MUL(source[0], quadColor[0], temp); /* R */
-            VEC4_MUL(source[1], quadColor[1], temp); /* G */
-            VEC4_MUL(source[2], quadColor[2], temp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float alpha[4];
-            VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[0], zero); /* R */
-         VEC4_COPY(source[1], zero); /* G */
-         VEC4_COPY(source[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(source[0], quadColor[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(source[1], quadColor[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(source[2], quadColor[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+
+      VEC4_SUB(one_minus_alpha, one, alpha);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* B */
+
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute src/first term A
-       */
-      switch (softpipe->blend->alpha_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[3], quadColor[3], alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         /* multiply alpha by 1.0 */
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(source[3], quadColor[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            /* A */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_comp);
+static void
+blend_single_add_one_one(struct quad_stage *qs, 
+                         struct quad_header *quads[],
+                         unsigned nr)
+{
+   float dest[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
          }
-         break;
-      default:
-         assert(0);
       }
+     
+      VEC4_ADD_SAT(quadColor[0], quadColor[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], quadColor[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], quadColor[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], quadColor[3], dest[3]); /* A */
 
-
-      /*
-       * Compute dest/second term RGB
-       */
-      switch (softpipe->blend->rgb_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[0], zero); /* R */
-         VEC4_COPY(dest[1], zero); /* G */
-         VEC4_COPY(dest[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
-            VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
-            VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute dest/second term A
-       */
-      switch (softpipe->blend->alpha_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[3], dest[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[3], dest[3], inv_comp);
+
+static void
+single_output_color(struct quad_stage *qs, 
+                    struct quad_header *quads[],
+                    unsigned nr)
+{
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      default:
-         assert(0);
       }
+   }
+}
+
+static void
+blend_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+}
 
-      /*
-       * Combine RGB terms
-       */
-      switch (softpipe->blend->rgb_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      default:
-         assert(0);
-      }
 
-      /*
-       * Combine A terms
-       */
-      switch (softpipe->blend->alpha_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      default:
-         assert(0);
+static void
+choose_blend_quad(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const struct pipe_blend_state *blend = softpipe->blend;
+
+   qs->run = blend_fallback;
+   
+   if (softpipe->framebuffer.nr_cbufs == 0) {
+      qs->run = blend_noop;
+   }
+   else if (!softpipe->blend->logicop_enable &&
+            softpipe->blend->colormask == 0xf &&
+            softpipe->framebuffer.nr_cbufs == 1)
+   {
+      if (!blend->blend_enable) {
+         qs->run = single_output_color;
       }
+      else if (blend->rgb_src_factor == blend->alpha_src_factor &&
+               blend->rgb_dst_factor == blend->alpha_dst_factor &&
+               blend->rgb_func == blend->alpha_func)
+      {
+         if (blend->alpha_func == PIPE_BLEND_ADD) {
+            if (blend->rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
+               qs->run = blend_single_add_one_one;
+            }
+            else if (blend->rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+               qs->run = blend_single_add_src_alpha_inv_src_alpha;
 
-   } /* cbuf loop */
+         }
+      }
+   }
 
-   /* pass blended quad to next stage */
-   qs->next->run(qs->next, quad);
+   qs->run(qs, quads, nr);
 }
 
 
 static void blend_begin(struct quad_stage *qs)
 {
-   qs->next->begin(qs->next);
+   qs->run = choose_blend_quad;
 }
 
 
@@ -770,7 +999,7 @@ struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = blend_begin;
-   stage->run = blend_quad;
+   stage->run = choose_blend_quad;
    stage->destroy = blend_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
deleted file mode 100644
index 953d8516b90..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ /dev/null
@@ -1,74 +0,0 @@
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Loop over colorbuffers, passing quad to next stage each time.
- */
-static void
-cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   float tmp[PIPE_MAX_COLOR_BUFS][4][QUAD_SIZE];
-   unsigned i;
-
-   assert(sizeof(quad->outputs.color) == sizeof(tmp));
-   assert(softpipe->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
-
-   /* make copy of original colors since they can get modified
-    * by blending and masking.
-    * XXX we won't have to do this if the fragment program actually emits
-    * N separate colors and we're drawing to N color buffers (MRT).
-    * But if we emitted one color and glDrawBuffer(GL_FRONT_AND_BACK) is
-    * in effect, we need to save/restore colors like this.
-    */
-   memcpy(tmp, quad->outputs.color, sizeof(tmp));
-
-   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-      /* set current cbuffer */
-#if 0 /* obsolete & going away */
-      softpipe->current_cbuf = i;
-#endif
-
-      /* pass blended quad to next stage */
-      qs->next->run(qs->next, quad);
-
-      /* restore quad's colors for next buffer */
-      memcpy(quad->outputs.color, tmp, sizeof(tmp));
-   }
-}
-
-
-static void cbuf_loop_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void cbuf_loop_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-/**
- * Create the colorbuffer loop stage.
- * This is used to implement multiple render targets and GL_FRONT_AND_BACK
- * rendering.
- */
-struct quad_stage *sp_quad_bufloop_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = cbuf_loop_begin;
-   stage->run = cbuf_loop_quad;
-   stage->destroy = cbuf_loop_destroy;
-
-   return stage;
-}
-
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
deleted file mode 100644
index dc90e5d5e99..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  quad colormask stage
- * \author Brian Paul
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-
-/**
- * XXX colormask could be rolled into blending...
- */
-static void
-colormask_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
-
-      /* R */
-      if (!(softpipe->blend->colormask & PIPE_MASK_R))
-          COPY_4V(quadColor[0], dest[0]);
-
-      /* G */
-      if (!(softpipe->blend->colormask & PIPE_MASK_G))
-          COPY_4V(quadColor[1], dest[1]);
-
-      /* B */
-      if (!(softpipe->blend->colormask & PIPE_MASK_B))
-          COPY_4V(quadColor[2], dest[2]);
-
-      /* A */
-      if (!(softpipe->blend->colormask & PIPE_MASK_A))
-          COPY_4V(quadColor[3], dest[3]);
-   }
-
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
-
-
-static void colormask_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void colormask_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = colormask_begin;
-   stage->run = colormask_quad;
-   stage->destroy = colormask_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
deleted file mode 100644
index 4aeee858705..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * \brief  Apply AA coverage to quad alpha valus
- * \author  Brian Paul
- */
-
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Multiply quad's alpha values by the fragment coverage.
- */
-static void
-coverage_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const uint prim = quad->input.prim;
-
-   if ((softpipe->rasterizer->poly_smooth && prim == QUAD_PRIM_TRI) ||
-       (softpipe->rasterizer->line_smooth && prim == QUAD_PRIM_LINE) ||
-       (softpipe->rasterizer->point_smooth && prim == QUAD_PRIM_POINT)) {
-      uint cbuf;
-
-      /* loop over colorbuffer outputs */
-      for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-         float (*quadColor)[4] = quad->output.color[cbuf];
-         unsigned j;
-         for (j = 0; j < QUAD_SIZE; j++) {
-            assert(quad->input.coverage[j] >= 0.0);
-            assert(quad->input.coverage[j] <= 1.0);
-         quadColor[3][j] *= quad->input.coverage[j];
-         }
-      }
-   }
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void coverage_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void coverage_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = coverage_begin;
-   stage->run = coverage_quad;
-   stage->destroy = coverage_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index d463930bae1..0ca86c4e1cb 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -31,61 +31,109 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_scan.h"
 #include "sp_context.h"
 #include "sp_quad.h"
 #include "sp_surface.h"
 #include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
+#include "sp_state.h"           /* for sp_fragment_shader */
 
 
-/**
- * Do depth testing for a quad.
- * Not static since it's used by the stencil code.
- */
+struct depth_data {
+   struct pipe_surface *ps;
+   enum pipe_format format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile;
+};
 
-/*
- * To increase efficiency, we should probably have multiple versions
- * of this function that are specifically for Z16, Z32 and FP Z buffers.
- * Try to effectively do that with codegen...
- */
 
-void
-sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+
+static void
+get_depth_stencil_values( struct depth_data *data,
+                          const struct quad_header *quad )
 {
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   const enum pipe_format format = ps->format;
-   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
-   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
-   unsigned zmask = 0;
    unsigned j;
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+   const struct softpipe_cached_tile *tile = data->tile;
+
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth16[y][x];
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         data->stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+   break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         data->stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-   assert(ps); /* shouldn't get here if there's no zbuffer */
+/* If the shader has not been run, interpolate the depth values
+ * ourselves.
+ */
+static void
+interpolate_quad_depth( struct quad_header *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
-   /*
-    * Convert quad's float depth values to int depth values (qzzzz).
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+}
+
+
+static void
+convert_quad_depth( struct depth_data *data, 
+                    const struct quad_header *quad )
+{
+   unsigned j;
+
+   /* Convert quad's float depth values to int depth values (qzzzz).
     * If the Z buffer stores integer values, we _have_ to do the depth
     * compares with integers (not floats).  Otherwise, the float->int->float
     * conversion of Z values (which isn't an identity function) will cause
     * Z-fighting errors.
-    *
-    * Also, get the zbuffer values (bzzzz) from the cached tile.
     */
-   switch (format) {
+   switch (data->format) {
    case PIPE_FORMAT_Z16_UNORM:
       {
          float scale = 65535.0;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth16[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
@@ -94,47 +142,247 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          double scale = (double) (uint) ~0UL;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_X8Z24_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_S8Z24_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_Z24X8_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_Z24S8_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] >> 8;
+
+
+static void
+write_depth_stencil_values( struct depth_data *data,
+                            struct quad_header *quad )
+{
+   struct softpipe_cached_tile *tile = data->tile;
+   unsigned j;
+
+   /* put updated Z values back into cached tile */
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth16[y][x] = (ushort) data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->stencilVals[j] << 24) | data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->bzzzz[j] << 8) | data->stencilVals[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j] << 8;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param data->stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(struct depth_data *data,
+                unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param data->stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(struct depth_data *data,
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = data->stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = data->stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] > 0) {
+               newstencil[j] = data->stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~data->stencilVals[j];
          }
       }
       break;
@@ -142,6 +390,39 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & data->stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+   
+
+/*
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+
+static boolean
+depth_test_quad(struct quad_stage *qs, 
+                struct depth_data *data,
+                struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned zmask = 0;
+   unsigned j;
+
    switch (softpipe->depth_stencil->depth.func) {
    case PIPE_FUNC_NEVER:
       /* zmask = 0 */
@@ -151,37 +432,37 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
        * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
        */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] < bzzzz[j]) 
+	 if (data->qzzzz[j] < data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_EQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] == bzzzz[j]) 
+	 if (data->qzzzz[j] == data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_LEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] <= bzzzz[j]) 
+	 if (data->qzzzz[j] <= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GREATER:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] > bzzzz[j]) 
+	 if (data->qzzzz[j] > data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_NOTEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] != bzzzz[j]) 
+	 if (data->qzzzz[j] != data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] >= bzzzz[j]) 
+	 if (data->qzzzz[j] >= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
@@ -193,80 +474,480 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
    }
 
    quad->inout.mask &= zmask;
+   if (quad->inout.mask == 0)
+      return FALSE;
 
+   /* Update our internal copy only if writemask set.  Even if
+    * depth.writemask is FALSE, may still need to write out buffer
+    * data due to stencil changes.
+    */
    if (softpipe->depth_stencil->depth.writemask) {
-      
-      /* This is also efficient with sse / spe instructions: 
-       */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (quad->inout.mask & (1 << j)) {
-	    bzzzz[j] = qzzzz[j];
-	 }
+         if (quad->inout.mask & (1 << j)) {
+            data->bzzzz[j] = data->qzzzz[j];
+         }
       }
+   }
 
-      /* put updated Z values back into cached tile */
-      switch (format) {
-      case PIPE_FORMAT_Z16_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth16[y][x] = (ushort) bzzzz[j];
+   return TRUE;
+}
+
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+depth_stencil_test_quad(struct quad_stage *qs, 
+                        struct depth_data *data,
+                        struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->depth_stencil->stencil[face].ref_value;
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
+
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(data, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(data, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         depth_test_quad(qs, data, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zFailMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(data, zFailMask, zFailOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_X8Z24_UNORM:
-         /* fall-through */
-         /* (yes, this falls through to a different case than above) */
-      case PIPE_FORMAT_Z32_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j];
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zPassMask = origMask & quad->inout.mask;
+            apply_stencil_op(data, zPassMask, zPassOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_S8Z24_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint s8z24 = tile->data.depth32[y][x];
-            s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
-            tile->data.depth32[y][x] = s8z24;
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(data, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+   }
+}
+
+
+#define ALPHATEST( FUNC, COMP )                                         \
+   static int                                                          \
+   alpha_test_quads_##FUNC( struct quad_stage *qs,                      \
+                           struct quad_header *quads[],                 \
+                           unsigned nr )                                \
+   {                                                                    \
+      const float ref = qs->softpipe->depth_stencil->alpha.ref_value;   \
+      const uint cbuf = 0; /* only output[0].alpha is tested */         \
+      unsigned pass_nr = 0;                                             \
+      unsigned i;                                                       \
+                                                                        \
+      for (i = 0; i < nr; i++) {                                        \
+         const float *aaaa = quads[i]->output.color[cbuf][3];           \
+         unsigned passMask = 0;                                         \
+                                                                        \
+         if (aaaa[0] COMP ref) passMask |= (1 << 0);                    \
+         if (aaaa[1] COMP ref) passMask |= (1 << 1);                    \
+         if (aaaa[2] COMP ref) passMask |= (1 << 2);                    \
+         if (aaaa[3] COMP ref) passMask |= (1 << 3);                    \
+                                                                        \
+         quads[i]->inout.mask &= passMask;                              \
+                                                                        \
+         if (quads[i]->inout.mask)                                      \
+            quads[pass_nr++] = quads[i];                                \
+      }                                                                 \
+                                                                        \
+      return pass_nr;                                                   \
+   }
+
+
+ALPHATEST( LESS,     < )
+ALPHATEST( EQUAL,    == )
+ALPHATEST( LEQUAL,   <= )
+ALPHATEST( GREATER,  > )
+ALPHATEST( NOTEQUAL, != )
+ALPHATEST( GEQUAL,   >= )
+
+
+/* XXX: Incorporate into shader using KILP.
+ */
+static int
+alpha_test_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[], 
+                 unsigned nr)
+{
+   switch (qs->softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_LESS:
+      return alpha_test_quads_LESS( qs, quads, nr );
+   case PIPE_FUNC_EQUAL:
+      return alpha_test_quads_EQUAL( qs, quads, nr );
+      break;
+   case PIPE_FUNC_LEQUAL:
+      return alpha_test_quads_LEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GREATER:
+      return alpha_test_quads_GREATER( qs, quads, nr );
+   case PIPE_FUNC_NOTEQUAL:
+      return alpha_test_quads_NOTEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GEQUAL:
+      return alpha_test_quads_GEQUAL( qs, quads, nr );
+   case PIPE_FUNC_ALWAYS:
+      return nr;
+   case PIPE_FUNC_NEVER:
+   default:
+      return 0;
+   }
+}
+
+static unsigned mask_count[16] = 
+{
+   0,                           /* 0x0 */
+   1,                           /* 0x1 */
+   1,                           /* 0x2 */
+   2,                           /* 0x3 */
+   1,                           /* 0x4 */
+   2,                           /* 0x5 */
+   2,                           /* 0x6 */
+   3,                           /* 0x7 */
+   1,                           /* 0x8 */
+   2,                           /* 0x9 */
+   2,                           /* 0xa */
+   3,                           /* 0xb */
+   2,                           /* 0xc */
+   3,                           /* 0xd */
+   3,                           /* 0xe */
+   4,                           /* 0xf */
+};
+
+
+
+static void
+depth_test_quads_fallback(struct quad_stage *qs, 
+                          struct quad_header *quads[],
+                          unsigned nr)
+{
+   unsigned i, pass = 0;
+   const struct sp_fragment_shader *fs = qs->softpipe->fs;
+   boolean interp_depth = !fs->info.writes_z;
+   struct depth_data data;
+
+
+   if (qs->softpipe->depth_stencil->alpha.enabled) {
+      nr = alpha_test_quads(qs, quads, nr);
+   }
+
+   if (qs->softpipe->framebuffer.zsbuf && 
+       (qs->softpipe->depth_stencil->depth.enabled ||
+        qs->softpipe->depth_stencil->stencil[0].enabled)) {
+
+      data.ps = qs->softpipe->framebuffer.zsbuf;
+      data.format = data.ps->format;
+      data.tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, 
+                                     quads[0]->input.x0, 
+                                     quads[0]->input.y0);
+
+      for (i = 0; i < nr; i++) {
+         get_depth_stencil_values(&data, quads[i]);
+
+         if (qs->softpipe->depth_stencil->depth.enabled) {
+            if (interp_depth)
+               interpolate_quad_depth(quads[i]);
+
+            convert_quad_depth(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24S8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint z24s8 = tile->data.depth32[y][x];
-            z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
-            tile->data.depth32[y][x] = z24s8;
+
+         if (qs->softpipe->depth_stencil->stencil[0].enabled) {
+            depth_stencil_test_quad(qs, &data, quads[i]);
+            write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j] << 8;
+         else {
+            if (!depth_test_quad(qs, &data, quads[i]))
+               continue;
+
+            if (qs->softpipe->depth_stencil->depth.writemask)
+               write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      default:
-         assert(0);
+
+
+         quads[pass++] = quads[i];
+      }
+
+      nr = pass;
+   }
+
+   if (qs->softpipe->active_query_count) {
+      for (i = 0; i < nr; i++) 
+         qs->softpipe->occlusion_count += mask_count[quads[i]->inout.mask];
+   }
+
+   if (nr)
+      qs->next->run(qs->next, quads, nr);
+}
+
+/* XXX: this function assumes setup function actually emits linear
+ * spans of quads.  It seems a lot more natural to do (early)
+ * depth-testing on spans rather than quads.
+ */
+static void
+depth_interp_z16_less_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
+{
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] < depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] < depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] < depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] < depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
       }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
    }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
+
 }
 
 
 static void
-depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+depth_interp_z16_lequal_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
 {
-   sp_depth_test_quad(qs, quad);
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] <= depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] <= depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] <= depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] <= depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
+      }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
+   }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
 }
 
 
+
+
+
+static void
+depth_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+   qs->next->run(qs->next, quads, nr);
+}
+
+
+
+static void
+choose_depth_test(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   boolean interp_depth = !qs->softpipe->fs->info.writes_z;
+
+   boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
+
+   boolean depth = (qs->softpipe->framebuffer.zsbuf && 
+                    qs->softpipe->depth_stencil->depth.enabled);
+
+   unsigned depthfunc = qs->softpipe->depth_stencil->depth.func;
+
+   boolean stencil = qs->softpipe->depth_stencil->stencil[0].enabled;
+
+   boolean depthwrite = qs->softpipe->depth_stencil->depth.writemask;
+
+   boolean occlusion = qs->softpipe->active_query_count;
+
+
+   if (!alpha &&
+       !depth &&
+       !stencil) {
+      qs->run = depth_noop;
+   }
+   else if (!alpha && 
+            interp_depth && 
+            depth && 
+            depthwrite && 
+            !occlusion &&
+            !stencil) 
+   {
+      switch (depthfunc) {
+      case PIPE_FUNC_LESS:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_less_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      case PIPE_FUNC_LEQUAL:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_lequal_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      default:
+         qs->run = depth_test_quads_fallback;
+      }
+   }
+   else {
+      qs->run = depth_test_quads_fallback;
+   }
+
+
+   qs->run( qs, quads, nr );
+}
+
+
+
+
+
 static void depth_test_begin(struct quad_stage *qs)
 {
+   qs->run = choose_depth_test;
    qs->next->begin(qs->next);
 }
 
@@ -283,7 +964,7 @@ struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = depth_test_begin;
-   stage->run = depth_test_quad;
+   stage->run = choose_depth_test;
    stage->destroy = depth_test_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
deleted file mode 100644
index 496fd39ed1a..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  Quad early-z testing
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * All this stage does is compute the quad's Z values (which is normally
- * done by the shading stage).
- * The next stage will do the actual depth test.
- */
-static void
-earlyz_quad(
-   struct quad_stage    *qs,
-   struct quad_header   *quad )
-{
-   const float fx = (float) quad->input.x0;
-   const float fy = (float) quad->input.y0;
-   const float dzdx = quad->posCoef->dadx[2];
-   const float dzdy = quad->posCoef->dady[2];
-   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-   quad->output.depth[0] = z0;
-   quad->output.depth[1] = z0 + dzdx;
-   quad->output.depth[2] = z0 + dzdy;
-   quad->output.depth[3] = z0 + dzdx + dzdy;
-
-   qs->next->run( qs->next, quad );
-}
-
-static void
-earlyz_begin(
-   struct quad_stage *qs )
-{
-   qs->next->begin( qs->next );
-}
-
-static void
-earlyz_destroy(
-   struct quad_stage *qs )
-{
-   FREE( qs );
-}
-
-struct quad_stage *
-sp_quad_earlyz_stage(
-   struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT( quad_stage );
-
-   stage->softpipe = softpipe;
-   stage->begin = earlyz_begin;
-   stage->run = earlyz_quad;
-   stage->destroy = earlyz_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 28f8d1a60ea..1e7533d0f9e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -68,72 +68,69 @@ quad_shade_stage(struct quad_stage *qs)
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-static void
+static INLINE boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct softpipe_context *softpipe = qs->softpipe;
    struct tgsi_exec_machine *machine = qss->machine;
-   boolean z_written;
-   
-   /* Consts do not require 16 byte alignment. */
-   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   machine->InterpCoefs = quad->coef;
 
    /* run shader */
-   quad->inout.mask &= softpipe->fs->run( softpipe->fs, machine, quad );
-
-   /* store outputs */
-   z_written = FALSE;
-   {
-      const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
-      const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
-      const uint n = qss->stage.softpipe->fs->info.num_outputs;
-      uint i;
-      for (i = 0; i < n; i++) {
-         switch (sem_name[i]) {
-         case TGSI_SEMANTIC_COLOR:
-            {
-               uint cbuf = sem_index[i];
-               memcpy(quad->output.color[cbuf],
-                      &machine->Outputs[i].xyzw[0].f[0],
-                      sizeof(quad->output.color[0]) );
-            }
-            break;
-         case TGSI_SEMANTIC_POSITION:
-            {
-               uint j;
-               for (j = 0; j < 4; j++) {
-                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
-               }
-               z_written = TRUE;
-            }
-            break;
-         }
+   return softpipe->fs->run( softpipe->fs, machine, quad );
+}
+
+
+
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      unsigned j;
+      for (j = 0; j < QUAD_SIZE; j++) {
+         assert(quad->input.coverage[j] >= 0.0);
+         assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
       }
    }
+}
+
 
-   if (!z_written) {
-      /* compute Z values now, as in the quad earlyz stage */
-      /* XXX we should really only do this if the earlyz stage is not used */
-      const float fx = (float) quad->input.x0;
-      const float fy = (float) quad->input.y0;
-      const float dzdx = quad->posCoef->dadx[2];
-      const float dzdy = quad->posCoef->dady[2];
-      const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-      quad->output.depth[0] = z0;
-      quad->output.depth[1] = z0 + dzdx;
-      quad->output.depth[2] = z0 + dzdy;
-      quad->output.depth[3] = z0 + dzdx + dzdy;
-   }
 
-   /* shader may cull fragments */
-   if (quad->inout.mask) {
-      qs->next->run( qs->next, quad );
+static void
+shade_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[],
+                 unsigned nr)
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = qss->machine;
+
+   unsigned i, pass = 0;
+   
+   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+   machine->InterpCoefs = quads[0]->coef;
+
+   for (i = 0; i < nr; i++) {
+      if (!shade_quad(qs, quads[i]))
+         continue;
+
+      if (/*do_coverage*/ 0)
+         coverage_quad( qs, quads[i] );
+
+      quads[pass++] = quads[i];
    }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
+   
+
+
 
 
 /**
@@ -174,7 +171,7 @@ sp_quad_shade_stage( struct softpipe_context *softpipe )
 
    qss->stage.softpipe = softpipe;
    qss->stage.begin = shade_begin;
-   qss->stage.run = shade_quad;
+   qss->stage.run = shade_quads;
    qss->stage.destroy = shade_destroy;
 
    qss->machine = tgsi_exec_machine_create();
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
deleted file mode 100644
index 92d5f9f3c1a..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-/**
- * Last step of quad processing: write quad colors to the framebuffer,
- * taking mask into account.
- */
-static void
-output_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   /* in-tile pos: */
-   const int itx = quad->input.x0 % TILE_SIZE;
-   const int ity = quad->input.y0 % TILE_SIZE;
-
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      int i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (quad->inout.mask & (1 << j)) {
-            int x = itx + (j & 1);
-            int y = ity + (j >> 1);
-            for (i = 0; i < 4; i++) { /* loop over color chans */
-               tile->data.color[y][x][i] = quadColor[i][j];
-            }
-            if (0) {
-               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
-                            quad->input.x0 + x,
-                            quad->input.y0 + y,
-                            quadColor[0][j],
-                            quadColor[1][j],
-                            quadColor[2][j]);
-            }
-         }
-      }
-   }
-}
-
-
-static void output_begin(struct quad_stage *qs)
-{
-   assert(qs->next == NULL);
-}
-
-
-static void output_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = output_begin;
-   stage->run = output_quad;
-   stage->destroy = output_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index b5f69b74264..1b5bab4eca6 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -31,88 +31,33 @@
 #include "pipe/p_shader_tokens.h"
 
 static void
-sp_push_quad_first(
-   struct softpipe_context *sp,
-   struct quad_stage *quad,
-   uint i )
+sp_push_quad_first( struct softpipe_context *sp,
+                    struct quad_stage *quad )
 {
-   quad->next = sp->quad[i].first;
-   sp->quad[i].first = quad;
+   quad->next = sp->quad.first;
+   sp->quad.first = quad;
 }
 
-static void
-sp_build_depth_stencil(
-   struct softpipe_context *sp,
-   uint i )
-{
-   if (sp->depth_stencil->stencil[0].enabled ||
-       sp->depth_stencil->stencil[1].enabled) {
-      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
-   }
-   else if (sp->depth_stencil->depth.enabled &&
-            sp->framebuffer.zsbuf) {
-      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
-   }
-}
 
 void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
-   uint i;
-
    boolean early_depth_test =
-               sp->depth_stencil->depth.enabled &&
-               sp->framebuffer.zsbuf &&
-               !sp->depth_stencil->alpha.enabled &&
-               !sp->fs->info.uses_kill &&
-               !sp->fs->info.writes_z;
-
-   /* build up the pipeline in reverse order... */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first = sp->quad[i].output;
-
-      if (sp->blend->colormask != 0xf) {
-         sp_push_quad_first( sp, sp->quad[i].colormask, i );
-      }
+      sp->depth_stencil->depth.enabled &&
+      sp->framebuffer.zsbuf &&
+      !sp->depth_stencil->alpha.enabled &&
+      !sp->fs->info.uses_kill &&
+      !sp->fs->info.writes_z;
 
-      if (sp->blend->blend_enable ||
-          sp->blend->logicop_enable) {
-         sp_push_quad_first( sp, sp->quad[i].blend, i );
-      }
+   sp->quad.first = sp->quad.blend;
 
-      if (sp->active_query_count) {
-         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
-      }
-
-      if (sp->rasterizer->poly_smooth ||
-          sp->rasterizer->line_smooth ||
-          sp->rasterizer->point_smooth) {
-         sp_push_quad_first( sp, sp->quad[i].coverage, i );
-      }
-
-      if (!early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-      }
-
-      if (sp->depth_stencil->alpha.enabled) {
-         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
-      }
-
-      /* XXX always enable shader? */
-      if (1) {
-         sp_push_quad_first( sp, sp->quad[i].shade, i );
-      }
-
-      if (early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
-      }
-
-#if !USE_DRAW_STAGE_PSTIPPLE
-      if (sp->rasterizer->poly_stipple_enable) {
-         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
-      }
-#endif
+   if (early_depth_test) {
+      sp_push_quad_first( sp, sp->quad.shade );
+      sp_push_quad_first( sp, sp->quad.depth_test );
+   }
+   else {
+      sp_push_quad_first( sp, sp->quad.depth_test );
+      sp_push_quad_first( sp, sp->quad.shade );
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.h b/src/gallium/drivers/softpipe/sp_quad_pipe.h
index 0e40586ffc8..c0aa1348319 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.h
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.h
@@ -49,7 +49,7 @@ struct quad_stage {
    void (*begin)(struct quad_stage *qs);
 
    /** the stage action */
-   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+   void (*run)(struct quad_stage *qs, struct quad_header *quad[], unsigned nr);
 
    void (*destroy)(struct quad_stage *qs);
 };
@@ -69,6 +69,4 @@ struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
 
 void sp_build_quad_pipeline(struct softpipe_context *sp);
 
-void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
-
 #endif /* SP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
deleted file mode 100644
index 5e9d447737d..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ /dev/null
@@ -1,352 +0,0 @@
-
-/**
- * \brief Quad stencil testing
- */
-
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_tile_cache.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-/** Only 8-bit stencil supported */
-#define STENCIL_MAX 0xff
-
-
-/**
- * Do the basic stencil test (compare stencil buffer values against the
- * reference value.
- *
- * \param stencilVals  the stencil values from the stencil buffer
- * \param func  the stencil func (PIPE_FUNC_x)
- * \param ref  the stencil reference value
- * \param valMask  the stencil value mask indicating which bits of the stencil
- *                 values and ref value are to be used.
- * \return mask indicating which pixels passed the stencil test
- */
-static unsigned
-do_stencil_test(const ubyte stencilVals[QUAD_SIZE], unsigned func,
-                unsigned ref, unsigned valMask)
-{
-   unsigned passMask = 0x0;
-   unsigned j;
-
-   ref &= valMask;
-
-   switch (func) {
-   case PIPE_FUNC_NEVER:
-      /* passMask = 0x0 */
-      break;
-   case PIPE_FUNC_LESS:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref < (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref == (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref <= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref > (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref != (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref >= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   return passMask;
-}
-
-
-/**
- * Apply the stencil operator to stencil values.
- *
- * \param stencilVals  the stencil buffer values (read and written)
- * \param mask  indicates which pixels to update
- * \param op  the stencil operator (PIPE_STENCIL_OP_x)
- * \param ref  the stencil reference value
- * \param wrtMask  writemask controlling which bits are changed in the
- *                 stencil values
- */
-static void
-apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
-                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
-{
-   unsigned j;
-   ubyte newstencil[QUAD_SIZE];
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      newstencil[j] = stencilVals[j];
-   }
-
-   switch (op) {
-   case PIPE_STENCIL_OP_KEEP:
-      /* no-op */
-      break;
-   case PIPE_STENCIL_OP_ZERO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = 0;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_REPLACE:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ref;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] < STENCIL_MAX) {
-               newstencil[j] = stencilVals[j] + 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] > 0) {
-               newstencil[j] = stencilVals[j] - 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] + 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] - 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INVERT:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ~stencilVals[j];
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /*
-    * update the stencil values
-    */
-   if (wrtMask != STENCIL_MAX) {
-      /* apply bit-wise stencil buffer writemask */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & stencilVals[j]);
-      }
-   }
-   else {
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = newstencil[j];
-      }
-   }
-}
-
-
-/**
- * Do stencil (and depth) testing.  Stenciling depends on the outcome of
- * depth testing.
- */
-static void
-stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   unsigned func, zFailOp, zPassOp, failOp;
-   ubyte ref, wrtMask, valMask;
-   ubyte stencilVals[QUAD_SIZE];
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
-   uint j;
-   uint face = quad->input.facing;
-
-   if (!softpipe->depth_stencil->stencil[1].enabled) {
-      /* single-sided stencil test, use front (face=0) state */
-      face = 0;
-   }
-
-   /* choose front or back face function, operator, etc */
-   /* XXX we could do these initializations once per primitive */
-   func    = softpipe->depth_stencil->stencil[face].func;
-   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
-   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
-   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
-   ref     = softpipe->depth_stencil->stencil[face].ref_value;
-   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
-   valMask = softpipe->depth_stencil->stencil[face].valuemask;
-
-   assert(ps); /* shouldn't get here if there's no stencil buffer */
-
-   /* get stencil values from cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] >> 24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.stencil8[y][x];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /* do the stencil test first */
-   {
-      unsigned passMask, failMask;
-      passMask = do_stencil_test(stencilVals, func, ref, valMask);
-      failMask = quad->inout.mask & ~passMask;
-      quad->inout.mask &= passMask;
-
-      if (failOp != PIPE_STENCIL_OP_KEEP) {
-         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
-      }
-   }
-
-   if (quad->inout.mask) {
-      /* now the pixels that passed the stencil test are depth tested */
-      if (softpipe->depth_stencil->depth.enabled) {
-         const unsigned origMask = quad->inout.mask;
-
-         sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
-
-         /* update stencil buffer values according to z pass/fail result */
-         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned failMask = origMask & ~quad->inout.mask;
-            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
-         }
-
-         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned passMask = origMask & quad->inout.mask;
-            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
-         }
-      }
-      else {
-         /* no depth test, apply Zpass operator to stencil buffer values */
-         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
-      }
-
-   }
-
-   /* put new stencil values into cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint s8z24 = tile->data.depth32[y][x];
-         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
-         tile->data.depth32[y][x] = s8z24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint z24s8 = tile->data.depth32[y][x];
-         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
-         tile->data.depth32[y][x] = z24s8;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         tile->data.stencil8[y][x] = stencilVals[j];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void stencil_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void stencil_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = stencil_begin;
-   stage->run = stencil_test_quad;
-   stage->destroy = stencil_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index 07162db7b6e..a0527a596a6 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -14,14 +14,20 @@
  * Apply polygon stipple to quads produced by triangle rasterization
  */
 static void
-stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+stipple_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
+   unsigned pass = nr;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned q;
+
+   pass = 0;
+
+   for (q = 0; q < nr; q++)  {
+      struct quad_header *quad = quads[q];
 
-   if (quad->input.prim == QUAD_PRIM_TRI) {
-      struct softpipe_context *softpipe = qs->softpipe;
-      /* need to invert Y to index into OpenGL's stipple pattern */
       const int col0 = quad->input.x0 % 32;
       const int y0 = quad->input.y0;
       const int y1 = y0 + 1;
@@ -41,13 +47,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
       if ((stipple1 & (bit30 >> col0)) == 0)
          quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
 
-      if (!quad->inout.mask) {
-         /* all fragments failed stipple test, end of quad pipeline */
-         return;
-      }
+      if (quad->inout.mask)
+         quads[pass++] = quad;
    }
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, pass);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 769425bd12c..81fb7aa20c6 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -40,7 +40,7 @@
 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -65,8 +65,6 @@ softpipe_get_param(struct pipe_screen *screen, int param)
       return 1;
    case PIPE_CAP_GLSL:
       return 1;
-   case PIPE_CAP_S3TC:
-      return 0;
    case PIPE_CAP_ANISOTROPIC_FILTER:
       return 0;
    case PIPE_CAP_POINT_SPRITE:
@@ -137,10 +135,14 @@ softpipe_is_format_supported( struct pipe_screen *screen,
           target == PIPE_TEXTURE_CUBE);
 
    switch(format) {
+   case PIPE_FORMAT_L16_UNORM:
+   case PIPE_FORMAT_YCBCR_REV:
+   case PIPE_FORMAT_YCBCR:
    case PIPE_FORMAT_DXT1_RGB:
    case PIPE_FORMAT_DXT1_RGBA:
    case PIPE_FORMAT_DXT3_RGBA:
    case PIPE_FORMAT_DXT5_RGBA:
+   case PIPE_FORMAT_Z32_FLOAT:
       return FALSE;
    default:
       return TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index de3ae3c3696..615581b95f9 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -33,7 +33,6 @@
  */
 
 #include "sp_context.h"
-#include "sp_prim_setup.h"
 #include "sp_quad.h"
 #include "sp_quad_pipe.h"
 #include "sp_setup.h"
@@ -61,87 +60,9 @@ struct edge {
    int lines;		/**< number of lines on this edge */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
 
-/* Set to 1 if you want other threads to be instantly
- * notified of pending jobs.
- */
-#define INSTANT_NOTEMPTY_NOTIFY 0
-
-struct thread_info
-{
-   struct setup_context *setup;
-   uint id;
-   pipe_thread handle;
-};
-
-struct quad_job;
-
-typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
-
-struct quad_job
-{
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-   quad_job_routine routine;
-};
-
-#define NUM_QUAD_JOBS 64
-
-struct quad_job_que
-{
-   struct quad_job jobs[NUM_QUAD_JOBS];
-   uint first;
-   uint last;
-   pipe_mutex que_mutex;
-   pipe_condvar que_notfull_condvar;
-   pipe_condvar que_notempty_condvar;
-   uint jobs_added;
-   uint jobs_done;
-   pipe_condvar que_done_condvar;
-};
+#define MAX_QUADS 16
 
-static void
-add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
-{
-#if INSTANT_NOTEMPTY_NOTIFY
-   boolean empty;
-#endif
-
-   /* Wait for empty slot, see if the que is empty.
-    */
-   pipe_mutex_lock( que->que_mutex );
-   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
-#if !INSTANT_NOTEMPTY_NOTIFY
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-#endif
-      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
-   }
-#if INSTANT_NOTEMPTY_NOTIFY
-   empty = que->last == que->first;
-#endif
-   que->jobs_added++;
-   pipe_mutex_unlock( que->que_mutex );
-
-   /* Submit new job.
-    */
-   que->jobs[que->last].input = quad->input;
-   que->jobs[que->last].inout = quad->inout;
-   que->jobs[que->last].routine = routine;
-   que->last = (que->last + 1) % NUM_QUAD_JOBS;
-
-#if INSTANT_NOTEMPTY_NOTIFY
-   /* If the que was empty, notify consumers there's a job to be done.
-    */
-   if (empty) {
-      pipe_mutex_lock( que->que_mutex );
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-#endif
-}
-
-#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -164,22 +85,21 @@ struct setup_context {
    struct edge emaj;
 
    float oneoverarea;
+   int facing;
+
+   float pixel_offset;
+
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
 
    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
    struct tgsi_interp_coef posCoef;  /* For Z, W */
-   struct quad_header quad;
-
-#if SP_NUM_QUAD_THREADS > 1
-   struct quad_job_que que;
-   struct thread_info threads[SP_NUM_QUAD_THREADS];
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
       int y;
-      unsigned y_flags;
-      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
    } span;
 
 #if DEBUG_FRAGS
@@ -188,69 +108,9 @@ struct setup_context {
 #endif
 
    unsigned winding;		/* which winding to cull */
+   unsigned nr_vertex_attrs;
 };
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static PIPE_THREAD_ROUTINE( quad_thread, param )
-{
-   struct thread_info *info = (struct thread_info *) param;
-   struct quad_job_que *que = &info->setup->que;
-
-   for (;;) {
-      struct quad_job job;
-      boolean full;
-
-      /* Wait for an available job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      while (que->last == que->first)
-         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
-
-      /* See if the que is full.
-       */
-      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
-
-      /* Take a job and remove it from que.
-       */
-      job = que->jobs[que->first];
-      que->first = (que->first + 1) % NUM_QUAD_JOBS;
-
-      /* Notify the producer if the que is not full.
-       */
-      if (full)
-         pipe_condvar_signal( que->que_notfull_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-
-      job.routine( info->setup, info->id, &job );
-
-      /* Notify the producer if that's the last finished job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      que->jobs_done++;
-      if (que->jobs_added == que->jobs_done)
-         pipe_condvar_signal( que->que_done_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-
-   return NULL;
-}
-
-#define WAIT_FOR_COMPLETION(setup) \
-   do {\
-      pipe_mutex_lock( setup->que.que_mutex );\
-      if (!INSTANT_NOTEMPTY_NOTIFY)\
-         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
-      while (setup->que.jobs_added != setup->que.jobs_done)\
-         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
-      pipe_mutex_unlock( setup->que.que_mutex );\
-   } while (0)
-
-#else
-
-#define WAIT_FOR_COMPLETION(setup) ((void) 0)
-
-#endif
 
 
 
@@ -313,98 +173,18 @@ quad_clip( struct setup_context *setup, struct quad_header *quad )
  * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
 {
    quad_clip( setup, quad );
+
    if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
 
-      sp->quad[thread].first->run( sp->quad[thread].first, quad );
+      sp->quad.first->run( sp->quad.first, &quad, 1 );
    }
 }
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static void
-clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   clip_emit_quad( setup, &quad, thread );
-}
 
-#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
-
-#else
-
-#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
-
-#endif
-
-/**
- * Emit a quad (pass to next stage).  No clipping is done.
- */
-static INLINE void
-emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
-{
-   struct softpipe_context *sp = setup->softpipe;
-#if DEBUG_FRAGS
-   uint mask = quad->inout.mask;
-#endif
-
-#if DEBUG_FRAGS
-   if (mask & 1) setup->numFragsEmitted++;
-   if (mask & 2) setup->numFragsEmitted++;
-   if (mask & 4) setup->numFragsEmitted++;
-   if (mask & 8) setup->numFragsEmitted++;
-#endif
-   sp->quad[thread].first->run( sp->quad[thread].first, quad );
-#if DEBUG_FRAGS
-   mask = quad->inout.mask;
-   if (mask & 1) setup->numFragsWritten++;
-   if (mask & 2) setup->numFragsWritten++;
-   if (mask & 4) setup->numFragsWritten++;
-   if (mask & 8) setup->numFragsWritten++;
-#endif
-}
-
-#if SP_NUM_QUAD_THREADS > 1
-
-static void
-emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   emit_quad( setup, &quad, thread );
-}
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
-   } while (0)
-
-#else
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      emit_quad( setup, &setup->quad, 0 );\
-   } while (0)
-
-#endif
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -412,7 +192,12 @@ emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
  */
 static INLINE int block( int x )
 {
-   return x & ~1;
+   return x & ~(2-1);
+}
+
+static INLINE int block_x( int x )
+{
+   return x & ~(16-1);
 }
 
 
@@ -421,72 +206,63 @@ static INLINE int block( int x )
  */
 static void flush_spans( struct setup_context *setup )
 {
+   const int step = 16;
    const int xleft0 = setup->span.left[0];
    const int xleft1 = setup->span.left[1];
    const int xright0 = setup->span.right[0];
    const int xright1 = setup->span.right[1];
-   int minleft, maxright;
+   struct quad_stage *pipe = setup->softpipe->quad.first;
+
+
+   int minleft = block_x(MIN2(xleft0, xleft1));
+   int maxright = MAX2(xright0, xright1);
    int x;
 
-   switch (setup->span.y_flags) {
-   case 0x3:
-      /* both odd and even lines written (both quad rows) */
-      minleft = block(MIN2(xleft0, xleft1));
-      maxright = block(MAX2(xright0, xright1));
-      for (x = minleft; x <= maxright; x += 2) {
-         /* determine which of the four pixels is inside the span bounds */
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x1:
-      /* only even line written (quad top row) */
-      minleft = block(xleft0);
-      maxright = block(xright0);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x2:
-      /* only odd line written (quad bottom row) */
-      minleft = block(xleft1);
-      maxright = block(xright1);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
+   for (x = minleft; x < maxright; x += step) {
+      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
+      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
+      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
+      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
+      unsigned lx = x;
+      unsigned q = 0;
 
-   default:
-      return;
+      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
+      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
+
+      /* These calculations fail when step == 32 and skip_right == 0.
+       */
+      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
+      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
+
+      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
+      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
+
+      if (mask0 | mask1) {
+         do {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            if (quadmask) {
+               setup->quad[q].input.x0 = lx;
+               setup->quad[q].input.y0 = setup->span.y;
+               setup->quad[q].input.facing = setup->facing;
+               setup->quad[q].inout.mask = quadmask;
+               setup->quad_ptrs[q] = &setup->quad[q];
+               q++;
+            }
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         } while (mask0 | mask1);
+
+         pipe->run( pipe, setup->quad_ptrs, q );
+      }
    }
 
+
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 }
 
 
@@ -495,8 +271,8 @@ static void print_vertex(const struct setup_context *setup,
                          const float (*v)[4])
 {
    int i;
-   debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   debug_printf("   Vertex: (%p)\n", (void *) v);
+   for (i = 0; i < setup->nr_vertex_attrs; i++) {
       debug_printf("     %d: %f %f %f %f\n",  i,
               v[i][0], v[i][1], v[i][2], v[i][3]);
       if (util_is_inf_or_nan(v[i][0])) {
@@ -601,7 +377,19 @@ static boolean setup_sort_vertices( struct setup_context *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup->facing = 
+      ((det > 0.0) ^ 
+       (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW));
+
+   /* Prepare pixel offset for rasterisation:
+    *  - pixel center (0.5, 0.5) for GL, or
+    *  - assume (0.0, 0.0) for other APIs.
+    */
+   if (setup->softpipe->rasterizer->gl_rasterization_rules) {
+      setup->pixel_offset = 0.5f;
+   } else {
+      setup->pixel_offset = 0.0f;
+   }
 
    return TRUE;
 }
@@ -651,7 +439,7 @@ static void tri_linear_coeff( struct setup_context *setup,
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
+    * pixel centers, in other words (pixel_offset, pixel_offset).
     *
     * this is neat but unfortunately not a good way to do things for
     * triangles with very large values of dadx or dady as it will
@@ -662,8 +450,8 @@ static void tri_linear_coeff( struct setup_context *setup,
     * instead - i'll switch to this later.
     */
    coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
 
    /*
    debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
@@ -711,8 +499,8 @@ static void tri_persp_coeff( struct setup_context *setup,
    coef->dadx[i] = dadx;
    coef->dady[i] = dady;
    coef->a0[i] = (mina -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
 }
 
 
@@ -788,7 +576,7 @@ static void setup_tri_coefficients( struct setup_context *setup )
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -799,12 +587,12 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
 static void setup_tri_edges( struct setup_context *setup )
 {
-   float vmin_x = setup->vmin[0][0] + 0.5f;
-   float vmid_x = setup->vmid[0][0] + 0.5f;
+   float vmin_x = setup->vmin[0][0] + setup->pixel_offset;
+   float vmid_x = setup->vmid[0][0] + setup->pixel_offset;
 
-   float vmin_y = setup->vmin[0][1] - 0.5f;
-   float vmid_y = setup->vmid[0][1] - 0.5f;
-   float vmax_y = setup->vmax[0][1] - 0.5f;
+   float vmin_y = setup->vmin[0][1] - setup->pixel_offset;
+   float vmid_y = setup->vmid[0][1] - setup->pixel_offset;
+   float vmax_y = setup->vmax[0][1] - setup->pixel_offset;
 
    setup->emaj.sy = ceilf(vmin_y);
    setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
@@ -844,11 +632,10 @@ static void subtriangle( struct setup_context *setup,
 
    /* clip top/bottom */
    start_y = sy;
-   finish_y = sy + lines;
-
    if (start_y < miny)
       start_y = miny;
 
+   finish_y = sy + lines;
    if (finish_y > maxy)
       finish_y = maxy;
 
@@ -885,7 +672,6 @@ static void subtriangle( struct setup_context *setup,
 
          setup->span.left[_y&1] = left;
          setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -923,7 +709,7 @@ calc_det( const float (*v0)[4],
 /**
  * Do setup for triangle rasterization, then render the triangle.
  */
-void setup_tri( struct setup_context *setup,
+void sp_setup_tri( struct setup_context *setup,
                 const float (*v0)[4],
                 const float (*v1)[4],
                 const float (*v2)[4] )
@@ -958,10 +744,9 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.input.prim = QUAD_PRIM_TRI;
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
 
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
    /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
@@ -983,8 +768,6 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
-   WAIT_FOR_COMPLETION(setup);
-
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -1009,8 +792,8 @@ line_linear_coeff(const struct setup_context *setup,
    coef->dadx[i] = dadx;
    coef->dady[i] = dady;
    coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
 }
 
 
@@ -1032,8 +815,8 @@ line_persp_coeff(const struct setup_context *setup,
    coef->dadx[i] = dadx;
    coef->dady[i] = dady;
    coef->a0[i] = (setup->vmin[vertSlot][i] -
-                  (dadx * (setup->vmin[0][0] - 0.5f) +
-                   dady * (setup->vmin[0][1] - 0.5f)));
+                  (dadx * (setup->vmin[0][0] - setup->pixel_offset) +
+                   dady * (setup->vmin[0][1] - setup->pixel_offset)));
 }
 
 
@@ -1101,7 +884,7 @@ setup_line_coefficients(struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -1122,20 +905,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.input.x0 ||
-       quadY != setup->quad.input.y0)
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.input.x0 != -1)
-         CLIP_EMIT_QUAD(setup);
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
 
-      setup->quad.input.x0 = quadX;
-      setup->quad.input.y0 = quadY;
-      setup->quad.inout.mask = 0x0;
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
    }
 
-   setup->quad.inout.mask |= mask;
+   setup->quad[0].inout.mask |= mask;
 }
 
 
@@ -1145,7 +928,7 @@ plot(struct setup_context *setup, int x, int y)
  * to handle stippling and wide lines.
  */
 void
-setup_line(struct setup_context *setup,
+sp_setup_line(struct setup_context *setup,
            const float (*v0)[4],
            const float (*v1)[4])
 {
@@ -1195,17 +978,18 @@ setup_line(struct setup_context *setup,
 
    assert(dx >= 0);
    assert(dy >= 0);
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
 
-   setup->quad.input.x0 = setup->quad.input.y0 = -1;
-   setup->quad.inout.mask = 0x0;
-   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.input.coverage[0] =
-   setup->quad.input.coverage[1] =
-   setup->quad.input.coverage[2] =
-   setup->quad.input.coverage[3] = 1.0;
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1249,11 +1033,9 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.inout.mask) {
-      CLIP_EMIT_QUAD(setup);
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 
@@ -1276,7 +1058,7 @@ point_persp_coeff(const struct setup_context *setup,
  * XXX could optimize a lot for 1-pixel points.
  */
 void
-setup_point( struct setup_context *setup,
+sp_setup_point( struct setup_context *setup,
              const float (*v0)[4] )
 {
    struct softpipe_context *softpipe = setup->softpipe;
@@ -1300,6 +1082,8 @@ setup_point( struct setup_context *setup,
    if (softpipe->no_rast)
       return;
 
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_POINTS);
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
@@ -1346,22 +1130,21 @@ setup_point( struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
    }
 
-   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.input.x0 = (int) x - ix;
-      setup->quad.input.y0 = (int) y - iy;
-      setup->quad.inout.mask = (1 << ix) << (2 * iy);
-      CLIP_EMIT_QUAD(setup);
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
    }
    else {
       if (round) {
@@ -1381,15 +1164,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.inout.mask = 0x0;
+               setup->quad[0].inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1397,8 +1180,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1406,8 +1189,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1415,14 +1198,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.inout.mask) {
-                  setup->quad.input.x0 = ix;
-                  setup->quad.input.y0 = iy;
-                  CLIP_EMIT_QUAD(setup);
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
                }
             }
          }
@@ -1466,33 +1249,28 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.inout.mask = mask;
-               setup->quad.input.x0 = ix;
-               setup->quad.input.y0 = iy;
-               CLIP_EMIT_QUAD(setup);
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
             }
          }
       }
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
-void setup_prepare( struct setup_context *setup )
+void sp_setup_prepare( struct setup_context *setup )
 {
    struct softpipe_context *sp = setup->softpipe;
-   unsigned i;
 
    if (sp->dirty) {
       softpipe_update_derived(sp);
    }
 
    /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
+   setup->nr_vertex_attrs = draw_num_vs_outputs(sp->draw);
 
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first->begin( sp->quad[i].first );
-   }
+   sp->quad.first->begin( sp->quad.first );
 
    if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
@@ -1508,7 +1286,7 @@ void setup_prepare( struct setup_context *setup )
 
 
 
-void setup_destroy_context( struct setup_context *setup )
+void sp_setup_destroy_context( struct setup_context *setup )
 {
    FREE( setup );
 }
@@ -1517,33 +1295,20 @@ void setup_destroy_context( struct setup_context *setup )
 /**
  * Create a new primitive setup/render stage.
  */
-struct setup_context *setup_create_context( struct softpipe_context *softpipe )
+struct setup_context *sp_setup_create_context( struct softpipe_context *softpipe )
 {
    struct setup_context *setup = CALLOC_STRUCT(setup_context);
-#if SP_NUM_QUAD_THREADS > 1
-   uint i;
-#endif
+   unsigned i;
 
    setup->softpipe = softpipe;
 
-   setup->quad.coef = setup->coef;
-   setup->quad.posCoef = &setup->posCoef;
-
-#if SP_NUM_QUAD_THREADS > 1
-   setup->que.first = 0;
-   setup->que.last = 0;
-   pipe_mutex_init( setup->que.que_mutex );
-   pipe_condvar_init( setup->que.que_notfull_condvar );
-   pipe_condvar_init( setup->que.que_notempty_condvar );
-   setup->que.jobs_added = 0;
-   setup->que.jobs_done = 0;
-   pipe_condvar_init( setup->que.que_done_condvar );
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      setup->threads[i].setup = setup;
-      setup->threads[i].id = i;
-      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = setup->coef;
+      setup->quad[i].posCoef = &setup->posCoef;
    }
-#endif
+
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 
    return setup;
 }
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index d54f3344288..9c8844d2e8e 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -31,23 +31,23 @@ struct setup_context;
 struct softpipe_context;
 
 void 
-setup_tri( struct setup_context *setup,
+sp_setup_tri( struct setup_context *setup,
 	   const float (*v0)[4],
 	   const float (*v1)[4],
 	   const float (*v2)[4] );
 
 void
-setup_line(struct setup_context *setup,
+sp_setup_line(struct setup_context *setup,
            const float (*v0)[4],
            const float (*v1)[4]);
 
 void
-setup_point( struct setup_context *setup,
+sp_setup_point( struct setup_context *setup,
              const float (*v0)[4] );
 
 
-struct setup_context *setup_create_context( struct softpipe_context *softpipe );
-void setup_prepare( struct setup_context *setup );
-void setup_destroy_context( struct setup_context *setup );
+struct setup_context *sp_setup_create_context( struct softpipe_context *softpipe );
+void sp_setup_prepare( struct setup_context *setup );
+void sp_setup_destroy_context( struct setup_context *setup );
 
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 9776e978e3e..77ee3c1136b 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -87,6 +87,7 @@ struct sp_fragment_shader {
 struct sp_vertex_shader {
    struct pipe_shader_state shader;
    struct draw_vertex_shader *draw_data;
+   int max_sampler;             /* -1 if no samplers */
 };
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index 384fe559afd..efed082f823 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -45,7 +45,7 @@ void softpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->blend = (const struct pipe_blend_state *)blend;
+   softpipe->blend = (struct pipe_blend_state *)blend;
 
    softpipe->dirty |= SP_NEW_BLEND;
 }
@@ -86,7 +86,7 @@ softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+   softpipe->depth_stencil = (struct pipe_depth_stencil_alpha_state *)depth_stencil;
 
    softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 75551000c9b..3bc96b95385 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -32,7 +32,10 @@
 #include "draw/draw_vertex.h"
 #include "draw/draw_private.h"
 #include "sp_context.h"
+#include "sp_screen.h"
 #include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
 
 
 /**
@@ -63,26 +66,19 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
    if (vinfo->num_attribs == 0) {
       /* compute vertex layout now */
       const struct sp_fragment_shader *spfs = softpipe->fs;
-      const enum interp_mode colorInterp
-         = softpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(softpipe->draw);
       uint i;
 
-      if (softpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(softpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
@@ -91,35 +87,40 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       vinfo->num_attribs = 0;
       for (i = 0; i < spfs->info.num_inputs; i++) {
          int src;
-         switch (spfs->info.input_semantic_name[i]) {
-         case TGSI_SEMANTIC_POSITION:
-            src = draw_find_vs_output(softpipe->draw,
-                                      TGSI_SEMANTIC_POSITION, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
-            break;
+         enum interp_mode interp;
 
-         case TGSI_SEMANTIC_COLOR:
-            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_COLOR, 
-                                 spfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         switch (spfs->info.input_interpolate[i]) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = INTERP_CONSTANT;
             break;
-
-         case TGSI_SEMANTIC_FOG:
-            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_FOG, 0);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = INTERP_LINEAR;
             break;
-
-         case TGSI_SEMANTIC_GENERIC:
-         case TGSI_SEMANTIC_FACE:
-            /* this includes texcoords and varying vars */
-            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_GENERIC,
-                                      spfs->info.input_semantic_index[i]);
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = INTERP_PERSPECTIVE;
             break;
-
          default:
             assert(0);
+            interp = INTERP_LINEAR;
+         }
+
+         switch (spfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            interp = INTERP_POS;
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            if (softpipe->rasterizer->flatshade) {
+               interp = INTERP_CONSTANT;
+            }
+            break;
          }
+
+         /* this includes texcoords and varying vars */
+         src = draw_find_vs_output(softpipe->draw,
+                                   spfs->info.input_semantic_name[i],
+                                   spfs->info.input_semantic_index[i]);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
       softpipe->psize_slot = draw_find_vs_output(softpipe->draw,
@@ -164,11 +165,19 @@ softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 static void
 compute_cliprect(struct softpipe_context *sp)
 {
+   /* SP_NEW_FRAMEBUFFER
+    */
    uint surfWidth = sp->framebuffer.width;
    uint surfHeight = sp->framebuffer.height;
 
+   /* SP_NEW_RASTERIZER
+    */
    if (sp->rasterizer->scissor) {
-      /* clip to scissor rect */
+
+      /* SP_NEW_SCISSOR
+       *
+       * clip to scissor rect:
+       */
       sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
       sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
       sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
@@ -184,27 +193,63 @@ compute_cliprect(struct softpipe_context *sp)
 }
 
 
+static void
+update_tgsi_samplers( struct softpipe_context *softpipe )
+{
+   unsigned i;
+
+   softpipe_reset_sampler_varients( softpipe );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->tex_cache[i];
+      if (tc->texture) {
+         struct softpipe_texture *spt = softpipe_texture(tc->texture);
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture( tc );
+            /*
+            _debug_printf("INV %d %d\n", tc->timestamp, spt->timestamp);
+            */
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+}
+
+
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
 void softpipe_update_derived( struct softpipe_context *softpipe )
 {
+   struct softpipe_screen *sp_screen = softpipe_screen(softpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (softpipe->tex_timestamp != sp_screen->timestamp) {
+      softpipe->tex_timestamp = sp_screen->timestamp;
+      softpipe->dirty |= SP_NEW_TEXTURE;
+   }
+      
+   if (softpipe->dirty & (SP_NEW_SAMPLER |
+                          SP_NEW_TEXTURE |
+                          SP_NEW_FS | 
+                          SP_NEW_VS))
+      update_tgsi_samplers( softpipe );
+
    if (softpipe->dirty & (SP_NEW_RASTERIZER |
                           SP_NEW_FS |
                           SP_NEW_VS))
       invalidate_vertex_layout( softpipe );
 
    if (softpipe->dirty & (SP_NEW_SCISSOR |
-                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_RASTERIZER |
                           SP_NEW_FRAMEBUFFER))
       compute_cliprect(softpipe);
 
    if (softpipe->dirty & (SP_NEW_BLEND |
                           SP_NEW_DEPTH_STENCIL_ALPHA |
                           SP_NEW_FRAMEBUFFER |
-                          SP_NEW_RASTERIZER |
-                          SP_NEW_FS | 
-			  SP_NEW_QUERY))
+                          SP_NEW_FS))
       sp_build_quad_pipeline(softpipe);
 
    softpipe->dirty = 0;
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 4330c203935..b41f7e8ab72 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -31,9 +31,8 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
+#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_parse.h"
@@ -51,12 +50,9 @@ softpipe_create_fs_state(struct pipe_context *pipe,
       tgsi_dump(templ->tokens, 0);
 
    /* codegen */
-   state = softpipe_create_fs_llvm( softpipe, templ );
+   state = softpipe_create_fs_sse( softpipe, templ );
    if (!state) {
-      state = softpipe_create_fs_sse( softpipe, templ );
-      if (!state) {
-         state = softpipe_create_fs_exec( softpipe, templ );
-      }
+      state = softpipe_create_fs_exec( softpipe, templ );
    }
 
    assert(state);
@@ -111,6 +107,8 @@ softpipe_create_vs_state(struct pipe_context *pipe,
    if (state->draw_data == NULL) 
       goto fail;
 
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
    return state;
 
 fail:
@@ -128,7 +126,7 @@ softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->vs = (const struct sp_vertex_shader *)vs;
+   softpipe->vs = (struct sp_vertex_shader *) vs;
 
    draw_bind_vertex_shader(softpipe->draw,
                            (softpipe->vs ? softpipe->vs->draw_data : NULL));
@@ -142,10 +140,10 @@ softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   struct sp_vertex_shader *state =
-      (struct sp_vertex_shader *)vs;
+   struct sp_vertex_shader *state = (struct sp_vertex_shader *) vs;
 
    draw_delete_vertex_shader(softpipe->draw, state->draw_data);
+   FREE( (void *)state->shader.tokens );
    FREE( state );
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index cb517b02e44..db0b8ab76b1 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -32,21 +32,37 @@
 #include "util/u_memory.h"
 
 #include "draw/draw_context.h"
+#include "draw/draw_context.h"
 
 #include "sp_context.h"
-#include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
-#include "draw/draw_context.h"
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
+struct sp_sampler {
+   struct pipe_sampler_state base;
+   struct sp_sampler_varient *varients;
+   struct sp_sampler_varient *current;
+};
+
+static struct sp_sampler *sp_sampler( struct pipe_sampler_state *sampler )
+{
+   return (struct sp_sampler *)sampler;
+}
+
 
 void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   return mem_dup(sampler, sizeof(*sampler));
+   struct sp_sampler *sp_sampler = CALLOC_STRUCT(sp_sampler);
+
+   sp_sampler->base = *sampler;
+   sp_sampler->varients = NULL;
+
+   return (void *)sp_sampler;
 }
 
 
@@ -97,7 +113,7 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
       struct pipe_texture *tex = i < num ? texture[i] : NULL;
 
       pipe_texture_reference(&softpipe->texture[i], tex);
-      sp_tile_cache_set_texture(pipe, softpipe->tex_cache[i], tex);
+      sp_tex_tile_cache_set_texture(softpipe->tex_cache[i], tex);
    }
 
    softpipe->num_textures = num;
@@ -106,10 +122,111 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
 }
 
 
+/**
+ * Find/create an sp_sampler_varient object for sampling the given texture,
+ * sampler and tex unit.
+ *
+ * Note that the tex unit is significant.  We can't re-use a sampler
+ * varient for multiple texture units because the sampler varient contains
+ * the texture object pointer.  If the texture object pointer were stored
+ * somewhere outside the sampler varient, we could re-use samplers for
+ * multiple texture units.
+ */
+static struct sp_sampler_varient *
+get_sampler_varient( unsigned unit,
+                     struct sp_sampler *sampler,
+                     struct pipe_texture *texture,
+                     unsigned processor )
+{
+   struct softpipe_texture *sp_texture = softpipe_texture(texture);
+   struct sp_sampler_varient *v = NULL;
+   union sp_sampler_key key;
+
+   /* if this fails, widen the key.unit field and update this assertion */
+   assert(PIPE_MAX_SAMPLERS <= 16);
+
+   key.bits.target = sp_texture->base.target;
+   key.bits.is_pot = sp_texture->pot;
+   key.bits.processor = processor;
+   key.bits.unit = unit;
+   key.bits.pad = 0;
+
+   if (sampler->current && 
+       key.value == sampler->current->key.value) {
+      v = sampler->current;
+   }
+
+   if (v == NULL) {
+      for (v = sampler->varients; v; v = v->next)
+         if (v->key.value == key.value)
+            break;
+
+      if (v == NULL) {
+         v = sp_create_sampler_varient( &sampler->base, key );
+         v->next = sampler->varients;
+         sampler->varients = v;
+      }
+   }
+   
+   sampler->current = v;
+   return v;
+}
+
+
+
+
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe)
+{
+   int i;
+
+   /* It's a bit hard to build these samplers ahead of time -- don't
+    * really know which samplers are going to be used for vertex and
+    * fragment programs.
+    */
+   for (i = 0; i <= softpipe->vs->max_sampler; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.vert_samplers_list[i] = 
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_VERTEX );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.vert_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+
+   for (i = 0; i <= softpipe->fs->info.file_max[TGSI_FILE_SAMPLER]; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.frag_samplers_list[i] =
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_FRAGMENT );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.frag_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+}
+
+
+
 void
 softpipe_delete_sampler_state(struct pipe_context *pipe,
                               void *sampler)
 {
+   struct sp_sampler *sp_sampler = (struct sp_sampler *)sampler;
+   struct sp_sampler_varient *v, *tmp;
+
+   for (v = sp_sampler->varients; v; v = tmp) {
+      tmp = v->next;
+      sp_sampler_varient_destroy(v);
+   }
+
    FREE( sampler );
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index 181bff8f75c..bc0e2011300 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -53,7 +53,7 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
       /* check if changing cbuf */
       if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
          /* flush old */
-         sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+         sp_flush_tile_cache(sp->cbuf_cache[i]);
 
          /* assign new */
          pipe_surface_reference(&sp->framebuffer.cbufs[i], fb->cbufs[i]);
@@ -68,58 +68,28 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
    /* zbuf changing? */
    if (sp->framebuffer.zsbuf != fb->zsbuf) {
       /* flush old */
-      sp_flush_tile_cache(sp, sp->zsbuf_cache);
+      sp_flush_tile_cache(sp->zsbuf_cache);
 
       /* assign new */
       pipe_surface_reference(&sp->framebuffer.zsbuf, fb->zsbuf);
 
       /* update cache */
       sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
-   }
-
-#if 0
-   /* XXX combined depth/stencil here */
-
-   /* sbuf changing? */
-   if (sp->framebuffer.sbuf != fb->sbuf) {
-      /* flush old */
-      sp_flush_tile_cache(sp, sp->sbuf_cache_sep);
-
-      /* assign new */
-      sp->framebuffer.sbuf = fb->sbuf;
-
-      /* update cache */
-      if (fb->sbuf != fb->zbuf) {
-         /* separate stencil buf */
-         sp->sbuf_cache = sp->sbuf_cache_sep;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-      else {
-         /* combined depth/stencil */
-         sp->sbuf_cache = sp->zbuf_cache;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-   }
-#endif
 
-   /* Tell draw module how deep the Z/depth buffer is */
-   {
-      int depth_bits;
-      double mrd;
+      /* Tell draw module how deep the Z/depth buffer is */
       if (sp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
          depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
                                             PIPE_FORMAT_COMP_Z);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(sp->draw, mrd);
       }
-      else {
-         depth_bits = 0;
-      }
-      if (depth_bits > 16) {
-         mrd = 0.0000001;
-      }
-      else {
-         mrd = 0.00002;
-      }
-      draw_set_mrd(sp->draw, mrd);
    }
 
    sp->framebuffer.width = fb->width;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index f99a30277dd..c22ee86b66c 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -31,29 +31,33 @@
  *
  * Authors:
  *   Brian Paul
+ *   Keith Whitwell
  */
 
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
-#include "sp_tex_sample.h"
-#include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "sp_quad.h"   /* only for #define QUAD_* tokens */
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
 
 /*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * Return fractional part of 'f'.  Used for computing interpolation weights.
+ * Need to be careful with negative values.
+ * Note, if this function isn't perfect you'll sometimes see 1-pixel bands
+ * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
  */
-#define FRAC(f)  ((f) - util_ifloor(f))
+static INLINE float
+frac(float f)
+{
+   return f - util_ifloor(f);
+}
+
 
 
 /**
@@ -100,10 +104,16 @@ lerp_3d(float a, float b, float c,
 
 
 /**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ * Compute coord % size for repeat wrap modes.
+ * Note that if coord is a signed integer, coord % size doesn't give
+ * the right value for coord < 0 (in terms of texture repeat).  Just
+ * casting to unsigned fixes that.
  */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+static INLINE int
+repeat(int coord, unsigned size)
+{
+   return (int) ((unsigned) coord % size);
+}
 
 
 /**
@@ -115,133 +125,153 @@ lerp_3d(float a, float b, float c,
  * \param icoord  returns the integer texcoords
  * \return  integer texture index
  */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
+static void
+wrap_nearest_repeat(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
+   /* s limited to [0,1) */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch] * size);
+      icoord[ch] = repeat(i, size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [0,1] */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= 0.0F)
+         icoord[ch] = 0;
+      else if (s[ch] >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_edge(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] < min)
+         icoord[ch] = 0;
+      else if (s[ch] > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_border(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [-1, size] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= min)
+         icoord[ch] = -1;
+      else if (s[ch] >= max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_repeat(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
       /* s limited to [0,1] */
       /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
+      const float u = fabsf(s[ch]);
+      if (u <= 0.0F)
+         icoord[ch] = 0;
+      else if (u >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                  int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_border(const float s[4], unsigned size,
+                                    int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = -1;
+      else if (u > max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(u * size);
    }
 }
 
@@ -256,125 +286,156 @@ nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * \param w  returns blend factor/weight between texture indexes
  * \param icoord  returns the computed integer texture coords
  */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+static void
+wrap_linear_repeat(const float s[4], unsigned size,
+                   int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = s[ch] * size - 0.5F;
+      icoord0[ch] = repeat(util_ifloor(u), size);
+      icoord1[ch] = repeat(icoord0[ch] + 1, size);
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp(const float s[4], unsigned size,
                   int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
 
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
+
+static void
+wrap_linear_clamp_to_edge(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp_to_border(const float s[4], unsigned size,
+                            int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], min, max);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_repeat(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      u = u * size - 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp(const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                 int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u <= min)
+         u = min * size;
+      else if (u >= max)
+         u = max * size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
@@ -383,27 +444,27 @@ linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
+static void
+wrap_nearest_unorm_clamp(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch]);
+      icoord[ch]= CLAMP(i, 0, (int) size-1);
+   }
+}
+
+
+/**
+ * Handles clamp_to_edge and clamp_to_border:
+ */
+static void
+wrap_nearest_unorm_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
    }
 }
 
@@ -412,358 +473,971 @@ nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords.
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
+static void
+wrap_linear_unorm_clamp(const float s[4], unsigned size,
+                        int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      /* Not exactly what the spec says, but it matches NVIDIA output */
+      float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
 
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+static void
+wrap_linear_unorm_clamp_to_border(const float s[4], unsigned size,
+                                  int icoord0[4], int icoord1[4], float w[4])
 {
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx >= ary && arx >= arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord1[ch] > (int) size - 1)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ */
+static float
+compute_lambda_1d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float rho = MAX2(dsdx, dsdy) * texture->width[0];
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_2d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float rho  = MAX2(maxx, maxy);
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_3d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float dpdx = fabsf(p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT]);
+   float dpdy = fabsf(p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float maxz = MAX2(dpdx, dpdy) * texture->depth[0];
+   float rho, lambda;
+
+   rho = MAX2(maxx, maxy);
+   rho = MAX2(rho, maxz);
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Compute lambda for a vertex texture sampler.
+ * Since there aren't derivatives to use, just return the LOD bias.
+ */
+static float
+compute_lambda_vert(const struct sp_sampler_varient *samp,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias)
+{
+   return lodbias;
+}
+
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param addr  the template tex address containing cube, z, face info.
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param rgba  the quad to put the texel/color into
+ *
+ * XXX maybe move this into sp_tex_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+
+
+
+
+static INLINE const float *
+get_texel_2d_no_border(const struct sp_sampler_varient *samp,
+		       union tex_tile_address addr, int x, int y)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_2d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_2d_no_border( samp, addr, x, y );
+   }
+}
+
+
+/* Gather a quad of adjacent texels within a tile:
+ */
+static INLINE void
+get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_varient *samp,
+					union tex_tile_address addr, 
+					unsigned x, unsigned y, 
+					const float *out[4])
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+      
+   out[0] = &tile->data.color[y  ][x  ][0];
+   out[1] = &tile->data.color[y  ][x+1][0];
+   out[2] = &tile->data.color[y+1][x  ][0];
+   out[3] = &tile->data.color[y+1][x+1][0];
+}
+
+
+/* Gather a quad of potentially non-adjacent texels:
+ */
+static INLINE void
+get_texel_quad_2d_no_border(const struct sp_sampler_varient *samp,
+			    union tex_tile_address addr,
+			    int x0, int y0, 
+			    int x1, int y1,
+			    const float *out[4])
+{
+   out[0] = get_texel_2d_no_border( samp, addr, x0, y0 );
+   out[1] = get_texel_2d_no_border( samp, addr, x1, y0 );
+   out[2] = get_texel_2d_no_border( samp, addr, x0, y1 );
+   out[3] = get_texel_2d_no_border( samp, addr, x1, y1 );
+}
+
+/* Can involve a lot of unnecessary checks for border color:
+ */
+static INLINE void
+get_texel_quad_2d(const struct sp_sampler_varient *samp,
+		  union tex_tile_address addr,
+		  int x0, int y0, 
+		  int x1, int y1,
+		  const float *out[4])
+{
+   out[0] = get_texel_2d( samp, addr, x0, y0 );
+   out[1] = get_texel_2d( samp, addr, x1, y0 );
+   out[3] = get_texel_2d( samp, addr, x1, y1 );
+   out[2] = get_texel_2d( samp, addr, x0, y1 );
+}
+
+
+
+/* 3d varients:
+ */
+static INLINE const float *
+get_texel_3d_no_border(const struct sp_sampler_varient *samp,
+                       union tex_tile_address addr, int x, int y, int z)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_3d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y, int z)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_3d_no_border( samp, addr, x, y, z );
+   }
+}
+
+
+/**
+ * Given the logbase2 of a mipmap's base level size and a mipmap level,
+ * return the size (in texels) of that mipmap level.
+ * For example, if level[0].width = 256 then base_pot will be 8.
+ * If level = 2, then we'll return 64 (the width at level=2).
+ * Return 1 if level > base_pot.
+ */
+static INLINE unsigned
+pot_level_size(unsigned base_pot, unsigned level)
+{
+   return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
+}
+
+
+/* Some image-filter fastpaths:
+ */
+static INLINE void
+img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   unsigned xmax = (xpot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, xpot) - 1; */
+   unsigned ymax = (ypot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, ypot) - 1; */
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot - 0.5F;
+      float v = t[j] * ypot - 0.5F;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      float xw = u - (float)uflr;
+      float yw = v - (float)vflr;
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *tx[4];      
+      
+      /* Can we fetch all four at once:
+       */
+      if (x0 < xmax && y0 < ymax) {
+         get_texel_quad_2d_no_border_single_tile(samp, addr, x0, y0, tx);
       }
       else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
+         unsigned x1 = (x0 + 1) & (xpot - 1);
+         unsigned y1 = (y0 + 1) & (ypot - 1);
+         get_texel_quad_2d_no_border(samp, addr, x0, y0, x1, y1, tx);
+      }
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw, yw, 
+                              tx[0][c], tx[1][c], 
+                              tx[2][c], tx[3][c]);
       }
    }
-   else if (ary >= arx && ary >= arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
+}
+
+
+static INLINE void
+img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                 const float s[QUAD_SIZE],
+                                 const float t[QUAD_SIZE],
+                                 const float p[QUAD_SIZE],
+                                 float lodbias,
+                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
+   }
+}
+
+
+static INLINE void
+img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int x0, y0;
+      const float *out;
+
+      x0 = util_ifloor(u);
+      if (x0 < 0) 
+         x0 = 0;
+      else if (x0 > xpot - 1)
+         x0 = xpot - 1;
+
+      y0 = util_ifloor(v);
+      if (y0 < 0) 
+         y0 = 0;
+      else if (y0 > ypot - 1)
+         y0 = ypot - 1;
+      
+      out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
+}
+
+
+static void
+img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], 0);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
+   }
+}
+
+
+static void
+img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
+
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
+}
 
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
 
-   return face;
+static INLINE union tex_tile_address
+face(union tex_tile_address addr, unsigned face )
+{
+   addr.bits.face = face;
+   return addr;
 }
 
 
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(const struct pipe_texture *tex,
-               const struct pipe_sampler_state *sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
+static void
+img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   float rho, lambda;
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
 
-   assert(sampler->normalized_coords);
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
 
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * tex->width[0];
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * tex->height[0];
-      rho = MAX2(rho, max);
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, face(addr, faces[j]), x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
    }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * tex->depth[0];
-      rho = MAX2(rho, max);
+}
+
+
+static void
+img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x[4], y[4], z[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->nearest_texcoord_s(s, width,  x);
+   samp->nearest_texcoord_t(t, height, y);
+   samp->nearest_texcoord_p(p, depth,  z);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_3d(samp, addr, x[j], y[j], z[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }      
    }
+}
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
-   return lambda;
+static void
+img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x0[4], x1[4];
+   float xw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width, x0, x1, xw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], 0);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], 0);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp(xw[j], tx0[c], tx1[c]);
+      }
+   }
 }
 
 
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
 static void
-choose_mipmap_levels(const struct pipe_texture *texture,
-                     const struct pipe_sampler_state *sampler,
+img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     boolean computeLambda,
                      float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addr, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addr, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
-      else {
-         *imgFilter = sampler->mag_img_filter;
+   }
+}
+
+
+static void
+img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      union tex_tile_address addrj = face(addr, faces[j]);
+      const float *tx0 = get_texel_2d(samp, addrj, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addrj, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addrj, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addrj, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
    }
-   else {
-      float lambda;
+}
 
-      if (computeLambda)
-         /* fragment shader */
-         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-      else
-         /* vertex shader */
-         lambda = lodbias; /* not really a bias, but absolute LOD */
 
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
+static void
+img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+   float xw[4], yw[4], zw[4]; /* interpolation weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   addr.value = 0;
+   addr.bits.level = level0;
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+   samp->linear_texcoord_p(p, depth,  z0, z1, zw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      const float *tx00 = get_texel_3d(samp, addr, x0[j], y0[j], z0[j]);
+      const float *tx01 = get_texel_3d(samp, addr, x1[j], y0[j], z0[j]);
+      const float *tx02 = get_texel_3d(samp, addr, x0[j], y1[j], z0[j]);
+      const float *tx03 = get_texel_3d(samp, addr, x1[j], y1[j], z0[j]);
+      
+      const float *tx10 = get_texel_3d(samp, addr, x0[j], y0[j], z1[j]);
+      const float *tx11 = get_texel_3d(samp, addr, x1[j], y0[j], z1[j]);
+      const float *tx12 = get_texel_3d(samp, addr, x0[j], y1[j], z1[j]);
+      const float *tx13 = get_texel_3d(samp, addr, x1[j], y1[j], z1[j]);
+      
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                              tx00[c], tx01[c],
+                              tx02[c], tx03[c],
+                              tx10[c], tx11[c],
+                              tx12[c], tx13[c]);
       }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+   }
+}
+
+
+static void
+mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else if (level0 >= texture->last_level) {
+      samp->level = texture->last_level;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
          }
       }
    }
 }
 
 
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into sp_tile_cache.c and merge with the
- * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
 static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-          unsigned face, unsigned level, int x, int y, int z,
-          float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
+                   const float s[QUAD_SIZE],
+                   const float t[QUAD_SIZE],
+                   const float p[QUAD_SIZE],
+                   float lodbias,
+                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   float lambda;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
    else {
-      const int tx = x % TILE_SIZE;
-      const int ty = y % TILE_SIZE;
-      const struct softpipe_cached_tile *tile
-         = sp_get_cached_tile_tex(sp, samp->cache,
-                                  x, y, z, face, level);
-      rgba[0][j] = tile->data.color[ty][tx][0];
-      rgba[1][j] = tile->data.color[ty][tx][1];
-      rgba[2][j] = tile->data.color[ty][tx][2];
-      rgba[3][j] = tile->data.color[ty][tx][3];
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
+      samp->level = (int)(lambda + 0.5) ;
+      samp->level = MIN2(samp->level, (int)texture->last_level);
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+
+#if 0
+   printf("RGBA %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
+          rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
+          rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
+          rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
+          rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
+#endif
+}
+
+
+static void
+mip_filter_none(struct tgsi_sampler *tgsi_sampler,
+                const float s[QUAD_SIZE],
+                const float t[QUAD_SIZE],
+                const float p[QUAD_SIZE],
+                float lodbias,
+                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
 }
 
 
+
 /**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
+ * Specialized version of mip_filter_linear with hard-wired calls to
+ * 2d lambda calculation and 2d_linear_repeat_POT img filters.
  */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
+static void
+mip_filter_linear_2d_linear_repeat_POT(
+   struct tgsi_sampler *tgsi_sampler,
+   const float s[QUAD_SIZE],
+   const float t[QUAD_SIZE],
+   const float p[QUAD_SIZE],
+   float lodbias,
+   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   /* Catches both negative and large values of level0:
+    */
+   if ((unsigned)level0 >= texture->last_level) { 
+      if (level0 < 0)
+         samp->level = 0;
+      else
+         samp->level = texture->last_level;
+
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
    }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
 
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
+      samp->level = level0;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
 }
 
 
+
 /**
- * As above, but do four z/texture comparisons.
+ * Do shadow/depth comparisons.
  */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
+static void
+sample_compare(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_sampler_state *sampler = samp->sampler;
    int j, k0, k1, k2, k3;
    float val;
 
+   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+
+   /**
+    * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+    * When we sampled the depth texture, the depth value was put into all
+    * RGBA channels.  We look at the red channel here.
+    */
+
    /* compare four texcoords vs. four texture samples */
    switch (sampler->compare_func) {
    case PIPE_FUNC_LESS:
@@ -826,470 +1500,392 @@ shadow_compare4(const struct pipe_sampler_state *sampler,
 
 
 /**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
+ * Compute which cube face is referenced by each texcoord and put that
+ * info into the sampler faces[] array.  Then sample the cube faces
  */
 static void
-sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         boolean computeLambda,
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
+sample_cube(struct tgsi_sampler *tgsi_sampler,
+            const float s[QUAD_SIZE],
+            const float t[QUAD_SIZE],
+            const float p[QUAD_SIZE],
+            float lodbias,
+            float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned j;
+   float ssss[4], tttt[4];
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
+   /*
+     major axis
+     direction     target                             sc     tc    ma
+     ----------    -------------------------------    ---    ---   ---
+     +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+     -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+     +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+     -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+     +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+     -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   for (j = 0; j < QUAD_SIZE; j++) {
+      float rx = s[j];
+      float ry = t[j];
+      float rz = p[j];
+      const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+      unsigned face;
+      float sc, tc, ma;
+
+      if (arx >= ary && arx >= arz) {
+         if (rx >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_X;
+            sc = -rz;
+            tc = -ry;
+            ma = arx;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_X;
+            sc = rz;
+            tc = -ry;
+            ma = arx;
          }
       }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+      else if (ary >= arx && ary >= arz) {
+         if (ry >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_Y;
+            sc = rx;
+            tc = rz;
+            ma = ary;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Y;
+            sc = rx;
+            tc = -rz;
+            ma = ary;
+         }
+      }
+      else {
+         if (rz > 0.0F) {
+            face = PIPE_TEX_FACE_POS_Z;
+            sc = rx;
+            tc = -ry;
+            ma = arz;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Z;
+            sc = -rx;
+            tc = -ry;
+            ma = arz;
+         }
+      }
+
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
+	 const float ima = 1.0 / ma;
+	 ssss[j] = ( sc * ima + 1.0F ) * 0.5F;
+	 tttt[j] = ( tc * ima + 1.0F ) * 0.5F;
+	 samp->faces[j] = face;
+      }
+   }
 
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
+   /* In our little pipeline, the compare stage is next.  If compare
+    * is not active, this will point somewhere deeper into the
+    * pipeline, eg. to mip_filter or even img_filter.
+    */
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+}
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+
+
+static wrap_nearest_func
+get_nearest_unorm_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_unorm_clamp_to_border;
    default:
       assert(0);
+      return wrap_nearest_unorm_clamp;
    }
 }
 
 
-static INLINE void
-sp_get_samples_1d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_nearest_func
+get_nearest_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_nearest_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_nearest_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_nearest_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_nearest_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_nearest_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_nearest_mirror_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_nearest_repeat;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_2d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_unorm_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, t, p,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_unorm_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_linear_unorm_clamp;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_wrap(unsigned mode)
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend;
-   const uint face = 0;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_linear_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_linear_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_linear_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_linear_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_linear_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_linear_mirror_clamp_to_border;
    default:
       assert(0);
+      return wrap_linear_repeat;
    }
 }
 
 
-static void
-sp_get_samples_cube(const struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+static compute_lambda_func
+get_lambda_func(const union sp_sampler_key key)
 {
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   if (key.bits.processor == TGSI_PROCESSOR_VERTEX)
+      return compute_lambda_vert;
+   
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      return compute_lambda_1d;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return compute_lambda_2d;
+   case PIPE_TEXTURE_3D:
+      return compute_lambda_3d;
+   default:
+      assert(0);
+      return compute_lambda_1d;
    }
-   sp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            computeLambda, lodbias, rgba, faces);
 }
 
 
-static void
-sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
+static filter_func
+get_img_filter(const union sp_sampler_key key,
+               unsigned filter,
+               const struct pipe_sampler_state *sampler)
+{
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_1d_nearest;
+      else
+         return img_filter_1d_linear;
       break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+   case PIPE_TEXTURE_2D:
+      /* Try for fast path:
+       */
+      if (key.bits.is_pot &&
+          sampler->wrap_s == sampler->wrap_t &&
+          sampler->normalized_coords) 
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
+         switch (sampler->wrap_s) {
+         case PIPE_TEX_WRAP_REPEAT:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_repeat_POT;
+            case PIPE_TEX_FILTER_LINEAR:
+               return img_filter_2d_linear_repeat_POT;
+            default:
+               break;
             }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            break;
+         case PIPE_TEX_WRAP_CLAMP:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_clamp_POT;
+            default:
+               break;
             }
          }
       }
+      /* Otherwise use default versions:
+       */
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_2d_nearest;
+      else
+         return img_filter_2d_linear;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_cube_nearest;
+      else
+         return img_filter_cube_linear;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_3d_nearest;
+      else
+         return img_filter_3d_linear;
       break;
    default:
       assert(0);
+      return img_filter_1d_nearest;
    }
 }
 
 
 /**
- * Common code for vertex/fragment program texture sampling.
+ * Bind the given texture object and texture cache to the sampler varient.
  */
-static INLINE void
-sp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               boolean computeLambda,
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
+void
+sp_sampler_varient_bind_texture( struct sp_sampler_varient *samp,
+                                 struct softpipe_tex_tile_cache *tex_cache,
+                                 const struct pipe_texture *texture )
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-
-   if (!texture)
-      return;
+   const struct pipe_sampler_state *sampler = samp->sampler;
 
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_1d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_2D:
-      if (sampler->normalized_coords)
-         sp_get_samples_2d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      else
-         sp_get_samples_rect(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_3D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_3d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_CUBE:
-      assert(sampler->normalized_coords);
-      sp_get_samples_cube(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   default:
-      assert(0);
-   }
-
-#if 0 /* DEBUG */
-   {
-      int i;
-      printf("Sampled at %f, %f, %f:\n", s[0], t[0], p[0]);
-      for (i = 0; i < 4; i++) {
-         printf("Frag %d: %f %f %f %f\n", i,
-                rgba[0][i],
-                rgba[1][i],
-                rgba[2][i],
-                rgba[3][i]);
-      }
-   }
-#endif
+   samp->texture = texture;
+   samp->cache = tex_cache;
+   samp->xpot = util_unsigned_logbase2( texture->width[0] );
+   samp->ypot = util_unsigned_logbase2( texture->height[0] );
+   samp->level = CLAMP((int) sampler->min_lod, 0, (int) texture->last_level);
 }
 
 
-/**
- * Called via tgsi_sampler::get_samples() when running a fragment shader.
- * Get four filtered RGBA values from the sampler's texture.
- */
 void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+sp_sampler_varient_destroy( struct sp_sampler_varient *samp )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, TRUE, lodbias, rgba);
+   FREE(samp);
 }
 
 
 /**
- * Called via tgsi_sampler::get_samples() when running a vertex shader.
- * Get four filtered RGBA values from the sampler's texture.
+ * Create a sampler varient for a given set of non-orthogonal state.
  */
-void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, FALSE, lodbias, rgba);
+   struct sp_sampler_varient *samp = CALLOC_STRUCT(sp_sampler_varient);
+   if (!samp)
+      return NULL;
+
+   samp->sampler = sampler;
+   samp->key = key;
+
+   /* Note that (for instance) linear_texcoord_s and
+    * nearest_texcoord_s may be active at the same time, if the
+    * sampler min_img_filter differs from its mag_img_filter.
+    */
+   if (sampler->normalized_coords) {
+      samp->linear_texcoord_s = get_linear_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_wrap( sampler->wrap_r );
+   }
+   else {
+      samp->linear_texcoord_s = get_linear_unorm_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_unorm_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_unorm_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_unorm_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_unorm_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_unorm_wrap( sampler->wrap_r );
+   }
+   
+   samp->compute_lambda = get_lambda_func( key );
+
+   samp->min_img_filter = get_img_filter(key, sampler->min_img_filter, sampler);
+   samp->mag_img_filter = get_img_filter(key, sampler->mag_img_filter, sampler);
+
+   switch (sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      if (sampler->min_img_filter == sampler->mag_img_filter) 
+         samp->mip_filter = samp->min_img_filter;         
+      else
+         samp->mip_filter = mip_filter_none;
+      break;
+
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      samp->mip_filter = mip_filter_nearest;
+      break;
+
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      if (key.bits.is_pot &&
+          sampler->min_img_filter == sampler->mag_img_filter &&
+          sampler->normalized_coords &&
+          sampler->wrap_s == PIPE_TEX_WRAP_REPEAT &&
+          sampler->wrap_t == PIPE_TEX_WRAP_REPEAT &&
+          sampler->min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      {
+         samp->mip_filter = mip_filter_linear_2d_linear_repeat_POT;
+      }
+      else 
+      {
+         samp->mip_filter = mip_filter_linear;
+      }
+      break;
+   }
+
+   if (sampler->compare_mode != FALSE) {
+      samp->compare = sample_compare;
+   }
+   else {
+      /* Skip compare operation by promoting the mip_filter function
+       * pointer:
+       */
+      samp->compare = samp->mip_filter;
+   }
+   
+   if (key.bits.target == PIPE_TEXTURE_CUBE) {
+      samp->base.get_samples = sample_cube;
+   }
+   else {
+      samp->faces[0] = 0;
+      samp->faces[1] = 0;
+      samp->faces[2] = 0;
+      samp->faces[3] = 0;
+
+      /* Skip cube face determination by promoting the compare
+       * function pointer:
+       */
+      samp->base.get_samples = samp->compare;
+   }
+
+   return samp;
 }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 40d8eb2c2a8..b0797711d37 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -31,43 +31,122 @@
 
 #include "tgsi/tgsi_exec.h"
 
+struct sp_sampler_varient;
+
+typedef void (*wrap_nearest_func)(const float s[4],
+                                  unsigned size,
+                                  int icoord[4]);
+
+typedef void (*wrap_linear_func)(const float s[4], 
+                                 unsigned size,
+                                 int icoord0[4],
+                                 int icoord1[4],
+                                 float w[4]);
+
+typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE],
+                                     float lodbias);
+
+typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
+                            const float s[QUAD_SIZE],
+                            const float t[QUAD_SIZE],
+                            const float p[QUAD_SIZE],
+                            float lodbias,
+                            float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+union sp_sampler_key {
+   struct {
+      unsigned target:3;
+      unsigned is_pot:1;
+      unsigned processor:2;
+      unsigned unit:4;
+      unsigned pad:22;
+   } bits;
+   unsigned value;
+};
 
 /**
  * Subclass of tgsi_sampler
  */
-struct sp_shader_sampler
+struct sp_sampler_varient
 {
    struct tgsi_sampler base;  /**< base class */
 
-   uint unit;
-   struct softpipe_context *sp;
-   struct softpipe_tile_cache *cache;
+   union sp_sampler_key key;
+
+   /* The owner of this struct:
+    */
+   const struct pipe_sampler_state *sampler;
+
+
+   /* Currently bound texture:
+    */
+   const struct pipe_texture *texture;
+   struct softpipe_tex_tile_cache *cache;
+
+   unsigned processor;
+
+   /* For sp_get_samples_2d_linear_POT:
+    */
+   unsigned xpot;
+   unsigned ypot;
+   unsigned level;
+
+   unsigned faces[4];
+   
+   wrap_nearest_func nearest_texcoord_s;
+   wrap_nearest_func nearest_texcoord_t;
+   wrap_nearest_func nearest_texcoord_p;
+
+   wrap_linear_func linear_texcoord_s;
+   wrap_linear_func linear_texcoord_t;
+   wrap_linear_func linear_texcoord_p;
+
+   filter_func min_img_filter;
+   filter_func mag_img_filter;
+
+   compute_lambda_func compute_lambda;
+
+   filter_func mip_filter;
+   filter_func compare;
+   
+   /* Linked list:
+    */
+   struct sp_sampler_varient *next;
 };
 
+struct sp_sampler;
 
+/* Create a sampler varient for a given set of non-orthogonal state.  Currently the 
+ */
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key );
 
-static INLINE const struct sp_shader_sampler *
-sp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (const struct sp_shader_sampler *) sampler;
-}
+void sp_sampler_varient_bind_texture( struct sp_sampler_varient *varient,
+                                      struct softpipe_tex_tile_cache *tex_cache,
+                                      const struct pipe_texture *tex );
 
+void sp_sampler_varient_destroy( struct sp_sampler_varient * );
 
-extern void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+static INLINE struct sp_sampler_varient *
+sp_sampler_varient(const struct tgsi_sampler *sampler)
+{
+   return (struct sp_sampler_varient *) sampler;
+}
 
 extern void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE]);
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
 #endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
new file mode 100644
index 00000000000..407a22a9f4b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -0,0 +1,273 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
+
+   
+
+struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen )
+{
+   struct softpipe_tex_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tex_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   struct pipe_screen *screen;
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_destroy(tc->transfer);
+   }
+   if (tc->tex_trans) {
+      screen = tc->tex_trans->texture->screen;
+      screen->tex_transfer_destroy(tc->tex_trans);
+   }
+
+   FREE( tc );
+}
+
+
+
+
+void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
+}
+
+
+void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+      tc->tex_trans_map = NULL;
+   }
+}
+
+/**
+ * Invalidate all cached tiles for the cached texture.
+ * Should be called when the texture is modified.
+ */
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc)
+{
+   unsigned i;
+
+   assert(tc);
+   assert(tc->texture);
+
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].addr.bits.invalid = 1;
+   }
+}
+
+/**
+ * Specify the texture to cache.
+ */
+void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->transfer);
+
+   if (tc->texture != texture) {
+      pipe_texture_reference(&tc->texture, texture);
+
+      if (tc->tex_trans) {
+         struct pipe_screen *screen = tc->tex_trans->texture->screen;
+         
+         if (tc->tex_trans_map) {
+            screen->transfer_unmap(screen, tc->tex_trans);
+            tc->tex_trans_map = NULL;
+         }
+
+         screen->tex_transfer_destroy(tc->tex_trans);
+         tc->tex_trans = NULL;
+      }
+
+      /* mark as entries as invalid/empty */
+      /* XXX we should try to avoid this when the teximage hasn't changed */
+      for (i = 0; i < NUM_ENTRIES; i++) {
+         tc->entries[i].addr.bits.invalid = 1;
+      }
+
+      tc->tex_face = -1; /* any invalid value here */
+   }
+}
+
+
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   int pos;
+
+   if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->tex_face = -1;
+   }
+
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos( union tex_tile_address addr )
+{
+   uint entry = (addr.bits.x + 
+                 addr.bits.y * 9 + 
+                 addr.bits.z * 3 + 
+                 addr.bits.face + 
+                 addr.bits.level * 7);
+
+   return entry % NUM_ENTRIES;
+}
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                        union tex_tile_address addr )
+{
+   struct pipe_screen *screen = tc->screen;
+   struct softpipe_tex_cached_tile *tile;
+   
+   tile = tc->entries + tex_cache_pos( addr );
+
+   if (addr.value != tile->addr.value) {
+
+      /* cache miss.  Most misses are because we've invaldiated the
+       * texture cache previously -- most commonly on binding a new
+       * texture.  Currently we effectively flush the cache on texture
+       * bind.
+       */
+#if 0
+      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
+                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
+                    pos, x/TILE_SIZE, y/TILE_SIZE, z, face, level,
+                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
+#endif
+
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
+          tc->tex_face != addr.bits.face ||
+          tc->tex_level != addr.bits.level ||
+          tc->tex_z != addr.bits.z) {
+         /* get new transfer (view into texture) */
+
+         if (tc->tex_trans) {
+            if (tc->tex_trans_map) {
+               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+               tc->tex_trans_map = NULL;
+            }
+
+            screen->tex_transfer_destroy(tc->tex_trans);
+            tc->tex_trans = NULL;
+         }
+
+         tc->tex_trans = 
+            screen->get_tex_transfer(screen, tc->texture, 
+                                     addr.bits.face, 
+                                     addr.bits.level, 
+                                     addr.bits.z, 
+                                     PIPE_TRANSFER_READ, 0, 0,
+                                     tc->texture->width[addr.bits.level],
+                                     tc->texture->height[addr.bits.level]);
+
+         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
+
+         tc->tex_face = addr.bits.face;
+         tc->tex_level = addr.bits.level;
+         tc->tex_z = addr.bits.z;
+      }
+
+      /* get tile from the transfer (view into texture) */
+      pipe_get_tile_rgba(tc->tex_trans,
+                         addr.bits.x * TILE_SIZE, 
+                         addr.bits.y * TILE_SIZE,
+                         TILE_SIZE, TILE_SIZE,
+                         (float *) tile->data.color);
+      tile->addr = addr;
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
new file mode 100644
index 00000000000..ac6886a3df1
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_TILE_CACHE_H
+#define SP_TEX_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tex_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tex_tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned z:12;            /* 4096 -- z not tiled */
+      unsigned face:3;
+      unsigned level:4;
+      unsigned invalid:1;
+   } bits;
+   unsigned value;
+};
+
+
+struct softpipe_tex_cached_tile
+{
+   union tex_tile_address addr;
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+   } data;
+};
+
+#define NUM_ENTRIES 50
+
+struct softpipe_tex_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct pipe_texture *texture;  /**< if caching a texture */
+   unsigned timestamp;
+
+   struct softpipe_tex_cached_tile entries[NUM_ENTRIES];
+
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
+   int tex_face, tex_level, tex_z;
+
+   struct softpipe_tex_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen );
+
+extern void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+extern void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+
+extern const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr );
+
+static INLINE union tex_tile_address
+tex_tile_address( unsigned x,
+		  unsigned y,
+		  unsigned z,
+		  unsigned face,
+		  unsigned level )
+{
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   addr.bits.face = face;
+   addr.bits.level = level;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE const struct softpipe_tex_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr )
+{
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile_tex( tc, addr );
+}
+
+
+
+
+
+#endif /* SP_TEX_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 70f09324311..7caf2928b4b 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -30,26 +30,21 @@
   *   Michel Dänzer <michel@tungstengraphics.com>
   */
 
-#include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/internal/p_winsys_screen.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
 #include "sp_screen.h"
 #include "sp_winsys.h"
 
 
-/* Simple, maximally packed layout.
- */
-
-
-/* Conventional allocation path for non-display textures:
+/**
+ * Conventional allocation path for non-display textures:
+ * Use a simple, maximally packed layout.
  */
 static boolean
 softpipe_texture_layout(struct pipe_screen *screen,
@@ -89,6 +84,10 @@ softpipe_texture_layout(struct pipe_screen *screen,
    return spt->buffer != NULL;
 }
 
+
+/**
+ * Texture layout for simple color buffers.
+ */
 static boolean
 softpipe_displaytarget_layout(struct pipe_screen *screen,
                               struct softpipe_texture * spt)
@@ -112,21 +111,25 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
 }
 
 
-
-
-
+/**
+ * Create new pipe_texture given the template information.
+ */
 static struct pipe_texture *
 softpipe_texture_create(struct pipe_screen *screen,
-                        const struct pipe_texture *templat)
+                        const struct pipe_texture *template)
 {
    struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
    if (!spt)
       return NULL;
 
-   spt->base = *templat;
+   spt->base = *template;
    pipe_reference_init(&spt->base.reference, 1);
    spt->base.screen = screen;
 
+   spt->pot = (util_is_power_of_two(template->width[0]) &&
+               util_is_power_of_two(template->height[0]) &&
+               util_is_power_of_two(template->depth[0]));
+
    if (spt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                               PIPE_TEXTURE_USAGE_PRIMARY)) {
       if (!softpipe_displaytarget_layout(screen, spt))
@@ -145,6 +148,9 @@ softpipe_texture_create(struct pipe_screen *screen,
 }
 
 
+/**
+ * Create a new pipe_texture which wraps an existing buffer.
+ */
 static struct pipe_texture *
 softpipe_texture_blanket(struct pipe_screen * screen,
                          const struct pipe_texture *base,
@@ -188,6 +194,9 @@ softpipe_texture_destroy(struct pipe_texture *pt)
 }
 
 
+/**
+ * Get a pipe_surface "view" into a texture.
+ */
 static struct pipe_surface *
 softpipe_get_tex_surface(struct pipe_screen *screen,
                          struct pipe_texture *pt,
@@ -222,6 +231,13 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
       if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
          ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
 
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         spt->timestamp++;
+         softpipe_screen(screen)->timestamp++;
+      }
+
       ps->face = face;
       ps->level = level;
       ps->zslice = zslice;
@@ -241,6 +257,9 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
 }
 
 
+/**
+ * Free a pipe_surface which was created with softpipe_get_tex_surface().
+ */
 static void 
 softpipe_tex_surface_destroy(struct pipe_surface *surf)
 {
@@ -254,6 +273,18 @@ softpipe_tex_surface_destroy(struct pipe_surface *surf)
 }
 
 
+/**
+ * Geta pipe_transfer object which is used for moving data in/out of
+ * a texture object.
+ * \param face  one of PIPE_TEX_FACE_x or 0
+ * \param level  texture mipmap level
+ * \param zslice  2D slice of a 3D texture
+ * \param usage  one of PIPE_TRANSFER_READ/WRITE/READ_WRITE
+ * \param x  X position of region to read/write
+ * \param y  Y position of region to read/write
+ * \param width  width of region to read/write
+ * \param height  height of region to read/write
+ */
 static struct pipe_transfer *
 softpipe_get_tex_transfer(struct pipe_screen *screen,
                           struct pipe_texture *texture,
@@ -303,6 +334,10 @@ softpipe_get_tex_transfer(struct pipe_screen *screen,
 }
 
 
+/**
+ * Free a pipe_transfer object which was created with
+ * softpipe_get_tex_transfer().
+ */
 static void 
 softpipe_tex_transfer_destroy(struct pipe_transfer *transfer)
 {
@@ -316,40 +351,33 @@ softpipe_tex_transfer_destroy(struct pipe_transfer *transfer)
 }
 
 
+/**
+ * Create memory mapping for given pipe_transfer object.
+ */
 static void *
 softpipe_transfer_map( struct pipe_screen *screen,
                        struct pipe_transfer *transfer )
 {
    ubyte *map, *xfer_map;
    struct softpipe_texture *spt;
-   unsigned flags = 0;
 
    assert(transfer->texture);
    spt = softpipe_texture(transfer->texture);
 
-   if (transfer->usage != PIPE_TRANSFER_READ) {
-      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-   }
-
-   if (transfer->usage != PIPE_TRANSFER_WRITE) {
-      flags |= PIPE_BUFFER_USAGE_CPU_READ;
-   }
-
-   map = pipe_buffer_map(screen, spt->buffer, flags);
+   map = pipe_buffer_map(screen, spt->buffer, pipe_transfer_buffer_flags(transfer));
    if (map == NULL)
       return NULL;
 
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
-   {
+   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE)) {
       /* Do something to notify sharing contexts of a texture change.
        * In softpipe, that would mean flushing the texture cache.
        */
       softpipe_screen(screen)->timestamp++;
    }
-   
+
    xfer_map = map + softpipe_transfer(transfer)->offset +
       transfer->y / transfer->block.height * transfer->stride +
       transfer->x / transfer->block.width * transfer->block.size;
@@ -358,9 +386,12 @@ softpipe_transfer_map( struct pipe_screen *screen,
 }
 
 
+/**
+ * Unmap memory mapping for given pipe_transfer object.
+ */
 static void
 softpipe_transfer_unmap(struct pipe_screen *screen,
-                       struct pipe_transfer *transfer)
+                        struct pipe_transfer *transfer)
 {
    struct softpipe_texture *spt;
 
@@ -369,16 +400,63 @@ softpipe_transfer_unmap(struct pipe_screen *screen,
 
    pipe_buffer_unmap( screen, spt->buffer );
 
-   if (transfer->usage != PIPE_TRANSFER_READ) {
+   if (transfer->usage & PIPE_TRANSFER_WRITE) {
       /* Mark the texture as dirty to expire the tile caches. */
-      spt->modified = TRUE;
+      spt->timestamp++;
    }
 }
 
 
-void
-softpipe_init_texture_funcs(struct softpipe_context *sp)
+static struct pipe_video_surface*
+softpipe_video_surface_create(struct pipe_screen *screen,
+                              enum pipe_video_chroma_format chroma_format,
+                              unsigned width, unsigned height)
+{
+   struct softpipe_video_surface *sp_vsfc;
+   struct pipe_texture template;
+
+   assert(screen);
+   assert(width && height);
+
+   sp_vsfc = CALLOC_STRUCT(softpipe_video_surface);
+   if (!sp_vsfc)
+      return NULL;
+
+   pipe_reference_init(&sp_vsfc->base.reference, 1);
+   sp_vsfc->base.screen = screen;
+   sp_vsfc->base.chroma_format = chroma_format;
+   /*sp_vsfc->base.surface_format = PIPE_VIDEO_SURFACE_FORMAT_VUYA;*/
+   sp_vsfc->base.width = width;
+   sp_vsfc->base.height = height;
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+   template.last_level = 0;
+   /* vl_mpeg12_mc_renderer expects this when it's initialized with pot_buffers=true */
+   template.width[0] = util_next_power_of_two(width);
+   template.height[0] = util_next_power_of_two(height);
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   sp_vsfc->tex = screen->texture_create(screen, &template);
+   if (!sp_vsfc->tex) {
+      FREE(sp_vsfc);
+      return NULL;
+   }
+
+   return &sp_vsfc->base;
+}
+
+
+static void
+softpipe_video_surface_destroy(struct pipe_video_surface *vsfc)
 {
+   struct softpipe_video_surface *sp_vsfc = softpipe_video_surface(vsfc);
+
+   pipe_texture_reference(&sp_vsfc->tex, NULL);
+   FREE(sp_vsfc);
 }
 
 
@@ -396,15 +474,22 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
    screen->tex_transfer_destroy = softpipe_tex_transfer_destroy;
    screen->transfer_map = softpipe_transfer_map;
    screen->transfer_unmap = softpipe_transfer_unmap;
+
+   screen->video_surface_create = softpipe_video_surface_create;
+   screen->video_surface_destroy = softpipe_video_surface_destroy;
 }
 
 
+/**
+ * Return pipe_buffer handle and stride for given texture object.
+ * XXX used for???
+ */
 boolean
 softpipe_get_texture_buffer( struct pipe_texture *texture,
                              struct pipe_buffer **buf,
                              unsigned *stride )
 {
-   struct softpipe_texture *tex = (struct softpipe_texture *)texture;
+   struct softpipe_texture *tex = (struct softpipe_texture *) texture;
 
    if (!tex)
       return FALSE;
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index 893aa7d11d8..2ef64e1e7c3 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -30,6 +30,7 @@
 
 
 #include "pipe/p_state.h"
+#include "pipe/p_video_state.h"
 
 
 struct pipe_context;
@@ -48,7 +49,11 @@ struct softpipe_texture
     */
    struct pipe_buffer *buffer;
 
-   boolean modified;
+   /* True if texture images are power-of-two in all dimensions:
+    */
+   boolean pot;
+
+   unsigned timestamp;
 };
 
 struct softpipe_transfer
@@ -58,6 +63,15 @@ struct softpipe_transfer
    unsigned long offset;
 };
 
+struct softpipe_video_surface
+{
+   struct pipe_video_surface base;
+
+   /* The data is held here:
+    */
+   struct pipe_texture *tex;
+};
+
 
 /** cast wrappers */
 static INLINE struct softpipe_texture *
@@ -72,9 +86,12 @@ softpipe_transfer(struct pipe_transfer *pt)
    return (struct softpipe_transfer *) pt;
 }
 
+static INLINE struct softpipe_video_surface *
+softpipe_video_surface(struct pipe_video_surface *pvs)
+{
+   return (struct softpipe_video_surface *) pvs;
+}
 
-extern void
-softpipe_init_texture_funcs( struct softpipe_context *softpipe );
 
 extern void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index b2195ec6b59..65872cecc4f 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 /**
- * Texture tile caching.
+ * Render target tile caching.
  *
  * Author:
  *    Brian Paul
@@ -35,38 +35,8 @@
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_tile.h"
-#include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
 #include "sp_tile_cache.h"
 
-#define NUM_ENTRIES 50
-
-
-/** XXX move these */
-#define MAX_WIDTH 4096
-#define MAX_HEIGHT 4096
-
-
-struct softpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-   struct pipe_texture *texture;  /**< if caching a texture */
-   struct softpipe_cached_tile entries[NUM_ENTRIES];
-   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
-   float clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
-
-   struct pipe_transfer *tex_trans;
-   void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
-
-   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
-};
 
 
 /**
@@ -76,7 +46,7 @@ struct softpipe_tile_cache
  * a LRU replacement policy.
  */
 #define CACHE_POS(x, y) \
-   (((x) / TILE_SIZE + ((y) / TILE_SIZE) * 5) % NUM_ENTRIES)
+   (((x) + (y) * 5) % NUM_ENTRIES)
 
 
 
@@ -84,12 +54,10 @@ struct softpipe_tile_cache
  * Is the tile at (x,y) in cleared state?
  */
 static INLINE uint
-is_clear_flag_set(const uint *bitvec, int x, int y)
+is_clear_flag_set(const uint *bitvec, union tile_address addr)
 {
    int pos, bit;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bit = bitvec[pos / 32] & (1 << (pos & 31));
    return bit;
@@ -100,12 +68,10 @@ is_clear_flag_set(const uint *bitvec, int x, int y)
  * Mark the tile at (x,y) as not cleared.
  */
 static INLINE void
-clear_clear_flag(uint *bitvec, int x, int y)
+clear_clear_flag(uint *bitvec, union tile_address addr)
 {
    int pos;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bitvec[pos / 32] &= ~(1 << (pos & 31));
 }
@@ -127,9 +93,9 @@ sp_create_tile_cache( struct pipe_screen *screen )
    if (tc) {
       tc->screen = screen;
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x =
-         tc->entries[pos].y = -1;
+         tc->entries[pos].addr.bits.invalid = 1;
       }
+      tc->last_tile = &tc->entries[0]; /* any tile */
 
       /* XXX this code prevents valgrind warnings about use of uninitialized
        * memory in programs that don't clear the surface before rendering.
@@ -158,10 +124,6 @@ sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
       screen = tc->transfer->texture->screen;
       screen->tex_transfer_destroy(tc->transfer);
    }
-   if (tc->tex_trans) {
-      screen = tc->tex_trans->texture->screen;
-      screen->tex_transfer_destroy(tc->tex_trans);
-   }
 
    FREE( tc );
 }
@@ -174,8 +136,6 @@ void
 sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
                           struct pipe_surface *ps)
 {
-   assert(!tc->texture);
-
    if (tc->transfer) {
       struct pipe_screen *screen = tc->transfer->texture->screen;
 
@@ -227,9 +187,6 @@ sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc)
 {
    if (tc->transfer && !tc->transfer_map)
       tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-
-   if (tc->tex_trans && !tc->tex_trans_map)
-      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
 }
 
 
@@ -240,47 +197,6 @@ sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc)
       tc->screen->transfer_unmap(tc->screen, tc->transfer);
       tc->transfer_map = NULL;
    }
-
-   if (tc->tex_trans_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-      tc->tex_trans_map = NULL;
-   }
-}
-
-
-/**
- * Specify the texture to cache.
- */
-void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture)
-{
-   uint i;
-
-   assert(!tc->transfer);
-
-   pipe_texture_reference(&tc->texture, texture);
-
-   if (tc->tex_trans) {
-      struct pipe_screen *screen = tc->tex_trans->texture->screen;
-
-      if (tc->tex_trans_map) {
-         screen->transfer_unmap(screen, tc->tex_trans);
-         tc->tex_trans_map = NULL;
-      }
-
-      screen->tex_transfer_destroy(tc->tex_trans);
-      tc->tex_trans = NULL;
-   }
-
-   /* mark as entries as invalid/empty */
-   /* XXX we should try to avoid this when the teximage hasn't changed */
-   for (i = 0; i < NUM_ENTRIES; i++) {
-      tc->entries[i].x = -1;
-   }
-
-   tc->tex_face = -1; /* any invalid value here */
 }
 
 
@@ -324,7 +240,7 @@ clear_tile(struct softpipe_cached_tile *tile,
 
    switch (pf_get_size(format)) {
    case 1:
-      memset(tile->data.any, 0, TILE_SIZE * TILE_SIZE);
+      memset(tile->data.any, clear_value, TILE_SIZE * TILE_SIZE);
       break;
    case 2:
       if (clear_value == 0) {
@@ -360,8 +276,7 @@ clear_tile(struct softpipe_cached_tile *tile,
  * Actually clear the tiles which were flagged as being in a clear state.
  */
 static void
-sp_tile_cache_flush_clear(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc)
+sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    const uint w = tc->transfer->width;
@@ -375,13 +290,15 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
    /* push the tile to all positions marked as clear */
    for (y = 0; y < h; y += TILE_SIZE) {
       for (x = 0; x < w; x += TILE_SIZE) {
-         if (is_clear_flag_set(tc->clear_flags, x, y)) {
+         union tile_address addr = tile_address(x, y);
+
+         if (is_clear_flag_set(tc->clear_flags, addr)) {
             pipe_put_tile_raw(pt,
                               x, y, TILE_SIZE, TILE_SIZE,
                               tc->tile.data.color32, 0/*STRIDE*/);
 
             /* do this? */
-            clear_clear_flag(tc->clear_flags, x, y);
+            clear_clear_flag(tc->clear_flags, addr);
 
             numCleared++;
          }
@@ -398,8 +315,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
  * any tiles "flagged" as cleared will be "really" cleared.
  */
 void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc)
+sp_flush_tile_cache(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    int inuse = 0, pos;
@@ -408,33 +324,30 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
       /* caching a drawing transfer */
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
          struct softpipe_cached_tile *tile = tc->entries + pos;
-         if (tile->x >= 0) {
+         if (!tile->addr.bits.invalid) {
             if (tc->depth_stencil) {
                pipe_put_tile_raw(pt,
-                                 tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                 tile->addr.bits.x * TILE_SIZE, 
+                                 tile->addr.bits.y * TILE_SIZE, 
+                                 TILE_SIZE, TILE_SIZE,
                                  tile->data.depth32, 0/*STRIDE*/);
             }
             else {
                pipe_put_tile_rgba(pt,
-                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                  tile->addr.bits.x * TILE_SIZE, 
+                                  tile->addr.bits.y * TILE_SIZE, 
+                                  TILE_SIZE, TILE_SIZE,
                                   (float *) tile->data.color);
             }
-            tile->x = tile->y = -1;  /* mark as empty */
+            tile->addr.bits.invalid = 1;  /* mark as empty */
             inuse++;
          }
       }
 
 #if TILE_CLEAR_OPTIMIZATION
-      sp_tile_cache_flush_clear(&softpipe->pipe, tc);
+      sp_tile_cache_flush_clear(tc);
 #endif
    }
-   else if (tc->texture) {
-      /* caching a texture, mark all entries as empty */
-      for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x = -1;
-      }
-      tc->tex_face = -1;
-   }
 
 #if 0
    debug_printf("flushed tiles in use: %d\n", inuse);
@@ -447,40 +360,39 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
  * \param x, y  position of tile, in pixels
  */
 struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y)
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr )
 {
    struct pipe_transfer *pt = tc->transfer;
-
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-
+   
    /* cache pos/entry: */
-   const int pos = CACHE_POS(x, y);
+   const int pos = CACHE_POS(addr.bits.x,
+                             addr.bits.y);
    struct softpipe_cached_tile *tile = tc->entries + pos;
 
-   if (tile_x != tile->x ||
-       tile_y != tile->y) {
+   if (addr.value != tile->addr.value) {
 
-      if (tile->x != -1) {
+      if (tile->addr.bits.invalid == 0) {
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE,
+                              tile->addr.bits.y * TILE_SIZE,
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE,
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
 
-      tile->x = tile_x;
-      tile->y = tile_y;
+      tile->addr = addr;
 
-      if (is_clear_flag_set(tc->clear_flags, x, y)) {
+      if (is_clear_flag_set(tc->clear_flags, addr)) {
          /* don't get tile from framebuffer, just clear it */
          if (tc->depth_stencil) {
             clear_tile(tile, pt->format, tc->clear_val);
@@ -488,125 +400,33 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          else {
             clear_tile_rgba(tile, pt->format, tc->clear_color);
          }
-         clear_clear_flag(tc->clear_flags, x, y);
+         clear_clear_flag(tc->clear_flags, addr);
       }
       else {
          /* get new tile data from transfer */
          if (tc->depth_stencil) {
             pipe_get_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE, 
+                              tile->addr.bits.y * TILE_SIZE, 
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE, 
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
    }
 
+   tc->last_tile = tile;
    return tile;
 }
 
 
-/**
- * Given the texture face, level, zslice, x and y values, compute
- * the cache entry position/index where we'd hope to find the
- * cached texture tile.
- * This is basically a direct-map cache.
- * XXX There's probably lots of ways in which we can improve this.
- */
-static INLINE uint
-tex_cache_pos(int x, int y, int z, int face, int level)
-{
-   uint entry = x + y * 9 + z * 3 + face + level * 7;
-   return entry % NUM_ENTRIES;
-}
-
 
-/**
- * Similar to sp_get_cached_tile() but for textures.
- * Tiles are read-only and indexed with more params.
- */
-const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *sp,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level)
-{
-   struct pipe_screen *screen = sp->pipe.screen;
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-   /* cache pos/entry: */
-   const uint pos = tex_cache_pos(x / TILE_SIZE, y / TILE_SIZE, z,
-                                  face, level);
-   struct softpipe_cached_tile *tile = tc->entries + pos;
-
-   if (tc->texture) {
-      struct softpipe_texture *spt = softpipe_texture(tc->texture);
-      if (spt->modified) {
-         /* texture was modified, invalidate all cached tiles */
-         uint p;
-         for (p = 0; p < NUM_ENTRIES; p++) {
-            tile = tc->entries + p;
-            tile->x = -1;
-         }
-         spt->modified = FALSE;
-      }
-   }
-
-   if (tile_x != tile->x ||
-       tile_y != tile->y ||
-       z != tile->z ||
-       face != tile->face ||
-       level != tile->level) {
-      /* cache miss */
-
-#if 0
-      printf("miss at %u  x=%d y=%d z=%d face=%d level=%d\n", pos,
-             x/TILE_SIZE, y/TILE_SIZE, z, face, level);
-#endif
-      /* check if we need to get a new transfer */
-      if (!tc->tex_trans ||
-          tc->tex_face != face ||
-          tc->tex_level != level ||
-          tc->tex_z != z) {
-         /* get new transfer (view into texture) */
-
-         if (tc->tex_trans) {
-            if (tc->tex_trans_map) {
-               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-               tc->tex_trans_map = NULL;
-            }
-
-            screen->tex_transfer_destroy(tc->tex_trans);
-            tc->tex_trans = NULL;
-         }
-
-         tc->tex_trans = screen->get_tex_transfer(screen, tc->texture, face, level, z, 
-                                                  PIPE_TRANSFER_READ, 0, 0,
-                                                  tc->texture->width[level],
-                                                  tc->texture->height[level]);
-         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
-
-         tc->tex_face = face;
-         tc->tex_level = level;
-         tc->tex_z = z;
-      }
-
-      /* get tile from the transfer (view into texture) */
-      pipe_get_tile_rgba(tc->tex_trans,
-                         tile_x, tile_y, TILE_SIZE, TILE_SIZE,
-                         (float *) tile->data.color);
-      tile->x = tile_x;
-      tile->y = tile_y;
-      tile->z = z;
-      tile->face = face;
-      tile->level = level;
-   }
-
-   return tile;
-}
 
 
 /**
@@ -637,6 +457,6 @@ sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
 
    for (pos = 0; pos < NUM_ENTRIES; pos++) {
       struct softpipe_cached_tile *tile = tc->entries + pos;
-      tile->x = tile->y = -1;
+      tile->addr.bits.invalid = 1;
    }
 }
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 8f247d0e580..a12092702a6 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -34,7 +34,6 @@
 #include "pipe/p_compiler.h"
 
 
-struct softpipe_context;
 struct softpipe_tile_cache;
 
 
@@ -44,11 +43,23 @@ struct softpipe_tile_cache;
 #define TILE_SIZE 64
 
 
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned invalid:1;
+      unsigned pad:19;
+   } bits;
+   unsigned value;
+};
+
 
 struct softpipe_cached_tile
 {
-   int x, y;           /**< pos of tile in window coords */
-   int z, face, level; /**< Extra texture indexes */
+   union tile_address addr;
    union {
       float color[TILE_SIZE][TILE_SIZE][4];
       uint color32[TILE_SIZE][TILE_SIZE];
@@ -59,6 +70,32 @@ struct softpipe_cached_tile
    } data;
 };
 
+#define NUM_ENTRIES 50
+
+
+/** XXX move these */
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+
+   struct softpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
 
 extern struct softpipe_tile_cache *
 sp_create_tile_cache( struct pipe_screen *screen );
@@ -80,26 +117,45 @@ extern void
 sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc);
 
 extern void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture);
-
-extern void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc);
+sp_flush_tile_cache(struct softpipe_tile_cache *tc);
 
 extern void
 sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
                     uint clearValue);
 
 extern struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y);
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr );
+
+
+static INLINE union tile_address
+tile_address( unsigned x,
+              unsigned y )
+{
+   union tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_tile_cache *tc, 
+                   int x, int y )
+{
+   union tile_address addr = tile_address( x, y );
+
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile( tc, addr );
+}
+
 
-extern const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *softpipe,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level);
 
 
 #endif /* SP_TILE_CACHE_H */
diff --git a/src/gallium/drivers/softpipe/sp_video_context.c b/src/gallium/drivers/softpipe/sp_video_context.c
new file mode 100644
index 00000000000..cae2d3efc58
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_video_context.c
@@ -0,0 +1,304 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "sp_video_context.h"
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include "softpipe/sp_winsys.h"
+#include "softpipe/sp_texture.h"
+
+static void
+sp_mpeg12_destroy(struct pipe_video_context *vpipe)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+	
+   /* Asserted in softpipe_delete_fs_state() for some reason */
+   ctx->pipe->bind_vs_state(ctx->pipe, NULL);
+   ctx->pipe->bind_fs_state(ctx->pipe, NULL);
+
+   ctx->pipe->delete_blend_state(ctx->pipe, ctx->blend);
+   ctx->pipe->delete_rasterizer_state(ctx->pipe, ctx->rast);
+   ctx->pipe->delete_depth_stencil_alpha_state(ctx->pipe, ctx->dsa);
+
+   pipe_video_surface_reference(&ctx->decode_target, NULL);
+   vl_compositor_cleanup(&ctx->compositor);
+   vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+   ctx->pipe->destroy(ctx->pipe);
+
+   FREE(ctx);
+}
+
+static void
+sp_mpeg12_decode_macroblocks(struct pipe_video_context *vpipe,
+                             struct pipe_video_surface *past,
+                             struct pipe_video_surface *future,
+                             unsigned num_macroblocks,
+                             struct pipe_macroblock *macroblocks,
+                             struct pipe_fence_handle **fence)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+   struct pipe_mpeg12_macroblock *mpeg12_macroblocks = (struct pipe_mpeg12_macroblock*)macroblocks;
+
+   assert(vpipe);
+   assert(num_macroblocks);
+   assert(macroblocks);
+   assert(macroblocks->codec == PIPE_VIDEO_CODEC_MPEG12);
+   assert(ctx->decode_target);
+
+   vl_mpeg12_mc_renderer_render_macroblocks(&ctx->mc_renderer,
+                                            softpipe_video_surface(ctx->decode_target)->tex,
+                                            past ? softpipe_video_surface(past)->tex : NULL,
+                                            future ? softpipe_video_surface(future)->tex : NULL,
+                                            num_macroblocks, mpeg12_macroblocks, fence);
+}
+
+static void
+sp_mpeg12_clear_surface(struct pipe_video_context *vpipe,
+                        unsigned x, unsigned y,
+                        unsigned width, unsigned height,
+                        unsigned value,
+                        struct pipe_surface *surface)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+   assert(surface);
+
+   ctx->pipe->surface_fill(ctx->pipe, surface, x, y, width, height, value);
+}
+
+static void
+sp_mpeg12_render_picture(struct pipe_video_context     *vpipe,
+                         /*struct pipe_surface         *backround,
+                         struct pipe_video_rect        *backround_area,*/
+                         struct pipe_video_surface     *src_surface,
+                         enum pipe_mpeg12_picture_type picture_type,
+                         /*unsigned                    num_past_surfaces,
+                         struct pipe_video_surface     *past_surfaces,
+                         unsigned                      num_future_surfaces,
+                         struct pipe_video_surface     *future_surfaces,*/
+                         struct pipe_video_rect        *src_area,
+                         struct pipe_surface           *dst_surface,
+                         struct pipe_video_rect        *dst_area,
+                         /*unsigned                      num_layers,
+                         struct pipe_surface           *layers,
+                         struct pipe_video_rect        *layer_src_areas,
+                         struct pipe_video_rect        *layer_dst_areas*/
+                         struct pipe_fence_handle      **fence)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+	
+   assert(vpipe);
+   assert(src_surface);
+   assert(src_area);
+   assert(dst_surface);
+   assert(dst_area);
+	
+   vl_compositor_render(&ctx->compositor, softpipe_video_surface(src_surface)->tex,
+                        picture_type, src_area, dst_surface->texture, dst_area, fence);
+}
+
+static void
+sp_mpeg12_set_decode_target(struct pipe_video_context *vpipe,
+                            struct pipe_video_surface *dt)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+   assert(dt);
+
+   pipe_video_surface_reference(&ctx->decode_target, dt);
+}
+
+static void sp_mpeg12_set_csc_matrix(struct pipe_video_context *vpipe, const float *mat)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+
+   vl_compositor_set_csc_matrix(&ctx->compositor, mat);
+}
+
+static bool
+init_pipe_state(struct sp_mpeg12_context *ctx)
+{
+   struct pipe_rasterizer_state rast;
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   unsigned i;
+
+   assert(ctx);
+	
+   rast.flatshade = 1;
+   rast.flatshade_first = 0;
+   rast.light_twoside = 0;
+   rast.front_winding = PIPE_WINDING_CCW;
+   rast.cull_mode = PIPE_WINDING_CW;
+   rast.fill_cw = PIPE_POLYGON_MODE_FILL;
+   rast.fill_ccw = PIPE_POLYGON_MODE_FILL;
+   rast.offset_cw = 0;
+   rast.offset_ccw = 0;
+   rast.scissor = 0;
+   rast.poly_smooth = 0;
+   rast.poly_stipple_enable = 0;
+   rast.point_sprite = 0;
+   rast.point_size_per_vertex = 0;
+   rast.multisample = 0;
+   rast.line_smooth = 0;
+   rast.line_stipple_enable = 0;
+   rast.line_stipple_factor = 0;
+   rast.line_stipple_pattern = 0;
+   rast.line_last_pixel = 0;
+   rast.bypass_vs_clip_and_viewport = 0;
+   rast.line_width = 1;
+   rast.point_smooth = 0;
+   rast.point_size = 1;
+   rast.offset_units = 1;
+   rast.offset_scale = 1;
+   /*rast.sprite_coord_mode[i] = ;*/
+   ctx->rast = ctx->pipe->create_rasterizer_state(ctx->pipe, &rast);
+   ctx->pipe->bind_rasterizer_state(ctx->pipe, ctx->rast);
+
+   blend.blend_enable = 0;
+   blend.rgb_func = PIPE_BLEND_ADD;
+   blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_func = PIPE_BLEND_ADD;
+   blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
+   blend.logicop_enable = 0;
+   blend.logicop_func = PIPE_LOGICOP_CLEAR;
+   /* Needed to allow color writes to FB, even if blending disabled */
+   blend.colormask = PIPE_MASK_RGBA;
+   blend.dither = 0;
+   ctx->blend = ctx->pipe->create_blend_state(ctx->pipe, &blend);
+   ctx->pipe->bind_blend_state(ctx->pipe, ctx->blend);
+
+   dsa.depth.enabled = 0;
+   dsa.depth.writemask = 0;
+   dsa.depth.func = PIPE_FUNC_ALWAYS;
+   for (i = 0; i < 2; ++i) {
+      dsa.stencil[i].enabled = 0;
+      dsa.stencil[i].func = PIPE_FUNC_ALWAYS;
+      dsa.stencil[i].fail_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].zpass_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].zfail_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].ref_value = 0;
+      dsa.stencil[i].valuemask = 0;
+      dsa.stencil[i].writemask = 0;
+   }
+   dsa.alpha.enabled = 0;
+   dsa.alpha.func = PIPE_FUNC_ALWAYS;
+   dsa.alpha.ref_value = 0;
+   ctx->dsa = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe, &dsa);
+   ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, ctx->dsa);
+	
+   return true;
+}
+
+static struct pipe_video_context *
+sp_mpeg12_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                 enum pipe_video_chroma_format chroma_format,
+                 unsigned width, unsigned height)
+{
+   struct sp_mpeg12_context *ctx;
+
+   assert(u_reduce_video_profile(profile) == PIPE_VIDEO_CODEC_MPEG12);
+
+   ctx = CALLOC_STRUCT(sp_mpeg12_context);
+
+   if (!ctx)
+      return NULL;
+
+   ctx->base.profile = profile;
+   ctx->base.chroma_format = chroma_format;
+   ctx->base.width = width;
+   ctx->base.height = height;
+
+   ctx->base.screen = screen;
+   ctx->base.destroy = sp_mpeg12_destroy;
+   ctx->base.decode_macroblocks = sp_mpeg12_decode_macroblocks;
+   ctx->base.clear_surface = sp_mpeg12_clear_surface;
+   ctx->base.render_picture = sp_mpeg12_render_picture;
+   ctx->base.set_decode_target = sp_mpeg12_set_decode_target;
+   ctx->base.set_csc_matrix = sp_mpeg12_set_csc_matrix;
+
+   ctx->pipe = softpipe_create(screen);
+   if (!ctx->pipe) {
+      FREE(ctx);
+      return NULL;
+   }
+
+   /* TODO: Use slice buffering for softpipe when implemented, no advantage to buffering an entire picture */
+   if (!vl_mpeg12_mc_renderer_init(&ctx->mc_renderer, ctx->pipe,
+                                   width, height, chroma_format,
+                                   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE,
+                                   /* TODO: Use XFER_NONE when implemented */
+                                   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE,
+                                   true)) {
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+	
+   if (!vl_compositor_init(&ctx->compositor, ctx->pipe)) {
+      vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+	
+   if (!init_pipe_state(ctx)) {
+      vl_compositor_cleanup(&ctx->compositor);
+      vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+
+   return &ctx->base;
+}
+
+struct pipe_video_context *
+sp_video_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height)
+{
+   assert(screen);
+   assert(width && height);
+
+   switch (u_reduce_video_profile(profile)) {
+      case PIPE_VIDEO_CODEC_MPEG12:
+         return sp_mpeg12_create(screen, profile,
+                                 chroma_format,
+                                 width, height);
+      default:
+         return NULL;
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_draw.h b/src/gallium/drivers/softpipe/sp_video_context.h
index 62fe0d5d0ee..ccbd1ffe4c8 100644
--- a/src/gallium/drivers/i965simple/brw_draw.h
+++ b/src/gallium/drivers/softpipe/sp_video_context.h
@@ -1,6 +1,6 @@
- /**************************************************************************
+/**************************************************************************
  * 
- * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,31 +25,33 @@
  * 
  **************************************************************************/
 
-#ifndef BRW_DRAW_H
-#define BRW_DRAW_H
+#ifndef SP_VIDEO_CONTEXT_H
+#define SP_VIDEO_CONTEXT_H
 
-#include "pipe/p_context.h"
+#include <pipe/p_video_context.h>
+#include <vl/vl_mpeg12_mc_renderer.h>
+#include <vl/vl_compositor.h>
 
-struct brw_context;
+struct pipe_screen;
+struct pipe_context;
+struct pipe_video_surface;
 
+struct sp_mpeg12_context
+{
+   struct pipe_video_context base;
+   struct pipe_context *pipe;
+   struct pipe_video_surface *decode_target;
+   struct vl_mpeg12_mc_renderer mc_renderer;
+   struct vl_compositor compositor;
 
+   void *rast;
+   void *dsa;
+   void *blend;
+};
 
-void brw_init_draw_functions( struct brw_context *brw );
+struct pipe_video_context *
+sp_video_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height);
 
-
-boolean brw_upload_vertices( struct brw_context *brw,
-			       unsigned min_index,
-			       unsigned max_index );
-
-boolean brw_upload_indices(struct brw_context *brw,
-                           const struct pipe_buffer *index_buffer,
-                           int ib_size, int start, int count);
-
-boolean brw_upload_vertex_buffers( struct brw_context *brw );
-boolean brw_upload_vertex_elements( struct brw_context *brw );
-
-unsigned brw_translate_surface_format( unsigned id );
-
-
-
-#endif
+#endif /* SP_VIDEO_CONTEXT_H */
diff --git a/src/gallium/drivers/svga/Makefile b/src/gallium/drivers/svga/Makefile
new file mode 100644
index 00000000000..f3619081875
--- /dev/null
+++ b/src/gallium/drivers/svga/Makefile
@@ -0,0 +1,60 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = svga
+
+C_SOURCES = \
+	svgadump/svga_shader_dump.c \
+	svgadump/svga_shader_op.c \
+	svgadump/svga_dump.c \
+	svga_cmd.c \
+	svga_context.c \
+	svga_draw.c \
+	svga_draw_arrays.c \
+	svga_draw_elements.c \
+	svga_pipe_blend.c \
+	svga_pipe_blit.c \
+	svga_pipe_clear.c \
+	svga_pipe_constants.c \
+	svga_pipe_depthstencil.c \
+	svga_pipe_draw.c \
+	svga_pipe_flush.c \
+	svga_pipe_fs.c \
+	svga_pipe_misc.c \
+	svga_pipe_query.c \
+	svga_pipe_rasterizer.c \
+	svga_pipe_sampler.c \
+	svga_pipe_vertex.c \
+	svga_pipe_vs.c \
+	svga_screen.c \
+	svga_screen_buffer.c \
+	svga_screen_texture.c \
+	svga_screen_cache.c \
+	svga_state.c \
+	svga_state_need_swtnl.c \
+	svga_state_constants.c \
+	svga_state_framebuffer.c \
+	svga_state_rss.c \
+	svga_state_tss.c \
+	svga_state_vdecl.c \
+	svga_state_fs.c \
+	svga_state_vs.c \
+	svga_swtnl_backend.c \
+	svga_swtnl_draw.c \
+	svga_swtnl_state.c \
+	svga_tgsi.c \
+	svga_tgsi_decl_sm20.c \
+	svga_tgsi_decl_sm30.c \
+	svga_tgsi_insn.c
+
+LIBRARY_INCLUDES = \
+	-I$(TOP)/src/gallium/drivers/svga/include
+
+# With linux-debug we get a lots of warnings, filter out the bad flags.
+CFLAGS := $(filter-out -pedantic, $(filter-out -ansi, $(CFLAGS)))
+
+LIBRARY_DEFINES = \
+	-std=gnu99 -fvisibility=hidden \
+	-DHAVE_STDINT_H -DHAVE_SYS_TYPES_H
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/svga/SConscript b/src/gallium/drivers/svga/SConscript
new file mode 100644
index 00000000000..737b791ceb0
--- /dev/null
+++ b/src/gallium/drivers/svga/SConscript
@@ -0,0 +1,72 @@
+Import('*')
+
+env = env.Clone()
+
+if env['platform'] in ['linux']:
+	env.Append(CCFLAGS = ['-fvisibility=hidden'])
+
+if env['gcc']:
+	env.Append(CPPDEFINES = [
+		'HAVE_STDINT_H', 
+		'HAVE_SYS_TYPES_H',
+	])
+	
+env.Prepend(CPPPATH = [
+	'include',
+])
+
+env.Append(CPPDEFINES = [
+])
+
+sources = [
+    'svga_cmd.c',
+    'svga_context.c',
+    'svga_draw.c',
+    'svga_draw_arrays.c',
+    'svga_draw_elements.c',
+    'svga_pipe_blend.c',
+    'svga_pipe_blit.c',
+    'svga_pipe_clear.c',
+    'svga_pipe_constants.c',
+    'svga_pipe_depthstencil.c',
+    'svga_pipe_draw.c',
+    'svga_pipe_flush.c',
+    'svga_pipe_fs.c',
+    'svga_pipe_misc.c',
+    'svga_pipe_query.c',
+    'svga_pipe_rasterizer.c',
+    'svga_pipe_sampler.c',
+    'svga_pipe_vertex.c',
+    'svga_pipe_vs.c',
+    'svga_screen.c',
+    'svga_screen_buffer.c',
+    'svga_screen_cache.c',
+    'svga_screen_texture.c',
+    'svga_state.c',
+    'svga_state_constants.c',
+    'svga_state_framebuffer.c',
+    'svga_state_need_swtnl.c',
+    'svga_state_rss.c',
+    'svga_state_tss.c',
+    'svga_state_vdecl.c',
+    'svga_state_fs.c',
+    'svga_state_vs.c',
+    'svga_swtnl_backend.c',
+    'svga_swtnl_draw.c',
+    'svga_swtnl_state.c',
+    'svga_tgsi.c',
+    'svga_tgsi_decl_sm20.c',
+    'svga_tgsi_decl_sm30.c',
+    'svga_tgsi_insn.c',
+    
+    'svgadump/svga_dump.c',
+    'svgadump/svga_shader_dump.c',
+    'svgadump/svga_shader_op.c',
+]
+
+svga = env.ConvenienceLibrary(
+	target = 'svga',
+	source = sources,
+)
+
+Export('svga')
diff --git a/src/gallium/drivers/svga/include/README b/src/gallium/drivers/svga/include/README
new file mode 100644
index 00000000000..a0b8916104e
--- /dev/null
+++ b/src/gallium/drivers/svga/include/README
@@ -0,0 +1,3 @@
+This directory contains the headers from the VMware SVGA Device Developer Kit:
+
+   https://vmware-svga.svn.sourceforge.net/svnroot/vmware-svga/trunk/lib/vmware/
diff --git a/src/gallium/drivers/svga/include/svga3d_caps.h b/src/gallium/drivers/svga/include/svga3d_caps.h
new file mode 100644
index 00000000000..714ce9f45fb
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga3d_caps.h
@@ -0,0 +1,139 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga3d_caps.h --
+ *
+ *       Definitions for SVGA3D hardware capabilities.  Capabilities
+ *       are used to query for optional rendering features during
+ *       driver initialization. The capability data is stored as very
+ *       basic key/value dictionary within the "FIFO register" memory
+ *       area at the beginning of BAR2.
+ *
+ *       Note that these definitions are only for 3D capabilities.
+ *       The SVGA device also has "device capabilities" and "FIFO
+ *       capabilities", which are non-3D-specific and are stored as
+ *       bitfields rather than key/value pairs.
+ */
+
+#ifndef _SVGA3D_CAPS_H_
+#define _SVGA3D_CAPS_H_
+
+#define SVGA_FIFO_3D_CAPS_SIZE   (SVGA_FIFO_3D_CAPS_LAST - \
+                                  SVGA_FIFO_3D_CAPS + 1)
+
+
+/*
+ * SVGA3dCapsRecordType
+ *
+ *    Record types that can be found in the caps block.
+ *    Related record types are grouped together numerically so that
+ *    SVGA3dCaps_FindRecord() can be applied on a range of record
+ *    types.
+ */
+
+typedef enum {
+   SVGA3DCAPS_RECORD_UNKNOWN        = 0,
+   SVGA3DCAPS_RECORD_DEVCAPS_MIN    = 0x100,
+   SVGA3DCAPS_RECORD_DEVCAPS        = 0x100,
+   SVGA3DCAPS_RECORD_DEVCAPS_MAX    = 0x1ff,
+} SVGA3dCapsRecordType;
+
+
+/*
+ * SVGA3dCapsRecordHeader
+ *
+ *    Header field leading each caps block record. Contains the offset (in
+ *    register words, NOT bytes) to the next caps block record (or the end
+ *    of caps block records which will be a zero word) and the record type
+ *    as defined above.
+ */
+
+typedef
+struct SVGA3dCapsRecordHeader {
+   uint32 length;
+   SVGA3dCapsRecordType type;
+}
+SVGA3dCapsRecordHeader;
+
+
+/*
+ * SVGA3dCapsRecord
+ *
+ *    Caps block record; "data" is a placeholder for the actual data structure
+ *    contained within the record; for example a record containing a FOOBAR
+ *    structure would be of size "sizeof(SVGA3dCapsRecordHeader) +
+ *    sizeof(FOOBAR)".
+ */
+
+typedef
+struct SVGA3dCapsRecord {
+   SVGA3dCapsRecordHeader header;
+   uint32 data[1];
+}
+SVGA3dCapsRecord;
+
+
+typedef uint32 SVGA3dCapPair[2];
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3dCaps_FindRecord
+ *
+ *    Finds the record with the highest-valued type within the given range
+ *    in the caps block.
+ *
+ *    Result: pointer to found record, or NULL if not found.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE SVGA3dCapsRecord *
+SVGA3dCaps_FindRecord(const uint32 *capsBlock,
+                      SVGA3dCapsRecordType recordTypeMin,
+                      SVGA3dCapsRecordType recordTypeMax)
+{
+   SVGA3dCapsRecord *record, *found = NULL;
+   uint32 offset;
+
+   /*
+    * Search linearly through the caps block records for the specified type.
+    */
+   for (offset = 0; capsBlock[offset] != 0; offset += capsBlock[offset]) {
+      record = (SVGA3dCapsRecord *) (capsBlock + offset);
+      if ((record->header.type >= recordTypeMin) &&
+          (record->header.type <= recordTypeMax) &&
+          (!found || (record->header.type > found->header.type))) {
+         found = record;
+      }
+   }
+
+   return found;
+}
+
+
+#endif // _SVGA3D_CAPS_H_
diff --git a/src/gallium/drivers/svga/include/svga3d_reg.h b/src/gallium/drivers/svga/include/svga3d_reg.h
new file mode 100644
index 00000000000..77cb4533100
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga3d_reg.h
@@ -0,0 +1,1793 @@
+/**********************************************************
+ * Copyright 1998-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga3d_reg.h --
+ *
+ *       SVGA 3D hardware definitions
+ */
+
+#ifndef _SVGA3D_REG_H_
+#define _SVGA3D_REG_H_
+
+#include "svga_reg.h"
+
+
+/*
+ * 3D Hardware Version
+ *
+ *   The hardware version is stored in the SVGA_FIFO_3D_HWVERSION fifo
+ *   register.   Is set by the host and read by the guest.  This lets
+ *   us make new guest drivers which are backwards-compatible with old
+ *   SVGA hardware revisions.  It does not let us support old guest
+ *   drivers.  Good enough for now.
+ *
+ */
+
+#define SVGA3D_MAKE_HWVERSION(major, minor)      (((major) << 16) | ((minor) & 0xFF))
+#define SVGA3D_MAJOR_HWVERSION(version)          ((version) >> 16)
+#define SVGA3D_MINOR_HWVERSION(version)          ((version) & 0xFF)
+
+typedef enum {
+   SVGA3D_HWVERSION_WS5_RC1   = SVGA3D_MAKE_HWVERSION(0, 1),
+   SVGA3D_HWVERSION_WS5_RC2   = SVGA3D_MAKE_HWVERSION(0, 2),
+   SVGA3D_HWVERSION_WS51_RC1  = SVGA3D_MAKE_HWVERSION(0, 3),
+   SVGA3D_HWVERSION_WS6_B1    = SVGA3D_MAKE_HWVERSION(1, 1),
+   SVGA3D_HWVERSION_FUSION_11 = SVGA3D_MAKE_HWVERSION(1, 4),
+   SVGA3D_HWVERSION_WS65_B1   = SVGA3D_MAKE_HWVERSION(2, 0),
+   SVGA3D_HWVERSION_CURRENT   = SVGA3D_HWVERSION_WS65_B1,
+} SVGA3dHardwareVersion;
+
+/*
+ * Generic Types
+ */
+
+typedef uint32 SVGA3dBool; /* 32-bit Bool definition */
+#define SVGA3D_NUM_CLIPPLANES                   6
+#define SVGA3D_MAX_SIMULTANEOUS_RENDER_TARGETS  8
+
+
+/*
+ * Surface formats.
+ *
+ * If you modify this list, be sure to keep GLUtil.c in sync. It
+ * includes the internal format definition of each surface in
+ * GLUtil_ConvertSurfaceFormat, and it contains a table of
+ * human-readable names in GLUtil_GetFormatName.
+ */
+
+typedef enum SVGA3dSurfaceFormat {
+   SVGA3D_FORMAT_INVALID = 0,
+
+   SVGA3D_X8R8G8B8       = 1,
+   SVGA3D_A8R8G8B8       = 2,
+
+   SVGA3D_R5G6B5         = 3,
+   SVGA3D_X1R5G5B5       = 4,
+   SVGA3D_A1R5G5B5       = 5,
+   SVGA3D_A4R4G4B4       = 6,
+
+   SVGA3D_Z_D32          = 7,
+   SVGA3D_Z_D16          = 8,
+   SVGA3D_Z_D24S8        = 9,
+   SVGA3D_Z_D15S1        = 10,
+
+   SVGA3D_LUMINANCE8            = 11,
+   SVGA3D_LUMINANCE4_ALPHA4     = 12,
+   SVGA3D_LUMINANCE16           = 13,
+   SVGA3D_LUMINANCE8_ALPHA8     = 14,
+
+   SVGA3D_DXT1           = 15,
+   SVGA3D_DXT2           = 16,
+   SVGA3D_DXT3           = 17,
+   SVGA3D_DXT4           = 18,
+   SVGA3D_DXT5           = 19,
+
+   SVGA3D_BUMPU8V8       = 20,
+   SVGA3D_BUMPL6V5U5     = 21,
+   SVGA3D_BUMPX8L8V8U8   = 22,
+   SVGA3D_BUMPL8V8U8     = 23,
+
+   SVGA3D_ARGB_S10E5     = 24,   /* 16-bit floating-point ARGB */
+   SVGA3D_ARGB_S23E8     = 25,   /* 32-bit floating-point ARGB */
+
+   SVGA3D_A2R10G10B10    = 26,
+
+   /* signed formats */
+   SVGA3D_V8U8           = 27,
+   SVGA3D_Q8W8V8U8       = 28,
+   SVGA3D_CxV8U8         = 29,
+
+   /* mixed formats */
+   SVGA3D_X8L8V8U8       = 30,
+   SVGA3D_A2W10V10U10    = 31,
+
+   SVGA3D_ALPHA8         = 32,
+
+   /* Single- and dual-component floating point formats */
+   SVGA3D_R_S10E5        = 33,
+   SVGA3D_R_S23E8        = 34,
+   SVGA3D_RG_S10E5       = 35,
+   SVGA3D_RG_S23E8       = 36,
+
+   /*
+    * Any surface can be used as a buffer object, but SVGA3D_BUFFER is
+    * the most efficient format to use when creating new surfaces
+    * expressly for index or vertex data.
+    */
+   SVGA3D_BUFFER         = 37,
+
+   SVGA3D_Z_D24X8        = 38,
+
+   SVGA3D_V16U16         = 39,
+
+   SVGA3D_G16R16         = 40,
+   SVGA3D_A16B16G16R16   = 41,
+
+   /* Packed Video formats */
+   SVGA3D_UYVY           = 42,
+   SVGA3D_YUY2           = 43,
+
+   SVGA3D_FORMAT_MAX
+} SVGA3dSurfaceFormat;
+
+typedef uint32 SVGA3dColor; /* a, r, g, b */
+
+/*
+ * These match the D3DFORMAT_OP definitions used by Direct3D. We need
+ * them so that we can query the host for what the supported surface
+ * operations are (when we're using the D3D backend, in particular),
+ * and so we can send those operations to the guest.
+ */
+typedef enum {
+   SVGA3DFORMAT_OP_TEXTURE                               = 0x00000001,
+   SVGA3DFORMAT_OP_VOLUMETEXTURE                         = 0x00000002,
+   SVGA3DFORMAT_OP_CUBETEXTURE                           = 0x00000004,
+   SVGA3DFORMAT_OP_OFFSCREEN_RENDERTARGET                = 0x00000008,
+   SVGA3DFORMAT_OP_SAME_FORMAT_RENDERTARGET              = 0x00000010,
+   SVGA3DFORMAT_OP_ZSTENCIL                              = 0x00000040,
+   SVGA3DFORMAT_OP_ZSTENCIL_WITH_ARBITRARY_COLOR_DEPTH   = 0x00000080,
+
+/*
+ * This format can be used as a render target if the current display mode
+ * is the same depth if the alpha channel is ignored. e.g. if the device
+ * can render to A8R8G8B8 when the display mode is X8R8G8B8, then the
+ * format op list entry for A8R8G8B8 should have this cap.
+ */
+   SVGA3DFORMAT_OP_SAME_FORMAT_UP_TO_ALPHA_RENDERTARGET  = 0x00000100,
+
+/*
+ * This format contains DirectDraw support (including Flip).  This flag
+ * should not to be set on alpha formats.
+ */
+   SVGA3DFORMAT_OP_DISPLAYMODE                           = 0x00000400,
+
+/*
+ * The rasterizer can support some level of Direct3D support in this format
+ * and implies that the driver can create a Context in this mode (for some
+ * render target format).  When this flag is set, the SVGA3DFORMAT_OP_DISPLAYMODE
+ * flag must also be set.
+ */
+   SVGA3DFORMAT_OP_3DACCELERATION                        = 0x00000800,
+
+/*
+ * This is set for a private format when the driver has put the bpp in
+ * the structure.
+ */
+   SVGA3DFORMAT_OP_PIXELSIZE                             = 0x00001000,
+
+/*
+ * Indicates that this format can be converted to any RGB format for which
+ * SVGA3DFORMAT_OP_MEMBEROFGROUP_ARGB is specified
+ */
+   SVGA3DFORMAT_OP_CONVERT_TO_ARGB                       = 0x00002000,
+
+/*
+ * Indicates that this format can be used to create offscreen plain surfaces.
+ */
+   SVGA3DFORMAT_OP_OFFSCREENPLAIN                        = 0x00004000,
+
+/*
+ * Indicated that this format can be read as an SRGB texture (meaning that the
+ * sampler will linearize the looked up data)
+ */
+   SVGA3DFORMAT_OP_SRGBREAD                              = 0x00008000,
+
+/*
+ * Indicates that this format can be used in the bumpmap instructions
+ */
+   SVGA3DFORMAT_OP_BUMPMAP                               = 0x00010000,
+
+/*
+ * Indicates that this format can be sampled by the displacement map sampler
+ */
+   SVGA3DFORMAT_OP_DMAP                                  = 0x00020000,
+
+/*
+ * Indicates that this format cannot be used with texture filtering
+ */
+   SVGA3DFORMAT_OP_NOFILTER                              = 0x00040000,
+
+/*
+ * Indicates that format conversions are supported to this RGB format if
+ * SVGA3DFORMAT_OP_CONVERT_TO_ARGB is specified in the source format.
+ */
+   SVGA3DFORMAT_OP_MEMBEROFGROUP_ARGB                    = 0x00080000,
+
+/*
+ * Indicated that this format can be written as an SRGB target (meaning that the
+ * pixel pipe will DE-linearize data on output to format)
+ */
+   SVGA3DFORMAT_OP_SRGBWRITE                             = 0x00100000,
+
+/*
+ * Indicates that this format cannot be used with alpha blending
+ */
+   SVGA3DFORMAT_OP_NOALPHABLEND                          = 0x00200000,
+
+/*
+ * Indicates that the device can auto-generated sublevels for resources
+ * of this format
+ */
+   SVGA3DFORMAT_OP_AUTOGENMIPMAP                         = 0x00400000,
+
+/*
+ * Indicates that this format can be used by vertex texture sampler
+ */
+   SVGA3DFORMAT_OP_VERTEXTEXTURE                         = 0x00800000,
+
+/*
+ * Indicates that this format supports neither texture coordinate wrap
+ * modes, nor mipmapping
+ */
+   SVGA3DFORMAT_OP_NOTEXCOORDWRAPNORMIP                  = 0x01000000
+} SVGA3dFormatOp;
+
+/*
+ * This structure is a conversion of SVGA3DFORMAT_OP_*.
+ * Entries must be located at the same position.
+ */
+typedef union {
+   uint32 value;
+   struct {
+      uint32 texture : 1;
+      uint32 volumeTexture : 1;
+      uint32 cubeTexture : 1;
+      uint32 offscreenRenderTarget : 1;
+      uint32 sameFormatRenderTarget : 1;
+      uint32 unknown1 : 1;
+      uint32 zStencil : 1;
+      uint32 zStencilArbitraryDepth : 1;
+      uint32 sameFormatUpToAlpha : 1;
+      uint32 unknown2 : 1;
+      uint32 displayMode : 1;
+      uint32 acceleration3d : 1;
+      uint32 pixelSize : 1;
+      uint32 convertToARGB : 1;
+      uint32 offscreenPlain : 1;
+      uint32 sRGBRead : 1;
+      uint32 bumpMap : 1;
+      uint32 dmap : 1;
+      uint32 noFilter : 1;
+      uint32 memberOfGroupARGB : 1;
+      uint32 sRGBWrite : 1;
+      uint32 noAlphaBlend : 1;
+      uint32 autoGenMipMap : 1;
+      uint32 vertexTexture : 1;
+      uint32 noTexCoordWrapNorMip : 1;
+   };
+} SVGA3dSurfaceFormatCaps;
+
+/*
+ * SVGA_3D_CMD_SETRENDERSTATE Types.  All value types
+ * must fit in a uint32.
+ */
+
+typedef enum {
+   SVGA3D_RS_INVALID                   = 0,
+   SVGA3D_RS_ZENABLE                   = 1,     /* SVGA3dBool */
+   SVGA3D_RS_ZWRITEENABLE              = 2,     /* SVGA3dBool */
+   SVGA3D_RS_ALPHATESTENABLE           = 3,     /* SVGA3dBool */
+   SVGA3D_RS_DITHERENABLE              = 4,     /* SVGA3dBool */
+   SVGA3D_RS_BLENDENABLE               = 5,     /* SVGA3dBool */
+   SVGA3D_RS_FOGENABLE                 = 6,     /* SVGA3dBool */
+   SVGA3D_RS_SPECULARENABLE            = 7,     /* SVGA3dBool */
+   SVGA3D_RS_STENCILENABLE             = 8,     /* SVGA3dBool */
+   SVGA3D_RS_LIGHTINGENABLE            = 9,     /* SVGA3dBool */
+   SVGA3D_RS_NORMALIZENORMALS          = 10,    /* SVGA3dBool */
+   SVGA3D_RS_POINTSPRITEENABLE         = 11,    /* SVGA3dBool */
+   SVGA3D_RS_POINTSCALEENABLE          = 12,    /* SVGA3dBool */
+   SVGA3D_RS_STENCILREF                = 13,    /* uint32 */
+   SVGA3D_RS_STENCILMASK               = 14,    /* uint32 */
+   SVGA3D_RS_STENCILWRITEMASK          = 15,    /* uint32 */
+   SVGA3D_RS_FOGSTART                  = 16,    /* float */
+   SVGA3D_RS_FOGEND                    = 17,    /* float */
+   SVGA3D_RS_FOGDENSITY                = 18,    /* float */
+   SVGA3D_RS_POINTSIZE                 = 19,    /* float */
+   SVGA3D_RS_POINTSIZEMIN              = 20,    /* float */
+   SVGA3D_RS_POINTSIZEMAX              = 21,    /* float */
+   SVGA3D_RS_POINTSCALE_A              = 22,    /* float */
+   SVGA3D_RS_POINTSCALE_B              = 23,    /* float */
+   SVGA3D_RS_POINTSCALE_C              = 24,    /* float */
+   SVGA3D_RS_FOGCOLOR                  = 25,    /* SVGA3dColor */
+   SVGA3D_RS_AMBIENT                   = 26,    /* SVGA3dColor */
+   SVGA3D_RS_CLIPPLANEENABLE           = 27,    /* SVGA3dClipPlanes */
+   SVGA3D_RS_FOGMODE                   = 28,    /* SVGA3dFogMode */
+   SVGA3D_RS_FILLMODE                  = 29,    /* SVGA3dFillMode */
+   SVGA3D_RS_SHADEMODE                 = 30,    /* SVGA3dShadeMode */
+   SVGA3D_RS_LINEPATTERN               = 31,    /* SVGA3dLinePattern */
+   SVGA3D_RS_SRCBLEND                  = 32,    /* SVGA3dBlendOp */
+   SVGA3D_RS_DSTBLEND                  = 33,    /* SVGA3dBlendOp */
+   SVGA3D_RS_BLENDEQUATION             = 34,    /* SVGA3dBlendEquation */
+   SVGA3D_RS_CULLMODE                  = 35,    /* SVGA3dFace */
+   SVGA3D_RS_ZFUNC                     = 36,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_ALPHAFUNC                 = 37,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_STENCILFUNC               = 38,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_STENCILFAIL               = 39,    /* SVGA3dStencilOp */
+   SVGA3D_RS_STENCILZFAIL              = 40,    /* SVGA3dStencilOp */
+   SVGA3D_RS_STENCILPASS               = 41,    /* SVGA3dStencilOp */
+   SVGA3D_RS_ALPHAREF                  = 42,    /* float (0.0 .. 1.0) */
+   SVGA3D_RS_FRONTWINDING              = 43,    /* SVGA3dFrontWinding */
+   SVGA3D_RS_COORDINATETYPE            = 44,    /* SVGA3dCoordinateType */
+   SVGA3D_RS_ZBIAS                     = 45,    /* float */
+   SVGA3D_RS_RANGEFOGENABLE            = 46,    /* SVGA3dBool */
+   SVGA3D_RS_COLORWRITEENABLE          = 47,    /* SVGA3dColorMask */
+   SVGA3D_RS_VERTEXMATERIALENABLE      = 48,    /* SVGA3dBool */
+   SVGA3D_RS_DIFFUSEMATERIALSOURCE     = 49,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_SPECULARMATERIALSOURCE    = 50,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_AMBIENTMATERIALSOURCE     = 51,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_EMISSIVEMATERIALSOURCE    = 52,    /* SVGA3dVertexMaterial */
+   SVGA3D_RS_TEXTUREFACTOR             = 53,    /* SVGA3dColor */
+   SVGA3D_RS_LOCALVIEWER               = 54,    /* SVGA3dBool */
+   SVGA3D_RS_SCISSORTESTENABLE         = 55,    /* SVGA3dBool */
+   SVGA3D_RS_BLENDCOLOR                = 56,    /* SVGA3dColor */
+   SVGA3D_RS_STENCILENABLE2SIDED       = 57,    /* SVGA3dBool */
+   SVGA3D_RS_CCWSTENCILFUNC            = 58,    /* SVGA3dCmpFunc */
+   SVGA3D_RS_CCWSTENCILFAIL            = 59,    /* SVGA3dStencilOp */
+   SVGA3D_RS_CCWSTENCILZFAIL           = 60,    /* SVGA3dStencilOp */
+   SVGA3D_RS_CCWSTENCILPASS            = 61,    /* SVGA3dStencilOp */
+   SVGA3D_RS_VERTEXBLEND               = 62,    /* SVGA3dVertexBlendFlags */
+   SVGA3D_RS_SLOPESCALEDEPTHBIAS       = 63,    /* float */
+   SVGA3D_RS_DEPTHBIAS                 = 64,    /* float */
+
+
+   /*
+    * Output Gamma Level
+    *
+    * Output gamma effects the gamma curve of colors that are output from the
+    * rendering pipeline.  A value of 1.0 specifies a linear color space. If the
+    * value is <= 0.0, gamma correction is ignored and linear color space is
+    * used.
+    */
+
+   SVGA3D_RS_OUTPUTGAMMA               = 65,    /* float */
+   SVGA3D_RS_ZVISIBLE                  = 66,    /* SVGA3dBool */
+   SVGA3D_RS_LASTPIXEL                 = 67,    /* SVGA3dBool */
+   SVGA3D_RS_CLIPPING                  = 68,    /* SVGA3dBool */
+   SVGA3D_RS_WRAP0                     = 69,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP1                     = 70,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP2                     = 71,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP3                     = 72,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP4                     = 73,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP5                     = 74,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP6                     = 75,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP7                     = 76,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP8                     = 77,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP9                     = 78,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP10                    = 79,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP11                    = 80,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP12                    = 81,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP13                    = 82,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP14                    = 83,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_WRAP15                    = 84,    /* SVGA3dWrapFlags */
+   SVGA3D_RS_MULTISAMPLEANTIALIAS      = 85,    /* SVGA3dBool */
+   SVGA3D_RS_MULTISAMPLEMASK           = 86,    /* uint32 */
+   SVGA3D_RS_INDEXEDVERTEXBLENDENABLE  = 87,    /* SVGA3dBool */
+   SVGA3D_RS_TWEENFACTOR               = 88,    /* float */
+   SVGA3D_RS_ANTIALIASEDLINEENABLE     = 89,    /* SVGA3dBool */
+   SVGA3D_RS_COLORWRITEENABLE1         = 90,    /* SVGA3dColorMask */
+   SVGA3D_RS_COLORWRITEENABLE2         = 91,    /* SVGA3dColorMask */
+   SVGA3D_RS_COLORWRITEENABLE3         = 92,    /* SVGA3dColorMask */
+   SVGA3D_RS_SEPARATEALPHABLENDENABLE  = 93,    /* SVGA3dBool */
+   SVGA3D_RS_SRCBLENDALPHA             = 94,    /* SVGA3dBlendOp */
+   SVGA3D_RS_DSTBLENDALPHA             = 95,    /* SVGA3dBlendOp */
+   SVGA3D_RS_BLENDEQUATIONALPHA        = 96,    /* SVGA3dBlendEquation */
+   SVGA3D_RS_MAX
+} SVGA3dRenderStateName;
+
+typedef enum {
+   SVGA3D_VERTEXMATERIAL_NONE     = 0,    /* Use the value in the current material */
+   SVGA3D_VERTEXMATERIAL_DIFFUSE  = 1,    /* Use the value in the diffuse component */
+   SVGA3D_VERTEXMATERIAL_SPECULAR = 2,    /* Use the value in the specular component */
+} SVGA3dVertexMaterial;
+
+typedef enum {
+   SVGA3D_FILLMODE_INVALID = 0,
+   SVGA3D_FILLMODE_POINT   = 1,
+   SVGA3D_FILLMODE_LINE    = 2,
+   SVGA3D_FILLMODE_FILL    = 3,
+   SVGA3D_FILLMODE_MAX
+} SVGA3dFillModeType;
+
+
+typedef
+union {
+   struct {
+      uint16   mode;       /* SVGA3dFillModeType */
+      uint16   face;       /* SVGA3dFace */
+   };
+   uint32 uintValue;
+} SVGA3dFillMode;
+
+typedef enum {
+   SVGA3D_SHADEMODE_INVALID = 0,
+   SVGA3D_SHADEMODE_FLAT    = 1,
+   SVGA3D_SHADEMODE_SMOOTH  = 2,
+   SVGA3D_SHADEMODE_PHONG   = 3,     /* Not supported */
+   SVGA3D_SHADEMODE_MAX
+} SVGA3dShadeMode;
+
+typedef
+union {
+   struct {
+      uint16 repeat;
+      uint16 pattern;
+   };
+   uint32 uintValue;
+} SVGA3dLinePattern;
+
+typedef enum {
+   SVGA3D_BLENDOP_INVALID            = 0,
+   SVGA3D_BLENDOP_ZERO               = 1,
+   SVGA3D_BLENDOP_ONE                = 2,
+   SVGA3D_BLENDOP_SRCCOLOR           = 3,
+   SVGA3D_BLENDOP_INVSRCCOLOR        = 4,
+   SVGA3D_BLENDOP_SRCALPHA           = 5,
+   SVGA3D_BLENDOP_INVSRCALPHA        = 6,
+   SVGA3D_BLENDOP_DESTALPHA          = 7,
+   SVGA3D_BLENDOP_INVDESTALPHA       = 8,
+   SVGA3D_BLENDOP_DESTCOLOR          = 9,
+   SVGA3D_BLENDOP_INVDESTCOLOR       = 10,
+   SVGA3D_BLENDOP_SRCALPHASAT        = 11,
+   SVGA3D_BLENDOP_BLENDFACTOR        = 12,
+   SVGA3D_BLENDOP_INVBLENDFACTOR     = 13,
+   SVGA3D_BLENDOP_MAX
+} SVGA3dBlendOp;
+
+typedef enum {
+   SVGA3D_BLENDEQ_INVALID            = 0,
+   SVGA3D_BLENDEQ_ADD                = 1,
+   SVGA3D_BLENDEQ_SUBTRACT           = 2,
+   SVGA3D_BLENDEQ_REVSUBTRACT        = 3,
+   SVGA3D_BLENDEQ_MINIMUM            = 4,
+   SVGA3D_BLENDEQ_MAXIMUM            = 5,
+   SVGA3D_BLENDEQ_MAX
+} SVGA3dBlendEquation;
+
+typedef enum {
+   SVGA3D_FRONTWINDING_INVALID = 0,
+   SVGA3D_FRONTWINDING_CW      = 1,
+   SVGA3D_FRONTWINDING_CCW     = 2,
+   SVGA3D_FRONTWINDING_MAX
+} SVGA3dFrontWinding;
+
+typedef enum {
+   SVGA3D_FACE_INVALID  = 0,
+   SVGA3D_FACE_NONE     = 1,
+   SVGA3D_FACE_FRONT    = 2,
+   SVGA3D_FACE_BACK     = 3,
+   SVGA3D_FACE_FRONT_BACK = 4,
+   SVGA3D_FACE_MAX
+} SVGA3dFace;
+
+/*
+ * The order and the values should not be changed
+ */
+
+typedef enum {
+   SVGA3D_CMP_INVALID              = 0,
+   SVGA3D_CMP_NEVER                = 1,
+   SVGA3D_CMP_LESS                 = 2,
+   SVGA3D_CMP_EQUAL                = 3,
+   SVGA3D_CMP_LESSEQUAL            = 4,
+   SVGA3D_CMP_GREATER              = 5,
+   SVGA3D_CMP_NOTEQUAL             = 6,
+   SVGA3D_CMP_GREATEREQUAL         = 7,
+   SVGA3D_CMP_ALWAYS               = 8,
+   SVGA3D_CMP_MAX
+} SVGA3dCmpFunc;
+
+/*
+ * SVGA3D_FOGFUNC_* specifies the fog equation, or PER_VERTEX which allows
+ * the fog factor to be specified in the alpha component of the specular
+ * (a.k.a. secondary) vertex color.
+ */
+typedef enum {
+   SVGA3D_FOGFUNC_INVALID          = 0,
+   SVGA3D_FOGFUNC_EXP              = 1,
+   SVGA3D_FOGFUNC_EXP2             = 2,
+   SVGA3D_FOGFUNC_LINEAR           = 3,
+   SVGA3D_FOGFUNC_PER_VERTEX       = 4
+} SVGA3dFogFunction;
+
+/*
+ * SVGA3D_FOGTYPE_* specifies if fog factors are computed on a per-vertex
+ * or per-pixel basis.
+ */
+typedef enum {
+   SVGA3D_FOGTYPE_INVALID          = 0,
+   SVGA3D_FOGTYPE_VERTEX           = 1,
+   SVGA3D_FOGTYPE_PIXEL            = 2,
+   SVGA3D_FOGTYPE_MAX              = 3
+} SVGA3dFogType;
+
+/*
+ * SVGA3D_FOGBASE_* selects depth or range-based fog. Depth-based fog is
+ * computed using the eye Z value of each pixel (or vertex), whereas range-
+ * based fog is computed using the actual distance (range) to the eye.
+ */
+typedef enum {
+   SVGA3D_FOGBASE_INVALID          = 0,
+   SVGA3D_FOGBASE_DEPTHBASED       = 1,
+   SVGA3D_FOGBASE_RANGEBASED       = 2,
+   SVGA3D_FOGBASE_MAX              = 3
+} SVGA3dFogBase;
+
+typedef enum {
+   SVGA3D_STENCILOP_INVALID        = 0,
+   SVGA3D_STENCILOP_KEEP           = 1,
+   SVGA3D_STENCILOP_ZERO           = 2,
+   SVGA3D_STENCILOP_REPLACE        = 3,
+   SVGA3D_STENCILOP_INCRSAT        = 4,
+   SVGA3D_STENCILOP_DECRSAT        = 5,
+   SVGA3D_STENCILOP_INVERT         = 6,
+   SVGA3D_STENCILOP_INCR           = 7,
+   SVGA3D_STENCILOP_DECR           = 8,
+   SVGA3D_STENCILOP_MAX
+} SVGA3dStencilOp;
+
+typedef enum {
+   SVGA3D_CLIPPLANE_0              = (1 << 0),
+   SVGA3D_CLIPPLANE_1              = (1 << 1),
+   SVGA3D_CLIPPLANE_2              = (1 << 2),
+   SVGA3D_CLIPPLANE_3              = (1 << 3),
+   SVGA3D_CLIPPLANE_4              = (1 << 4),
+   SVGA3D_CLIPPLANE_5              = (1 << 5),
+} SVGA3dClipPlanes;
+
+typedef enum {
+   SVGA3D_CLEAR_COLOR              = 0x1,
+   SVGA3D_CLEAR_DEPTH              = 0x2,
+   SVGA3D_CLEAR_STENCIL            = 0x4
+} SVGA3dClearFlag;
+
+typedef enum {
+   SVGA3D_RT_DEPTH                 = 0,
+   SVGA3D_RT_STENCIL               = 1,
+   SVGA3D_RT_COLOR0                = 2,
+   SVGA3D_RT_COLOR1                = 3,
+   SVGA3D_RT_COLOR2                = 4,
+   SVGA3D_RT_COLOR3                = 5,
+   SVGA3D_RT_COLOR4                = 6,
+   SVGA3D_RT_COLOR5                = 7,
+   SVGA3D_RT_COLOR6                = 8,
+   SVGA3D_RT_COLOR7                = 9,
+   SVGA3D_RT_MAX,
+   SVGA3D_RT_INVALID               = ((uint32)-1),
+} SVGA3dRenderTargetType;
+
+#define SVGA3D_MAX_RT_COLOR (SVGA3D_RT_COLOR7 - SVGA3D_RT_COLOR0 + 1)
+
+typedef
+union {
+   struct {
+      uint32  red   : 1;
+      uint32  green : 1;
+      uint32  blue  : 1;
+      uint32  alpha : 1;
+   };
+   uint32 uintValue;
+} SVGA3dColorMask;
+
+typedef enum {
+   SVGA3D_VBLEND_DISABLE            = 0,
+   SVGA3D_VBLEND_1WEIGHT            = 1,
+   SVGA3D_VBLEND_2WEIGHT            = 2,
+   SVGA3D_VBLEND_3WEIGHT            = 3,
+} SVGA3dVertexBlendFlags;
+
+typedef enum {
+   SVGA3D_WRAPCOORD_0   = 1 << 0,
+   SVGA3D_WRAPCOORD_1   = 1 << 1,
+   SVGA3D_WRAPCOORD_2   = 1 << 2,
+   SVGA3D_WRAPCOORD_3   = 1 << 3,
+   SVGA3D_WRAPCOORD_ALL = 0xF,
+} SVGA3dWrapFlags;
+
+/*
+ * SVGA_3D_CMD_TEXTURESTATE Types.  All value types
+ * must fit in a uint32.
+ */
+
+typedef enum {
+   SVGA3D_TS_INVALID                    = 0,
+   SVGA3D_TS_BIND_TEXTURE               = 1,    /* SVGA3dSurfaceId */
+   SVGA3D_TS_COLOROP                    = 2,    /* SVGA3dTextureCombiner */
+   SVGA3D_TS_COLORARG1                  = 3,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_COLORARG2                  = 4,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_ALPHAOP                    = 5,    /* SVGA3dTextureCombiner */
+   SVGA3D_TS_ALPHAARG1                  = 6,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_ALPHAARG2                  = 7,    /* SVGA3dTextureArgData */
+   SVGA3D_TS_ADDRESSU                   = 8,    /* SVGA3dTextureAddress */
+   SVGA3D_TS_ADDRESSV                   = 9,    /* SVGA3dTextureAddress */
+   SVGA3D_TS_MIPFILTER                  = 10,   /* SVGA3dTextureFilter */
+   SVGA3D_TS_MAGFILTER                  = 11,   /* SVGA3dTextureFilter */
+   SVGA3D_TS_MINFILTER                  = 12,   /* SVGA3dTextureFilter */
+   SVGA3D_TS_BORDERCOLOR                = 13,   /* SVGA3dColor */
+   SVGA3D_TS_TEXCOORDINDEX              = 14,   /* uint32 */
+   SVGA3D_TS_TEXTURETRANSFORMFLAGS      = 15,   /* SVGA3dTexTransformFlags */
+   SVGA3D_TS_TEXCOORDGEN                = 16,   /* SVGA3dTextureCoordGen */
+   SVGA3D_TS_BUMPENVMAT00               = 17,   /* float */
+   SVGA3D_TS_BUMPENVMAT01               = 18,   /* float */
+   SVGA3D_TS_BUMPENVMAT10               = 19,   /* float */
+   SVGA3D_TS_BUMPENVMAT11               = 20,   /* float */
+   SVGA3D_TS_TEXTURE_MIPMAP_LEVEL       = 21,   /* uint32 */
+   SVGA3D_TS_TEXTURE_LOD_BIAS           = 22,   /* float */
+   SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL  = 23,   /* uint32 */
+   SVGA3D_TS_ADDRESSW                   = 24,   /* SVGA3dTextureAddress */
+
+
+   /*
+    * Sampler Gamma Level
+    *
+    * Sampler gamma effects the color of samples taken from the sampler.  A
+    * value of 1.0 will produce linear samples.  If the value is <= 0.0 the
+    * gamma value is ignored and a linear space is used.
+    */
+
+   SVGA3D_TS_GAMMA                      = 25,   /* float */
+   SVGA3D_TS_BUMPENVLSCALE              = 26,   /* float */
+   SVGA3D_TS_BUMPENVLOFFSET             = 27,   /* float */
+   SVGA3D_TS_COLORARG0                  = 28,   /* SVGA3dTextureArgData */
+   SVGA3D_TS_ALPHAARG0                  = 29,   /* SVGA3dTextureArgData */
+   SVGA3D_TS_MAX
+} SVGA3dTextureStateName;
+
+typedef enum {
+   SVGA3D_TC_INVALID                   = 0,
+   SVGA3D_TC_DISABLE                   = 1,
+   SVGA3D_TC_SELECTARG1                = 2,
+   SVGA3D_TC_SELECTARG2                = 3,
+   SVGA3D_TC_MODULATE                  = 4,
+   SVGA3D_TC_ADD                       = 5,
+   SVGA3D_TC_ADDSIGNED                 = 6,
+   SVGA3D_TC_SUBTRACT                  = 7,
+   SVGA3D_TC_BLENDTEXTUREALPHA         = 8,
+   SVGA3D_TC_BLENDDIFFUSEALPHA         = 9,
+   SVGA3D_TC_BLENDCURRENTALPHA         = 10,
+   SVGA3D_TC_BLENDFACTORALPHA          = 11,
+   SVGA3D_TC_MODULATE2X                = 12,
+   SVGA3D_TC_MODULATE4X                = 13,
+   SVGA3D_TC_DSDT                      = 14,
+   SVGA3D_TC_DOTPRODUCT3               = 15,
+   SVGA3D_TC_BLENDTEXTUREALPHAPM       = 16,
+   SVGA3D_TC_ADDSIGNED2X               = 17,
+   SVGA3D_TC_ADDSMOOTH                 = 18,
+   SVGA3D_TC_PREMODULATE               = 19,
+   SVGA3D_TC_MODULATEALPHA_ADDCOLOR    = 20,
+   SVGA3D_TC_MODULATECOLOR_ADDALPHA    = 21,
+   SVGA3D_TC_MODULATEINVALPHA_ADDCOLOR = 22,
+   SVGA3D_TC_MODULATEINVCOLOR_ADDALPHA = 23,
+   SVGA3D_TC_BUMPENVMAPLUMINANCE       = 24,
+   SVGA3D_TC_MULTIPLYADD               = 25,
+   SVGA3D_TC_LERP                      = 26,
+   SVGA3D_TC_MAX
+} SVGA3dTextureCombiner;
+
+#define SVGA3D_TC_CAP_BIT(svga3d_tc_op) (svga3d_tc_op ? (1 << (svga3d_tc_op - 1)) : 0)
+
+typedef enum {
+   SVGA3D_TEX_ADDRESS_INVALID    = 0,
+   SVGA3D_TEX_ADDRESS_WRAP       = 1,
+   SVGA3D_TEX_ADDRESS_MIRROR     = 2,
+   SVGA3D_TEX_ADDRESS_CLAMP      = 3,
+   SVGA3D_TEX_ADDRESS_BORDER     = 4,
+   SVGA3D_TEX_ADDRESS_MIRRORONCE = 5,
+   SVGA3D_TEX_ADDRESS_EDGE       = 6,
+   SVGA3D_TEX_ADDRESS_MAX
+} SVGA3dTextureAddress;
+
+/*
+ * SVGA3D_TEX_FILTER_NONE as the minification filter means mipmapping is
+ * disabled, and the rasterizer should use the magnification filter instead.
+ */
+typedef enum {
+   SVGA3D_TEX_FILTER_NONE           = 0,
+   SVGA3D_TEX_FILTER_NEAREST        = 1,
+   SVGA3D_TEX_FILTER_LINEAR         = 2,
+   SVGA3D_TEX_FILTER_ANISOTROPIC    = 3,
+   SVGA3D_TEX_FILTER_FLATCUBIC      = 4, // Deprecated, not implemented
+   SVGA3D_TEX_FILTER_GAUSSIANCUBIC  = 5, // Deprecated, not implemented
+   SVGA3D_TEX_FILTER_PYRAMIDALQUAD  = 6, // Not currently implemented
+   SVGA3D_TEX_FILTER_GAUSSIANQUAD   = 7, // Not currently implemented
+   SVGA3D_TEX_FILTER_MAX
+} SVGA3dTextureFilter;
+
+typedef enum {
+   SVGA3D_TEX_TRANSFORM_OFF    = 0,
+   SVGA3D_TEX_TRANSFORM_S      = (1 << 0),
+   SVGA3D_TEX_TRANSFORM_T      = (1 << 1),
+   SVGA3D_TEX_TRANSFORM_R      = (1 << 2),
+   SVGA3D_TEX_TRANSFORM_Q      = (1 << 3),
+   SVGA3D_TEX_PROJECTED        = (1 << 15),
+} SVGA3dTexTransformFlags;
+
+typedef enum {
+   SVGA3D_TEXCOORD_GEN_OFF              = 0,
+   SVGA3D_TEXCOORD_GEN_EYE_POSITION     = 1,
+   SVGA3D_TEXCOORD_GEN_EYE_NORMAL       = 2,
+   SVGA3D_TEXCOORD_GEN_REFLECTIONVECTOR = 3,
+   SVGA3D_TEXCOORD_GEN_SPHERE           = 4,
+   SVGA3D_TEXCOORD_GEN_MAX
+} SVGA3dTextureCoordGen;
+
+/*
+ * Texture argument constants for texture combiner
+ */
+typedef enum {
+   SVGA3D_TA_INVALID    = 0,
+   SVGA3D_TA_CONSTANT   = 1,
+   SVGA3D_TA_PREVIOUS   = 2,
+   SVGA3D_TA_DIFFUSE    = 3,
+   SVGA3D_TA_TEXTURE    = 4,
+   SVGA3D_TA_SPECULAR   = 5,
+   SVGA3D_TA_MAX
+} SVGA3dTextureArgData;
+
+#define SVGA3D_TM_MASK_LEN 4
+
+/* Modifiers for texture argument constants defined above. */
+typedef enum {
+   SVGA3D_TM_NONE       = 0,
+   SVGA3D_TM_ALPHA      = (1 << SVGA3D_TM_MASK_LEN),
+   SVGA3D_TM_ONE_MINUS  = (2 << SVGA3D_TM_MASK_LEN),
+} SVGA3dTextureArgModifier;
+
+#define SVGA3D_INVALID_ID         ((uint32)-1)
+#define SVGA3D_MAX_CLIP_PLANES    6
+
+/*
+ * This is the limit to the number of fixed-function texture
+ * transforms and texture coordinates we can support. It does *not*
+ * correspond to the number of texture image units (samplers) we
+ * support!
+ */
+#define SVGA3D_MAX_TEXTURE_COORDS 8
+
+/*
+ * Vertex declarations
+ *
+ * Notes:
+ *
+ * SVGA3D_DECLUSAGE_POSITIONT is for pre-transformed vertices. If you
+ * draw with any POSITIONT vertex arrays, the programmable vertex
+ * pipeline will be implicitly disabled. Drawing will take place as if
+ * no vertex shader was bound.
+ */
+
+typedef enum {
+   SVGA3D_DECLUSAGE_POSITION     = 0,
+   SVGA3D_DECLUSAGE_BLENDWEIGHT,       //  1
+   SVGA3D_DECLUSAGE_BLENDINDICES,      //  2
+   SVGA3D_DECLUSAGE_NORMAL,            //  3
+   SVGA3D_DECLUSAGE_PSIZE,             //  4
+   SVGA3D_DECLUSAGE_TEXCOORD,          //  5
+   SVGA3D_DECLUSAGE_TANGENT,           //  6
+   SVGA3D_DECLUSAGE_BINORMAL,          //  7
+   SVGA3D_DECLUSAGE_TESSFACTOR,        //  8
+   SVGA3D_DECLUSAGE_POSITIONT,         //  9
+   SVGA3D_DECLUSAGE_COLOR,             // 10
+   SVGA3D_DECLUSAGE_FOG,               // 11
+   SVGA3D_DECLUSAGE_DEPTH,             // 12
+   SVGA3D_DECLUSAGE_SAMPLE,            // 13
+   SVGA3D_DECLUSAGE_MAX
+} SVGA3dDeclUsage;
+
+typedef enum {
+   SVGA3D_DECLMETHOD_DEFAULT     = 0,
+   SVGA3D_DECLMETHOD_PARTIALU,
+   SVGA3D_DECLMETHOD_PARTIALV,
+   SVGA3D_DECLMETHOD_CROSSUV,          // Normal
+   SVGA3D_DECLMETHOD_UV,
+   SVGA3D_DECLMETHOD_LOOKUP,           // Lookup a displacement map
+   SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED, // Lookup a pre-sampled displacement map
+} SVGA3dDeclMethod;
+
+typedef enum {
+   SVGA3D_DECLTYPE_FLOAT1        =  0,
+   SVGA3D_DECLTYPE_FLOAT2        =  1,
+   SVGA3D_DECLTYPE_FLOAT3        =  2,
+   SVGA3D_DECLTYPE_FLOAT4        =  3,
+   SVGA3D_DECLTYPE_D3DCOLOR      =  4,
+   SVGA3D_DECLTYPE_UBYTE4        =  5,
+   SVGA3D_DECLTYPE_SHORT2        =  6,
+   SVGA3D_DECLTYPE_SHORT4        =  7,
+   SVGA3D_DECLTYPE_UBYTE4N       =  8,
+   SVGA3D_DECLTYPE_SHORT2N       =  9,
+   SVGA3D_DECLTYPE_SHORT4N       = 10,
+   SVGA3D_DECLTYPE_USHORT2N      = 11,
+   SVGA3D_DECLTYPE_USHORT4N      = 12,
+   SVGA3D_DECLTYPE_UDEC3         = 13,
+   SVGA3D_DECLTYPE_DEC3N         = 14,
+   SVGA3D_DECLTYPE_FLOAT16_2     = 15,
+   SVGA3D_DECLTYPE_FLOAT16_4     = 16,
+   SVGA3D_DECLTYPE_MAX,
+} SVGA3dDeclType;
+
+/*
+ * This structure is used for the divisor for geometry instancing;
+ * it's a direct translation of the Direct3D equivalent.
+ */
+typedef union {
+   struct {
+      /*
+       * For index data, this number represents the number of instances to draw.
+       * For instance data, this number represents the number of
+       * instances/vertex in this stream
+       */
+      uint32 count : 30;
+
+      /*
+       * This is 1 if this is supposed to be the data that is repeated for
+       * every instance.
+       */
+      uint32 indexedData : 1;
+
+      /*
+       * This is 1 if this is supposed to be the per-instance data.
+       */
+      uint32 instanceData : 1;
+   };
+
+   uint32 value;
+} SVGA3dVertexDivisor;
+
+typedef enum {
+   SVGA3D_PRIMITIVE_INVALID                     = 0,
+   SVGA3D_PRIMITIVE_TRIANGLELIST                = 1,
+   SVGA3D_PRIMITIVE_POINTLIST                   = 2,
+   SVGA3D_PRIMITIVE_LINELIST                    = 3,
+   SVGA3D_PRIMITIVE_LINESTRIP                   = 4,
+   SVGA3D_PRIMITIVE_TRIANGLESTRIP               = 5,
+   SVGA3D_PRIMITIVE_TRIANGLEFAN                 = 6,
+   SVGA3D_PRIMITIVE_MAX
+} SVGA3dPrimitiveType;
+
+typedef enum {
+   SVGA3D_COORDINATE_INVALID                   = 0,
+   SVGA3D_COORDINATE_LEFTHANDED                = 1,
+   SVGA3D_COORDINATE_RIGHTHANDED               = 2,
+   SVGA3D_COORDINATE_MAX
+} SVGA3dCoordinateType;
+
+typedef enum {
+   SVGA3D_TRANSFORM_INVALID                     = 0,
+   SVGA3D_TRANSFORM_WORLD                       = 1,
+   SVGA3D_TRANSFORM_VIEW                        = 2,
+   SVGA3D_TRANSFORM_PROJECTION                  = 3,
+   SVGA3D_TRANSFORM_TEXTURE0                    = 4,
+   SVGA3D_TRANSFORM_TEXTURE1                    = 5,
+   SVGA3D_TRANSFORM_TEXTURE2                    = 6,
+   SVGA3D_TRANSFORM_TEXTURE3                    = 7,
+   SVGA3D_TRANSFORM_TEXTURE4                    = 8,
+   SVGA3D_TRANSFORM_TEXTURE5                    = 9,
+   SVGA3D_TRANSFORM_TEXTURE6                    = 10,
+   SVGA3D_TRANSFORM_TEXTURE7                    = 11,
+   SVGA3D_TRANSFORM_WORLD1                      = 12,
+   SVGA3D_TRANSFORM_WORLD2                      = 13,
+   SVGA3D_TRANSFORM_WORLD3                      = 14,
+   SVGA3D_TRANSFORM_MAX
+} SVGA3dTransformType;
+
+typedef enum {
+   SVGA3D_LIGHTTYPE_INVALID                     = 0,
+   SVGA3D_LIGHTTYPE_POINT                       = 1,
+   SVGA3D_LIGHTTYPE_SPOT1                       = 2, /* 1-cone, in degrees */
+   SVGA3D_LIGHTTYPE_SPOT2                       = 3, /* 2-cone, in radians */
+   SVGA3D_LIGHTTYPE_DIRECTIONAL                 = 4,
+   SVGA3D_LIGHTTYPE_MAX
+} SVGA3dLightType;
+
+typedef enum {
+   SVGA3D_CUBEFACE_POSX                         = 0,
+   SVGA3D_CUBEFACE_NEGX                         = 1,
+   SVGA3D_CUBEFACE_POSY                         = 2,
+   SVGA3D_CUBEFACE_NEGY                         = 3,
+   SVGA3D_CUBEFACE_POSZ                         = 4,
+   SVGA3D_CUBEFACE_NEGZ                         = 5,
+} SVGA3dCubeFace;
+
+typedef enum {
+   SVGA3D_SHADERTYPE_COMPILED_DX8               = 0,
+   SVGA3D_SHADERTYPE_VS                         = 1,
+   SVGA3D_SHADERTYPE_PS                         = 2,
+   SVGA3D_SHADERTYPE_MAX
+} SVGA3dShaderType;
+
+typedef enum {
+   SVGA3D_CONST_TYPE_FLOAT                      = 0,
+   SVGA3D_CONST_TYPE_INT                        = 1,
+   SVGA3D_CONST_TYPE_BOOL                       = 2,
+} SVGA3dShaderConstType;
+
+#define SVGA3D_MAX_SURFACE_FACES                6
+
+typedef enum {
+   SVGA3D_STRETCH_BLT_POINT                     = 0,
+   SVGA3D_STRETCH_BLT_LINEAR                    = 1,
+   SVGA3D_STRETCH_BLT_MAX
+} SVGA3dStretchBltMode;
+
+typedef enum {
+   SVGA3D_QUERYTYPE_OCCLUSION                   = 0,
+   SVGA3D_QUERYTYPE_MAX
+} SVGA3dQueryType;
+
+typedef enum {
+   SVGA3D_QUERYSTATE_PENDING     = 0,      /* Waiting on the host (set by guest) */
+   SVGA3D_QUERYSTATE_SUCCEEDED   = 1,      /* Completed successfully (set by host) */
+   SVGA3D_QUERYSTATE_FAILED      = 2,      /* Completed unsuccessfully (set by host) */
+   SVGA3D_QUERYSTATE_NEW         = 3,      /* Never submitted (For guest use only) */
+} SVGA3dQueryState;
+
+typedef enum {
+   SVGA3D_WRITE_HOST_VRAM        = 1,
+   SVGA3D_READ_HOST_VRAM         = 2,
+} SVGA3dTransferType;
+
+/*
+ * The maximum number vertex arrays we're guaranteed to support in
+ * SVGA_3D_CMD_DRAWPRIMITIVES.
+ */
+#define SVGA3D_MAX_VERTEX_ARRAYS   32
+
+/*
+ * Identifiers for commands in the command FIFO.
+ *
+ * IDs between 1000 and 1039 (inclusive) were used by obsolete versions of
+ * the SVGA3D protocol and remain reserved; they should not be used in the
+ * future.
+ *
+ * IDs between 1040 and 1999 (inclusive) are available for use by the
+ * current SVGA3D protocol.
+ *
+ * FIFO clients other than SVGA3D should stay below 1000, or at 2000
+ * and up.
+ */
+
+#define SVGA_3D_CMD_LEGACY_BASE            1000
+#define SVGA_3D_CMD_BASE                   1040
+
+#define SVGA_3D_CMD_SURFACE_DEFINE         SVGA_3D_CMD_BASE + 0
+#define SVGA_3D_CMD_SURFACE_DESTROY        SVGA_3D_CMD_BASE + 1
+#define SVGA_3D_CMD_SURFACE_COPY           SVGA_3D_CMD_BASE + 2
+#define SVGA_3D_CMD_SURFACE_STRETCHBLT     SVGA_3D_CMD_BASE + 3
+#define SVGA_3D_CMD_SURFACE_DMA            SVGA_3D_CMD_BASE + 4
+#define SVGA_3D_CMD_CONTEXT_DEFINE         SVGA_3D_CMD_BASE + 5
+#define SVGA_3D_CMD_CONTEXT_DESTROY        SVGA_3D_CMD_BASE + 6
+#define SVGA_3D_CMD_SETTRANSFORM           SVGA_3D_CMD_BASE + 7
+#define SVGA_3D_CMD_SETZRANGE              SVGA_3D_CMD_BASE + 8
+#define SVGA_3D_CMD_SETRENDERSTATE         SVGA_3D_CMD_BASE + 9
+#define SVGA_3D_CMD_SETRENDERTARGET        SVGA_3D_CMD_BASE + 10
+#define SVGA_3D_CMD_SETTEXTURESTATE        SVGA_3D_CMD_BASE + 11
+#define SVGA_3D_CMD_SETMATERIAL            SVGA_3D_CMD_BASE + 12
+#define SVGA_3D_CMD_SETLIGHTDATA           SVGA_3D_CMD_BASE + 13
+#define SVGA_3D_CMD_SETLIGHTENABLED        SVGA_3D_CMD_BASE + 14
+#define SVGA_3D_CMD_SETVIEWPORT            SVGA_3D_CMD_BASE + 15
+#define SVGA_3D_CMD_SETCLIPPLANE           SVGA_3D_CMD_BASE + 16
+#define SVGA_3D_CMD_CLEAR                  SVGA_3D_CMD_BASE + 17
+#define SVGA_3D_CMD_PRESENT                SVGA_3D_CMD_BASE + 18    // Deprecated
+#define SVGA_3D_CMD_SHADER_DEFINE          SVGA_3D_CMD_BASE + 19
+#define SVGA_3D_CMD_SHADER_DESTROY         SVGA_3D_CMD_BASE + 20
+#define SVGA_3D_CMD_SET_SHADER             SVGA_3D_CMD_BASE + 21
+#define SVGA_3D_CMD_SET_SHADER_CONST       SVGA_3D_CMD_BASE + 22
+#define SVGA_3D_CMD_DRAW_PRIMITIVES        SVGA_3D_CMD_BASE + 23
+#define SVGA_3D_CMD_SETSCISSORRECT         SVGA_3D_CMD_BASE + 24
+#define SVGA_3D_CMD_BEGIN_QUERY            SVGA_3D_CMD_BASE + 25
+#define SVGA_3D_CMD_END_QUERY              SVGA_3D_CMD_BASE + 26
+#define SVGA_3D_CMD_WAIT_FOR_QUERY         SVGA_3D_CMD_BASE + 27
+#define SVGA_3D_CMD_PRESENT_READBACK       SVGA_3D_CMD_BASE + 28    // Deprecated
+#define SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN SVGA_3D_CMD_BASE + 29
+#define SVGA_3D_CMD_MAX                    SVGA_3D_CMD_BASE + 30
+
+#define SVGA_3D_CMD_FUTURE_MAX             2000
+
+/*
+ * Common substructures used in multiple FIFO commands:
+ */
+
+typedef struct {
+   union {
+      struct {
+         uint16  function;       // SVGA3dFogFunction
+         uint8   type;           // SVGA3dFogType
+         uint8   base;           // SVGA3dFogBase
+      };
+      uint32     uintValue;
+   };
+} SVGA3dFogMode;
+
+/*
+ * Uniquely identify one image (a 1D/2D/3D array) from a surface. This
+ * is a surface ID as well as face/mipmap indices.
+ */
+
+typedef
+struct SVGA3dSurfaceImageId {
+   uint32               sid;
+   uint32               face;
+   uint32               mipmap;
+} SVGA3dSurfaceImageId;
+
+typedef
+struct SVGA3dGuestImage {
+   SVGAGuestPtr         ptr;
+
+   /*
+    * A note on interpretation of pitch: This value of pitch is the
+    * number of bytes between vertically adjacent image
+    * blocks. Normally this is the number of bytes between the first
+    * pixel of two adjacent scanlines. With compressed textures,
+    * however, this may represent the number of bytes between
+    * compression blocks rather than between rows of pixels.
+    *
+    * XXX: Compressed textures currently must be tightly packed in guest memory.
+    *
+    * If the image is 1-dimensional, pitch is ignored.
+    *
+    * If 'pitch' is zero, the SVGA3D device calculates a pitch value
+    * assuming each row of blocks is tightly packed.
+    */
+   uint32 pitch;
+} SVGA3dGuestImage;
+
+
+/*
+ * FIFO command format definitions:
+ */
+
+/*
+ * The data size header following cmdNum for every 3d command
+ */
+typedef
+struct {
+   uint32               id;
+   uint32               size;
+} SVGA3dCmdHeader;
+
+/*
+ * A surface is a hierarchy of host VRAM surfaces: 1D, 2D, or 3D, with
+ * optional mipmaps and cube faces.
+ */
+
+typedef
+struct {
+   uint32               width;
+   uint32               height;
+   uint32               depth;
+} SVGA3dSize;
+
+typedef enum {
+   SVGA3D_SURFACE_CUBEMAP              = (1 << 0),
+   SVGA3D_SURFACE_HINT_STATIC          = (1 << 1),
+   SVGA3D_SURFACE_HINT_DYNAMIC         = (1 << 2),
+   SVGA3D_SURFACE_HINT_INDEXBUFFER     = (1 << 3),
+   SVGA3D_SURFACE_HINT_VERTEXBUFFER    = (1 << 4),
+   SVGA3D_SURFACE_HINT_TEXTURE         = (1 << 5),
+   SVGA3D_SURFACE_HINT_RENDERTARGET    = (1 << 6),
+   SVGA3D_SURFACE_HINT_DEPTHSTENCIL    = (1 << 7),
+   SVGA3D_SURFACE_HINT_WRITEONLY       = (1 << 8),
+} SVGA3dSurfaceFlags;
+
+typedef
+struct {
+   uint32               numMipLevels;
+} SVGA3dSurfaceFace;
+
+typedef
+struct {
+   uint32                      sid;
+   SVGA3dSurfaceFlags          surfaceFlags;
+   SVGA3dSurfaceFormat         format;
+   SVGA3dSurfaceFace           face[SVGA3D_MAX_SURFACE_FACES];
+   /*
+    * Followed by an SVGA3dSize structure for each mip level in each face.
+    *
+    * A note on surface sizes: Sizes are always specified in pixels,
+    * even if the true surface size is not a multiple of the minimum
+    * block size of the surface's format. For example, a 3x3x1 DXT1
+    * compressed texture would actually be stored as a 4x4x1 image in
+    * memory.
+    */
+} SVGA3dCmdDefineSurface;       /* SVGA_3D_CMD_SURFACE_DEFINE */
+
+typedef
+struct {
+   uint32               sid;
+} SVGA3dCmdDestroySurface;      /* SVGA_3D_CMD_SURFACE_DESTROY */
+
+typedef
+struct {
+   uint32               cid;
+} SVGA3dCmdDefineContext;       /* SVGA_3D_CMD_CONTEXT_DEFINE */
+
+typedef
+struct {
+   uint32               cid;
+} SVGA3dCmdDestroyContext;      /* SVGA_3D_CMD_CONTEXT_DESTROY */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dClearFlag      clearFlag;
+   uint32               color;
+   float                depth;
+   uint32               stencil;
+   /* Followed by variable number of SVGA3dRect structures */
+} SVGA3dCmdClear;               /* SVGA_3D_CMD_CLEAR */
+
+typedef
+struct SVGA3dCopyRect {
+   uint32               x;
+   uint32               y;
+   uint32               w;
+   uint32               h;
+   uint32               srcx;
+   uint32               srcy;
+} SVGA3dCopyRect;
+
+typedef
+struct SVGA3dCopyBox {
+   uint32               x;
+   uint32               y;
+   uint32               z;
+   uint32               w;
+   uint32               h;
+   uint32               d;
+   uint32               srcx;
+   uint32               srcy;
+   uint32               srcz;
+} SVGA3dCopyBox;
+
+typedef
+struct {
+   uint32               x;
+   uint32               y;
+   uint32               w;
+   uint32               h;
+} SVGA3dRect;
+
+typedef
+struct {
+   uint32               x;
+   uint32               y;
+   uint32               z;
+   uint32               w;
+   uint32               h;
+   uint32               d;
+} SVGA3dBox;
+
+typedef
+struct {
+   uint32               x;
+   uint32               y;
+   uint32               z;
+} SVGA3dPoint;
+
+typedef
+struct {
+   SVGA3dLightType      type;
+   SVGA3dBool           inWorldSpace;
+   float                diffuse[4];
+   float                specular[4];
+   float                ambient[4];
+   float                position[4];
+   float                direction[4];
+   float                range;
+   float                falloff;
+   float                attenuation0;
+   float                attenuation1;
+   float                attenuation2;
+   float                theta;
+   float                phi;
+} SVGA3dLightData;
+
+typedef
+struct {
+   uint32               sid;
+   /* Followed by variable number of SVGA3dCopyRect structures */
+} SVGA3dCmdPresent;             /* SVGA_3D_CMD_PRESENT */
+
+typedef
+struct {
+   SVGA3dRenderStateName   state;
+   union {
+      uint32               uintValue;
+      float                floatValue;
+   };
+} SVGA3dRenderState;
+
+typedef
+struct {
+   uint32               cid;
+   /* Followed by variable number of SVGA3dRenderState structures */
+} SVGA3dCmdSetRenderState;      /* SVGA_3D_CMD_SETRENDERSTATE */
+
+typedef
+struct {
+   uint32                 cid;
+   SVGA3dRenderTargetType type;
+   SVGA3dSurfaceImageId   target;
+} SVGA3dCmdSetRenderTarget;     /* SVGA_3D_CMD_SETRENDERTARGET */
+
+typedef
+struct {
+   SVGA3dSurfaceImageId  src;
+   SVGA3dSurfaceImageId  dest;
+   /* Followed by variable number of SVGA3dCopyBox structures */
+} SVGA3dCmdSurfaceCopy;               /* SVGA_3D_CMD_SURFACE_COPY */
+
+typedef
+struct {
+   SVGA3dSurfaceImageId  src;
+   SVGA3dSurfaceImageId  dest;
+   SVGA3dBox             boxSrc;
+   SVGA3dBox             boxDest;
+   SVGA3dStretchBltMode  mode;
+} SVGA3dCmdSurfaceStretchBlt;         /* SVGA_3D_CMD_SURFACE_STRETCHBLT */
+
+typedef
+struct {
+   /*
+    * If the discard flag is present in a surface DMA operation, the host may
+    * discard the contents of the current mipmap level and face of the target
+    * surface before applying the surface DMA contents.
+    */
+   uint32 discard : 1;
+
+   /*
+    * If the unsynchronized flag is present, the host may perform this upload
+    * without syncing to pending reads on this surface.
+    */
+   uint32 unsynchronized : 1;
+
+   /*
+    * Guests *MUST* set the reserved bits to 0 before submitting the command
+    * suffix as future flags may occupy these bits.
+    */
+   uint32 reserved : 30;
+} SVGA3dSurfaceDMAFlags;
+
+typedef
+struct {
+   SVGA3dGuestImage      guest;
+   SVGA3dSurfaceImageId  host;
+   SVGA3dTransferType    transfer;
+   /*
+    * Followed by variable number of SVGA3dCopyBox structures. For consistency
+    * in all clipping logic and coordinate translation, we define the
+    * "source" in each copyBox as the guest image and the
+    * "destination" as the host image, regardless of transfer
+    * direction.
+    *
+    * For efficiency, the SVGA3D device is free to copy more data than
+    * specified. For example, it may round copy boxes outwards such
+    * that they lie on particular alignment boundaries.
+    */
+} SVGA3dCmdSurfaceDMA;                /* SVGA_3D_CMD_SURFACE_DMA */
+
+/*
+ * SVGA3dCmdSurfaceDMASuffix --
+ *
+ *    This is a command suffix that will appear after a SurfaceDMA command in
+ *    the FIFO.  It contains some extra information that hosts may use to
+ *    optimize performance or protect the guest.  This suffix exists to preserve
+ *    backwards compatibility while also allowing for new functionality to be
+ *    implemented.
+ */
+
+typedef
+struct {
+   uint32 suffixSize;
+
+   /*
+    * The maximum offset is used to determine the maximum offset from the
+    * guestPtr base address that will be accessed or written to during this
+    * surfaceDMA.  If the suffix is supported, the host will respect this
+    * boundary while performing surface DMAs.
+    *
+    * Defaults to MAX_UINT32
+    */
+   uint32 maximumOffset;
+
+   /*
+    * A set of flags that describes optimizations that the host may perform
+    * while performing this surface DMA operation.  The guest should never rely
+    * on behaviour that is different when these flags are set for correctness.
+    *
+    * Defaults to 0
+    */
+   SVGA3dSurfaceDMAFlags flags;
+} SVGA3dCmdSurfaceDMASuffix;
+
+/*
+ * SVGA_3D_CMD_DRAW_PRIMITIVES --
+ *
+ *   This command is the SVGA3D device's generic drawing entry point.
+ *   It can draw multiple ranges of primitives, optionally using an
+ *   index buffer, using an arbitrary collection of vertex buffers.
+ *
+ *   Each SVGA3dVertexDecl defines a distinct vertex array to bind
+ *   during this draw call. The declarations specify which surface
+ *   the vertex data lives in, what that vertex data is used for,
+ *   and how to interpret it.
+ *
+ *   Each SVGA3dPrimitiveRange defines a collection of primitives
+ *   to render using the same vertex arrays. An index buffer is
+ *   optional.
+ */
+
+typedef
+struct {
+   /*
+    * A range hint is an optional specification for the range of indices
+    * in an SVGA3dArray that will be used. If 'last' is zero, it is assumed
+    * that the entire array will be used.
+    *
+    * These are only hints. The SVGA3D device may use them for
+    * performance optimization if possible, but it's also allowed to
+    * ignore these values.
+    */
+   uint32               first;
+   uint32               last;
+} SVGA3dArrayRangeHint;
+
+typedef
+struct {
+   /*
+    * Define the origin and shape of a vertex or index array. Both
+    * 'offset' and 'stride' are in bytes. The provided surface will be
+    * reinterpreted as a flat array of bytes in the same format used
+    * by surface DMA operations. To avoid unnecessary conversions, the
+    * surface should be created with the SVGA3D_BUFFER format.
+    *
+    * Index 0 in the array starts 'offset' bytes into the surface.
+    * Index 1 begins at byte 'offset + stride', etc. Array indices may
+    * not be negative.
+    */
+   uint32               surfaceId;
+   uint32               offset;
+   uint32               stride;
+} SVGA3dArray;
+
+typedef
+struct {
+   /*
+    * Describe a vertex array's data type, and define how it is to be
+    * used by the fixed function pipeline or the vertex shader. It
+    * isn't useful to have two VertexDecls with the same
+    * VertexArrayIdentity in one draw call.
+    */
+   SVGA3dDeclType       type;
+   SVGA3dDeclMethod     method;
+   SVGA3dDeclUsage      usage;
+   uint32               usageIndex;
+} SVGA3dVertexArrayIdentity;
+
+typedef
+struct {
+   SVGA3dVertexArrayIdentity  identity;
+   SVGA3dArray                array;
+   SVGA3dArrayRangeHint       rangeHint;
+} SVGA3dVertexDecl;
+
+typedef
+struct {
+   /*
+    * Define a group of primitives to render, from sequential indices.
+    *
+    * The value of 'primitiveType' and 'primitiveCount' imply the
+    * total number of vertices that will be rendered.
+    */
+   SVGA3dPrimitiveType  primType;
+   uint32               primitiveCount;
+
+   /*
+    * Optional index buffer. If indexArray.surfaceId is
+    * SVGA3D_INVALID_ID, we render without an index buffer. Rendering
+    * without an index buffer is identical to rendering with an index
+    * buffer containing the sequence [0, 1, 2, 3, ...].
+    *
+    * If an index buffer is in use, indexWidth specifies the width in
+    * bytes of each index value. It must be less than or equal to
+    * indexArray.stride.
+    *
+    * (Currently, the SVGA3D device requires index buffers to be tightly
+    * packed. In other words, indexWidth == indexArray.stride)
+    */
+   SVGA3dArray          indexArray;
+   uint32               indexWidth;
+
+   /*
+    * Optional index bias. This number is added to all indices from
+    * indexArray before they are used as vertex array indices. This
+    * can be used in multiple ways:
+    *
+    *  - When not using an indexArray, this bias can be used to
+    *    specify where in the vertex arrays to begin rendering.
+    *
+    *  - A positive number here is equivalent to increasing the
+    *    offset in each vertex array.
+    *
+    *  - A negative number can be used to render using a small
+    *    vertex array and an index buffer that contains large
+    *    values. This may be used by some applications that
+    *    crop a vertex buffer without modifying their index
+    *    buffer.
+    *
+    * Note that rendering with a negative bias value may be slower and
+    * use more memory than rendering with a positive or zero bias.
+    */
+   int32                indexBias;
+} SVGA3dPrimitiveRange;
+
+typedef
+struct {
+   uint32               cid;
+   uint32               numVertexDecls;
+   uint32               numRanges;
+
+   /*
+    * There are two variable size arrays after the
+    * SVGA3dCmdDrawPrimitives structure. In order,
+    * they are:
+    *
+    * 1. SVGA3dVertexDecl, quantity 'numVertexDecls'
+    * 2. SVGA3dPrimitiveRange, quantity 'numRanges'
+    * 3. Optionally, SVGA3dVertexDivisor, quantity 'numVertexDecls' (contains
+    *    the frequency divisor for this the corresponding vertex decl)
+    */
+} SVGA3dCmdDrawPrimitives;      /* SVGA_3D_CMD_DRAWPRIMITIVES */
+
+typedef
+struct {
+   uint32                   stage;
+   SVGA3dTextureStateName   name;
+   union {
+      uint32                value;
+      float                 floatValue;
+   };
+} SVGA3dTextureState;
+
+typedef
+struct {
+   uint32               cid;
+   /* Followed by variable number of SVGA3dTextureState structures */
+} SVGA3dCmdSetTextureState;      /* SVGA_3D_CMD_SETTEXTURESTATE */
+
+typedef
+struct {
+   uint32                   cid;
+   SVGA3dTransformType      type;
+   float                    matrix[16];
+} SVGA3dCmdSetTransform;          /* SVGA_3D_CMD_SETTRANSFORM */
+
+typedef
+struct {
+   float                min;
+   float                max;
+} SVGA3dZRange;
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dZRange         zRange;
+} SVGA3dCmdSetZRange;             /* SVGA_3D_CMD_SETZRANGE */
+
+typedef
+struct {
+   float                diffuse[4];
+   float                ambient[4];
+   float                specular[4];
+   float                emissive[4];
+   float                shininess;
+} SVGA3dMaterial;
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dFace           face;
+   SVGA3dMaterial       material;
+} SVGA3dCmdSetMaterial;           /* SVGA_3D_CMD_SETMATERIAL */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               index;
+   SVGA3dLightData      data;
+} SVGA3dCmdSetLightData;           /* SVGA_3D_CMD_SETLIGHTDATA */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               index;
+   uint32               enabled;
+} SVGA3dCmdSetLightEnabled;      /* SVGA_3D_CMD_SETLIGHTENABLED */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dRect           rect;
+} SVGA3dCmdSetViewport;           /* SVGA_3D_CMD_SETVIEWPORT */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dRect           rect;
+} SVGA3dCmdSetScissorRect;         /* SVGA_3D_CMD_SETSCISSORRECT */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               index;
+   float                plane[4];
+} SVGA3dCmdSetClipPlane;           /* SVGA_3D_CMD_SETCLIPPLANE */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               shid;
+   SVGA3dShaderType     type;
+   /* Followed by variable number of DWORDs for shader bycode */
+} SVGA3dCmdDefineShader;           /* SVGA_3D_CMD_SHADER_DEFINE */
+
+typedef
+struct {
+   uint32               cid;
+   uint32               shid;
+   SVGA3dShaderType     type;
+} SVGA3dCmdDestroyShader;         /* SVGA_3D_CMD_SHADER_DESTROY */
+
+typedef
+struct {
+   uint32                  cid;
+   uint32                  reg;     /* register number */
+   SVGA3dShaderType        type;
+   SVGA3dShaderConstType   ctype;
+   uint32                  values[4];
+} SVGA3dCmdSetShaderConst;        /* SVGA_3D_CMD_SET_SHADER_CONST */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dShaderType     type;
+   uint32               shid;
+} SVGA3dCmdSetShader;             /* SVGA_3D_CMD_SET_SHADER */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dQueryType      type;
+} SVGA3dCmdBeginQuery;           /* SVGA_3D_CMD_BEGIN_QUERY */
+
+typedef
+struct {
+   uint32               cid;
+   SVGA3dQueryType      type;
+   SVGAGuestPtr         guestResult;  /* Points to an SVGA3dQueryResult structure */
+} SVGA3dCmdEndQuery;                  /* SVGA_3D_CMD_END_QUERY */
+
+typedef
+struct {
+   uint32               cid;          /* Same parameters passed to END_QUERY */
+   SVGA3dQueryType      type;
+   SVGAGuestPtr         guestResult;
+} SVGA3dCmdWaitForQuery;              /* SVGA_3D_CMD_WAIT_FOR_QUERY */
+
+typedef
+struct {
+   uint32               totalSize;    /* Set by guest before query is ended. */
+   SVGA3dQueryState     state;        /* Set by host or guest. See SVGA3dQueryState. */
+   union {                            /* Set by host on exit from PENDING state */
+      uint32            result32;
+   };
+} SVGA3dQueryResult;
+
+/*
+ * SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN --
+ *
+ *    This is a blit from an SVGA3D surface to a Screen Object. Just
+ *    like GMR-to-screen blits, this blit may be directed at a
+ *    specific screen or to the virtual coordinate space.
+ *
+ *    The blit copies from a rectangular region of an SVGA3D surface
+ *    image to a rectangular region of a screen or screens.
+ *
+ *    This command takes an optional variable-length list of clipping
+ *    rectangles after the body of the command. If no rectangles are
+ *    specified, there is no clipping region. The entire destRect is
+ *    drawn to. If one or more rectangles are included, they describe
+ *    a clipping region. The clip rectangle coordinates are measured
+ *    relative to the top-left corner of destRect.
+ *
+ *    This clipping region serves multiple purposes:
+ *
+ *      - It can be used to perform an irregularly shaped blit more
+ *        efficiently than by issuing many separate blit commands.
+ *
+ *      - It is equivalent to allowing blits with non-integer
+ *        source coordinates. You could blit just one half-pixel
+ *        of a source, for example, by specifying a larger
+ *        destination rectangle than you need, then removing
+ *        part of it using a clip rectangle.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ *
+ * Limitations:
+ *
+ *    - Currently, no backend supports blits from a mipmap or face
+ *      other than the first one.
+ */
+
+typedef
+struct {
+   SVGA3dSurfaceImageId srcImage;
+   SVGASignedRect       srcRect;
+   uint32               destScreenId; /* Screen ID or SVGA_ID_INVALID for virt. coords */
+   SVGASignedRect       destRect;     /* Supports scaling if src/rest different size */
+   /* Clipping: zero or more SVGASignedRects follow */
+} SVGA3dCmdBlitSurfaceToScreen;         /* SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN */
+
+
+/*
+ * Capability query index.
+ *
+ * Notes:
+ *
+ *   1. SVGA3D_DEVCAP_MAX_TEXTURES reflects the maximum number of
+ *      fixed-function texture units available. Each of these units
+ *      work in both FFP and Shader modes, and they support texture
+ *      transforms and texture coordinates. The host may have additional
+ *      texture image units that are only usable with shaders.
+ *
+ *   2. The BUFFER_FORMAT capabilities are deprecated, and they always
+ *      return TRUE. Even on physical hardware that does not support
+ *      these formats natively, the SVGA3D device will provide an emulation
+ *      which should be invisible to the guest OS.
+ *
+ *      In general, the SVGA3D device should support any operation on
+ *      any surface format, it just may perform some of these
+ *      operations in software depending on the capabilities of the
+ *      available physical hardware.
+ *
+ *      XXX: In the future, we will add capabilities that describe in
+ *      detail what formats are supported in hardware for what kinds
+ *      of operations.
+ */
+
+typedef enum {
+   SVGA3D_DEVCAP_3D                                = 0,
+   SVGA3D_DEVCAP_MAX_LIGHTS                        = 1,
+   SVGA3D_DEVCAP_MAX_TEXTURES                      = 2,  /* See note (1) */
+   SVGA3D_DEVCAP_MAX_CLIP_PLANES                   = 3,
+   SVGA3D_DEVCAP_VERTEX_SHADER_VERSION             = 4,
+   SVGA3D_DEVCAP_VERTEX_SHADER                     = 5,
+   SVGA3D_DEVCAP_FRAGMENT_SHADER_VERSION           = 6,
+   SVGA3D_DEVCAP_FRAGMENT_SHADER                   = 7,
+   SVGA3D_DEVCAP_MAX_RENDER_TARGETS                = 8,
+   SVGA3D_DEVCAP_S23E8_TEXTURES                    = 9,
+   SVGA3D_DEVCAP_S10E5_TEXTURES                    = 10,
+   SVGA3D_DEVCAP_MAX_FIXED_VERTEXBLEND             = 11,
+   SVGA3D_DEVCAP_D16_BUFFER_FORMAT                 = 12, /* See note (2) */
+   SVGA3D_DEVCAP_D24S8_BUFFER_FORMAT               = 13, /* See note (2) */
+   SVGA3D_DEVCAP_D24X8_BUFFER_FORMAT               = 14, /* See note (2) */
+   SVGA3D_DEVCAP_QUERY_TYPES                       = 15,
+   SVGA3D_DEVCAP_TEXTURE_GRADIENT_SAMPLING         = 16,
+   SVGA3D_DEVCAP_MAX_POINT_SIZE                    = 17,
+   SVGA3D_DEVCAP_MAX_SHADER_TEXTURES               = 18,
+   SVGA3D_DEVCAP_MAX_TEXTURE_WIDTH                 = 19,
+   SVGA3D_DEVCAP_MAX_TEXTURE_HEIGHT                = 20,
+   SVGA3D_DEVCAP_MAX_VOLUME_EXTENT                 = 21,
+   SVGA3D_DEVCAP_MAX_TEXTURE_REPEAT                = 22,
+   SVGA3D_DEVCAP_MAX_TEXTURE_ASPECT_RATIO          = 23,
+   SVGA3D_DEVCAP_MAX_TEXTURE_ANISOTROPY            = 24,
+   SVGA3D_DEVCAP_MAX_PRIMITIVE_COUNT               = 25,
+   SVGA3D_DEVCAP_MAX_VERTEX_INDEX                  = 26,
+   SVGA3D_DEVCAP_MAX_VERTEX_SHADER_INSTRUCTIONS    = 27,
+   SVGA3D_DEVCAP_MAX_FRAGMENT_SHADER_INSTRUCTIONS  = 28,
+   SVGA3D_DEVCAP_MAX_VERTEX_SHADER_TEMPS           = 29,
+   SVGA3D_DEVCAP_MAX_FRAGMENT_SHADER_TEMPS         = 30,
+   SVGA3D_DEVCAP_TEXTURE_OPS                       = 31,
+   SVGA3D_DEVCAP_SURFACEFMT_X8R8G8B8               = 32,
+   SVGA3D_DEVCAP_SURFACEFMT_A8R8G8B8               = 33,
+   SVGA3D_DEVCAP_SURFACEFMT_A2R10G10B10            = 34,
+   SVGA3D_DEVCAP_SURFACEFMT_X1R5G5B5               = 35,
+   SVGA3D_DEVCAP_SURFACEFMT_A1R5G5B5               = 36,
+   SVGA3D_DEVCAP_SURFACEFMT_A4R4G4B4               = 37,
+   SVGA3D_DEVCAP_SURFACEFMT_R5G6B5                 = 38,
+   SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE16            = 39,
+   SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE8_ALPHA8      = 40,
+   SVGA3D_DEVCAP_SURFACEFMT_ALPHA8                 = 41,
+   SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE8             = 42,
+   SVGA3D_DEVCAP_SURFACEFMT_Z_D16                  = 43,
+   SVGA3D_DEVCAP_SURFACEFMT_Z_D24S8                = 44,
+   SVGA3D_DEVCAP_SURFACEFMT_Z_D24X8                = 45,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT1                   = 46,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT2                   = 47,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT3                   = 48,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT4                   = 49,
+   SVGA3D_DEVCAP_SURFACEFMT_DXT5                   = 50,
+   SVGA3D_DEVCAP_SURFACEFMT_BUMPX8L8V8U8           = 51,
+   SVGA3D_DEVCAP_SURFACEFMT_A2W10V10U10            = 52,
+   SVGA3D_DEVCAP_SURFACEFMT_BUMPU8V8               = 53,
+   SVGA3D_DEVCAP_SURFACEFMT_Q8W8V8U8               = 54,
+   SVGA3D_DEVCAP_SURFACEFMT_CxV8U8                 = 55,
+   SVGA3D_DEVCAP_SURFACEFMT_R_S10E5                = 56,
+   SVGA3D_DEVCAP_SURFACEFMT_R_S23E8                = 57,
+   SVGA3D_DEVCAP_SURFACEFMT_RG_S10E5               = 58,
+   SVGA3D_DEVCAP_SURFACEFMT_RG_S23E8               = 59,
+   SVGA3D_DEVCAP_SURFACEFMT_ARGB_S10E5             = 60,
+   SVGA3D_DEVCAP_SURFACEFMT_ARGB_S23E8             = 61,
+   SVGA3D_DEVCAP_MAX_VERTEX_SHADER_TEXTURES        = 63,
+
+   /*
+    * Note that MAX_SIMULTANEOUS_RENDER_TARGETS is a maximum count of color
+    * render targets.  This does no include the depth or stencil targets.
+    */
+   SVGA3D_DEVCAP_MAX_SIMULTANEOUS_RENDER_TARGETS   = 64,
+
+   SVGA3D_DEVCAP_SURFACEFMT_V16U16                 = 65,
+   SVGA3D_DEVCAP_SURFACEFMT_G16R16                 = 66,
+   SVGA3D_DEVCAP_SURFACEFMT_A16B16G16R16           = 67,
+   SVGA3D_DEVCAP_SURFACEFMT_UYVY                   = 68,
+   SVGA3D_DEVCAP_SURFACEFMT_YUY2                   = 69,
+
+   /*
+    * Don't add new caps into the previous section; the values in this
+    * enumeration must not change. You can put new values right before
+    * SVGA3D_DEVCAP_MAX.
+    */
+   SVGA3D_DEVCAP_MAX                                  /* This must be the last index. */
+} SVGA3dDevCapIndex;
+
+typedef union {
+   Bool   b;
+   uint32 u;
+   int32  i;
+   float  f;
+} SVGA3dDevCapResult;
+
+#endif /* _SVGA3D_REG_H_ */
diff --git a/src/gallium/drivers/svga/include/svga3d_shaderdefs.h b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
new file mode 100644
index 00000000000..2078c4a8a44
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga3d_shaderdefs.h
@@ -0,0 +1,519 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga3d_shaderdefs.h --
+ *
+ * SVGA3D byte code format and limit definitions.
+ *
+ * The format of the byte code directly corresponds to that defined
+ * by Microsoft DirectX SDK 9.0c (file d3d9types.h). The format can
+ * also be extended so that different shader formats can be supported
+ * for example GLSL, ARB vp/fp, NV/ATI shader formats, etc.
+ *
+ */
+
+#ifndef __SVGA3D_SHADER_DEFS__
+#define __SVGA3D_SHADER_DEFS__
+
+/* SVGA3D shader hardware limits. */
+
+#define SVGA3D_INPUTREG_MAX            16
+#define SVGA3D_OUTPUTREG_MAX           12
+#define SVGA3D_VERTEX_SAMPLERREG_MAX   4
+#define SVGA3D_PIXEL_SAMPLERREG_MAX    16
+#define SVGA3D_SAMPLERREG_MAX          (SVGA3D_PIXEL_SAMPLERREG_MAX+\
+                                        SVGA3D_VERTEX_SAMPLERREG_MAX)
+#define SVGA3D_TEMPREG_MAX             32
+#define SVGA3D_CONSTREG_MAX            256
+#define SVGA3D_CONSTINTREG_MAX         16
+#define SVGA3D_CONSTBOOLREG_MAX        16
+#define SVGA3D_ADDRREG_MAX             1
+#define SVGA3D_PREDREG_MAX             1
+
+/* SVGA3D byte code specific limits */
+
+#define SVGA3D_MAX_SRC_REGS      4
+#define SVGA3D_MAX_NESTING_LEVEL 32
+
+/* SVGA3D version information. */
+
+#define SVGA3D_VS_TYPE  0xFFFE
+#define SVGA3D_PS_TYPE  0xFFFF
+
+typedef struct {
+   union {
+      struct {
+         uint32 minor : 8;
+         uint32 major : 8;
+         uint32 type : 16;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderVersion;
+
+#define SVGA3D_VS_10 ((SVGA3D_VS_TYPE << 16) | 1 << 8)
+#define SVGA3D_VS_11 (SVGA3D_VS_10 | 1)
+#define SVGA3D_VS_20 ((SVGA3D_VS_TYPE << 16) | 2 << 8)
+#define SVGA3D_VS_30 ((SVGA3D_VS_TYPE << 16) | 3 << 8)
+
+#define SVGA3D_PS_10 ((SVGA3D_PS_TYPE << 16) | 1 << 8)
+#define SVGA3D_PS_11 (SVGA3D_PS_10 | 1)
+#define SVGA3D_PS_12 (SVGA3D_PS_10 | 2)
+#define SVGA3D_PS_13 (SVGA3D_PS_10 | 3)
+#define SVGA3D_PS_14 (SVGA3D_PS_10 | 4)
+#define SVGA3D_PS_20 ((SVGA3D_PS_TYPE << 16) | 2 << 8)
+#define SVGA3D_PS_30 ((SVGA3D_PS_TYPE << 16) | 3 << 8)
+
+/* The *_ENABLED are for backwards compatibility with old drivers */
+typedef enum {
+   SVGA3DPSVERSION_NONE = 0,
+   SVGA3DPSVERSION_ENABLED = 1,
+   SVGA3DPSVERSION_11 = 3,
+   SVGA3DPSVERSION_12 = 5,
+   SVGA3DPSVERSION_13 = 7,
+   SVGA3DPSVERSION_14 = 9,
+   SVGA3DPSVERSION_20 = 11,
+   SVGA3DPSVERSION_30 = 13,
+   SVGA3DPSVERSION_40 = 15,
+   SVGA3DPSVERSION_MAX
+} SVGA3dPixelShaderVersion;
+
+typedef enum {
+   SVGA3DVSVERSION_NONE = 0,
+   SVGA3DVSVERSION_ENABLED = 1,
+   SVGA3DVSVERSION_11 = 3,
+   SVGA3DVSVERSION_20 = 5,
+   SVGA3DVSVERSION_30 = 7,
+   SVGA3DVSVERSION_40 = 9,
+   SVGA3DVSVERSION_MAX
+} SVGA3dVertexShaderVersion;
+
+/* SVGA3D instruction op codes. */
+
+typedef enum {
+   SVGA3DOP_NOP = 0,
+   SVGA3DOP_MOV,
+   SVGA3DOP_ADD,
+   SVGA3DOP_SUB,
+   SVGA3DOP_MAD,
+   SVGA3DOP_MUL,
+   SVGA3DOP_RCP,
+   SVGA3DOP_RSQ,
+   SVGA3DOP_DP3,
+   SVGA3DOP_DP4,
+   SVGA3DOP_MIN,
+   SVGA3DOP_MAX,
+   SVGA3DOP_SLT,
+   SVGA3DOP_SGE,
+   SVGA3DOP_EXP,
+   SVGA3DOP_LOG,
+   SVGA3DOP_LIT,
+   SVGA3DOP_DST,
+   SVGA3DOP_LRP,
+   SVGA3DOP_FRC,
+   SVGA3DOP_M4x4,
+   SVGA3DOP_M4x3,
+   SVGA3DOP_M3x4,
+   SVGA3DOP_M3x3,
+   SVGA3DOP_M3x2,
+   SVGA3DOP_CALL,
+   SVGA3DOP_CALLNZ,
+   SVGA3DOP_LOOP,
+   SVGA3DOP_RET,
+   SVGA3DOP_ENDLOOP,
+   SVGA3DOP_LABEL,
+   SVGA3DOP_DCL,
+   SVGA3DOP_POW,
+   SVGA3DOP_CRS,
+   SVGA3DOP_SGN,
+   SVGA3DOP_ABS,
+   SVGA3DOP_NRM,
+   SVGA3DOP_SINCOS,
+   SVGA3DOP_REP,
+   SVGA3DOP_ENDREP,
+   SVGA3DOP_IF,
+   SVGA3DOP_IFC,
+   SVGA3DOP_ELSE,
+   SVGA3DOP_ENDIF,
+   SVGA3DOP_BREAK,
+   SVGA3DOP_BREAKC,
+   SVGA3DOP_MOVA,
+   SVGA3DOP_DEFB,
+   SVGA3DOP_DEFI,
+   SVGA3DOP_TEXCOORD = 64,
+   SVGA3DOP_TEXKILL,
+   SVGA3DOP_TEX,
+   SVGA3DOP_TEXBEM,
+   SVGA3DOP_TEXBEML,
+   SVGA3DOP_TEXREG2AR,
+   SVGA3DOP_TEXREG2GB = 70,
+   SVGA3DOP_TEXM3x2PAD,
+   SVGA3DOP_TEXM3x2TEX,
+   SVGA3DOP_TEXM3x3PAD,
+   SVGA3DOP_TEXM3x3TEX,
+   SVGA3DOP_RESERVED0,
+   SVGA3DOP_TEXM3x3SPEC,
+   SVGA3DOP_TEXM3x3VSPEC,
+   SVGA3DOP_EXPP,
+   SVGA3DOP_LOGP,
+   SVGA3DOP_CND = 80,
+   SVGA3DOP_DEF,
+   SVGA3DOP_TEXREG2RGB,
+   SVGA3DOP_TEXDP3TEX,
+   SVGA3DOP_TEXM3x2DEPTH,
+   SVGA3DOP_TEXDP3,
+   SVGA3DOP_TEXM3x3,
+   SVGA3DOP_TEXDEPTH,
+   SVGA3DOP_CMP,
+   SVGA3DOP_BEM,
+   SVGA3DOP_DP2ADD = 90,
+   SVGA3DOP_DSX,
+   SVGA3DOP_DSY,
+   SVGA3DOP_TEXLDD,
+   SVGA3DOP_SETP,
+   SVGA3DOP_TEXLDL,
+   SVGA3DOP_BREAKP = 96,
+   SVGA3DOP_LAST_INST,
+   SVGA3DOP_PHASE = 0xFFFD,
+   SVGA3DOP_COMMENT = 0xFFFE,
+   SVGA3DOP_END = 0xFFFF,
+} SVGA3dShaderOpCodeType;
+
+/* SVGA3D operation control/comparison function types */
+
+typedef enum {
+   SVGA3DOPCONT_NONE,
+   SVGA3DOPCONT_PROJECT,   /* Projective texturing */
+   SVGA3DOPCONT_BIAS,      /* Texturing with a LOD bias */
+} SVGA3dShaderOpCodeControlFnType;
+
+typedef enum {
+   SVGA3DOPCOMP_RESERVED0 = 0,
+   SVGA3DOPCOMP_GT,
+   SVGA3DOPCOMP_EQ,
+   SVGA3DOPCOMP_GE,
+   SVGA3DOPCOMP_LT,
+   SVGA3DOPCOMPC_NE,
+   SVGA3DOPCOMP_LE,
+   SVGA3DOPCOMP_RESERVED1
+} SVGA3dShaderOpCodeCompFnType;
+
+/* SVGA3D register types */
+
+typedef enum {
+    SVGA3DREG_TEMP = 0,       /* Temporary register file */
+    SVGA3DREG_INPUT,          /* Input register file */
+    SVGA3DREG_CONST,          /* Constant register file */
+    SVGA3DREG_ADDR,           /* Address register for VS */
+    SVGA3DREG_TEXTURE = 3,    /* Texture register file for PS */
+    SVGA3DREG_RASTOUT,        /* Rasterizer register file */
+    SVGA3DREG_ATTROUT,        /* Attribute output register file */
+    SVGA3DREG_TEXCRDOUT,      /* Texture coordinate output register file */
+    SVGA3DREG_OUTPUT = 6,     /* Output register file for VS 3.0+ */
+    SVGA3DREG_CONSTINT,       /* Constant integer vector register file */
+    SVGA3DREG_COLOROUT,       /* Color output register file */
+    SVGA3DREG_DEPTHOUT,       /* Depth output register file */
+    SVGA3DREG_SAMPLER,        /* Sampler state register file */
+    SVGA3DREG_CONST2,         /* Constant register file 2048 - 4095 */
+    SVGA3DREG_CONST3,         /* Constant register file 4096 - 6143 */
+    SVGA3DREG_CONST4,         /* Constant register file 6144 - 8191 */
+    SVGA3DREG_CONSTBOOL,      /* Constant boolean register file */
+    SVGA3DREG_LOOP,           /* Loop counter register file */
+    SVGA3DREG_TEMPFLOAT16,    /* 16-bit float temp register file */
+    SVGA3DREG_MISCTYPE,       /* Miscellaneous (single) registers */
+    SVGA3DREG_LABEL,          /* Label */
+    SVGA3DREG_PREDICATE,      /* Predicate register */
+} SVGA3dShaderRegType;
+
+/* SVGA3D rasterizer output register types */
+
+typedef enum {
+   SVGA3DRASTOUT_POSITION = 0,
+   SVGA3DRASTOUT_FOG,
+   SVGA3DRASTOUT_PSIZE
+} SVGA3dShaderRastOutRegType;
+
+/* SVGA3D miscellaneous register types */
+
+typedef enum {
+   SVGA3DMISCREG_POSITION = 0,   /* Input position x,y,z,rhw (PS) */
+   SVGA3DMISCREG_FACE            /* Floating point primitive area (PS) */
+} SVGA3DShaderMiscRegType;
+
+/* SVGA3D sampler types */
+
+typedef enum {
+   SVGA3DSAMP_UNKNOWN = 0, /* Uninitialized value */
+   SVGA3DSAMP_2D = 2,      /* dcl_2d s# (for declaring a 2-D texture) */
+   SVGA3DSAMP_CUBE,        /* dcl_cube s# (for declaring a cube texture) */
+   SVGA3DSAMP_VOLUME,      /* dcl_volume s# (for declaring a volume texture) */
+} SVGA3dShaderSamplerType;
+
+/* SVGA3D sampler format classes */
+
+typedef enum {
+   SVGA3DSAMPFORMAT_ARGB,        /* ARGB formats */
+   SVGA3DSAMPFORMAT_V8U8,        /* Sign and normalize (SNORM) V & U */
+   SVGA3DSAMPFORMAT_Q8W8V8U8,    /* SNORM all */
+   SVGA3DSAMPFORMAT_CxV8U8,      /* SNORM V & U, C=SQRT(1-U^2-V^2) */
+   SVGA3DSAMPFORMAT_X8L8V8U8,    /* SNORM V & U */
+   SVGA3DSAMPFORMAT_A2W10V10U10, /* SNORM W, V & U */
+   SVGA3DSAMPFORMAT_DXT_PMA,     /* DXT pre-multiplied alpha */
+   SVGA3DSAMPFORMAT_YUV,         /* YUV video format */
+   SVGA3DSAMPFORMAT_UYVY,        /* UYVY video format */
+   SVGA3DSAMPFORMAT_Rx,          /* R16F/32F */
+   SVGA3DSAMPFORMAT_RxGx,        /* R16FG16F, R32FG32F */
+   SVGA3DSAMPFORMAT_V16U16,      /* SNORM all */
+} SVGA3DShaderSamplerFormatClass;
+
+/* SVGA3D write mask */
+
+#define SVGA3DWRITEMASK_0    1 /* Component 0 (X;Red) */
+#define SVGA3DWRITEMASK_1    2 /* Component 1 (Y;Green) */
+#define SVGA3DWRITEMASK_2    4 /* Component 2 (Z;Blue) */
+#define SVGA3DWRITEMASK_3    8 /* Component 3 (W;Alpha) */
+#define SVGA3DWRITEMASK_ALL 15 /* All components */
+
+/* SVGA3D destination modifiers */
+
+#define SVGA3DDSTMOD_NONE              0 /* nop */
+#define SVGA3DDSTMOD_SATURATE          1 /* clamp to [0, 1] */
+#define SVGA3DDSTMOD_PARTIALPRECISION  2 /* Partial precision hint */
+
+/*
+ * Relevant to multisampling only:
+ * When the pixel center is not covered, sample
+ * attribute or compute gradients/LOD
+ * using multisample "centroid" location.
+ * "Centroid" is some location within the covered
+ * region of the pixel.
+ */
+
+#define SVGA3DDSTMOD_MSAMPCENTROID     4
+
+/* SVGA3D source swizzle */
+
+#define SVGA3DSWIZZLE_REPLICATEX 0x00
+#define SVGA3DSWIZZLE_REPLICATEY 0x55
+#define SVGA3DSWIZZLE_REPLICATEZ 0xAA
+#define SVGA3DSWIZZLE_REPLICATEW 0xFF
+#define SVGA3DSWIZZLE_NONE       0xE4
+#define SVGA3DSWIZZLE_YZXW       0xC9
+#define SVGA3DSWIZZLE_ZXYW       0xD2
+#define SVGA3DSWIZZLE_WXYZ       0x1B
+
+/* SVGA3D source modifiers */
+
+typedef enum {
+    SVGA3DSRCMOD_NONE = 0, /* nop */
+    SVGA3DSRCMOD_NEG,      /* negate */
+    SVGA3DSRCMOD_BIAS,     /* bias */
+    SVGA3DSRCMOD_BIASNEG,  /* bias and negate */
+    SVGA3DSRCMOD_SIGN,     /* sign */
+    SVGA3DSRCMOD_SIGNNEG,  /* sign and negate */
+    SVGA3DSRCMOD_COMP,     /* complement */
+    SVGA3DSRCMOD_X2,       /* x2 */
+    SVGA3DSRCMOD_X2NEG,    /* x2 and negate */
+    SVGA3DSRCMOD_DZ,       /* divide through by z component */
+    SVGA3DSRCMOD_DW,       /* divide through by w component */
+    SVGA3DSRCMOD_ABS,      /* abs() */
+    SVGA3DSRCMOD_ABSNEG,   /* -abs() */
+    SVGA3DSRCMOD_NOT,      /* ! (for predicate register) */
+} SVGA3dShaderSrcModType;
+
+/* SVGA3D instruction token */
+
+typedef struct {
+   union {
+      struct {
+         uint32 comment_op : 16;
+         uint32 comment_size : 16;
+      };
+
+      struct {
+         uint32 op : 16;
+         uint32 control : 3;
+         uint32 reserved2 : 5;
+         uint32 size : 4;
+         uint32 predicated : 1;
+         uint32 reserved1 : 1;
+         uint32 coissue : 1;
+         uint32 reserved0 : 1;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderInstToken;
+
+/* SVGA3D destination parameter token */
+
+typedef struct {
+   union {
+      struct {
+         uint32 num : 11;
+         uint32 type_upper : 2;
+         uint32 relAddr : 1;
+         uint32 reserved1 : 2;
+         uint32 mask : 4;
+         uint32 dstMod : 4;
+         uint32 shfScale : 4;
+         uint32 type_lower : 3;
+         uint32 reserved0 : 1;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderDestToken;
+
+/* SVGA3D source parameter token */
+
+typedef struct {
+   union {
+      struct {
+         uint32 num : 11;
+         uint32 type_upper : 2;
+         uint32 relAddr : 1;
+         uint32 reserved1 : 2;
+         uint32 swizzle : 8;
+         uint32 srcMod : 4;
+         uint32 type_lower : 3;
+         uint32 reserved0 : 1;
+      };
+
+      uint32 value;
+   };
+} SVGA3dShaderSrcToken;
+
+/* SVGA3DOP_DCL parameter tokens */
+
+typedef struct {
+   union {
+      struct {
+         union {
+            struct {
+               uint32 usage : 5;
+               uint32 reserved1 : 11;
+               uint32 index : 4;
+               uint32 reserved0 : 12;
+            }; /* input / output declaration */
+
+            struct {
+               uint32 reserved3 : 27;
+               uint32 type : 4;
+               uint32 reserved2 : 1;
+            }; /* sampler declaration */
+         };
+
+         SVGA3dShaderDestToken dst;
+      };
+
+      uint32 values[2];
+   };
+} SVGA3DOpDclArgs;
+
+/* SVGA3DOP_DEF parameter tokens */
+
+typedef struct {
+   union {
+      struct {
+         SVGA3dShaderDestToken dst;
+
+         union {
+            float constValues[4];
+            int constIValues[4];
+            Bool constBValue;
+         };
+      };
+
+      uint32 values[5];
+   };
+} SVGA3DOpDefArgs;
+
+/* SVGA3D shader token */
+
+typedef union {
+   uint32 value;
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dest;
+   SVGA3dShaderSrcToken src;
+} SVGA3dShaderToken;
+
+/* SVGA3D shader program */
+
+typedef struct {
+   SVGA3dShaderVersion version;
+   /* SVGA3dShaderToken stream */
+} SVGA3dShaderProgram;
+
+/* SVGA3D version specific register assignments */
+
+static const uint32 SVGA3D_INPUT_REG_POSITION_VS11 = 0;
+static const uint32 SVGA3D_INPUT_REG_PSIZE_VS11 = 1;
+static const uint32 SVGA3D_INPUT_REG_FOG_VS11 = 3;
+static const uint32 SVGA3D_INPUT_REG_FOG_MASK_VS11 = SVGA3DWRITEMASK_3;
+static const uint32 SVGA3D_INPUT_REG_COLOR_BASE_VS11 = 2;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_BASE_VS11 = 4;
+
+static const uint32 SVGA3D_INPUT_REG_COLOR_BASE_PS11 = 0;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_BASE_PS11 = 2;
+static const uint32 SVGA3D_OUTPUT_REG_DEPTH_PS11 = 0;
+static const uint32 SVGA3D_OUTPUT_REG_COLOR_PS11 = 1;
+
+static const uint32 SVGA3D_INPUT_REG_COLOR_BASE_PS20 = 0;
+static const uint32 SVGA3D_INPUT_REG_COLOR_NUM_PS20 = 2;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_BASE_PS20 = 2;
+static const uint32 SVGA3D_INPUT_REG_TEXCOORD_NUM_PS20 = 8;
+static const uint32 SVGA3D_OUTPUT_REG_COLOR_BASE_PS20 = 1;
+static const uint32 SVGA3D_OUTPUT_REG_COLOR_NUM_PS20 = 4;
+static const uint32 SVGA3D_OUTPUT_REG_DEPTH_BASE_PS20 = 0;
+static const uint32 SVGA3D_OUTPUT_REG_DEPTH_NUM_PS20 = 1;
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3dShaderGetRegType --
+ *
+ *      As the register type is split into two non sequential fields,
+ *      this function provides an useful way of accessing the actual
+ *      register type without having to manually concatenate the
+ *      type_upper and type_lower fields.
+ *
+ * Results:
+ *      Returns the register type.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE SVGA3dShaderRegType
+SVGA3dShaderGetRegType(uint32 token)
+{
+   SVGA3dShaderSrcToken src;
+   src.value = token;
+   return (SVGA3dShaderRegType)(src.type_upper << 3 | src.type_lower);
+}
+
+#endif /* __SVGA3D_SHADER_DEFS__ */
diff --git a/src/gallium/drivers/svga/include/svga_escape.h b/src/gallium/drivers/svga/include/svga_escape.h
new file mode 100644
index 00000000000..7b85e9b8c85
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_escape.h
@@ -0,0 +1,89 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_escape.h --
+ *
+ *    Definitions for our own (vendor-specific) SVGA Escape commands.
+ */
+
+#ifndef _SVGA_ESCAPE_H_
+#define _SVGA_ESCAPE_H_
+
+
+/*
+ * Namespace IDs for the escape command
+ */
+
+#define SVGA_ESCAPE_NSID_VMWARE 0x00000000
+#define SVGA_ESCAPE_NSID_DEVEL  0xFFFFFFFF
+
+
+/*
+ * Within SVGA_ESCAPE_NSID_VMWARE, we multiplex commands according to
+ * the first DWORD of escape data (after the nsID and size). As a
+ * guideline we're using the high word and low word as a major and
+ * minor command number, respectively.
+ *
+ * Major command number allocation:
+ *
+ *   0000: Reserved
+ *   0001: SVGA_ESCAPE_VMWARE_LOG (svga_binary_logger.h)
+ *   0002: SVGA_ESCAPE_VMWARE_VIDEO (svga_overlay.h)
+ *   0003: SVGA_ESCAPE_VMWARE_HINT (svga_escape.h)
+ */
+
+#define SVGA_ESCAPE_VMWARE_MAJOR_MASK  0xFFFF0000
+
+
+/*
+ * SVGA Hint commands.
+ *
+ * These escapes let the SVGA driver provide optional information to
+ * he host about the state of the guest or guest applications. The
+ * host can use these hints to make user interface or performance
+ * decisions.
+ *
+ * Notes:
+ *
+ *   - SVGA_ESCAPE_VMWARE_HINT_FULLSCREEN is deprecated for guests
+ *     that use the SVGA Screen Object extension. Instead of sending
+ *     this escape, use the SVGA_SCREEN_FULLSCREEN_HINT flag on your
+ *     Screen Object.
+ */
+
+#define SVGA_ESCAPE_VMWARE_HINT               0x00030000
+#define SVGA_ESCAPE_VMWARE_HINT_FULLSCREEN    0x00030001  // Deprecated
+
+typedef
+struct {
+   uint32 command;
+   uint32 fullscreen;
+   struct {
+      int32 x, y;
+   } monitorPosition;
+} SVGAEscapeHintFullscreen;
+
+#endif /* _SVGA_ESCAPE_H_ */
diff --git a/src/gallium/drivers/svga/include/svga_overlay.h b/src/gallium/drivers/svga/include/svga_overlay.h
new file mode 100644
index 00000000000..82c1d3ff3e2
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_overlay.h
@@ -0,0 +1,201 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_overlay.h --
+ *
+ *    Definitions for video-overlay support.
+ */
+
+#ifndef _SVGA_OVERLAY_H_
+#define _SVGA_OVERLAY_H_
+
+#include "svga_reg.h"
+
+/*
+ * Video formats we support
+ */
+
+#define VMWARE_FOURCC_YV12 0x32315659 // 'Y' 'V' '1' '2'
+#define VMWARE_FOURCC_YUY2 0x32595559 // 'Y' 'U' 'Y' '2'
+#define VMWARE_FOURCC_UYVY 0x59565955 // 'U' 'Y' 'V' 'Y'
+
+typedef enum {
+   SVGA_OVERLAY_FORMAT_INVALID = 0,
+   SVGA_OVERLAY_FORMAT_YV12 = VMWARE_FOURCC_YV12,
+   SVGA_OVERLAY_FORMAT_YUY2 = VMWARE_FOURCC_YUY2,
+   SVGA_OVERLAY_FORMAT_UYVY = VMWARE_FOURCC_UYVY,
+} SVGAOverlayFormat;
+
+#define SVGA_VIDEO_COLORKEY_MASK             0x00ffffff
+
+#define SVGA_ESCAPE_VMWARE_VIDEO             0x00020000
+
+#define SVGA_ESCAPE_VMWARE_VIDEO_SET_REGS    0x00020001
+        /* FIFO escape layout:
+         * Type, Stream Id, (Register Id, Value) pairs */
+
+#define SVGA_ESCAPE_VMWARE_VIDEO_FLUSH       0x00020002
+        /* FIFO escape layout:
+         * Type, Stream Id */
+
+typedef
+struct SVGAEscapeVideoSetRegs {
+   struct {
+      uint32 cmdType;
+      uint32 streamId;
+   } header;
+
+   // May include zero or more items.
+   struct {
+      uint32 registerId;
+      uint32 value;
+   } items[1];
+} SVGAEscapeVideoSetRegs;
+
+typedef
+struct SVGAEscapeVideoFlush {
+   uint32 cmdType;
+   uint32 streamId;
+} SVGAEscapeVideoFlush;
+
+
+/*
+ * Struct definitions for the video overlay commands built on
+ * SVGAFifoCmdEscape.
+ */
+typedef
+struct {
+   uint32 command;
+   uint32 overlay;
+} SVGAFifoEscapeCmdVideoBase;
+
+typedef
+struct {
+   SVGAFifoEscapeCmdVideoBase videoCmd;
+} SVGAFifoEscapeCmdVideoFlush;
+
+typedef
+struct {
+   SVGAFifoEscapeCmdVideoBase videoCmd;
+   struct {
+      uint32 regId;
+      uint32 value;
+   } items[1];
+} SVGAFifoEscapeCmdVideoSetRegs;
+
+typedef
+struct {
+   SVGAFifoEscapeCmdVideoBase videoCmd;
+   struct {
+      uint32 regId;
+      uint32 value;
+   } items[SVGA_VIDEO_NUM_REGS];
+} SVGAFifoEscapeCmdVideoSetAllRegs;
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * VMwareVideoGetAttributes --
+ *
+ *      Computes the size, pitches and offsets for YUV frames.
+ *
+ * Results:
+ *      TRUE on success; otherwise FALSE on failure.
+ *
+ * Side effects:
+ *      Pitches and offsets for the given YUV frame are put in 'pitches'
+ *      and 'offsets' respectively. They are both optional though.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE Bool
+VMwareVideoGetAttributes(const SVGAOverlayFormat format,    // IN
+                         uint32 *width,                     // IN / OUT
+                         uint32 *height,                    // IN / OUT
+                         uint32 *size,                      // OUT
+                         uint32 *pitches,                   // OUT (optional)
+                         uint32 *offsets)                   // OUT (optional)
+{
+    int tmp;
+
+    *width = (*width + 1) & ~1;
+
+    if (offsets) {
+        offsets[0] = 0;
+    }
+
+    switch (format) {
+    case VMWARE_FOURCC_YV12:
+       *height = (*height + 1) & ~1;
+       *size = (*width + 3) & ~3;
+
+       if (pitches) {
+          pitches[0] = *size;
+       }
+
+       *size *= *height;
+
+       if (offsets) {
+          offsets[1] = *size;
+       }
+
+       tmp = ((*width >> 1) + 3) & ~3;
+
+       if (pitches) {
+          pitches[1] = pitches[2] = tmp;
+       }
+
+       tmp *= (*height >> 1);
+       *size += tmp;
+
+       if (offsets) {
+          offsets[2] = *size;
+       }
+
+       *size += tmp;
+       break;
+
+    case VMWARE_FOURCC_YUY2:
+    case VMWARE_FOURCC_UYVY:
+       *size = *width * 2;
+
+       if (pitches) {
+          pitches[0] = *size;
+       }
+
+       *size *= *height;
+       break;
+
+    default:
+       return FALSE;
+    }
+
+    return TRUE;
+}
+
+#endif // _SVGA_OVERLAY_H_
diff --git a/src/gallium/drivers/svga/include/svga_reg.h b/src/gallium/drivers/svga/include/svga_reg.h
new file mode 100644
index 00000000000..1b96c2ec07d
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_reg.h
@@ -0,0 +1,1346 @@
+/**********************************************************
+ * Copyright 1998-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_reg.h --
+ *
+ *    Virtual hardware definitions for the VMware SVGA II device.
+ */
+
+#ifndef _SVGA_REG_H_
+#define _SVGA_REG_H_
+
+/*
+ * PCI device IDs.
+ */
+#define PCI_VENDOR_ID_VMWARE            0x15AD
+#define PCI_DEVICE_ID_VMWARE_SVGA2      0x0405
+
+/*
+ * Legal values for the SVGA_REG_CURSOR_ON register in old-fashioned
+ * cursor bypass mode. This is still supported, but no new guest
+ * drivers should use it.
+ */
+#define SVGA_CURSOR_ON_HIDE            0x0   /* Must be 0 to maintain backward compatibility */
+#define SVGA_CURSOR_ON_SHOW            0x1   /* Must be 1 to maintain backward compatibility */
+#define SVGA_CURSOR_ON_REMOVE_FROM_FB  0x2   /* Remove the cursor from the framebuffer because we need to see what's under it */
+#define SVGA_CURSOR_ON_RESTORE_TO_FB   0x3   /* Put the cursor back in the framebuffer so the user can see it */
+
+/*
+ * The maximum framebuffer size that can traced for e.g. guests in VESA mode.
+ * The changeMap in the monitor is proportional to this number. Therefore, we'd
+ * like to keep it as small as possible to reduce monitor overhead (using
+ * SVGA_VRAM_MAX_SIZE for this increases the size of the shared area by over
+ * 4k!).
+ *
+ * NB: For compatibility reasons, this value must be greater than 0xff0000.
+ *     See bug 335072.
+ */
+#define SVGA_FB_MAX_TRACEABLE_SIZE      0x1000000
+
+#define SVGA_MAX_PSEUDOCOLOR_DEPTH      8
+#define SVGA_MAX_PSEUDOCOLORS           (1 << SVGA_MAX_PSEUDOCOLOR_DEPTH)
+#define SVGA_NUM_PALETTE_REGS           (3 * SVGA_MAX_PSEUDOCOLORS)
+
+#define SVGA_MAGIC         0x900000UL
+#define SVGA_MAKE_ID(ver)  (SVGA_MAGIC << 8 | (ver))
+
+/* Version 2 let the address of the frame buffer be unsigned on Win32 */
+#define SVGA_VERSION_2     2
+#define SVGA_ID_2          SVGA_MAKE_ID(SVGA_VERSION_2)
+
+/* Version 1 has new registers starting with SVGA_REG_CAPABILITIES so
+   PALETTE_BASE has moved */
+#define SVGA_VERSION_1     1
+#define SVGA_ID_1          SVGA_MAKE_ID(SVGA_VERSION_1)
+
+/* Version 0 is the initial version */
+#define SVGA_VERSION_0     0
+#define SVGA_ID_0          SVGA_MAKE_ID(SVGA_VERSION_0)
+
+/* "Invalid" value for all SVGA IDs. (Version ID, screen object ID, surface ID...) */
+#define SVGA_ID_INVALID    0xFFFFFFFF
+
+/* Port offsets, relative to BAR0 */
+#define SVGA_INDEX_PORT         0x0
+#define SVGA_VALUE_PORT         0x1
+#define SVGA_BIOS_PORT          0x2
+#define SVGA_IRQSTATUS_PORT     0x8
+
+/*
+ * Interrupt source flags for IRQSTATUS_PORT and IRQMASK.
+ *
+ * Interrupts are only supported when the
+ * SVGA_CAP_IRQMASK capability is present.
+ */
+#define SVGA_IRQFLAG_ANY_FENCE            0x1    /* Any fence was passed */
+#define SVGA_IRQFLAG_FIFO_PROGRESS        0x2    /* Made forward progress in the FIFO */
+#define SVGA_IRQFLAG_FENCE_GOAL           0x4    /* SVGA_FIFO_FENCE_GOAL reached */
+
+/*
+ * Registers
+ */
+
+enum {
+   SVGA_REG_ID = 0,
+   SVGA_REG_ENABLE = 1,
+   SVGA_REG_WIDTH = 2,
+   SVGA_REG_HEIGHT = 3,
+   SVGA_REG_MAX_WIDTH = 4,
+   SVGA_REG_MAX_HEIGHT = 5,
+   SVGA_REG_DEPTH = 6,
+   SVGA_REG_BITS_PER_PIXEL = 7,       /* Current bpp in the guest */
+   SVGA_REG_PSEUDOCOLOR = 8,
+   SVGA_REG_RED_MASK = 9,
+   SVGA_REG_GREEN_MASK = 10,
+   SVGA_REG_BLUE_MASK = 11,
+   SVGA_REG_BYTES_PER_LINE = 12,
+   SVGA_REG_FB_START = 13,            /* (Deprecated) */
+   SVGA_REG_FB_OFFSET = 14,
+   SVGA_REG_VRAM_SIZE = 15,
+   SVGA_REG_FB_SIZE = 16,
+
+   /* ID 0 implementation only had the above registers, then the palette */
+
+   SVGA_REG_CAPABILITIES = 17,
+   SVGA_REG_MEM_START = 18,           /* (Deprecated) */
+   SVGA_REG_MEM_SIZE = 19,
+   SVGA_REG_CONFIG_DONE = 20,         /* Set when memory area configured */
+   SVGA_REG_SYNC = 21,                /* See "FIFO Synchronization Registers" */
+   SVGA_REG_BUSY = 22,                /* See "FIFO Synchronization Registers" */
+   SVGA_REG_GUEST_ID = 23,            /* Set guest OS identifier */
+   SVGA_REG_CURSOR_ID = 24,           /* (Deprecated) */
+   SVGA_REG_CURSOR_X = 25,            /* (Deprecated) */
+   SVGA_REG_CURSOR_Y = 26,            /* (Deprecated) */
+   SVGA_REG_CURSOR_ON = 27,           /* (Deprecated) */
+   SVGA_REG_HOST_BITS_PER_PIXEL = 28, /* (Deprecated) */
+   SVGA_REG_SCRATCH_SIZE = 29,        /* Number of scratch registers */
+   SVGA_REG_MEM_REGS = 30,            /* Number of FIFO registers */
+   SVGA_REG_NUM_DISPLAYS = 31,        /* (Deprecated) */
+   SVGA_REG_PITCHLOCK = 32,           /* Fixed pitch for all modes */
+   SVGA_REG_IRQMASK = 33,             /* Interrupt mask */
+
+   /* Legacy multi-monitor support */
+   SVGA_REG_NUM_GUEST_DISPLAYS = 34,/* Number of guest displays in X/Y direction */
+   SVGA_REG_DISPLAY_ID = 35,        /* Display ID for the following display attributes */
+   SVGA_REG_DISPLAY_IS_PRIMARY = 36,/* Whether this is a primary display */
+   SVGA_REG_DISPLAY_POSITION_X = 37,/* The display position x */
+   SVGA_REG_DISPLAY_POSITION_Y = 38,/* The display position y */
+   SVGA_REG_DISPLAY_WIDTH = 39,     /* The display's width */
+   SVGA_REG_DISPLAY_HEIGHT = 40,    /* The display's height */
+
+   /* See "Guest memory regions" below. */
+   SVGA_REG_GMR_ID = 41,
+   SVGA_REG_GMR_DESCRIPTOR = 42,
+   SVGA_REG_GMR_MAX_IDS = 43,
+   SVGA_REG_GMR_MAX_DESCRIPTOR_LENGTH = 44,
+
+   SVGA_REG_TRACES = 45,            /* Enable trace-based updates even when FIFO is on */
+   SVGA_REG_TOP = 46,               /* Must be 1 more than the last register */
+
+   SVGA_PALETTE_BASE = 1024,        /* Base of SVGA color map */
+   /* Next 768 (== 256*3) registers exist for colormap */
+
+   SVGA_SCRATCH_BASE = SVGA_PALETTE_BASE + SVGA_NUM_PALETTE_REGS
+                                    /* Base of scratch registers */
+   /* Next reg[SVGA_REG_SCRATCH_SIZE] registers exist for scratch usage:
+      First 4 are reserved for VESA BIOS Extension; any remaining are for
+      the use of the current SVGA driver. */
+};
+
+
+/*
+ * Guest memory regions (GMRs):
+ *
+ * This is a new memory mapping feature available in SVGA devices
+ * which have the SVGA_CAP_GMR bit set. Previously, there were two
+ * fixed memory regions available with which to share data between the
+ * device and the driver: the FIFO ('MEM') and the framebuffer. GMRs
+ * are our name for an extensible way of providing arbitrary DMA
+ * buffers for use between the driver and the SVGA device. They are a
+ * new alternative to framebuffer memory, usable for both 2D and 3D
+ * graphics operations.
+ *
+ * Since GMR mapping must be done synchronously with guest CPU
+ * execution, we use a new pair of SVGA registers:
+ *
+ *   SVGA_REG_GMR_ID --
+ *
+ *     Read/write.
+ *     This register holds the 32-bit ID (a small positive integer)
+ *     of a GMR to create, delete, or redefine. Writing this register
+ *     has no side-effects.
+ *
+ *   SVGA_REG_GMR_DESCRIPTOR --
+ *
+ *     Write-only.
+ *     Writing this register will create, delete, or redefine the GMR
+ *     specified by the above ID register. If this register is zero,
+ *     the GMR is deleted. Any pointers into this GMR (including those
+ *     currently being processed by FIFO commands) will be
+ *     synchronously invalidated.
+ *
+ *     If this register is nonzero, it must be the physical page
+ *     number (PPN) of a data structure which describes the physical
+ *     layout of the memory region this GMR should describe. The
+ *     descriptor structure will be read synchronously by the SVGA
+ *     device when this register is written. The descriptor need not
+ *     remain allocated for the lifetime of the GMR.
+ *
+ *     The guest driver should write SVGA_REG_GMR_ID first, then
+ *     SVGA_REG_GMR_DESCRIPTOR.
+ *
+ *   SVGA_REG_GMR_MAX_IDS --
+ *
+ *     Read-only.
+ *     The SVGA device may choose to support a maximum number of
+ *     user-defined GMR IDs. This register holds the number of supported
+ *     IDs. (The maximum supported ID plus 1)
+ *
+ *   SVGA_REG_GMR_MAX_DESCRIPTOR_LENGTH --
+ *
+ *     Read-only.
+ *     The SVGA device may choose to put a limit on the total number
+ *     of SVGAGuestMemDescriptor structures it will read when defining
+ *     a single GMR.
+ *
+ * The descriptor structure is an array of SVGAGuestMemDescriptor
+ * structures. Each structure may do one of three things:
+ *
+ *   - Terminate the GMR descriptor list.
+ *     (ppn==0, numPages==0)
+ *
+ *   - Add a PPN or range of PPNs to the GMR's virtual address space.
+ *     (ppn != 0, numPages != 0)
+ *
+ *   - Provide the PPN of the next SVGAGuestMemDescriptor, in order to
+ *     support multi-page GMR descriptor tables without forcing the
+ *     driver to allocate physically contiguous memory.
+ *     (ppn != 0, numPages == 0)
+ *
+ * Note that each physical page of SVGAGuestMemDescriptor structures
+ * can describe at least 2MB of guest memory. If the driver needs to
+ * use more than one page of descriptor structures, it must use one of
+ * its SVGAGuestMemDescriptors to point to an additional page.  The
+ * device will never automatically cross a page boundary.
+ *
+ * Once the driver has described a GMR, it is immediately available
+ * for use via any FIFO command that uses an SVGAGuestPtr structure.
+ * These pointers include a GMR identifier plus an offset into that
+ * GMR.
+ *
+ * The driver must check the SVGA_CAP_GMR bit before using the GMR
+ * registers.
+ */
+
+/*
+ * Special GMR IDs, allowing SVGAGuestPtrs to point to framebuffer
+ * memory as well.  In the future, these IDs could even be used to
+ * allow legacy memory regions to be redefined by the guest as GMRs.
+ *
+ * Using the guest framebuffer (GFB) at BAR1 for general purpose DMA
+ * is being phased out. Please try to use user-defined GMRs whenever
+ * possible.
+ */
+#define SVGA_GMR_NULL         ((uint32) -1)
+#define SVGA_GMR_FRAMEBUFFER  ((uint32) -2)  // Guest Framebuffer (GFB)
+
+typedef
+struct SVGAGuestMemDescriptor {
+   uint32 ppn;
+   uint32 numPages;
+} SVGAGuestMemDescriptor;
+
+typedef
+struct SVGAGuestPtr {
+   uint32 gmrId;
+   uint32 offset;
+} SVGAGuestPtr;
+
+
+/*
+ * SVGAGMRImageFormat --
+ *
+ *    This is a packed representation of the source 2D image format
+ *    for a GMR-to-screen blit. Currently it is defined as an encoding
+ *    of the screen's color depth and bits-per-pixel, however, 16 bits
+ *    are reserved for future use to identify other encodings (such as
+ *    RGBA or higher-precision images).
+ *
+ *    Currently supported formats:
+ *
+ *       bpp depth  Format Name
+ *       --- -----  -----------
+ *        32    24  32-bit BGRX
+ *        24    24  24-bit BGR
+ *        16    16  RGB 5-6-5
+ *        16    15  RGB 5-5-5
+ *
+ */
+
+typedef
+struct SVGAGMRImageFormat {
+   union {
+      struct {
+         uint32 bitsPerPixel : 8;
+         uint32 colorDepth   : 8;
+         uint32 reserved     : 16;  // Must be zero
+      };
+
+      uint32 value;
+   };
+} SVGAGMRImageFormat;
+
+/*
+ * SVGAColorBGRX --
+ *
+ *    A 24-bit color format (BGRX), which does not depend on the
+ *    format of the legacy guest framebuffer (GFB) or the current
+ *    GMRFB state.
+ */
+
+typedef
+struct SVGAColorBGRX {
+   union {
+      struct {
+         uint32 b : 8;
+         uint32 g : 8;
+         uint32 r : 8;
+         uint32 x : 8;  // Unused
+      };
+
+      uint32 value;
+   };
+} SVGAColorBGRX;
+
+
+/*
+ * SVGASignedRect --
+ * SVGASignedPoint --
+ *
+ *    Signed rectangle and point primitives. These are used by the new
+ *    2D primitives for drawing to Screen Objects, which can occupy a
+ *    signed virtual coordinate space.
+ *
+ *    SVGASignedRect specifies a half-open interval: the (left, top)
+ *    pixel is part of the rectangle, but the (right, bottom) pixel is
+ *    not.
+ */
+
+typedef
+struct SVGASignedRect {
+   int32  left;
+   int32  top;
+   int32  right;
+   int32  bottom;
+} SVGASignedRect;
+
+typedef
+struct SVGASignedPoint {
+   int32  x;
+   int32  y;
+} SVGASignedPoint;
+
+
+/*
+ *  Capabilities
+ *
+ *  Note the holes in the bitfield. Missing bits have been deprecated,
+ *  and must not be reused. Those capabilities will never be reported
+ *  by new versions of the SVGA device.
+ */
+
+#define SVGA_CAP_NONE               0x00000000
+#define SVGA_CAP_RECT_COPY          0x00000002
+#define SVGA_CAP_CURSOR             0x00000020
+#define SVGA_CAP_CURSOR_BYPASS      0x00000040   // Legacy (Use Cursor Bypass 3 instead)
+#define SVGA_CAP_CURSOR_BYPASS_2    0x00000080   // Legacy (Use Cursor Bypass 3 instead)
+#define SVGA_CAP_8BIT_EMULATION     0x00000100
+#define SVGA_CAP_ALPHA_CURSOR       0x00000200
+#define SVGA_CAP_3D                 0x00004000
+#define SVGA_CAP_EXTENDED_FIFO      0x00008000
+#define SVGA_CAP_MULTIMON           0x00010000   // Legacy multi-monitor support
+#define SVGA_CAP_PITCHLOCK          0x00020000
+#define SVGA_CAP_IRQMASK            0x00040000
+#define SVGA_CAP_DISPLAY_TOPOLOGY   0x00080000   // Legacy multi-monitor support
+#define SVGA_CAP_GMR                0x00100000
+#define SVGA_CAP_TRACES             0x00200000
+
+
+/*
+ * FIFO register indices.
+ *
+ * The FIFO is a chunk of device memory mapped into guest physmem.  It
+ * is always treated as 32-bit words.
+ *
+ * The guest driver gets to decide how to partition it between
+ * - FIFO registers (there are always at least 4, specifying where the
+ *   following data area is and how much data it contains; there may be
+ *   more registers following these, depending on the FIFO protocol
+ *   version in use)
+ * - FIFO data, written by the guest and slurped out by the VMX.
+ * These indices are 32-bit word offsets into the FIFO.
+ */
+
+enum {
+   /*
+    * Block 1 (basic registers): The originally defined FIFO registers.
+    * These exist and are valid for all versions of the FIFO protocol.
+    */
+
+   SVGA_FIFO_MIN = 0,
+   SVGA_FIFO_MAX,       /* The distance from MIN to MAX must be at least 10K */
+   SVGA_FIFO_NEXT_CMD,
+   SVGA_FIFO_STOP,
+
+   /*
+    * Block 2 (extended registers): Mandatory registers for the extended
+    * FIFO.  These exist if the SVGA caps register includes
+    * SVGA_CAP_EXTENDED_FIFO; some of them are valid only if their
+    * associated capability bit is enabled.
+    *
+    * Note that when originally defined, SVGA_CAP_EXTENDED_FIFO implied
+    * support only for (FIFO registers) CAPABILITIES, FLAGS, and FENCE.
+    * This means that the guest has to test individually (in most cases
+    * using FIFO caps) for the presence of registers after this; the VMX
+    * can define "extended FIFO" to mean whatever it wants, and currently
+    * won't enable it unless there's room for that set and much more.
+    */
+
+   SVGA_FIFO_CAPABILITIES = 4,
+   SVGA_FIFO_FLAGS,
+   // Valid with SVGA_FIFO_CAP_FENCE:
+   SVGA_FIFO_FENCE,
+
+   /*
+    * Block 3a (optional extended registers): Additional registers for the
+    * extended FIFO, whose presence isn't actually implied by
+    * SVGA_CAP_EXTENDED_FIFO; these exist if SVGA_FIFO_MIN is high enough to
+    * leave room for them.
+    *
+    * These in block 3a, the VMX currently considers mandatory for the
+    * extended FIFO.
+    */
+
+   // Valid if exists (i.e. if extended FIFO enabled):
+   SVGA_FIFO_3D_HWVERSION,       /* See SVGA3dHardwareVersion in svga3d_reg.h */
+   // Valid with SVGA_FIFO_CAP_PITCHLOCK:
+   SVGA_FIFO_PITCHLOCK,
+
+   // Valid with SVGA_FIFO_CAP_CURSOR_BYPASS_3:
+   SVGA_FIFO_CURSOR_ON,          /* Cursor bypass 3 show/hide register */
+   SVGA_FIFO_CURSOR_X,           /* Cursor bypass 3 x register */
+   SVGA_FIFO_CURSOR_Y,           /* Cursor bypass 3 y register */
+   SVGA_FIFO_CURSOR_COUNT,       /* Incremented when any of the other 3 change */
+   SVGA_FIFO_CURSOR_LAST_UPDATED,/* Last time the host updated the cursor */
+
+   // Valid with SVGA_FIFO_CAP_RESERVE:
+   SVGA_FIFO_RESERVED,           /* Bytes past NEXT_CMD with real contents */
+
+   /*
+    * Valid with SVGA_FIFO_CAP_SCREEN_OBJECT:
+    *
+    * By default this is SVGA_ID_INVALID, to indicate that the cursor
+    * coordinates are specified relative to the virtual root. If this
+    * is set to a specific screen ID, cursor position is reinterpreted
+    * as a signed offset relative to that screen's origin. This is the
+    * only way to place the cursor on a non-rooted screen.
+    */
+   SVGA_FIFO_CURSOR_SCREEN_ID,
+
+   /*
+    * XXX: The gap here, up until SVGA_FIFO_3D_CAPS, can be used for new
+    * registers, but this must be done carefully and with judicious use of
+    * capability bits, since comparisons based on SVGA_FIFO_MIN aren't
+    * enough to tell you whether the register exists: we've shipped drivers
+    * and products that used SVGA_FIFO_3D_CAPS but didn't know about some of
+    * the earlier ones.  The actual order of introduction was:
+    * - PITCHLOCK
+    * - 3D_CAPS
+    * - CURSOR_* (cursor bypass 3)
+    * - RESERVED
+    * So, code that wants to know whether it can use any of the
+    * aforementioned registers, or anything else added after PITCHLOCK and
+    * before 3D_CAPS, needs to reason about something other than
+    * SVGA_FIFO_MIN.
+    */
+
+   /*
+    * 3D caps block space; valid with 3D hardware version >=
+    * SVGA3D_HWVERSION_WS6_B1.
+    */
+   SVGA_FIFO_3D_CAPS      = 32,
+   SVGA_FIFO_3D_CAPS_LAST = 32 + 255,
+
+   /*
+    * End of VMX's current definition of "extended-FIFO registers".
+    * Registers before here are always enabled/disabled as a block; either
+    * the extended FIFO is enabled and includes all preceding registers, or
+    * it's disabled entirely.
+    *
+    * Block 3b (truly optional extended registers): Additional registers for
+    * the extended FIFO, which the VMX already knows how to enable and
+    * disable with correct granularity.
+    *
+    * Registers after here exist if and only if the guest SVGA driver
+    * sets SVGA_FIFO_MIN high enough to leave room for them.
+    */
+
+   // Valid if register exists:
+   SVGA_FIFO_GUEST_3D_HWVERSION, /* Guest driver's 3D version */
+   SVGA_FIFO_FENCE_GOAL,         /* Matching target for SVGA_IRQFLAG_FENCE_GOAL */
+   SVGA_FIFO_BUSY,               /* See "FIFO Synchronization Registers" */
+
+   /*
+    * Always keep this last.  This defines the maximum number of
+    * registers we know about.  At power-on, this value is placed in
+    * the SVGA_REG_MEM_REGS register, and we expect the guest driver
+    * to allocate this much space in FIFO memory for registers.
+    */
+    SVGA_FIFO_NUM_REGS
+};
+
+
+/*
+ * Definition of registers included in extended FIFO support.
+ *
+ * The guest SVGA driver gets to allocate the FIFO between registers
+ * and data.  It must always allocate at least 4 registers, but old
+ * drivers stopped there.
+ *
+ * The VMX will enable extended FIFO support if and only if the guest
+ * left enough room for all registers defined as part of the mandatory
+ * set for the extended FIFO.
+ *
+ * Note that the guest drivers typically allocate the FIFO only at
+ * initialization time, not at mode switches, so it's likely that the
+ * number of FIFO registers won't change without a reboot.
+ *
+ * All registers less than this value are guaranteed to be present if
+ * svgaUser->fifo.extended is set. Any later registers must be tested
+ * individually for compatibility at each use (in the VMX).
+ *
+ * This value is used only by the VMX, so it can change without
+ * affecting driver compatibility; keep it that way?
+ */
+#define SVGA_FIFO_EXTENDED_MANDATORY_REGS  (SVGA_FIFO_3D_CAPS_LAST + 1)
+
+
+/*
+ * FIFO Synchronization Registers
+ *
+ *  This explains the relationship between the various FIFO
+ *  sync-related registers in IOSpace and in FIFO space.
+ *
+ *  SVGA_REG_SYNC --
+ *
+ *       The SYNC register can be used in two different ways by the guest:
+ *
+ *         1. If the guest wishes to fully sync (drain) the FIFO,
+ *            it will write once to SYNC then poll on the BUSY
+ *            register. The FIFO is sync'ed once BUSY is zero.
+ *
+ *         2. If the guest wants to asynchronously wake up the host,
+ *            it will write once to SYNC without polling on BUSY.
+ *            Ideally it will do this after some new commands have
+ *            been placed in the FIFO, and after reading a zero
+ *            from SVGA_FIFO_BUSY.
+ *
+ *       (1) is the original behaviour that SYNC was designed to
+ *       support.  Originally, a write to SYNC would implicitly
+ *       trigger a read from BUSY. This causes us to synchronously
+ *       process the FIFO.
+ *
+ *       This behaviour has since been changed so that writing SYNC
+ *       will *not* implicitly cause a read from BUSY. Instead, it
+ *       makes a channel call which asynchronously wakes up the MKS
+ *       thread.
+ *
+ *       New guests can use this new behaviour to implement (2)
+ *       efficiently. This lets guests get the host's attention
+ *       without waiting for the MKS to poll, which gives us much
+ *       better CPU utilization on SMP hosts and on UP hosts while
+ *       we're blocked on the host GPU.
+ *
+ *       Old guests shouldn't notice the behaviour change. SYNC was
+ *       never guaranteed to process the entire FIFO, since it was
+ *       bounded to a particular number of CPU cycles. Old guests will
+ *       still loop on the BUSY register until the FIFO is empty.
+ *
+ *       Writing to SYNC currently has the following side-effects:
+ *
+ *         - Sets SVGA_REG_BUSY to TRUE (in the monitor)
+ *         - Asynchronously wakes up the MKS thread for FIFO processing
+ *         - The value written to SYNC is recorded as a "reason", for
+ *           stats purposes.
+ *
+ *       If SVGA_FIFO_BUSY is available, drivers are advised to only
+ *       write to SYNC if SVGA_FIFO_BUSY is FALSE. Drivers should set
+ *       SVGA_FIFO_BUSY to TRUE after writing to SYNC. The MKS will
+ *       eventually set SVGA_FIFO_BUSY on its own, but this approach
+ *       lets the driver avoid sending multiple asynchronous wakeup
+ *       messages to the MKS thread.
+ *
+ *  SVGA_REG_BUSY --
+ *
+ *       This register is set to TRUE when SVGA_REG_SYNC is written,
+ *       and it reads as FALSE when the FIFO has been completely
+ *       drained.
+ *
+ *       Every read from this register causes us to synchronously
+ *       process FIFO commands. There is no guarantee as to how many
+ *       commands each read will process.
+ *
+ *       CPU time spent processing FIFO commands will be billed to
+ *       the guest.
+ *
+ *       New drivers should avoid using this register unless they
+ *       need to guarantee that the FIFO is completely drained. It
+ *       is overkill for performing a sync-to-fence. Older drivers
+ *       will use this register for any type of synchronization.
+ *
+ *  SVGA_FIFO_BUSY --
+ *
+ *       This register is a fast way for the guest driver to check
+ *       whether the FIFO is already being processed. It reads and
+ *       writes at normal RAM speeds, with no monitor intervention.
+ *
+ *       If this register reads as TRUE, the host is guaranteeing that
+ *       any new commands written into the FIFO will be noticed before
+ *       the MKS goes back to sleep.
+ *
+ *       If this register reads as FALSE, no such guarantee can be
+ *       made.
+ *
+ *       The guest should use this register to quickly determine
+ *       whether or not it needs to wake up the host. If the guest
+ *       just wrote a command or group of commands that it would like
+ *       the host to begin processing, it should:
+ *
+ *         1. Read SVGA_FIFO_BUSY. If it reads as TRUE, no further
+ *            action is necessary.
+ *
+ *         2. Write TRUE to SVGA_FIFO_BUSY. This informs future guest
+ *            code that we've already sent a SYNC to the host and we
+ *            don't need to send a duplicate.
+ *
+ *         3. Write a reason to SVGA_REG_SYNC. This will send an
+ *            asynchronous wakeup to the MKS thread.
+ */
+
+
+/*
+ * FIFO Capabilities
+ *
+ *      Fence -- Fence register and command are supported
+ *      Accel Front -- Front buffer only commands are supported
+ *      Pitch Lock -- Pitch lock register is supported
+ *      Video -- SVGA Video overlay units are supported
+ *      Escape -- Escape command is supported
+ *
+ * XXX: Add longer descriptions for each capability, including a list
+ *      of the new features that each capability provides.
+ *
+ * SVGA_FIFO_CAP_SCREEN_OBJECT --
+ *
+ *    Provides dynamic multi-screen rendering, for improved Unity and
+ *    multi-monitor modes. With Screen Object, the guest can
+ *    dynamically create and destroy 'screens', which can represent
+ *    Unity windows or virtual monitors. Screen Object also provides
+ *    strong guarantees that DMA operations happen only when
+ *    guest-initiated. Screen Object deprecates the BAR1 guest
+ *    framebuffer (GFB) and all commands that work only with the GFB.
+ *
+ *    New registers:
+ *       FIFO_CURSOR_SCREEN_ID, VIDEO_DATA_GMRID, VIDEO_DST_SCREEN_ID
+ *
+ *    New 2D commands:
+ *       DEFINE_SCREEN, DESTROY_SCREEN, DEFINE_GMRFB, BLIT_GMRFB_TO_SCREEN,
+ *       BLIT_SCREEN_TO_GMRFB, ANNOTATION_FILL, ANNOTATION_COPY
+ *
+ *    New 3D commands:
+ *       BLIT_SURFACE_TO_SCREEN
+ *
+ *    New guarantees:
+ *
+ *       - The host will not read or write guest memory, including the GFB,
+ *         except when explicitly initiated by a DMA command.
+ *
+ *       - All DMA, including legacy DMA like UPDATE and PRESENT_READBACK,
+ *         is guaranteed to complete before any subsequent FENCEs.
+ *
+ *       - All legacy commands which affect a Screen (UPDATE, PRESENT,
+ *         PRESENT_READBACK) as well as new Screen blit commands will
+ *         all behave consistently as blits, and memory will be read
+ *         or written in FIFO order.
+ *
+ *         For example, if you PRESENT from one SVGA3D surface to multiple
+ *         places on the screen, the data copied will always be from the
+ *         SVGA3D surface at the time the PRESENT was issued in the FIFO.
+ *         This was not necessarily true on devices without Screen Object.
+ *
+ *         This means that on devices that support Screen Object, the
+ *         PRESENT_READBACK command should not be necessary unless you
+ *         actually want to read back the results of 3D rendering into
+ *         system memory. (And for that, the BLIT_SCREEN_TO_GMRFB
+ *         command provides a strict superset of functionality.)
+ *
+ *       - When a screen is resized, either using Screen Object commands or
+ *         legacy multimon registers, its contents are preserved.
+ */
+
+#define SVGA_FIFO_CAP_NONE                  0
+#define SVGA_FIFO_CAP_FENCE             (1<<0)
+#define SVGA_FIFO_CAP_ACCELFRONT        (1<<1)
+#define SVGA_FIFO_CAP_PITCHLOCK         (1<<2)
+#define SVGA_FIFO_CAP_VIDEO             (1<<3)
+#define SVGA_FIFO_CAP_CURSOR_BYPASS_3   (1<<4)
+#define SVGA_FIFO_CAP_ESCAPE            (1<<5)
+#define SVGA_FIFO_CAP_RESERVE           (1<<6)
+#define SVGA_FIFO_CAP_SCREEN_OBJECT     (1<<7)
+
+
+/*
+ * FIFO Flags
+ *
+ *      Accel Front -- Driver should use front buffer only commands
+ */
+
+#define SVGA_FIFO_FLAG_NONE                 0
+#define SVGA_FIFO_FLAG_ACCELFRONT       (1<<0)
+#define SVGA_FIFO_FLAG_RESERVED        (1<<31) // Internal use only
+
+/*
+ * FIFO reservation sentinel value
+ */
+
+#define SVGA_FIFO_RESERVED_UNKNOWN      0xffffffff
+
+
+/*
+ * Video overlay support
+ */
+
+#define SVGA_NUM_OVERLAY_UNITS 32
+
+
+/*
+ * Video capabilities that the guest is currently using
+ */
+
+#define SVGA_VIDEO_FLAG_COLORKEY        0x0001
+
+
+/*
+ * Offsets for the video overlay registers
+ */
+
+enum {
+   SVGA_VIDEO_ENABLED = 0,
+   SVGA_VIDEO_FLAGS,
+   SVGA_VIDEO_DATA_OFFSET,
+   SVGA_VIDEO_FORMAT,
+   SVGA_VIDEO_COLORKEY,
+   SVGA_VIDEO_SIZE,          // Deprecated
+   SVGA_VIDEO_WIDTH,
+   SVGA_VIDEO_HEIGHT,
+   SVGA_VIDEO_SRC_X,
+   SVGA_VIDEO_SRC_Y,
+   SVGA_VIDEO_SRC_WIDTH,
+   SVGA_VIDEO_SRC_HEIGHT,
+   SVGA_VIDEO_DST_X,         // Signed int32
+   SVGA_VIDEO_DST_Y,         // Signed int32
+   SVGA_VIDEO_DST_WIDTH,
+   SVGA_VIDEO_DST_HEIGHT,
+   SVGA_VIDEO_PITCH_1,
+   SVGA_VIDEO_PITCH_2,
+   SVGA_VIDEO_PITCH_3,
+   SVGA_VIDEO_DATA_GMRID,    // Optional, defaults to SVGA_GMR_FRAMEBUFFER
+   SVGA_VIDEO_DST_SCREEN_ID, // Optional, defaults to virtual coords (SVGA_ID_INVALID)
+   SVGA_VIDEO_NUM_REGS
+};
+
+
+/*
+ * SVGA Overlay Units
+ *
+ *      width and height relate to the entire source video frame.
+ *      srcX, srcY, srcWidth and srcHeight represent subset of the source
+ *      video frame to be displayed.
+ */
+
+typedef struct SVGAOverlayUnit {
+   uint32 enabled;
+   uint32 flags;
+   uint32 dataOffset;
+   uint32 format;
+   uint32 colorKey;
+   uint32 size;
+   uint32 width;
+   uint32 height;
+   uint32 srcX;
+   uint32 srcY;
+   uint32 srcWidth;
+   uint32 srcHeight;
+   int32  dstX;
+   int32  dstY;
+   uint32 dstWidth;
+   uint32 dstHeight;
+   uint32 pitches[3];
+   uint32 dataGMRId;
+   uint32 dstScreenId;
+} SVGAOverlayUnit;
+
+
+/*
+ * SVGAScreenObject --
+ *
+ *    This is a new way to represent a guest's multi-monitor screen or
+ *    Unity window. Screen objects are only supported if the
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT capability bit is set.
+ *
+ *    If Screen Objects are supported, they can be used to fully
+ *    replace the functionality provided by the framebuffer registers
+ *    (SVGA_REG_WIDTH, HEIGHT, etc.) and by SVGA_CAP_DISPLAY_TOPOLOGY.
+ *
+ *    The screen object is a struct with guaranteed binary
+ *    compatibility. New flags can be added, and the struct may grow,
+ *    but existing fields must retain their meaning.
+ *
+ */
+
+#define SVGA_SCREEN_HAS_ROOT    (1 << 0)  // Screen is present in the virtual coord space
+#define SVGA_SCREEN_IS_PRIMARY  (1 << 1)  // Guest considers this screen to be 'primary'
+#define SVGA_SCREEN_FULLSCREEN_HINT (1 << 2)   // Guest is running a fullscreen app here
+
+typedef
+struct SVGAScreenObject {
+   uint32 structSize;   // sizeof(SVGAScreenObject)
+   uint32 id;
+   uint32 flags;
+   struct {
+      uint32 width;
+      uint32 height;
+   } size;
+   struct {
+      int32 x;
+      int32 y;
+   } root;              // Only used if SVGA_SCREEN_HAS_ROOT is set.
+} SVGAScreenObject;
+
+
+/*
+ *  Commands in the command FIFO:
+ *
+ *  Command IDs defined below are used for the traditional 2D FIFO
+ *  communication (not all commands are available for all versions of the
+ *  SVGA FIFO protocol).
+ *
+ *  Note the holes in the command ID numbers: These commands have been
+ *  deprecated, and the old IDs must not be reused.
+ *
+ *  Command IDs from 1000 to 1999 are reserved for use by the SVGA3D
+ *  protocol.
+ *
+ *  Each command's parameters are described by the comments and
+ *  structs below.
+ */
+
+typedef enum {
+   SVGA_CMD_INVALID_CMD           = 0,
+   SVGA_CMD_UPDATE                = 1,
+   SVGA_CMD_RECT_COPY             = 3,
+   SVGA_CMD_DEFINE_CURSOR         = 19,
+   SVGA_CMD_DEFINE_ALPHA_CURSOR   = 22,
+   SVGA_CMD_UPDATE_VERBOSE        = 25,
+   SVGA_CMD_FRONT_ROP_FILL        = 29,
+   SVGA_CMD_FENCE                 = 30,
+   SVGA_CMD_ESCAPE                = 33,
+   SVGA_CMD_DEFINE_SCREEN         = 34,
+   SVGA_CMD_DESTROY_SCREEN        = 35,
+   SVGA_CMD_DEFINE_GMRFB          = 36,
+   SVGA_CMD_BLIT_GMRFB_TO_SCREEN  = 37,
+   SVGA_CMD_BLIT_SCREEN_TO_GMRFB  = 38,
+   SVGA_CMD_ANNOTATION_FILL       = 39,
+   SVGA_CMD_ANNOTATION_COPY       = 40,
+   SVGA_CMD_MAX
+} SVGAFifoCmdId;
+
+#define SVGA_CMD_MAX_ARGS           64
+
+
+/*
+ * SVGA_CMD_UPDATE --
+ *
+ *    This is a DMA transfer which copies from the Guest Framebuffer
+ *    (GFB) at BAR1 + SVGA_REG_FB_OFFSET to any screens which
+ *    intersect with the provided virtual rectangle.
+ *
+ *    This command does not support using arbitrary guest memory as a
+ *    data source- it only works with the pre-defined GFB memory.
+ *    This command also does not support signed virtual coordinates.
+ *    If you have defined screens (using SVGA_CMD_DEFINE_SCREEN) with
+ *    negative root x/y coordinates, the negative portion of those
+ *    screens will not be reachable by this command.
+ *
+ *    This command is not necessary when using framebuffer
+ *    traces. Traces are automatically enabled if the SVGA FIFO is
+ *    disabled, and you may explicitly enable/disable traces using
+ *    SVGA_REG_TRACES. With traces enabled, any write to the GFB will
+ *    automatically act as if a subsequent SVGA_CMD_UPDATE was issued.
+ *
+ *    Traces and SVGA_CMD_UPDATE are the only supported ways to render
+ *    pseudocolor screen updates. The newer Screen Object commands
+ *    only support true color formats.
+ *
+ * Availability:
+ *    Always available.
+ */
+
+typedef
+struct {
+   uint32 x;
+   uint32 y;
+   uint32 width;
+   uint32 height;
+} SVGAFifoCmdUpdate;
+
+
+/*
+ * SVGA_CMD_RECT_COPY --
+ *
+ *    Perform a rectangular DMA transfer from one area of the GFB to
+ *    another, and copy the result to any screens which intersect it.
+ *
+ * Availability:
+ *    SVGA_CAP_RECT_COPY
+ */
+
+typedef
+struct {
+   uint32 srcX;
+   uint32 srcY;
+   uint32 destX;
+   uint32 destY;
+   uint32 width;
+   uint32 height;
+} SVGAFifoCmdRectCopy;
+
+
+/*
+ * SVGA_CMD_DEFINE_CURSOR --
+ *
+ *    Provide a new cursor image, as an AND/XOR mask.
+ *
+ *    The recommended way to position the cursor overlay is by using
+ *    the SVGA_FIFO_CURSOR_* registers, supported by the
+ *    SVGA_FIFO_CAP_CURSOR_BYPASS_3 capability.
+ *
+ * Availability:
+ *    SVGA_CAP_CURSOR
+ */
+
+typedef
+struct {
+   uint32 id;             // Reserved, must be zero.
+   uint32 hotspotX;
+   uint32 hotspotY;
+   uint32 width;
+   uint32 height;
+   uint32 andMaskDepth;   // Value must be 1 or equal to BITS_PER_PIXEL
+   uint32 xorMaskDepth;   // Value must be 1 or equal to BITS_PER_PIXEL
+   /*
+    * Followed by scanline data for AND mask, then XOR mask.
+    * Each scanline is padded to a 32-bit boundary.
+   */
+} SVGAFifoCmdDefineCursor;
+
+
+/*
+ * SVGA_CMD_DEFINE_ALPHA_CURSOR --
+ *
+ *    Provide a new cursor image, in 32-bit BGRA format.
+ *
+ *    The recommended way to position the cursor overlay is by using
+ *    the SVGA_FIFO_CURSOR_* registers, supported by the
+ *    SVGA_FIFO_CAP_CURSOR_BYPASS_3 capability.
+ *
+ * Availability:
+ *    SVGA_CAP_ALPHA_CURSOR
+ */
+
+typedef
+struct {
+   uint32 id;             // Reserved, must be zero.
+   uint32 hotspotX;
+   uint32 hotspotY;
+   uint32 width;
+   uint32 height;
+   /* Followed by scanline data */
+} SVGAFifoCmdDefineAlphaCursor;
+
+
+/*
+ * SVGA_CMD_UPDATE_VERBOSE --
+ *
+ *    Just like SVGA_CMD_UPDATE, but also provide a per-rectangle
+ *    'reason' value, an opaque cookie which is used by internal
+ *    debugging tools. Third party drivers should not use this
+ *    command.
+ *
+ * Availability:
+ *    SVGA_CAP_EXTENDED_FIFO
+ */
+
+typedef
+struct {
+   uint32 x;
+   uint32 y;
+   uint32 width;
+   uint32 height;
+   uint32 reason;
+} SVGAFifoCmdUpdateVerbose;
+
+
+/*
+ * SVGA_CMD_FRONT_ROP_FILL --
+ *
+ *    This is a hint which tells the SVGA device that the driver has
+ *    just filled a rectangular region of the GFB with a solid
+ *    color. Instead of reading these pixels from the GFB, the device
+ *    can assume that they all equal 'color'. This is primarily used
+ *    for remote desktop protocols.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_ACCELFRONT
+ */
+
+#define  SVGA_ROP_COPY                    0x03
+
+typedef
+struct {
+   uint32 color;     // In the same format as the GFB
+   uint32 x;
+   uint32 y;
+   uint32 width;
+   uint32 height;
+   uint32 rop;       // Must be SVGA_ROP_COPY
+} SVGAFifoCmdFrontRopFill;
+
+
+/*
+ * SVGA_CMD_FENCE --
+ *
+ *    Insert a synchronization fence.  When the SVGA device reaches
+ *    this command, it will copy the 'fence' value into the
+ *    SVGA_FIFO_FENCE register. It will also compare the fence against
+ *    SVGA_FIFO_FENCE_GOAL. If the fence matches the goal and the
+ *    SVGA_IRQFLAG_FENCE_GOAL interrupt is enabled, the device will
+ *    raise this interrupt.
+ *
+ * Availability:
+ *    SVGA_FIFO_FENCE for this command,
+ *    SVGA_CAP_IRQMASK for SVGA_FIFO_FENCE_GOAL.
+ */
+
+typedef
+struct {
+   uint32 fence;
+} SVGAFifoCmdFence;
+
+
+/*
+ * SVGA_CMD_ESCAPE --
+ *
+ *    Send an extended or vendor-specific variable length command.
+ *    This is used for video overlay, third party plugins, and
+ *    internal debugging tools. See svga_escape.h
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_ESCAPE
+ */
+
+typedef
+struct {
+   uint32 nsid;
+   uint32 size;
+   /* followed by 'size' bytes of data */
+} SVGAFifoCmdEscape;
+
+
+/*
+ * SVGA_CMD_DEFINE_SCREEN --
+ *
+ *    Define or redefine an SVGAScreenObject. See the description of
+ *    SVGAScreenObject above.  The video driver is responsible for
+ *    generating new screen IDs. They should be small positive
+ *    integers. The virtual device will have an implementation
+ *    specific upper limit on the number of screen IDs
+ *    supported. Drivers are responsible for recycling IDs. The first
+ *    valid ID is zero.
+ *
+ *    - Interaction with other registers:
+ *
+ *    For backwards compatibility, when the GFB mode registers (WIDTH,
+ *    HEIGHT, PITCHLOCK, BITS_PER_PIXEL) are modified, the SVGA device
+ *    deletes all screens other than screen #0, and redefines screen
+ *    #0 according to the specified mode. Drivers that use
+ *    SVGA_CMD_DEFINE_SCREEN should destroy or redefine screen #0.
+ *
+ *    If you use screen objects, do not use the legacy multi-mon
+ *    registers (SVGA_REG_NUM_GUEST_DISPLAYS, SVGA_REG_DISPLAY_*).
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGAScreenObject screen;   // Variable-length according to version
+} SVGAFifoCmdDefineScreen;
+
+
+/*
+ * SVGA_CMD_DESTROY_SCREEN --
+ *
+ *    Destroy an SVGAScreenObject. Its ID is immediately available for
+ *    re-use.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   uint32 screenId;
+} SVGAFifoCmdDestroyScreen;
+
+
+/*
+ * SVGA_CMD_DEFINE_GMRFB --
+ *
+ *    This command sets a piece of SVGA device state called the
+ *    Guest Memory Region Framebuffer, or GMRFB. The GMRFB is a
+ *    piece of light-weight state which identifies the location and
+ *    format of an image in guest memory or in BAR1. The GMRFB has
+ *    an arbitrary size, and it doesn't need to match the geometry
+ *    of the GFB or any screen object.
+ *
+ *    The GMRFB can be redefined as often as you like. You could
+ *    always use the same GMRFB, you could redefine it before
+ *    rendering from a different guest screen, or you could even
+ *    redefine it before every blit.
+ *
+ *    There are multiple ways to use this command. The simplest way is
+ *    to use it to move the framebuffer either to elsewhere in the GFB
+ *    (BAR1) memory region, or to a user-defined GMR. This lets a
+ *    driver use a framebuffer allocated entirely out of normal system
+ *    memory, which we encourage.
+ *
+ *    Another way to use this command is to set up a ring buffer of
+ *    updates in GFB memory. If a driver wants to ensure that no
+ *    frames are skipped by the SVGA device, it is important that the
+ *    driver not modify the source data for a blit until the device is
+ *    done processing the command. One efficient way to accomplish
+ *    this is to use a ring of small DMA buffers. Each buffer is used
+ *    for one blit, then we move on to the next buffer in the
+ *    ring. The FENCE mechanism is used to protect each buffer from
+ *    re-use until the device is finished with that buffer's
+ *    corresponding blit.
+ *
+ *    This command does not affect the meaning of SVGA_CMD_UPDATE.
+ *    UPDATEs always occur from the legacy GFB memory area. This
+ *    command has no support for pseudocolor GMRFBs. Currently only
+ *    true-color 15, 16, and 24-bit depths are supported. Future
+ *    devices may expose capabilities for additional framebuffer
+ *    formats.
+ *
+ *    The default GMRFB value is undefined. Drivers must always send
+ *    this command at least once before performing any blit from the
+ *    GMRFB.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGAGuestPtr        ptr;
+   uint32              bytesPerLine;
+   SVGAGMRImageFormat  format;
+} SVGAFifoCmdDefineGMRFB;
+
+
+/*
+ * SVGA_CMD_BLIT_GMRFB_TO_SCREEN --
+ *
+ *    This is a guest-to-host blit. It performs a DMA operation to
+ *    copy a rectangular region of pixels from the current GMRFB to
+ *    one or more Screen Objects.
+ *
+ *    The destination coordinate may be specified relative to a
+ *    screen's origin (if a screen ID is specified) or relative to the
+ *    virtual coordinate system's origin (if the screen ID is
+ *    SVGA_ID_INVALID). The actual destination may span zero or more
+ *    screens, in the case of a virtual destination rect or a rect
+ *    which extends off the edge of the specified screen.
+ *
+ *    This command writes to the screen's "base layer": the underlying
+ *    framebuffer which exists below any cursor or video overlays. No
+ *    action is necessary to explicitly hide or update any overlays
+ *    which exist on top of the updated region.
+ *
+ *    The SVGA device is guaranteed to finish reading from the GMRFB
+ *    by the time any subsequent FENCE commands are reached.
+ *
+ *    This command consumes an annotation. See the
+ *    SVGA_CMD_ANNOTATION_* commands for details.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGASignedPoint  srcOrigin;
+   SVGASignedRect   destRect;
+   uint32           destScreenId;
+} SVGAFifoCmdBlitGMRFBToScreen;
+
+
+/*
+ * SVGA_CMD_BLIT_SCREEN_TO_GMRFB --
+ *
+ *    This is a host-to-guest blit. It performs a DMA operation to
+ *    copy a rectangular region of pixels from a single Screen Object
+ *    back to the current GMRFB.
+ *
+ *    Usage note: This command should be used rarely. It will
+ *    typically be inefficient, but it is necessary for some types of
+ *    synchronization between 3D (GPU) and 2D (CPU) rendering into
+ *    overlapping areas of a screen.
+ *
+ *    The source coordinate is specified relative to a screen's
+ *    origin. The provided screen ID must be valid. If any parameters
+ *    are invalid, the resulting pixel values are undefined.
+ *
+ *    This command reads the screen's "base layer". Overlays like
+ *    video and cursor are not included, but any data which was sent
+ *    using a blit-to-screen primitive will be available, no matter
+ *    whether the data's original source was the GMRFB or the 3D
+ *    acceleration hardware.
+ *
+ *    Note that our guest-to-host blits and host-to-guest blits aren't
+ *    symmetric in their current implementation. While the parameters
+ *    are identical, host-to-guest blits are a lot less featureful.
+ *    They do not support clipping: If the source parameters don't
+ *    fully fit within a screen, the blit fails. They must originate
+ *    from exactly one screen. Virtual coordinates are not directly
+ *    supported.
+ *
+ *    Host-to-guest blits do support the same set of GMRFB formats
+ *    offered by guest-to-host blits.
+ *
+ *    The SVGA device is guaranteed to finish writing to the GMRFB by
+ *    the time any subsequent FENCE commands are reached.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGASignedPoint  destOrigin;
+   SVGASignedRect   srcRect;
+   uint32           srcScreenId;
+} SVGAFifoCmdBlitScreenToGMRFB;
+
+
+/*
+ * SVGA_CMD_ANNOTATION_FILL --
+ *
+ *    This is a blit annotation. This command stores a small piece of
+ *    device state which is consumed by the next blit-to-screen
+ *    command. The state is only cleared by commands which are
+ *    specifically documented as consuming an annotation. Other
+ *    commands (such as ESCAPEs for debugging) may intervene between
+ *    the annotation and its associated blit.
+ *
+ *    This annotation is a promise about the contents of the next
+ *    blit: The video driver is guaranteeing that all pixels in that
+ *    blit will have the same value, specified here as a color in
+ *    SVGAColorBGRX format.
+ *
+ *    The SVGA device can still render the blit correctly even if it
+ *    ignores this annotation, but the annotation may allow it to
+ *    perform the blit more efficiently, for example by ignoring the
+ *    source data and performing a fill in hardware.
+ *
+ *    This annotation is most important for performance when the
+ *    user's display is being remoted over a network connection.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGAColorBGRX  color;
+} SVGAFifoCmdAnnotationFill;
+
+
+/*
+ * SVGA_CMD_ANNOTATION_COPY --
+ *
+ *    This is a blit annotation. See SVGA_CMD_ANNOTATION_FILL for more
+ *    information about annotations.
+ *
+ *    This annotation is a promise about the contents of the next
+ *    blit: The video driver is guaranteeing that all pixels in that
+ *    blit will have the same value as those which already exist at an
+ *    identically-sized region on the same or a different screen.
+ *
+ *    Note that the source pixels for the COPY in this annotation are
+ *    sampled before applying the anqnotation's associated blit. They
+ *    are allowed to overlap with the blit's destination pixels.
+ *
+ *    The copy source rectangle is specified the same way as the blit
+ *    destination: it can be a rectangle which spans zero or more
+ *    screens, specified relative to either a screen or to the virtual
+ *    coordinate system's origin. If the source rectangle includes
+ *    pixels which are not from exactly one screen, the results are
+ *    undefined.
+ *
+ * Availability:
+ *    SVGA_FIFO_CAP_SCREEN_OBJECT
+ */
+
+typedef
+struct {
+   SVGASignedPoint  srcOrigin;
+   uint32           srcScreenId;
+} SVGAFifoCmdAnnotationCopy;
+
+#endif
diff --git a/src/gallium/drivers/svga/include/svga_types.h b/src/gallium/drivers/svga/include/svga_types.h
new file mode 100644
index 00000000000..7fd9bab03a5
--- /dev/null
+++ b/src/gallium/drivers/svga/include/svga_types.h
@@ -0,0 +1,46 @@
+/**********************************************************
+ * Copyright 1998-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef _SVGA_TYPES_H_
+#define _SVGA_TYPES_H_
+
+#include "pipe/p_compiler.h"
+
+typedef int64_t int64;
+typedef uint64_t uint64;
+
+typedef int32_t int32;
+typedef uint32_t uint32;
+
+typedef int16_t int16;
+typedef uint16_t uint16;
+
+typedef int8_t int8;
+typedef uint8_t uint8;
+
+typedef uint8_t Bool;
+
+#endif /* _SVGA_TYPES_H_ */
+
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
new file mode 100644
index 00000000000..a0da7d7e5d5
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -0,0 +1,1427 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * svga_cmd.c --
+ *
+ *      Command construction utility for the SVGA3D protocol used by
+ *      the VMware SVGA device, based on the svgautil library.
+ */
+
+#include "svga_winsys.h"
+#include "svga_screen_buffer.h"
+#include "svga_screen_texture.h"
+#include "svga_cmd.h"
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * surface_to_surfaceid --
+ *
+ *      Utility function for surface ids.
+ *      Can handle null surface. Does a surface_reallocation so you need
+ *      to have allocated the fifo space before converting.
+ *
+ * Results:
+ *      id is filld out.
+ *
+ * Side effects:
+ *      One surface relocation is preformed for texture handle.
+ *
+ *----------------------------------------------------------------------
+ */
+
+static INLINE
+void surface_to_surfaceid(struct svga_winsys_context *swc, // IN
+                          struct pipe_surface *surface,    // IN
+                          SVGA3dSurfaceImageId *id,        // OUT
+                          unsigned flags)                  // IN
+{
+   if(surface) {
+      struct svga_surface *s = svga_surface(surface);
+      swc->surface_relocation(swc, &id->sid, s->handle, flags);
+      id->face = s->real_face; /* faces have the same order */
+      id->mipmap = s->real_level;
+   }
+   else {
+      id->sid = SVGA3D_INVALID_ID;
+      id->face = 0;
+      id->mipmap = 0;
+   }
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_FIFOReserve --
+ *
+ *      Reserve space for an SVGA3D FIFO command.
+ *
+ *      The 2D SVGA commands have been around for a while, so they
+ *      have a rather asymmetric structure. The SVGA3D protocol is
+ *      more uniform: each command begins with a header containing the
+ *      command number and the full size.
+ *
+ *      This is a convenience wrapper around SVGA_FIFOReserve. We
+ *      reserve space for the whole command, and write the header.
+ *
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *
+ * Results:
+ *      Returns a pointer to the space reserved for command-specific
+ *      data. It must be 'cmdSize' bytes long.
+ *
+ * Side effects:
+ *      Begins a FIFO reservation.
+ *
+ *----------------------------------------------------------------------
+ */
+
+void *
+SVGA3D_FIFOReserve(struct svga_winsys_context *swc,
+                   uint32 cmd,       // IN
+                   uint32 cmdSize,   // IN
+                   uint32 nr_relocs) // IN
+{
+   SVGA3dCmdHeader *header;
+
+   header = swc->reserve(swc, sizeof *header + cmdSize, nr_relocs);
+   if(!header)
+      return NULL;
+
+   header->id = cmd;
+   header->size = cmdSize;
+
+   return &header[1];
+}
+
+
+void
+SVGA_FIFOCommitAll(struct svga_winsys_context *swc)
+{
+   swc->commit(swc);
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DefineContext --
+ *
+ *      Create a new context, to be referred to with the provided ID.
+ *
+ *      Context objects encapsulate all render state, and shader
+ *      objects are per-context.
+ *
+ *      Surfaces are not per-context. The same surface can be shared
+ *      between multiple contexts, and surface operations can occur
+ *      without a context.
+ *
+ *      If the provided context ID already existed, it is redefined.
+ *
+ *      Context IDs are arbitrary small non-negative integers,
+ *      global to the entire SVGA device.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DefineContext(struct svga_winsys_context *swc)  // IN
+{
+   SVGA3dCmdDefineContext *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_CONTEXT_DEFINE, sizeof *cmd, 0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DestroyContext --
+ *
+ *      Delete a context created with SVGA3D_DefineContext.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DestroyContext(struct svga_winsys_context *swc)  // IN
+{
+   SVGA3dCmdDestroyContext *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_CONTEXT_DESTROY, sizeof *cmd, 0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   cmd->cid = swc->cid;
+   
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginDefineSurface --
+ *
+ *      Begin a SURFACE_DEFINE command. This reserves space for it in
+ *      the FIFO, and returns pointers to the command's faces and
+ *      mipsizes arrays.
+ *
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *      The faces and mipSizes arrays are initialized to zero.
+ *
+ *      This creates a "surface" object in the SVGA3D device,
+ *      with the provided surface ID (sid). Surfaces are generic
+ *      containers for host VRAM objects like textures, vertex
+ *      buffers, and depth/stencil buffers.
+ *
+ *      Surfaces are hierarchial:
+ *
+ *        - Surface may have multiple faces (for cube maps)
+ *
+ *          - Each face has a list of mipmap levels
+ *
+ *             - Each mipmap image may have multiple volume
+ *               slices, if the image is three dimensional.
+ *
+ *                - Each slice is a 2D array of 'blocks'
+ *
+ *                   - Each block may be one or more pixels.
+ *                     (Usually 1, more for DXT or YUV formats.)
+ *
+ *      Surfaces are generic host VRAM objects. The SVGA3D device
+ *      may optimize surfaces according to the format they were
+ *      created with, but this format does not limit the ways in
+ *      which the surface may be used. For example, a depth surface
+ *      can be used as a texture, or a floating point image may
+ *      be used as a vertex buffer. Some surface usages may be
+ *      lower performance, due to software emulation, but any
+ *      usage should work with any surface.
+ *
+ *      If 'sid' is already defined, the old surface is deleted
+ *      and this new surface replaces it.
+ *
+ *      Surface IDs are arbitrary small non-negative integers,
+ *      global to the entire SVGA device.
+ *
+ * Results:
+ *      Returns pointers to arrays allocated in the FIFO for 'faces'
+ *      and 'mipSizes'.
+ *
+ * Side effects:
+ *      Begins a FIFO reservation.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginDefineSurface(struct svga_winsys_context *swc,
+                          struct svga_winsys_surface *sid, // IN
+                          SVGA3dSurfaceFlags flags,    // IN
+                          SVGA3dSurfaceFormat format,  // IN
+                          SVGA3dSurfaceFace **faces,   // OUT
+                          SVGA3dSize **mipSizes,       // OUT
+                          uint32 numMipSizes)          // IN
+{
+   SVGA3dCmdDefineSurface *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DEFINE, sizeof *cmd +
+                            sizeof **mipSizes * numMipSizes, 1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->sid, sid, PIPE_BUFFER_USAGE_GPU_WRITE);
+   cmd->surfaceFlags = flags;
+   cmd->format = format;
+
+   *faces = &cmd->face[0];
+   *mipSizes = (SVGA3dSize*) &cmd[1];
+
+   memset(*faces, 0, sizeof **faces * SVGA3D_MAX_SURFACE_FACES);
+   memset(*mipSizes, 0, sizeof **mipSizes * numMipSizes);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DefineSurface2D --
+ *
+ *      This is a simplified version of SVGA3D_BeginDefineSurface(),
+ *      which does not support cube maps, mipmaps, or volume textures.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,    // IN
+                       struct svga_winsys_surface *sid, // IN
+                       uint32 width,                // IN
+                       uint32 height,               // IN
+                       SVGA3dSurfaceFormat format)  // IN
+{
+   SVGA3dSize *mipSizes;
+   SVGA3dSurfaceFace *faces;
+   enum pipe_error ret;
+
+   ret = SVGA3D_BeginDefineSurface(swc,
+                                   sid, 0, format, &faces, &mipSizes, 1);
+   if(ret != PIPE_OK)
+      return ret;
+
+   faces[0].numMipLevels = 1;
+
+   mipSizes[0].width = width;
+   mipSizes[0].height = height;
+   mipSizes[0].depth = 1;
+ 
+   swc->commit(swc);;
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DestroySurface --
+ *
+ *      Release the host VRAM encapsulated by a particular surface ID.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DestroySurface(struct svga_winsys_context *swc,
+                      struct svga_winsys_surface *sid)  // IN
+{
+   SVGA3dCmdDestroySurface *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DESTROY, sizeof *cmd, 1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   swc->surface_relocation(swc, &cmd->sid, sid, PIPE_BUFFER_USAGE_GPU_READ);
+   swc->commit(swc);;
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSurfaceDMA--
+ *
+ *      Begin a SURFACE_DMA command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's box array.
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      When the SVGA3D device asynchronously processes this FIFO
+ *      command, a DMA operation is performed between host VRAM and
+ *      a generic SVGAGuestPtr. The guest pointer may refer to guest
+ *      VRAM (provided by the SVGA PCI device) or to guest system
+ *      memory that has been set up as a Guest Memory Region (GMR)
+ *      by the SVGA device.
+ *
+ *      The guest's DMA buffer must remain valid (not freed, paged out,
+ *      or overwritten) until the host has finished processing this
+ *      command. The guest can determine that the host has finished
+ *      by using the SVGA device's FIFO Fence mechanism.
+ *
+ *      The guest's image buffer can be an arbitrary size and shape.
+ *      Guest image data is interpreted according to the SVGA3D surface
+ *      format specified when the surface was defined.
+ *
+ *      The caller may optionally define the guest image's pitch.
+ *      guestImage->pitch can either be zero (assume image is tightly
+ *      packed) or it must be the number of bytes between vertically
+ *      adjacent image blocks.
+ *
+ *      The provided copybox list specifies which regions of the source
+ *      image are to be copied, and where they appear on the destination.
+ *
+ *      NOTE: srcx/srcy are always on the guest image and x/y are
+ *      always on the host image, regardless of the actual transfer
+ *      direction!
+ *
+ *      For efficiency, the SVGA3D device is free to copy more data
+ *      than specified. For example, it may round copy boxes outwards
+ *      such that they lie on particular alignment boundaries.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
+                  struct svga_transfer *st,         // IN
+                  SVGA3dTransferType transfer,      // IN
+                  const SVGA3dCopyBox *boxes,       // IN
+                  uint32 numBoxes)                  // IN
+{
+   struct svga_texture *texture = svga_texture(st->base.texture); 
+   SVGA3dCmdSurfaceDMA *cmd;
+   SVGA3dCmdSurfaceDMASuffix *pSuffix;
+   uint32 boxesSize = sizeof *boxes * numBoxes;
+   unsigned region_flags;
+   unsigned surface_flags;
+   
+   if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+      region_flags = PIPE_BUFFER_USAGE_GPU_READ;
+      surface_flags = PIPE_BUFFER_USAGE_GPU_WRITE;
+   }
+   else if(transfer == SVGA3D_READ_HOST_VRAM) {
+      region_flags = PIPE_BUFFER_USAGE_GPU_WRITE;
+      surface_flags = PIPE_BUFFER_USAGE_GPU_READ;
+   }
+   else {
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DMA,
+                            sizeof *cmd + boxesSize + sizeof *pSuffix,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->region_relocation(swc, &cmd->guest.ptr, st->hwbuf, 0, region_flags);
+   cmd->guest.pitch = st->base.stride;
+
+   swc->surface_relocation(swc, &cmd->host.sid, texture->handle, surface_flags);
+   cmd->host.face = st->base.face; /* PIPE_TEX_FACE_* and SVGA3D_CUBEFACE_* match */
+   cmd->host.mipmap = st->base.level;
+
+   cmd->transfer = transfer;
+
+   memcpy(&cmd[1], boxes, boxesSize);
+   
+   pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + boxesSize);
+   pSuffix->suffixSize = sizeof *pSuffix;
+   pSuffix->maximumOffset = st->hw_nblocksy*st->base.stride;
+   memset(&pSuffix->flags, 0, sizeof pSuffix->flags);
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_BufferDMA(struct svga_winsys_context *swc,
+                 struct svga_winsys_buffer *guest,
+                 struct svga_winsys_surface *host,
+                 SVGA3dTransferType transfer,      // IN
+                 uint32 size,                      // IN
+                 uint32 offset,                    // IN
+                 SVGA3dSurfaceDMAFlags flags)      // IN
+{
+   SVGA3dCmdSurfaceDMA *cmd;
+   SVGA3dCopyBox *box;
+   SVGA3dCmdSurfaceDMASuffix *pSuffix;
+   unsigned region_flags;
+   unsigned surface_flags;
+   
+   if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+      region_flags = PIPE_BUFFER_USAGE_GPU_READ;
+      surface_flags = PIPE_BUFFER_USAGE_GPU_WRITE;
+   }
+   else if(transfer == SVGA3D_READ_HOST_VRAM) {
+      region_flags = PIPE_BUFFER_USAGE_GPU_WRITE;
+      surface_flags = PIPE_BUFFER_USAGE_GPU_READ;
+   }
+   else {
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DMA,
+                            sizeof *cmd + sizeof *box + sizeof *pSuffix,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->region_relocation(swc, &cmd->guest.ptr, guest, 0, region_flags);
+   cmd->guest.pitch = 0;
+
+   swc->surface_relocation(swc, &cmd->host.sid, host, surface_flags);
+   cmd->host.face = 0;
+   cmd->host.mipmap = 0;
+
+   cmd->transfer = transfer;
+
+   box = (SVGA3dCopyBox *)&cmd[1];
+   box->x = offset;
+   box->y = 0;
+   box->z = 0;
+   box->w = size;
+   box->h = 1;
+   box->d = 1;
+   box->srcx = offset;
+   box->srcy = 0;
+   box->srcz = 0;
+   
+   pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + sizeof *box);
+   pSuffix->suffixSize = sizeof *pSuffix;
+   pSuffix->maximumOffset = offset + size;
+   pSuffix->flags = flags;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetRenderTarget --
+ *
+ *      Bind a surface object to a particular render target attachment
+ *      point on the current context. Render target attachment points
+ *      exist for color buffers, a depth buffer, and a stencil buffer.
+ *
+ *      The SVGA3D device is quite lenient about the types of surfaces
+ *      that may be used as render targets. The color buffers must
+ *      all be the same size, but the depth and stencil buffers do not
+ *      have to be the same size as the color buffer. All attachments
+ *      are optional.
+ *
+ *      Some combinations of render target formats may require software
+ *      emulation, depending on the capabilities of the host graphics
+ *      API and graphics hardware.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetRenderTarget(struct svga_winsys_context *swc,
+                       SVGA3dRenderTargetType type,   // IN
+                       struct pipe_surface *surface)  // IN
+{
+   SVGA3dCmdSetRenderTarget *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETRENDERTARGET, sizeof *cmd, 1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+
+   cmd->cid = swc->cid;
+
+   cmd->type = type;
+
+   surface_to_surfaceid(swc, surface, &cmd->target, PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+
+
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DefineShader --
+ *
+ *      Upload the bytecode for a new shader. The bytecode is "SVGA3D
+ *      format", which is theoretically a binary-compatible superset
+ *      of Microsoft's DirectX shader bytecode. In practice, the
+ *      SVGA3D bytecode doesn't yet have any extensions to DirectX's
+ *      bytecode format.
+ *
+ *      The SVGA3D device supports shader models 1.1 through 2.0.
+ *
+ *      The caller chooses a shader ID (small positive integer) by
+ *      which this shader will be identified in future commands. This
+ *      ID is in a namespace which is per-context and per-shader-type.
+ *
+ *      'bytecodeLen' is specified in bytes. It must be a multiple of 4.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DefineShader(struct svga_winsys_context *swc,
+                    uint32 shid,                  // IN
+                    SVGA3dShaderType type,        // IN
+                    const uint32 *bytecode,       // IN
+                    uint32 bytecodeLen)           // IN
+{
+   SVGA3dCmdDefineShader *cmd;
+
+   assert(bytecodeLen % 4 == 0);
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SHADER_DEFINE, sizeof *cmd + bytecodeLen,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->shid = shid;
+   cmd->type = type;
+   memcpy(&cmd[1], bytecode, bytecodeLen);
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_DestroyShader --
+ *
+ *      Delete a shader that was created by SVGA3D_DefineShader. If
+ *      the shader was the current vertex or pixel shader for its
+ *      context, rendering results are undefined until a new shader is
+ *      bound.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_DestroyShader(struct svga_winsys_context *swc,
+                     uint32 shid,            // IN
+                     SVGA3dShaderType type)  // IN
+{
+   SVGA3dCmdDestroyShader *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SHADER_DESTROY, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->shid = shid;
+   cmd->type = type;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetShaderConst --
+ *
+ *      Set the value of a shader constant.
+ *
+ *      Shader constants are analogous to uniform variables in GLSL,
+ *      except that they belong to the render context rather than to
+ *      an individual shader.
+ *
+ *      Constants may have one of three types: A 4-vector of floats,
+ *      a 4-vector of integers, or a single boolean flag.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetShaderConst(struct svga_winsys_context *swc,
+                      uint32 reg,                   // IN
+                      SVGA3dShaderType type,        // IN
+                      SVGA3dShaderConstType ctype,  // IN
+                      const void *value)            // IN
+{
+   SVGA3dCmdSetShaderConst *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SET_SHADER_CONST, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->reg = reg;
+   cmd->type = type;
+   cmd->ctype = ctype;
+
+   switch (ctype) {
+
+   case SVGA3D_CONST_TYPE_FLOAT:
+   case SVGA3D_CONST_TYPE_INT:
+      memcpy(&cmd->values, value, sizeof cmd->values);
+      break;
+
+   case SVGA3D_CONST_TYPE_BOOL:
+      memset(&cmd->values, 0, sizeof cmd->values);
+      cmd->values[0] = *(uint32*)value;
+      break;
+
+   default:
+      assert(0);
+      break;
+
+   }
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetShader --
+ *
+ *      Switch active shaders. This binds a new vertex or pixel shader
+ *      to the specified context.
+ *
+ *      A shader ID of SVGA3D_INVALID_ID unbinds any shader, switching
+ *      back to the fixed function vertex or pixel pipeline.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetShader(struct svga_winsys_context *swc,
+                 SVGA3dShaderType type,  // IN
+                 uint32 shid)            // IN
+{
+   SVGA3dCmdSetShader *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SET_SHADER, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   cmd->cid = swc->cid;
+   cmd->type = type;
+   cmd->shid = shid;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginClear --
+ *
+ *      Begin a CLEAR command. This reserves space for it in the FIFO,
+ *      and returns a pointer to the command's rectangle array.  This
+ *      function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      Clear is a rendering operation which fills a list of
+ *      rectangles with constant values on all render target types
+ *      indicated by 'flags'.
+ *
+ *      Clear is not affected by clipping, depth test, or other
+ *      render state which affects the fragment pipeline.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      May write to attached render target surfaces.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginClear(struct svga_winsys_context *swc,
+                  SVGA3dClearFlag flags,  // IN
+                  uint32 color,           // IN
+                  float depth,            // IN
+                  uint32 stencil,         // IN
+                  SVGA3dRect **rects,     // OUT
+                  uint32 numRects)        // IN
+{
+   SVGA3dCmdClear *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_CLEAR, 
+                            sizeof *cmd + sizeof **rects * numRects,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->clearFlag = flags;
+   cmd->color = color;
+   cmd->depth = depth;
+   cmd->stencil = stencil;
+   *rects = (SVGA3dRect*) &cmd[1];
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_ClearRect --
+ *
+ *      This is a simplified version of SVGA3D_BeginClear().
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_ClearRect(struct svga_winsys_context *swc,
+                 SVGA3dClearFlag flags,  // IN
+                 uint32 color,           // IN
+                 float depth,            // IN
+                 uint32 stencil,         // IN
+                 uint32 x,               // IN
+                 uint32 y,               // IN
+                 uint32 w,               // IN
+                 uint32 h)               // IN
+{
+   SVGA3dRect *rect;
+   enum pipe_error ret;
+
+   ret = SVGA3D_BeginClear(swc, flags, color, depth, stencil, &rect, 1);
+   if(ret != PIPE_OK)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   memset(rect, 0, sizeof *rect);
+   rect->x = x;
+   rect->y = y;
+   rect->w = w;
+   rect->h = h;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginDrawPrimitives --
+ *
+ *      Begin a DRAW_PRIMITIVES command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's arrays.
+ *      This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      Drawing commands consist of two variable-length arrays:
+ *      SVGA3dVertexDecl elements declare a set of vertex buffers to
+ *      use while rendering, and SVGA3dPrimitiveRange elements specify
+ *      groups of primitives each with an optional index buffer.
+ *
+ *      The decls and ranges arrays are initialized to zero.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      May write to attached render target surfaces.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
+                           SVGA3dVertexDecl **decls,      // OUT
+                           uint32 numVertexDecls,         // IN
+                           SVGA3dPrimitiveRange **ranges, // OUT
+                           uint32 numRanges)              // IN
+{
+   SVGA3dCmdDrawPrimitives *cmd;
+   SVGA3dVertexDecl *declArray;
+   SVGA3dPrimitiveRange *rangeArray;
+   uint32 declSize = sizeof **decls * numVertexDecls;
+   uint32 rangeSize = sizeof **ranges * numRanges;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_DRAW_PRIMITIVES, 
+                            sizeof *cmd + declSize + rangeSize,
+                            numVertexDecls + numRanges);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->numVertexDecls = numVertexDecls;
+   cmd->numRanges = numRanges;
+
+   declArray = (SVGA3dVertexDecl*) &cmd[1];
+   rangeArray = (SVGA3dPrimitiveRange*) &declArray[numVertexDecls];
+
+   memset(declArray, 0, declSize);
+   memset(rangeArray, 0, rangeSize);
+
+   *decls = declArray;
+   *ranges = rangeArray;
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSurfaceCopy --
+ *
+ *      Begin a SURFACE_COPY command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's arrays.  This
+ *      function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      The box array is initialized with zeroes.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Asynchronously copies a list of boxes from surface to surface.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginSurfaceCopy(struct svga_winsys_context *swc,
+                        struct pipe_surface *src,    // IN
+                        struct pipe_surface *dest,   // IN
+                        SVGA3dCopyBox **boxes,       // OUT
+                        uint32 numBoxes)             // IN
+{
+   SVGA3dCmdSurfaceCopy *cmd;
+   uint32 boxesSize = sizeof **boxes * numBoxes;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_COPY, sizeof *cmd + boxesSize,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   surface_to_surfaceid(swc, src, &cmd->src, PIPE_BUFFER_USAGE_GPU_READ);
+   surface_to_surfaceid(swc, dest, &cmd->dest, PIPE_BUFFER_USAGE_GPU_WRITE);
+   *boxes = (SVGA3dCopyBox*) &cmd[1];
+
+   memset(*boxes, 0, boxesSize);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SurfaceStretchBlt --
+ *
+ *      Issue a SURFACE_STRETCHBLT command: an asynchronous
+ *      surface-to-surface blit, with scaling.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Asynchronously copies one box from surface to surface.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SurfaceStretchBlt(struct svga_winsys_context *swc,
+                         struct pipe_surface *src,    // IN
+                         struct pipe_surface *dest,   // IN
+                         SVGA3dBox *boxSrc,           // IN
+                         SVGA3dBox *boxDest,          // IN
+                         SVGA3dStretchBltMode mode)   // IN
+{
+   SVGA3dCmdSurfaceStretchBlt *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_STRETCHBLT, sizeof *cmd,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   surface_to_surfaceid(swc, src, &cmd->src, PIPE_BUFFER_USAGE_GPU_READ);
+   surface_to_surfaceid(swc, dest, &cmd->dest, PIPE_BUFFER_USAGE_GPU_WRITE);
+   cmd->boxSrc = *boxSrc;
+   cmd->boxDest = *boxDest;
+   cmd->mode = mode;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetViewport --
+ *
+ *      Set the current context's viewport rectangle. The viewport
+ *      is clipped to the dimensions of the current render target,
+ *      then all rendering is clipped to the viewport.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetViewport(struct svga_winsys_context *swc,
+                   SVGA3dRect *rect)  // IN
+{
+   SVGA3dCmdSetViewport *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETVIEWPORT, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->rect = *rect;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetScissorRect --
+ *
+ *      Set the current context's scissor rectangle. If scissor
+ *      is enabled then all rendering is clipped to the scissor.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetScissorRect(struct svga_winsys_context *swc,
+                      SVGA3dRect *rect)  // IN
+{
+   SVGA3dCmdSetScissorRect *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETSCISSORRECT, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->rect = *rect;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetClipPlane --
+ *
+ *      Set one of the current context's clip planes. If the clip
+ *      plane is enabled then all 3d rendering is clipped to against
+ *      the plane.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error SVGA3D_SetClipPlane(struct svga_winsys_context *swc,
+                         uint32 index, const float *plane)
+{
+   SVGA3dCmdSetClipPlane *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETCLIPPLANE, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->index = index;
+   cmd->plane[0] = plane[0];
+   cmd->plane[1] = plane[1];
+   cmd->plane[2] = plane[2];
+   cmd->plane[3] = plane[3];
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_SetZRange --
+ *
+ *      Set the range of the depth buffer to use. 'min' and 'max'
+ *      are values between 0.0 and 1.0.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_SetZRange(struct svga_winsys_context *swc,
+                 float zMin,  // IN
+                 float zMax)  // IN
+{
+   SVGA3dCmdSetZRange *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETZRANGE, sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->zRange.min = zMin;
+   cmd->zRange.max = zMax;
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSetTextureState --
+ *
+ *      Begin a SETTEXTURESTATE command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's texture state
+ *      array.  This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      This command sets rendering state which is per-texture-unit.
+ *
+ *      XXX: Individual texture states need documentation. However,
+ *           they are very similar to the texture states defined by
+ *           Direct3D. The D3D documentation is a good starting point
+ *           for understanding SVGA3D texture states.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginSetTextureState(struct svga_winsys_context *swc,
+                            SVGA3dTextureState **states,  // OUT
+                            uint32 numStates)             // IN
+{
+   SVGA3dCmdSetTextureState *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETTEXTURESTATE, 
+                            sizeof *cmd + sizeof **states * numStates,
+                            numStates);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   *states = (SVGA3dTextureState*) &cmd[1];
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginSetRenderState --
+ *
+ *      Begin a SETRENDERSTATE command. This reserves space for it in
+ *      the FIFO, and returns a pointer to the command's texture state
+ *      array.  This function must be paired with SVGA_FIFOCommitAll().
+ *
+ *      This command sets rendering state which is global to the context.
+ *
+ *      XXX: Individual render states need documentation. However,
+ *           they are very similar to the render states defined by
+ *           Direct3D. The D3D documentation is a good starting point
+ *           for understanding SVGA3D render states.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginSetRenderState(struct svga_winsys_context *swc,
+                           SVGA3dRenderState **states,  // OUT
+                           uint32 numStates)            // IN
+{
+   SVGA3dCmdSetRenderState *cmd;
+   
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SETRENDERSTATE, 
+                            sizeof *cmd + sizeof **states * numStates,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   *states = (SVGA3dRenderState*) &cmd[1];
+
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_BeginQuery--
+ *
+ *      Issues a SVGA_3D_CMD_BEGIN_QUERY command.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Commits space in the FIFO memory.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_BeginQuery(struct svga_winsys_context *swc,
+                  SVGA3dQueryType type) // IN
+{
+   SVGA3dCmdBeginQuery *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_BEGIN_QUERY,
+                            sizeof *cmd,
+                            0);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->type = type;
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_EndQuery--
+ *
+ *      Issues a SVGA_3D_CMD_END_QUERY command.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Commits space in the FIFO memory.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_EndQuery(struct svga_winsys_context *swc,
+                SVGA3dQueryType type,              // IN
+                struct svga_winsys_buffer *buffer) // IN/OUT
+{
+   SVGA3dCmdEndQuery *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_END_QUERY, 
+                            sizeof *cmd,
+                            1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->type = type;
+
+   swc->region_relocation(swc, &cmd->guestResult, buffer, 0,
+                          PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
+
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * SVGA3D_WaitForQuery--
+ *
+ *      Issues a SVGA_3D_CMD_WAIT_FOR_QUERY command.  This reserves space
+ *      for it in the FIFO.  This doesn't actually wait for the query to
+ *      finish but instead tells the host to start a wait at the driver
+ *      level.  The caller can wait on the status variable in the
+ *      guestPtr memory or send an insert fence instruction after this
+ *      command and wait on the fence.
+ *
+ * Results:
+ *      None.
+ *
+ * Side effects:
+ *      Commits space in the FIFO memory.
+ *
+ *----------------------------------------------------------------------
+ */
+
+enum pipe_error
+SVGA3D_WaitForQuery(struct svga_winsys_context *swc,
+                    SVGA3dQueryType type,              // IN
+                    struct svga_winsys_buffer *buffer) // IN/OUT
+{
+   SVGA3dCmdWaitForQuery *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_WAIT_FOR_QUERY, 
+                            sizeof *cmd,
+                            1);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->cid = swc->cid;
+   cmd->type = type;
+   
+   swc->region_relocation(swc, &cmd->guestResult, buffer, 0,
+                          PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   swc->commit(swc);
+   
+   return PIPE_OK;
+}
diff --git a/src/gallium/drivers/svga/svga_cmd.h b/src/gallium/drivers/svga/svga_cmd.h
new file mode 100644
index 00000000000..80410547690
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_cmd.h
@@ -0,0 +1,235 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/*
+ * svga_cmd.h --
+ *
+ *      Command construction utility for the SVGA3D protocol used by
+ *      the VMware SVGA device, based on the svgautil library.
+ */
+
+#ifndef __SVGA3D_H__
+#define __SVGA3D_H__
+
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+
+#include "pipe/p_defines.h"
+
+
+struct pipe_buffer;
+struct pipe_surface;
+struct svga_transfer;
+struct svga_winsys_context;
+struct svga_winsys_buffer;
+struct svga_winsys_surface;
+
+
+/*
+ * SVGA Device Interoperability
+ */
+
+void *
+SVGA3D_FIFOReserve(struct svga_winsys_context *swc, uint32 cmd, uint32 cmdSize, uint32 nr_relocs);
+
+void
+SVGA_FIFOCommitAll(struct svga_winsys_context *swc);
+
+
+/*
+ * Context Management
+ */
+
+enum pipe_error
+SVGA3D_DefineContext(struct svga_winsys_context *swc);
+
+enum pipe_error
+SVGA3D_DestroyContext(struct svga_winsys_context *swc);
+
+
+/*
+ * Surface Management
+ */
+
+enum pipe_error
+SVGA3D_BeginDefineSurface(struct svga_winsys_context *swc,
+                          struct svga_winsys_surface *sid,
+                          SVGA3dSurfaceFlags flags,
+                          SVGA3dSurfaceFormat format,
+                          SVGA3dSurfaceFace **faces,
+                          SVGA3dSize **mipSizes,
+                          uint32 numMipSizes);
+enum pipe_error
+SVGA3D_DefineSurface2D(struct svga_winsys_context *swc,
+                       struct svga_winsys_surface *sid,
+                       uint32 width,
+                       uint32 height,
+                       SVGA3dSurfaceFormat format);
+enum pipe_error
+SVGA3D_DestroySurface(struct svga_winsys_context *swc,
+                      struct svga_winsys_surface *sid);
+
+
+/*
+ * Surface Operations
+ */
+
+enum pipe_error
+SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
+                  struct svga_transfer *st,
+                  SVGA3dTransferType transfer,
+                  const SVGA3dCopyBox *boxes,
+                  uint32 numBoxes);
+
+enum pipe_error
+SVGA3D_BufferDMA(struct svga_winsys_context *swc,
+                 struct svga_winsys_buffer *guest,
+                 struct svga_winsys_surface *host,
+                 SVGA3dTransferType transfer,
+                 uint32 size,
+                 uint32 offset,
+                 SVGA3dSurfaceDMAFlags flags);
+
+/*
+ * Drawing Operations
+ */
+
+
+enum pipe_error
+SVGA3D_BeginClear(struct svga_winsys_context *swc,
+                  SVGA3dClearFlag flags,
+                  uint32 color, float depth, uint32 stencil,
+                  SVGA3dRect **rects, uint32 numRects);
+
+enum pipe_error
+SVGA3D_ClearRect(struct svga_winsys_context *swc,
+                 SVGA3dClearFlag flags, uint32 color, float depth,
+                 uint32 stencil, uint32 x, uint32 y, uint32 w, uint32 h);
+
+enum pipe_error
+SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
+                           SVGA3dVertexDecl **decls,
+                           uint32 numVertexDecls,
+                           SVGA3dPrimitiveRange **ranges,
+                           uint32 numRanges);
+
+/*
+ * Blits
+ */
+
+enum pipe_error
+SVGA3D_BeginSurfaceCopy(struct svga_winsys_context *swc,
+                        struct pipe_surface *src,
+                        struct pipe_surface *dest,
+                        SVGA3dCopyBox **boxes, uint32 numBoxes);
+
+
+enum pipe_error
+SVGA3D_SurfaceStretchBlt(struct svga_winsys_context *swc,
+                         struct pipe_surface *src,
+                         struct pipe_surface *dest,
+                         SVGA3dBox *boxSrc, SVGA3dBox *boxDest,
+                         SVGA3dStretchBltMode mode);
+
+/*
+ * Shared FFP/Shader Render State
+ */
+
+enum pipe_error
+SVGA3D_SetRenderTarget(struct svga_winsys_context *swc,
+                       SVGA3dRenderTargetType type,
+                       struct pipe_surface *surface);
+
+enum pipe_error
+SVGA3D_SetZRange(struct svga_winsys_context *swc,
+                 float zMin, float zMax);
+
+enum pipe_error
+SVGA3D_SetViewport(struct svga_winsys_context *swc,
+                   SVGA3dRect *rect);
+
+enum pipe_error
+SVGA3D_SetScissorRect(struct svga_winsys_context *swc,
+                      SVGA3dRect *rect);
+
+enum pipe_error
+SVGA3D_SetClipPlane(struct svga_winsys_context *swc,
+                    uint32 index, const float *plane);
+
+enum pipe_error
+SVGA3D_BeginSetTextureState(struct svga_winsys_context *swc,
+                            SVGA3dTextureState **states,
+                            uint32 numStates);
+
+enum pipe_error
+SVGA3D_BeginSetRenderState(struct svga_winsys_context *swc,
+                           SVGA3dRenderState **states,
+                           uint32 numStates);
+
+
+/*
+ * Shaders
+ */
+
+enum pipe_error
+SVGA3D_DefineShader(struct svga_winsys_context *swc,
+                    uint32 shid, SVGA3dShaderType type,
+                    const uint32 *bytecode, uint32 bytecodeLen);
+
+enum pipe_error
+SVGA3D_DestroyShader(struct svga_winsys_context *swc,
+                     uint32 shid, SVGA3dShaderType type);
+
+enum pipe_error
+SVGA3D_SetShaderConst(struct svga_winsys_context *swc,
+                      uint32 reg, SVGA3dShaderType type,
+                      SVGA3dShaderConstType ctype, const void *value);
+
+enum pipe_error
+SVGA3D_SetShader(struct svga_winsys_context *swc,
+                 SVGA3dShaderType type, uint32 shid);
+
+
+/*
+ * Queries
+ */
+
+enum pipe_error
+SVGA3D_BeginQuery(struct svga_winsys_context *swc,
+                  SVGA3dQueryType type);
+
+enum pipe_error
+SVGA3D_EndQuery(struct svga_winsys_context *swc,
+                SVGA3dQueryType type,
+                struct svga_winsys_buffer *buffer);
+
+enum pipe_error
+SVGA3D_WaitForQuery(struct svga_winsys_context *swc,
+                    SVGA3dQueryType type,
+                    struct svga_winsys_buffer *buffer);
+
+#endif /* __SVGA3D_H__ */
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
new file mode 100644
index 00000000000..c3de12b4a39
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -0,0 +1,271 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_screen_texture.h"
+#include "svga_screen_buffer.h"
+#include "svga_winsys.h"
+#include "svga_swtnl.h"
+#include "svga_draw.h"
+#include "svga_debug.h"
+#include "svga_state.h"
+
+
+static void svga_destroy( struct pipe_context *pipe )
+{
+   struct svga_context *svga = svga_context( pipe );
+   unsigned shader;
+
+   svga_cleanup_framebuffer( svga );
+   svga_cleanup_tss_binding( svga );
+
+   svga_hwtnl_destroy( svga->hwtnl );
+
+   svga_cleanup_vertex_state(svga);
+   
+   svga->swc->destroy(svga->swc);
+   
+   svga_destroy_swtnl( svga );
+
+   u_upload_destroy( svga->upload_vb );
+   u_upload_destroy( svga->upload_ib );
+
+   for(shader = 0; shader < PIPE_SHADER_TYPES; ++shader)
+      pipe_buffer_reference( &svga->curr.cb[shader], NULL );
+
+   FREE( svga );
+}
+
+static unsigned int
+svga_is_texture_referenced( struct pipe_context *pipe,
+			    struct pipe_texture *texture,
+			    unsigned face, unsigned level)
+{
+   struct svga_texture *tex = svga_texture(texture);
+   struct svga_screen *ss = svga_screen(pipe->screen);
+
+   /**
+    * The screen does not cache texture writes.
+    */
+
+   if (!tex->handle || ss->sws->surface_is_flushed(ss->sws, tex->handle))
+      return PIPE_UNREFERENCED;
+
+   /**
+    * sws->surface_is_flushed() does not distinguish between read references
+    * and write references. So assume a reference is both.
+    */
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+static unsigned int
+svga_is_buffer_referenced( struct pipe_context *pipe,
+			   struct pipe_buffer *buf)
+
+{
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_buffer *sbuf = svga_buffer(buf);
+
+   /**
+    * XXX: Check this.
+    * The screen may cache buffer writes, but when we map, we map out
+    * of those cached writes, so we don't need to set a
+    * PIPE_REFERENCED_FOR_WRITE flag for cached buffers.
+    */
+
+   if (!sbuf->handle || ss->sws->surface_is_flushed(ss->sws, sbuf->handle))
+     return PIPE_UNREFERENCED;
+
+   /**
+    * sws->surface_is_flushed() does not distinguish between read references
+    * and write references. So assume a reference is both,
+    * however, we make an exception for index- and vertex buffers, to avoid
+    * a flush in st_bufferobj_get_subdata, during display list replay.
+    */
+
+   if (sbuf->base.usage & (PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_INDEX))
+      return PIPE_REFERENCED_FOR_READ;
+
+   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
+}
+
+
+struct pipe_context *svga_context_create( struct pipe_screen *screen )
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   struct svga_context *svga = NULL;
+   enum pipe_error ret;
+
+   svga = CALLOC_STRUCT(svga_context);
+   if (svga == NULL)
+      goto error1;
+
+   svga->pipe.winsys = screen->winsys;
+   svga->pipe.screen = screen;
+   svga->pipe.destroy = svga_destroy;
+   svga->pipe.clear = svga_clear;
+
+   svga->pipe.is_texture_referenced = svga_is_texture_referenced;
+   svga->pipe.is_buffer_referenced = svga_is_buffer_referenced;
+
+   svga->swc = svgascreen->sws->context_create(svgascreen->sws);
+   if(!svga->swc)
+      goto error2;
+
+   svga_init_blend_functions(svga);
+   svga_init_blit_functions(svga);
+   svga_init_depth_stencil_functions(svga);
+   svga_init_draw_functions(svga);
+   svga_init_flush_functions(svga);
+   svga_init_misc_functions(svga);
+   svga_init_rasterizer_functions(svga);
+   svga_init_sampler_functions(svga);
+   svga_init_fs_functions(svga);
+   svga_init_vs_functions(svga);
+   svga_init_vertex_functions(svga);
+   svga_init_constbuffer_functions(svga);
+   svga_init_query_functions(svga);
+
+   /* debug */
+   svga->debug.no_swtnl = debug_get_bool_option("SVGA_NO_SWTNL", FALSE);
+   svga->debug.force_swtnl = debug_get_bool_option("SVGA_FORCE_SWTNL", FALSE);
+   svga->debug.use_min_mipmap = debug_get_bool_option("SVGA_USE_MIN_MIPMAP", FALSE);
+   svga->debug.disable_shader = debug_get_num_option("SVGA_DISABLE_SHADER", ~0);
+
+   if (!svga_init_swtnl(svga))
+      goto error3;
+
+   svga->upload_ib = u_upload_create( svga->pipe.screen,
+                                      32 * 1024,
+                                      16,
+                                      PIPE_BUFFER_USAGE_INDEX );
+   if (svga->upload_ib == NULL)
+      goto error4;
+
+   svga->upload_vb = u_upload_create( svga->pipe.screen,
+                                      128 * 1024,
+                                      16,
+                                      PIPE_BUFFER_USAGE_VERTEX );
+   if (svga->upload_vb == NULL)
+      goto error5;
+
+   svga->hwtnl = svga_hwtnl_create( svga,
+                                    svga->upload_ib,
+                                    svga->swc );
+   if (svga->hwtnl == NULL)
+      goto error6;
+
+
+   ret = svga_emit_initial_state( svga );
+   if (ret)
+      goto error7;
+   
+   /* Avoid shortcircuiting state with initial value of zero.
+    */
+   memset(&svga->state.hw_clear, 0xcd, sizeof(svga->state.hw_clear));
+   memset(&svga->state.hw_clear.framebuffer, 0x0, 
+          sizeof(svga->state.hw_clear.framebuffer));
+
+   memset(&svga->state.hw_draw, 0xcd, sizeof(svga->state.hw_draw));
+   memset(&svga->state.hw_draw.views, 0x0, sizeof(svga->state.hw_draw.views));
+   svga->state.hw_draw.num_views = 0;
+
+   svga->dirty = ~0;
+   svga->state.white_fs_id = SVGA3D_INVALID_ID;
+
+   LIST_INITHEAD(&svga->dirty_buffers);
+
+   return &svga->pipe;
+
+error7:
+   svga_hwtnl_destroy( svga->hwtnl );
+error6:
+   u_upload_destroy( svga->upload_vb );
+error5:
+   u_upload_destroy( svga->upload_ib );
+error4:
+   svga_destroy_swtnl(svga);
+error3:
+   svga->swc->destroy(svga->swc);
+error2:
+   FREE(svga);
+error1:
+   return NULL;
+}
+
+
+void svga_context_flush( struct svga_context *svga, 
+                         struct pipe_fence_handle **pfence )
+{
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+
+   svga->curr.nr_fbs = 0;
+
+   /* Unmap upload manager buffers: 
+    */
+   u_upload_flush(svga->upload_vb);
+   u_upload_flush(svga->upload_ib);
+
+   /* Flush screen, to ensure that texture dma uploads are processed
+    * before submitting commands.
+    */
+   svga_screen_flush(svgascreen, NULL);
+   
+   svga_context_flush_buffers(svga);
+
+   /* Flush pending commands to hardware:
+    */
+   svga->swc->flush(svga->swc, pfence);
+
+   if (SVGA_DEBUG & DEBUG_SYNC) {
+      if (pfence && *pfence)
+         svga->pipe.screen->fence_finish( svga->pipe.screen, *pfence, 0);
+   }
+}
+
+
+void svga_hwtnl_flush_retry( struct svga_context *svga )
+{
+   enum pipe_error ret = PIPE_OK;
+
+   ret = svga_hwtnl_flush( svga->hwtnl );
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      svga_context_flush( svga, NULL );
+      ret = svga_hwtnl_flush( svga->hwtnl );
+   }
+
+   assert(ret == 0);
+}
+
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
new file mode 100644
index 00000000000..e650a251d19
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -0,0 +1,448 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_CONTEXT_H
+#define SVGA_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "util/u_double_list.h"
+
+#include "tgsi/tgsi_scan.h"
+
+
+#define SVGA_TEX_UNITS 8
+
+struct draw_vertex_shader;
+struct svga_shader_result;
+struct SVGACmdMemory;
+struct u_upload_mgr;
+
+
+struct svga_shader
+{
+   const struct tgsi_token *tokens;
+
+   struct tgsi_shader_info info;
+
+   struct svga_shader_result *results;
+
+   unsigned id;
+
+   boolean use_sm30;
+};
+
+struct svga_fragment_shader
+{
+   struct svga_shader base;
+};
+
+struct svga_vertex_shader
+{
+   struct svga_shader base;
+
+   struct draw_vertex_shader *draw_shader;
+};
+
+
+struct svga_cache_context;
+struct svga_tracked_state;
+
+struct svga_blend_state {
+
+   boolean need_white_fragments;
+
+   /* Should be per-render-target:
+    */
+   struct {
+      uint8_t writemask;
+
+      boolean blend_enable;
+      uint8_t srcblend;
+      uint8_t dstblend;
+      uint8_t blendeq;
+      
+      boolean separate_alpha_blend_enable;
+      uint8_t srcblend_alpha;
+      uint8_t dstblend_alpha;
+      uint8_t blendeq_alpha;
+
+   } rt[1];
+};
+
+struct svga_depth_stencil_state {
+   unsigned zfunc:8;
+   unsigned zenable:1;
+   unsigned zwriteenable:1;
+
+   unsigned alphatestenable:1;
+   unsigned alphafunc:8;
+  
+   struct {
+      unsigned enabled:1;
+      unsigned func:8;
+      unsigned fail:8;
+      unsigned zfail:8;
+      unsigned pass:8;
+   } stencil[2];
+   
+   /* SVGA3D has one ref/mask/writemask triple shared between front &
+    * back face stencil.  We really need two:
+    */
+   unsigned stencil_ref:8;
+   unsigned stencil_mask:8;
+   unsigned stencil_writemask:8;
+
+   float    alpharef;
+};
+
+#define SVGA_UNFILLED_DISABLE 0
+#define SVGA_UNFILLED_LINE    1
+#define SVGA_UNFILLED_POINT   2
+
+#define SVGA_PIPELINE_FLAG_POINTS   (1<<PIPE_PRIM_POINTS)
+#define SVGA_PIPELINE_FLAG_LINES    (1<<PIPE_PRIM_LINES)
+#define SVGA_PIPELINE_FLAG_TRIS     (1<<PIPE_PRIM_TRIANGLES)
+
+struct svga_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* needed for draw module */
+
+   unsigned shademode:8;
+   unsigned cullmode:8;
+   unsigned scissortestenable:1;
+   unsigned multisampleantialias:1;
+   unsigned antialiasedlineenable:1;
+   unsigned lastpixel:1;
+
+   unsigned linepattern;
+
+   float slopescaledepthbias;
+   float depthbias;
+   float pointsize;
+   float pointsize_min;
+   float pointsize_max;
+   
+   unsigned hw_unfilled:16;         /* PIPE_POLYGON_MODE_x */
+   unsigned need_pipeline:16;    /* which prims do we need help for? */
+};
+
+struct svga_sampler_state {
+   unsigned mipfilter;
+   unsigned magfilter;
+   unsigned minfilter;
+   unsigned aniso_level;
+   float lod_bias;
+   unsigned addressu;
+   unsigned addressv;
+   unsigned addressw;
+   unsigned bordercolor;
+   unsigned normalized_coords:1;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+
+   unsigned min_lod;
+   unsigned view_min_lod;
+   unsigned view_max_lod;
+};
+
+/* Use to calculate differences between state emitted to hardware and
+ * current driver-calculated state.  
+ */
+struct svga_state 
+{
+   const struct svga_blend_state *blend;
+   const struct svga_depth_stencil_state *depth;
+   const struct svga_rasterizer_state *rast;
+   const struct svga_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+
+   struct pipe_texture *texture[PIPE_MAX_SAMPLERS]; /* or texture ID's? */
+   struct svga_fragment_shader *fs;
+   struct svga_vertex_shader *vs;
+
+   struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
+   struct pipe_buffer *cb[PIPE_SHADER_TYPES];
+
+   struct pipe_framebuffer_state framebuffer;
+   float depthscale;
+
+   /* Hack to limit the number of different render targets between
+    * flushes.  Helps avoid blowing out our surface cache in EXA.
+    */
+   int nr_fbs;
+
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_viewport_state viewport;
+
+   const unsigned *edgeflags;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+   unsigned num_vertex_elements;
+   unsigned num_vertex_buffers;
+   unsigned reduced_prim;
+
+   struct {
+      unsigned flag_1d;
+      unsigned flag_srgb;
+   } tex_flags;
+
+   boolean any_user_vertex_buffers;
+
+   unsigned zero_stride_vertex_elements;
+   unsigned num_zero_stride_vertex_elements;
+   /* ### maybe dynamically allocate this */
+   float zero_stride_constants[PIPE_MAX_ATTRIBS*4];
+};
+
+#define RS_MAX 97
+#define TS_MAX 30
+#define CB_MAX 256
+
+struct svga_prescale {
+   float translate[4];
+   float scale[4];
+   boolean enabled;
+};
+
+
+/* Updated by calling svga_update_state( SVGA_STATE_HW_VIEWPORT )
+ */
+struct svga_hw_clear_state
+{
+   struct {
+      unsigned x,y,w,h;
+   } viewport;
+
+   struct {
+      float zmin, zmax;
+   } depthrange;
+   
+   struct pipe_framebuffer_state framebuffer;
+   struct svga_prescale prescale;
+};
+
+struct svga_hw_view_state
+{
+   struct pipe_texture *texture;
+   struct svga_sampler_view *v;
+   unsigned min_lod;
+   unsigned max_lod;
+   int dirty;
+};
+
+/* Updated by calling svga_update_state( SVGA_STATE_HW_DRAW )
+ */
+struct svga_hw_draw_state
+{
+   unsigned rs[RS_MAX];
+   unsigned ts[16][TS_MAX];
+   float cb[PIPE_SHADER_TYPES][CB_MAX][4];
+
+   unsigned shader_id[PIPE_SHADER_TYPES];
+   
+   struct svga_shader_result *fs;
+   struct svga_shader_result *vs;
+   struct svga_hw_view_state views[PIPE_MAX_SAMPLERS];
+
+   unsigned num_views;
+};
+
+
+/* Updated by calling svga_update_state( SVGA_STATE_NEED_SWTNL )
+ */
+struct svga_sw_state
+{
+   unsigned ve_format[PIPE_MAX_ATTRIBS]; /* NEW_VELEMENT */
+
+   /* which parts we need */
+   boolean need_swvfetch;
+   boolean need_pipeline;
+   boolean need_swtnl;
+};
+
+
+/* Queue some state updates (like rss) and submit them to hardware in
+ * a single packet.
+ */
+struct svga_hw_queue;
+
+struct svga_query;
+
+struct svga_context
+{
+   struct pipe_context pipe;
+   struct svga_winsys_context *swc;
+
+   struct {
+      boolean no_swtnl;
+      boolean force_swtnl;
+      boolean use_min_mipmap;
+
+      /* incremented for each shader */
+      unsigned shader_id;
+
+      unsigned disable_shader;
+   } debug;
+
+   struct {
+      struct draw_context *draw;
+      struct vbuf_render *backend;
+      unsigned hw_prim;
+      boolean new_vbuf;
+      boolean new_vdecl;
+   } swtnl;
+
+   struct {
+      unsigned dirty[4];
+
+      unsigned texture_timestamp;
+      unsigned next_fs_id;
+      unsigned next_vs_id;
+
+      /* Internally generated shaders:
+       */
+      unsigned white_fs_id;
+
+      /* 
+       */
+      struct svga_sw_state          sw;
+      struct svga_hw_draw_state     hw_draw;
+      struct svga_hw_clear_state    hw_clear;
+   } state;
+
+   struct svga_state curr;      /* state from the state tracker */
+   unsigned dirty;              /* statechanges since last update_state() */
+
+   struct u_upload_mgr *upload_ib;
+   struct u_upload_mgr *upload_vb;
+   struct svga_hwtnl *hwtnl;
+
+   /** The occlusion query currently in progress */
+   struct svga_query *sq;
+
+   /** List of buffers with queued transfers */
+   struct list_head dirty_buffers;
+};
+
+/* A flag for each state_tracker state object:
+ */
+#define SVGA_NEW_BLEND               0x1
+#define SVGA_NEW_DEPTH_STENCIL       0x2
+#define SVGA_NEW_RAST                0x4
+#define SVGA_NEW_SAMPLER             0x8
+#define SVGA_NEW_TEXTURE             0x10
+#define SVGA_NEW_VBUFFER             0x20
+#define SVGA_NEW_VELEMENT            0x40
+#define SVGA_NEW_FS                  0x80
+#define SVGA_NEW_VS                  0x100
+#define SVGA_NEW_FS_CONST_BUFFER     0x200
+#define SVGA_NEW_VS_CONST_BUFFER     0x400
+#define SVGA_NEW_FRAME_BUFFER        0x800
+#define SVGA_NEW_STIPPLE             0x1000
+#define SVGA_NEW_SCISSOR             0x2000
+#define SVGA_NEW_BLEND_COLOR         0x5000
+#define SVGA_NEW_CLIP                0x8000
+#define SVGA_NEW_VIEWPORT            0x10000
+#define SVGA_NEW_PRESCALE            0x20000
+#define SVGA_NEW_REDUCED_PRIMITIVE   0x40000
+#define SVGA_NEW_TEXTURE_BINDING     0x80000
+#define SVGA_NEW_NEED_PIPELINE       0x100000
+#define SVGA_NEW_NEED_SWVFETCH       0x200000
+#define SVGA_NEW_NEED_SWTNL          0x400000
+#define SVGA_NEW_FS_RESULT           0x800000
+#define SVGA_NEW_VS_RESULT           0x1000000
+#define SVGA_NEW_EDGEFLAGS           0x2000000
+#define SVGA_NEW_ZERO_STRIDE         0x4000000
+#define SVGA_NEW_TEXTURE_FLAGS       0x8000000
+
+
+
+
+
+/***********************************************************************
+ * svga_clear.c: 
+ */
+void svga_clear(struct pipe_context *pipe, 
+                unsigned buffers,
+                const float *rgba,
+                double depth,
+                unsigned stencil);
+
+
+/***********************************************************************
+ * svga_screen_texture.c: 
+ */
+void svga_mark_surfaces_dirty(struct svga_context *svga);
+
+
+
+
+void svga_init_state_functions( struct svga_context *svga );
+void svga_init_flush_functions( struct svga_context *svga );
+void svga_init_string_functions( struct svga_context *svga );
+void svga_init_blit_functions(struct svga_context *svga);
+
+void svga_init_blend_functions( struct svga_context *svga );
+void svga_init_depth_stencil_functions( struct svga_context *svga );
+void svga_init_misc_functions( struct svga_context *svga );
+void svga_init_rasterizer_functions( struct svga_context *svga );
+void svga_init_sampler_functions( struct svga_context *svga );
+void svga_init_fs_functions( struct svga_context *svga );
+void svga_init_vs_functions( struct svga_context *svga );
+void svga_init_vertex_functions( struct svga_context *svga );
+void svga_init_constbuffer_functions( struct svga_context *svga );
+void svga_init_draw_functions( struct svga_context *svga );
+void svga_init_query_functions( struct svga_context *svga );
+
+void svga_cleanup_vertex_state( struct svga_context *svga );
+void svga_cleanup_tss_binding( struct svga_context *svga );
+void svga_cleanup_framebuffer( struct svga_context *svga );
+
+void svga_context_flush( struct svga_context *svga,
+                         struct pipe_fence_handle **pfence );
+
+void svga_hwtnl_flush_retry( struct svga_context *svga );
+
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct svga_context *
+svga_context( struct pipe_context *pipe )
+{
+   return (struct svga_context *)pipe;
+}
+
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
new file mode 100644
index 00000000000..3a3fcd8fae2
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -0,0 +1,75 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DEBUG_H
+#define SVGA_DEBUG_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+#define DEBUG_DMA      0x1
+#define DEBUG_TGSI     0x4
+#define DEBUG_PIPE     0x8
+#define DEBUG_STATE    0x10
+#define DEBUG_SCREEN   0x20
+#define DEBUG_TEX      0x40
+#define DEBUG_SWTNL    0x80
+#define DEBUG_CONSTS   0x100
+#define DEBUG_VIEWPORT 0x200
+#define DEBUG_VIEWS    0x400
+#define DEBUG_PERF     0x800    /* print something when we hit any slow path operation */
+#define DEBUG_FLUSH    0x1000   /* flush after every draw */
+#define DEBUG_SYNC     0x2000   /* sync after every flush */
+#define DEBUG_QUERY    0x4000
+#define DEBUG_CACHE    0x8000
+
+#ifdef DEBUG
+extern int SVGA_DEBUG;
+#define DBSTR(x) x
+#else
+#define SVGA_DEBUG 0
+#define DBSTR(x) ""
+#endif
+
+static INLINE void
+SVGA_DBG( unsigned flag, const char *fmt, ... )
+{
+#ifdef DEBUG 
+    if (SVGA_DEBUG & flag)
+    {
+        va_list args;
+
+        va_start( args, fmt );
+        debug_vprintf( fmt, args );
+        va_end( args );
+    }
+#else
+    (void)flag;
+    (void)fmt;
+#endif
+}
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
new file mode 100644
index 00000000000..8db40d0fd57
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -0,0 +1,377 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_draw.h"
+#include "svga_draw_private.h"
+#include "svga_debug.h"
+#include "svga_screen.h"
+#include "svga_screen_buffer.h"
+#include "svga_screen_texture.h"
+#include "svga_winsys.h"
+#include "svga_cmd.h"
+
+
+struct svga_hwtnl *svga_hwtnl_create( struct svga_context *svga,
+                                      struct u_upload_mgr *upload_ib,
+                                      struct svga_winsys_context *swc )
+{
+   struct svga_hwtnl *hwtnl = CALLOC_STRUCT(svga_hwtnl);
+   if (hwtnl == NULL)
+      goto fail;
+
+   hwtnl->svga = svga;
+   hwtnl->upload_ib = upload_ib;
+   
+   hwtnl->cmd.swc = swc;
+
+   return hwtnl;
+
+fail:
+   return NULL;
+}
+
+void svga_hwtnl_destroy( struct svga_hwtnl *hwtnl )
+{
+   int i, j;
+
+   for (i = 0; i < PIPE_PRIM_MAX; i++) {
+      for (j = 0; j < IDX_CACHE_MAX; j++) {
+         pipe_buffer_reference( &hwtnl->index_cache[i][j].buffer,
+                                NULL );
+      }
+   }
+
+   for (i = 0; i < hwtnl->cmd.vdecl_count; i++)
+      pipe_buffer_reference(&hwtnl->cmd.vdecl_vb[i], NULL);
+
+   for (i = 0; i < hwtnl->cmd.prim_count; i++)
+      pipe_buffer_reference(&hwtnl->cmd.prim_ib[i], NULL);
+      
+
+   FREE(hwtnl);
+}
+
+
+void svga_hwtnl_set_flatshade( struct svga_hwtnl *hwtnl,
+                               boolean flatshade,
+                               boolean flatshade_first )
+{
+   hwtnl->hw_pv = PV_FIRST;
+   hwtnl->api_pv = (flatshade && !flatshade_first) ? PV_LAST : PV_FIRST;
+}                               
+
+void svga_hwtnl_set_unfilled( struct svga_hwtnl *hwtnl,
+                              unsigned mode )
+{
+   hwtnl->api_fillmode = mode;
+}                               
+
+void svga_hwtnl_reset_vdecl( struct svga_hwtnl *hwtnl,
+                             unsigned count )
+{
+   unsigned i;
+
+   assert(hwtnl->cmd.prim_count == 0);
+
+   for (i = count; i < hwtnl->cmd.vdecl_count; i++) {
+      pipe_buffer_reference(&hwtnl->cmd.vdecl_vb[i],
+                            NULL);
+   }
+
+   hwtnl->cmd.vdecl_count = count;
+}
+
+
+void svga_hwtnl_vdecl( struct svga_hwtnl *hwtnl,
+                          unsigned i,
+                          const SVGA3dVertexDecl *decl,
+                          struct pipe_buffer *vb)
+{
+   assert(hwtnl->cmd.prim_count == 0);
+
+   assert( i < hwtnl->cmd.vdecl_count );
+
+   hwtnl->cmd.vdecl[i] = *decl;
+
+   pipe_buffer_reference(&hwtnl->cmd.vdecl_vb[i],
+                         vb);   
+}
+
+
+
+enum pipe_error
+svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
+{
+   struct svga_winsys_context *swc = hwtnl->cmd.swc;
+   struct svga_context *svga = hwtnl->svga;
+   enum pipe_error ret;
+
+   if (hwtnl->cmd.prim_count) {
+      struct svga_winsys_surface *vb_handle[SVGA3D_INPUTREG_MAX];
+      struct svga_winsys_surface *ib_handle[QSZ];
+      struct svga_winsys_surface *handle;
+      SVGA3dVertexDecl *vdecl;
+      SVGA3dPrimitiveRange *prim;
+      unsigned i;
+
+      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+         handle = svga_buffer_handle(svga, hwtnl->cmd.vdecl_vb[i]);
+         if (handle == NULL)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
+         vb_handle[i] = handle;
+      }
+
+      for (i = 0; i < hwtnl->cmd.prim_count; i++) {
+         if (hwtnl->cmd.prim_ib[i]) {
+            handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i]);
+            if (handle == NULL)
+               return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+         else
+            handle = NULL;
+
+         ib_handle[i] = handle;
+      }
+
+      SVGA_DBG(DEBUG_DMA, "draw to sid %p, %d prims\n",
+               svga_surface(svga->curr.framebuffer.cbufs[0])->handle,
+               hwtnl->cmd.prim_count);
+
+      ret = SVGA3D_BeginDrawPrimitives(swc, 
+                                       &vdecl, 
+                                       hwtnl->cmd.vdecl_count, 
+                                       &prim, 
+                                       hwtnl->cmd.prim_count);
+      if (ret != PIPE_OK) 
+         return ret;
+
+      
+      memcpy( vdecl,
+              hwtnl->cmd.vdecl,
+              hwtnl->cmd.vdecl_count * sizeof hwtnl->cmd.vdecl[0]);
+
+      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+         /* Given rangeHint is considered to be relative to indexBias, and 
+          * indexBias varies per primitive, we cannot accurately supply an 
+          * rangeHint when emitting more than one primitive per draw command.
+          */
+         if (hwtnl->cmd.prim_count == 1) {
+            vdecl[i].rangeHint.first = hwtnl->cmd.min_index[0];
+            vdecl[i].rangeHint.last = hwtnl->cmd.max_index[0] + 1;
+         }
+         else {
+            vdecl[i].rangeHint.first = 0;
+            vdecl[i].rangeHint.last = 0;
+         }
+
+         swc->surface_relocation(swc,
+                                 &vdecl[i].array.surfaceId,
+                                 vb_handle[i],
+                                 PIPE_BUFFER_USAGE_GPU_READ);
+      }
+
+      memcpy( prim,
+              hwtnl->cmd.prim,
+              hwtnl->cmd.prim_count * sizeof hwtnl->cmd.prim[0]);
+
+      for (i = 0; i < hwtnl->cmd.prim_count; i++) {
+         swc->surface_relocation(swc,
+                                 &prim[i].indexArray.surfaceId,
+                                 ib_handle[i],
+                                 PIPE_BUFFER_USAGE_GPU_READ);
+         pipe_buffer_reference(&hwtnl->cmd.prim_ib[i], NULL);
+      }
+      
+      SVGA_FIFOCommitAll( swc );
+      hwtnl->cmd.prim_count = 0;
+   }
+
+   return PIPE_OK;
+}
+
+
+
+
+
+/***********************************************************************
+ * Internal functions:
+ */
+
+enum pipe_error svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
+                                 const SVGA3dPrimitiveRange *range,
+                                 unsigned min_index,
+                                 unsigned max_index,
+                                 struct pipe_buffer *ib )
+{
+   int ret = PIPE_OK;
+
+#ifdef DEBUG
+   {
+      unsigned i;
+      for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
+         struct pipe_buffer *vb = hwtnl->cmd.vdecl_vb[i];
+         unsigned size = vb ? vb->size : 0;
+         unsigned offset = hwtnl->cmd.vdecl[i].array.offset;
+         unsigned stride = hwtnl->cmd.vdecl[i].array.stride;
+         unsigned index_bias = range->indexBias;
+         unsigned width;
+
+         assert(vb);
+         assert(size);
+         assert(offset < size);
+         assert(index_bias >= 0);
+         assert(min_index <= max_index);
+         assert(offset + index_bias*stride < size);
+         assert(offset + (index_bias + min_index)*stride < size);
+
+         switch (hwtnl->cmd.vdecl[i].identity.type) {
+         case SVGA3D_DECLTYPE_FLOAT1:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT2:
+            width = 4*2;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT3:
+            width = 4*3;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT4:
+            width = 4*4;
+            break;
+         case SVGA3D_DECLTYPE_D3DCOLOR:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_UBYTE4:
+            width = 1*4;
+            break;
+         case SVGA3D_DECLTYPE_SHORT2:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_SHORT4:
+            width = 2*4;
+            break;
+         case SVGA3D_DECLTYPE_UBYTE4N:
+            width = 1*4;
+            break;
+         case SVGA3D_DECLTYPE_SHORT2N:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_SHORT4N:
+            width = 2*4;
+            break;
+         case SVGA3D_DECLTYPE_USHORT2N:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_USHORT4N:
+            width = 2*4;
+            break;
+         case SVGA3D_DECLTYPE_UDEC3:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_DEC3N:
+            width = 4;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT16_2:
+            width = 2*2;
+            break;
+         case SVGA3D_DECLTYPE_FLOAT16_4:
+            width = 2*4;
+            break;
+         default:
+            assert(0);
+            width = 0;
+            break;
+         }
+
+         assert(!stride || width <= stride);
+         assert(offset + (index_bias + max_index)*stride + width <= size);
+      }
+
+      assert(range->indexWidth == range->indexArray.stride);
+
+      if(ib) {
+         unsigned size = ib->size;
+         unsigned offset = range->indexArray.offset;
+         unsigned stride = range->indexArray.stride;
+         unsigned count;
+
+         assert(size);
+         assert(offset < size);
+         assert(stride);
+
+         switch (range->primType) {
+         case SVGA3D_PRIMITIVE_POINTLIST:
+            count = range->primitiveCount;
+            break;
+         case SVGA3D_PRIMITIVE_LINELIST:
+            count = range->primitiveCount * 2;
+            break;
+         case SVGA3D_PRIMITIVE_LINESTRIP:
+            count = range->primitiveCount + 1;
+            break;
+         case SVGA3D_PRIMITIVE_TRIANGLELIST:
+            count = range->primitiveCount * 3;
+            break;
+         case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
+            count = range->primitiveCount + 2;
+            break;
+         case SVGA3D_PRIMITIVE_TRIANGLEFAN:
+            count = range->primitiveCount + 2;
+            break;
+         default:
+            assert(0);
+            count = 0;
+            break;
+         }
+
+         assert(offset + count*stride <= size);
+      }
+   }
+#endif
+
+   if (hwtnl->cmd.prim_count+1 >= QSZ) {
+      ret = svga_hwtnl_flush( hwtnl );
+      if (ret != PIPE_OK)
+         return ret;
+   }
+   
+   /* min/max indices are relative to bias */
+   hwtnl->cmd.min_index[hwtnl->cmd.prim_count] = min_index;
+   hwtnl->cmd.max_index[hwtnl->cmd.prim_count] = max_index;
+
+   hwtnl->cmd.prim[hwtnl->cmd.prim_count] = *range;
+
+   pipe_buffer_reference(&hwtnl->cmd.prim_ib[hwtnl->cmd.prim_count], ib);
+   hwtnl->cmd.prim_count++;
+
+   return ret;
+}
diff --git a/src/gallium/drivers/svga/svga_draw.h b/src/gallium/drivers/svga/svga_draw.h
new file mode 100644
index 00000000000..14553b17b58
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw.h
@@ -0,0 +1,83 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DRAW_H
+#define SVGA_DRAW_H
+
+#include "pipe/p_compiler.h"
+
+#include "svga_hw_reg.h"
+
+struct svga_hwtnl;
+struct svga_winsys_context;
+struct svga_screen;
+struct svga_context;
+struct pipe_buffer;
+struct u_upload_mgr;
+
+struct svga_hwtnl *svga_hwtnl_create( struct svga_context *svga,
+                                      struct u_upload_mgr *upload_ib,
+                                      struct svga_winsys_context *swc );
+
+void svga_hwtnl_destroy( struct svga_hwtnl *hwtnl );
+
+void svga_hwtnl_set_flatshade( struct svga_hwtnl *hwtnl,
+                               boolean flatshade,
+                               boolean flatshade_first );
+
+void svga_hwtnl_set_unfilled( struct svga_hwtnl *hwtnl,
+                              unsigned mode );
+
+void svga_hwtnl_vdecl( struct svga_hwtnl *hwtnl,
+                       unsigned i,
+                       const SVGA3dVertexDecl *decl,
+                       struct pipe_buffer *vb);
+
+void svga_hwtnl_reset_vdecl( struct svga_hwtnl *hwtnl,
+                             unsigned count );
+
+
+enum pipe_error 
+svga_hwtnl_draw_arrays( struct svga_hwtnl *hwtnl,
+                        unsigned prim, 
+                        unsigned start, 
+                        unsigned count);
+
+enum pipe_error
+svga_hwtnl_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                struct pipe_buffer *indexBuffer,
+                                unsigned index_size,
+                                unsigned min_index,
+                                unsigned max_index,
+                                unsigned prim, 
+                                unsigned start, 
+                                unsigned count,
+                                unsigned bias );
+
+enum pipe_error
+svga_hwtnl_flush( struct svga_hwtnl *hwtnl );
+
+
+#endif /* SVGA_DRAW_H_ */
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
new file mode 100644
index 00000000000..75492dffca2
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -0,0 +1,297 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_inlines.h"
+#include "util/u_prim.h"
+#include "indices/u_indices.h"
+
+#include "svga_hw_reg.h"
+#include "svga_draw.h"
+#include "svga_draw_private.h"
+#include "svga_context.h"
+
+
+#define DBG 0
+
+
+
+
+static enum pipe_error generate_indices( struct svga_hwtnl *hwtnl,
+                                         unsigned nr,
+                                         unsigned index_size,
+                                         u_generate_func generate,
+                                         struct pipe_buffer **out_buf )
+{
+   struct pipe_screen *screen = hwtnl->svga->pipe.screen;
+   unsigned size = index_size * nr;
+   struct pipe_buffer *dst = NULL;
+   void *dst_map = NULL;
+
+   dst = screen->buffer_create( screen, 32, 
+                                PIPE_BUFFER_USAGE_INDEX |
+                                PIPE_BUFFER_USAGE_CPU_WRITE |
+                                PIPE_BUFFER_USAGE_GPU_READ, 
+                                size );
+   if (dst == NULL)
+      goto fail;
+
+   dst_map = pipe_buffer_map( screen, dst, PIPE_BUFFER_USAGE_CPU_WRITE );
+   if (dst_map == NULL)
+      goto fail;
+
+   generate( nr,
+             dst_map );
+
+   pipe_buffer_unmap( screen, dst );
+
+   *out_buf = dst;
+   return PIPE_OK;
+
+fail:
+   if (dst_map)
+      screen->buffer_unmap( screen, dst );
+
+   if (dst)
+      screen->buffer_destroy( dst );
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+static boolean compare( unsigned cached_nr,
+                        unsigned nr,
+                        unsigned type )
+{
+   if (type == U_GENERATE_REUSABLE)
+      return cached_nr >= nr;
+   else
+      return cached_nr == nr;
+}
+
+static enum pipe_error retrieve_or_generate_indices( struct svga_hwtnl *hwtnl,
+                                                     unsigned prim,
+                                                     unsigned gen_type,
+                                                     unsigned gen_nr,
+                                                     unsigned gen_size,
+                                                     u_generate_func generate,
+                                                     struct pipe_buffer **out_buf )
+{
+   enum pipe_error ret = PIPE_OK;
+   int i;
+
+   for (i = 0; i < IDX_CACHE_MAX; i++) {
+      if (hwtnl->index_cache[prim][i].buffer != NULL &&
+          hwtnl->index_cache[prim][i].generate == generate)
+      {
+         if (compare(hwtnl->index_cache[prim][i].gen_nr, gen_nr, gen_type))
+         {
+            pipe_buffer_reference( out_buf,
+                                   hwtnl->index_cache[prim][i].buffer );
+
+            if (DBG) 
+               debug_printf("%s retrieve %d/%d\n", __FUNCTION__, i, gen_nr);
+
+            return PIPE_OK;
+         }
+         else if (gen_type == U_GENERATE_REUSABLE) 
+         {
+            pipe_buffer_reference( &hwtnl->index_cache[prim][i].buffer,
+                                   NULL );
+
+            if (DBG) 
+               debug_printf("%s discard %d/%d\n", __FUNCTION__, 
+                            i, hwtnl->index_cache[prim][i].gen_nr);
+
+            break;
+         }
+      }
+   }
+
+   if (i == IDX_CACHE_MAX)
+   {
+      unsigned smallest = 0;
+      unsigned smallest_size = ~0;
+      
+      for (i = 0; i < IDX_CACHE_MAX && smallest_size; i++) {
+         if (hwtnl->index_cache[prim][i].buffer == NULL)
+         {
+            smallest = i;
+            smallest_size = 0;
+         }
+         else if (hwtnl->index_cache[prim][i].gen_nr < smallest)
+         {
+            smallest = i;
+            smallest_size = hwtnl->index_cache[prim][i].gen_nr;
+         }
+      }
+
+      assert (smallest != IDX_CACHE_MAX);
+
+      pipe_buffer_reference( &hwtnl->index_cache[prim][smallest].buffer,
+                             NULL );
+
+      if (DBG)
+         debug_printf("%s discard smallest %d/%d\n", __FUNCTION__, 
+                      smallest, smallest_size);
+      
+      i = smallest;
+   }
+      
+      
+   ret = generate_indices( hwtnl, 
+                           gen_nr,
+                           gen_size,
+                           generate,
+                           out_buf );
+   if (ret != PIPE_OK)
+      return ret;
+
+
+   hwtnl->index_cache[prim][i].generate = generate;
+   hwtnl->index_cache[prim][i].gen_nr = gen_nr;
+   pipe_buffer_reference( &hwtnl->index_cache[prim][i].buffer,
+                          *out_buf );
+
+   if (DBG)
+      debug_printf("%s cache %d/%d\n", __FUNCTION__, 
+                   i, hwtnl->index_cache[prim][i].gen_nr);
+
+   return PIPE_OK;
+}
+
+
+
+static enum pipe_error
+simple_draw_arrays( struct svga_hwtnl *hwtnl,
+                    unsigned prim, unsigned start, unsigned count )
+{
+   SVGA3dPrimitiveRange range;
+   unsigned hw_prim;
+   unsigned hw_count;
+
+   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   if (hw_count == 0)
+      return PIPE_ERROR_BAD_INPUT;
+      
+   range.primType = hw_prim;
+   range.primitiveCount = hw_count;
+   range.indexArray.surfaceId = SVGA3D_INVALID_ID;
+   range.indexArray.offset = 0;
+   range.indexArray.stride = 0;
+   range.indexWidth = 0;
+   range.indexBias = start;
+
+   /* Min/max index should be calculated prior to applying bias, so we
+    * end up with min_index = 0, max_index = count - 1 and everybody
+    * looking at those numbers knows to adjust them by
+    * range.indexBias.
+    */
+   return svga_hwtnl_prim( hwtnl, &range, 0, count - 1, NULL );
+}
+
+
+
+
+
+
+
+
+
+
+enum pipe_error 
+svga_hwtnl_draw_arrays( struct svga_hwtnl *hwtnl,
+                        unsigned prim, 
+                        unsigned start, 
+                        unsigned count)
+{
+   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   u_generate_func gen_func;
+   enum pipe_error ret = PIPE_OK;
+
+   if (hwtnl->api_fillmode != PIPE_POLYGON_MODE_FILL && 
+       prim >= PIPE_PRIM_TRIANGLES) 
+   {
+      gen_type = u_unfilled_generator( prim,
+                                       start,
+                                       count,
+                                       hwtnl->api_fillmode,
+                                       &gen_prim,
+                                       &gen_size,
+                                       &gen_nr,
+                                       &gen_func );
+   }
+   else {
+      gen_type = u_index_generator( svga_hw_prims,
+                                    prim,
+                                    start,
+                                    count,
+                                    hwtnl->api_pv,
+                                    hwtnl->hw_pv,
+                                    &gen_prim,
+                                    &gen_size,
+                                    &gen_nr,
+                                    &gen_func );
+   }
+
+   if (gen_type == U_GENERATE_LINEAR) {
+      return simple_draw_arrays( hwtnl, gen_prim, start, count );
+   }
+   else {
+      struct pipe_buffer *gen_buf = NULL;
+
+      /* Need to draw as indexed primitive. 
+       * Potentially need to run the gen func to build an index buffer.
+       */
+      ret = retrieve_or_generate_indices( hwtnl,
+                                          prim,
+                                          gen_type,
+                                          gen_nr,
+                                          gen_size,
+                                          gen_func,
+                                          &gen_buf );
+      if (ret)
+         goto done;
+
+      ret = svga_hwtnl_simple_draw_range_elements( hwtnl,
+                                                   gen_buf,
+                                                   gen_size,
+                                                   0,
+                                                   count - 1,
+                                                   gen_prim,
+                                                   0,
+                                                   gen_nr,
+                                                   start );
+      if (ret)
+         goto done;
+
+   done:
+      if (gen_buf)
+         pipe_buffer_reference( &gen_buf, NULL );
+
+      return ret;
+   }
+}
+
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
new file mode 100644
index 00000000000..167d8178315
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -0,0 +1,255 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_prim.h"
+#include "util/u_upload_mgr.h"
+#include "indices/u_indices.h"
+
+#include "svga_cmd.h"
+#include "svga_draw.h"
+#include "svga_draw_private.h"
+#include "svga_screen_buffer.h"
+#include "svga_winsys.h"
+#include "svga_context.h"
+
+#include "svga_hw_reg.h"
+
+
+static enum pipe_error
+translate_indices( struct svga_hwtnl *hwtnl,
+                   struct pipe_buffer *src,
+                   unsigned offset,
+                   unsigned nr,
+                   unsigned index_size,
+                   u_translate_func translate,
+                   struct pipe_buffer **out_buf )
+{
+   struct pipe_screen *screen = hwtnl->svga->pipe.screen;
+   unsigned size = index_size * nr;
+   const void *src_map = NULL;
+   struct pipe_buffer *dst = NULL;
+   void *dst_map = NULL;
+
+   dst = screen->buffer_create( screen, 32, 
+                                PIPE_BUFFER_USAGE_INDEX |
+                                PIPE_BUFFER_USAGE_CPU_WRITE |
+                                PIPE_BUFFER_USAGE_GPU_READ, 
+                                size );
+   if (dst == NULL)
+      goto fail;
+
+   src_map = pipe_buffer_map( screen, src, PIPE_BUFFER_USAGE_CPU_READ );
+   if (src_map == NULL)
+      goto fail;
+
+   dst_map = pipe_buffer_map( screen, dst, PIPE_BUFFER_USAGE_CPU_WRITE );
+   if (dst_map == NULL)
+      goto fail;
+
+   translate( (const char *)src_map + offset,
+              nr,
+              dst_map );
+
+   pipe_buffer_unmap( screen, src );
+   pipe_buffer_unmap( screen, dst );
+
+   *out_buf = dst;
+   return PIPE_OK;
+
+fail:
+   if (src_map)
+      screen->buffer_unmap( screen, src );
+
+   if (dst_map)
+      screen->buffer_unmap( screen, dst );
+
+   if (dst)
+      screen->buffer_destroy( dst );
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+
+
+
+enum pipe_error
+svga_hwtnl_simple_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                       struct pipe_buffer *index_buffer,
+                                       unsigned index_size,
+                                       unsigned min_index,
+                                       unsigned max_index,
+                                       unsigned prim, 
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned bias )
+{
+   struct pipe_buffer *upload_buffer = NULL;
+   SVGA3dPrimitiveRange range;
+   unsigned hw_prim;
+   unsigned hw_count;
+   unsigned index_offset = start * index_size;
+   int ret = PIPE_OK;
+
+   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   if (hw_count == 0)
+      goto done;
+
+   if (index_buffer && 
+       svga_buffer_is_user_buffer(index_buffer)) 
+   {
+      assert( index_buffer->size >= index_offset + count * index_size );
+
+      ret = u_upload_buffer( hwtnl->upload_ib,
+                             index_offset,
+                             count * index_size,
+                             index_buffer,
+                             &index_offset,
+                             &upload_buffer );
+      if (ret)
+         goto done;
+
+      /* Don't need to worry about refcounting index_buffer as this is
+       * just a stack variable without a counted reference of its own.
+       * The caller holds the reference.
+       */
+      index_buffer = upload_buffer;
+   }
+
+   range.primType = hw_prim;
+   range.primitiveCount = hw_count;
+   range.indexArray.offset = index_offset;
+   range.indexArray.stride = index_size;
+   range.indexWidth = index_size;
+   range.indexBias = bias;
+      
+   ret = svga_hwtnl_prim( hwtnl, &range, min_index, max_index, index_buffer );
+   if (ret)
+      goto done;
+
+done:
+   if (upload_buffer)
+      pipe_buffer_reference( &upload_buffer, NULL );
+
+   return ret;
+}
+
+
+
+
+enum pipe_error
+svga_hwtnl_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                struct pipe_buffer *index_buffer,
+                                unsigned index_size,
+                                unsigned min_index,
+                                unsigned max_index,
+                                unsigned prim, unsigned start, unsigned count,
+                                unsigned bias)
+{
+   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   u_translate_func gen_func;
+   enum pipe_error ret = PIPE_OK;
+
+   if (hwtnl->api_fillmode != PIPE_POLYGON_MODE_FILL && 
+       prim >= PIPE_PRIM_TRIANGLES) 
+   {
+      gen_type = u_unfilled_translator( prim,
+                                        index_size,
+                                        count,
+                                        hwtnl->api_fillmode,
+                                        &gen_prim,
+                                        &gen_size,
+                                        &gen_nr,
+                                        &gen_func );
+   }
+   else
+   {
+      gen_type = u_index_translator( svga_hw_prims,
+                                     prim,
+                                     index_size,
+                                     count,
+                                     hwtnl->api_pv,
+                                     hwtnl->hw_pv,
+                                     &gen_prim,
+                                     &gen_size,
+                                     &gen_nr,
+                                     &gen_func );
+   }
+
+   
+   if (gen_type == U_TRANSLATE_MEMCPY) {
+      /* No need for translation, just pass through to hardware: 
+       */
+      return svga_hwtnl_simple_draw_range_elements( hwtnl, index_buffer,
+                                                    index_size,
+                                                    min_index,
+                                                    max_index,
+                                                    gen_prim, start, count, bias );
+   }
+   else {
+      struct pipe_buffer *gen_buf = NULL;
+
+      /* Need to allocate a new index buffer and run the translate
+       * func to populate it.  Could potentially cache this translated
+       * index buffer with the original to avoid future
+       * re-translations.  Not much point if we're just accelerating
+       * GL though, as index buffers are typically used only once
+       * there.
+       */
+      ret = translate_indices( hwtnl,
+                               index_buffer,
+                               start * index_size,
+                               gen_nr,
+                               gen_size,
+                               gen_func,
+                               &gen_buf );
+      if (ret)
+         goto done;
+
+      ret = svga_hwtnl_simple_draw_range_elements( hwtnl,
+                                                   gen_buf,
+                                                   gen_size,
+                                                   min_index,
+                                                   max_index,
+                                                   gen_prim,
+                                                   0,
+                                                   gen_nr,
+                                                   bias );
+      if (ret)
+         goto done;
+
+   done:
+      if (gen_buf)
+         pipe_buffer_reference( &gen_buf, NULL );
+
+      return ret;
+   }
+}
+
+
+
+
+
diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h
new file mode 100644
index 00000000000..9aa40e16642
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@@ -0,0 +1,158 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DRAW_H_
+#define SVGA_DRAW_H_
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "indices/u_indices.h"
+#include "svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+struct svga_context;
+struct u_upload_mgr;
+
+/* Should include polygon?
+ */
+static const unsigned svga_hw_prims = 
+   ((1 << PIPE_PRIM_POINTS) |
+    (1 << PIPE_PRIM_LINES) |
+    (1 << PIPE_PRIM_LINE_STRIP) |
+    (1 << PIPE_PRIM_TRIANGLES) |
+    (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+    (1 << PIPE_PRIM_TRIANGLE_FAN));
+
+
+static INLINE unsigned svga_translate_prim(unsigned mode, 
+                                           unsigned count,
+                                           unsigned *out_count)
+{
+   switch (mode) {
+   case PIPE_PRIM_POINTS:
+      *out_count = count;
+      return SVGA3D_PRIMITIVE_POINTLIST;
+
+   case PIPE_PRIM_LINES:
+      *out_count = count / 2;
+      return SVGA3D_PRIMITIVE_LINELIST; 
+
+   case PIPE_PRIM_LINE_STRIP:
+      *out_count = count - 1;
+      return SVGA3D_PRIMITIVE_LINESTRIP; 
+
+   case PIPE_PRIM_TRIANGLES:
+      *out_count = count / 3;
+      return SVGA3D_PRIMITIVE_TRIANGLELIST; 
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      *out_count = count - 2;
+      return SVGA3D_PRIMITIVE_TRIANGLESTRIP; 
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      *out_count = count - 2;
+      return SVGA3D_PRIMITIVE_TRIANGLEFAN; 
+
+   default:
+      assert(0);
+      *out_count = 0;
+      return 0;
+   }
+}
+
+
+struct index_cache {
+   u_generate_func generate;
+   unsigned gen_nr;
+
+   /* If non-null, this buffer is filled by calling 
+    *   generate(nr, map(buffer))
+    */
+   struct pipe_buffer *buffer;
+};
+
+#define QSZ 32
+
+struct draw_cmd {
+   struct svga_winsys_context *swc;
+
+   SVGA3dVertexDecl vdecl[SVGA3D_INPUTREG_MAX];
+   struct pipe_buffer *vdecl_vb[SVGA3D_INPUTREG_MAX];
+   unsigned vdecl_count;
+
+   SVGA3dPrimitiveRange prim[QSZ];
+   struct pipe_buffer *prim_ib[QSZ];
+   unsigned prim_count;
+   unsigned min_index[QSZ];
+   unsigned max_index[QSZ];
+};
+
+#define IDX_CACHE_MAX  8
+
+struct svga_hwtnl {
+   struct svga_context *svga;
+   struct u_upload_mgr *upload_ib;
+   
+   /* Flatshade information:
+    */
+   unsigned api_pv;
+   unsigned hw_pv;
+   unsigned api_fillmode;
+
+   /* Cache the results of running a particular generate func on each
+    * primitive type.
+    */
+   struct index_cache index_cache[PIPE_PRIM_MAX][IDX_CACHE_MAX];
+
+   /* Try to build the maximal draw command packet before emitting:
+    */
+   struct draw_cmd cmd;
+};
+
+
+
+/***********************************************************************
+ * Internal functions
+ */
+enum pipe_error 
+svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
+                 const SVGA3dPrimitiveRange *range,
+                 unsigned min_index,
+                 unsigned max_index,
+                 struct pipe_buffer *ib );
+
+enum pipe_error
+svga_hwtnl_simple_draw_range_elements( struct svga_hwtnl *hwtnl,
+                                       struct pipe_buffer *indexBuffer,
+                                       unsigned index_size,
+                                       unsigned min_index,
+                                       unsigned max_index,
+                                       unsigned prim, 
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned bias );
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_hw_reg.h b/src/gallium/drivers/svga/svga_hw_reg.h
new file mode 100644
index 00000000000..183f4b918e0
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_hw_reg.h
@@ -0,0 +1,42 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_HW_REG_H
+#define SVGA_HW_REG_H
+
+#include "pipe/p_compiler.h"
+
+#if defined(PIPE_CC_GCC)
+#ifndef HAVE_STDINT_H
+#define HAVE_STDINT_H
+#endif
+#endif
+
+#include "svga_types.h"
+
+#include "svga3d_reg.h"
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
new file mode 100644
index 00000000000..855d228755f
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -0,0 +1,246 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+
+#include "svga_hw_reg.h"
+
+
+static INLINE unsigned
+svga_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:            return SVGA3D_BLENDOP_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:       return SVGA3D_BLENDOP_SRCALPHA;
+   case PIPE_BLENDFACTOR_ONE:             return SVGA3D_BLENDOP_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:       return SVGA3D_BLENDOP_SRCCOLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:   return SVGA3D_BLENDOP_INVSRCCOLOR;
+   case PIPE_BLENDFACTOR_DST_COLOR:       return SVGA3D_BLENDOP_DESTCOLOR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:   return SVGA3D_BLENDOP_INVDESTCOLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:   return SVGA3D_BLENDOP_INVSRCALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:       return SVGA3D_BLENDOP_DESTALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:   return SVGA3D_BLENDOP_INVDESTALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return SVGA3D_BLENDOP_SRCALPHASAT;
+   case PIPE_BLENDFACTOR_CONST_COLOR:     return SVGA3D_BLENDOP_BLENDFACTOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR: return SVGA3D_BLENDOP_INVBLENDFACTOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:     return SVGA3D_BLENDOP_BLENDFACTOR; /* ? */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: return SVGA3D_BLENDOP_INVBLENDFACTOR; /* ? */
+   default:
+      assert(0);
+      return SVGA3D_BLENDOP_ZERO;
+   }
+}
+
+static INLINE unsigned
+svga_translate_blend_func(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:              return SVGA3D_BLENDEQ_ADD;
+   case PIPE_BLEND_SUBTRACT:         return SVGA3D_BLENDEQ_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT: return SVGA3D_BLENDEQ_REVSUBTRACT;
+   case PIPE_BLEND_MIN:              return SVGA3D_BLENDEQ_MINIMUM;
+   case PIPE_BLEND_MAX:              return SVGA3D_BLENDEQ_MAXIMUM;
+   default:
+      assert(0);
+      return SVGA3D_BLENDEQ_ADD;
+   }
+}
+
+
+static void *
+svga_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *templ)
+{
+   struct svga_blend_state *blend = CALLOC_STRUCT( svga_blend_state );
+   unsigned i;
+
+ 
+   /* Fill in the per-rendertarget blend state.  We currently only
+    * have one rendertarget.
+    */
+   for (i = 0; i < 1; i++) {
+      /* No way to set this in SVGA3D, and no way to correctly implement it on
+       * top of D3D9 API.  Instead we try to simulate with various blend modes.
+       */
+      if (templ->logicop_enable) {
+         switch (templ->logicop_func) {
+         case PIPE_LOGICOP_XOR:
+            blend->need_white_fragments = TRUE;
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_SUBTRACT;
+            break;
+         case PIPE_LOGICOP_CLEAR:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_COPY:
+            blend->rt[i].blend_enable = FALSE;
+            break;
+         case PIPE_LOGICOP_COPY_INVERTED:
+            blend->rt[i].blend_enable   = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
+            break;
+         case PIPE_LOGICOP_NOOP:
+            blend->rt[i].blend_enable   = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
+            break;
+         case PIPE_LOGICOP_SET:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_INVERT:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
+            break;
+         case PIPE_LOGICOP_AND:
+            /* Approximate with minimum - works for the 0 & anything case: */
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_AND_REVERSE:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_AND_INVERTED:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+            break;
+         case PIPE_LOGICOP_OR:
+            /* Approximate with maximum - works for the 1 | anything case: */
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_OR_REVERSE:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_OR_INVERTED:
+            blend->rt[i].blend_enable = TRUE;
+            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+            break;
+         case PIPE_LOGICOP_NAND:
+         case PIPE_LOGICOP_NOR:
+         case PIPE_LOGICOP_EQUIV:
+            /* Fill these in with plausible values */
+            blend->rt[i].blend_enable = FALSE;
+            break;
+         default:
+            assert(0);
+            break;
+         }
+      }
+      else {
+         blend->rt[i].blend_enable   = templ->blend_enable;
+
+         if (templ->blend_enable) {
+            blend->rt[i].srcblend       = svga_translate_blend_factor(templ->rgb_src_factor);
+            blend->rt[i].dstblend       = svga_translate_blend_factor(templ->rgb_dst_factor);
+            blend->rt[i].blendeq        = svga_translate_blend_func(templ->rgb_func);
+            blend->rt[i].srcblend_alpha = svga_translate_blend_factor(templ->alpha_src_factor);
+            blend->rt[i].dstblend_alpha = svga_translate_blend_factor(templ->alpha_dst_factor);
+            blend->rt[i].blendeq_alpha  = svga_translate_blend_func(templ->alpha_func);
+
+            if (blend->rt[i].srcblend_alpha != blend->rt[i].srcblend ||
+                blend->rt[i].dstblend_alpha != blend->rt[i].dstblend ||
+                blend->rt[i].blendeq_alpha  != blend->rt[i].blendeq)
+            {
+               blend->rt[i].separate_alpha_blend_enable = TRUE;
+            }
+         }
+      }
+
+      blend->rt[i].writemask = templ->colormask;
+   }
+
+   return blend;
+}
+
+static void svga_bind_blend_state(struct pipe_context *pipe,
+                                  void *blend)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.blend = (struct svga_blend_state*)blend;
+   svga->dirty |= SVGA_NEW_BLEND;
+}
+
+
+static void svga_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+static void svga_set_blend_color( struct pipe_context *pipe,
+                                  const struct pipe_blend_color *blend_color )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.blend_color = *blend_color;
+
+   svga->dirty |= SVGA_NEW_BLEND;
+}
+
+
+void svga_init_blend_functions( struct svga_context *svga )
+{
+   svga->pipe.create_blend_state = svga_create_blend_state;
+   svga->pipe.bind_blend_state = svga_bind_blend_state;
+   svga->pipe.delete_blend_state = svga_delete_blend_state;
+
+   svga->pipe.set_blend_color = svga_set_blend_color;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_blit.c b/src/gallium/drivers/svga/svga_pipe_blit.c
new file mode 100644
index 00000000000..4f575b06e62
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@@ -0,0 +1,92 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_screen_texture.h"
+#include "svga_context.h"
+#include "svga_debug.h"
+#include "svga_cmd.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+
+static void svga_surface_copy(struct pipe_context *pipe,
+                              struct pipe_surface *dest,
+                              unsigned destx, unsigned desty,
+                              struct pipe_surface *src,
+                              unsigned srcx, unsigned srcy,
+                              unsigned width, unsigned height)
+{
+   struct svga_context *svga = svga_context(pipe);
+   SVGA3dCopyBox *box;
+   enum pipe_error ret;
+
+   svga_hwtnl_flush_retry( svga );
+
+   SVGA_DBG(DEBUG_DMA, "blit to sid %p (%d,%d), from sid %p (%d,%d) sz %dx%d\n",
+            svga_surface(dest)->handle,
+            destx, desty,
+            svga_surface(src)->handle,
+            srcx, srcy,
+            width, height);
+
+   ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                 src,
+                                 dest,
+                                 &box,
+                                 1);
+   if(ret != PIPE_OK) {
+
+      svga_context_flush(svga, NULL);
+
+      ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                    src,
+                                    dest,
+                                    &box,
+                                    1);
+      assert(ret == PIPE_OK);
+   }
+
+   box->x = destx;
+   box->y = desty;
+   box->z = 0;
+   box->w = width;
+   box->h = height;
+   box->d = 1;
+   box->srcx = srcx;
+   box->srcy = srcy;
+   box->srcz = 0;
+
+   SVGA_FIFOCommitAll(svga->swc);
+
+   svga_surface(dest)->dirty = TRUE;
+   svga_propagate_surface(pipe, dest);
+}
+
+
+void
+svga_init_blit_functions(struct svga_context *svga)
+{
+   svga->pipe.surface_copy = svga_surface_copy;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_clear.c b/src/gallium/drivers/svga/svga_pipe_clear.c
new file mode 100644
index 00000000000..6195c3897ed
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@@ -0,0 +1,125 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_screen_texture.h"
+
+
+static enum pipe_error
+try_clear(struct svga_context *svga, 
+          unsigned buffers,
+          const float *rgba,
+          double depth,
+          unsigned stencil)
+{
+   int ret = PIPE_OK;
+   SVGA3dRect rect = { 0, 0, 0, 0 };
+   boolean restore_viewport = FALSE;
+   SVGA3dClearFlag flags = 0;
+   struct pipe_framebuffer_state *fb = &svga->curr.framebuffer;
+   unsigned color = 0;
+
+   ret = svga_update_state(svga, SVGA_STATE_HW_CLEAR);
+   if (ret)
+      return ret;
+
+   if ((buffers & PIPE_CLEAR_COLOR) && fb->cbufs[0]) {
+      flags |= SVGA3D_CLEAR_COLOR;
+      util_pack_color(rgba, PIPE_FORMAT_A8R8G8B8_UNORM, &color);
+
+      rect.w = fb->cbufs[0]->width;
+      rect.h = fb->cbufs[0]->height;
+   }
+
+   if ((buffers & PIPE_CLEAR_DEPTHSTENCIL) && fb->zsbuf) {
+      flags |= SVGA3D_CLEAR_DEPTH;
+
+      if (svga->curr.framebuffer.zsbuf->format == PIPE_FORMAT_Z24S8_UNORM)
+         flags |= SVGA3D_CLEAR_STENCIL;
+
+      rect.w = MAX2(rect.w, fb->zsbuf->width);
+      rect.h = MAX2(rect.h, fb->zsbuf->height);
+   }
+
+   if (memcmp(&rect, &svga->state.hw_clear.viewport, sizeof(rect)) != 0) {
+      restore_viewport = TRUE;
+      ret = SVGA3D_SetViewport(svga->swc, &rect);
+      if (ret)
+         return ret;
+   }
+
+   ret = SVGA3D_ClearRect(svga->swc, flags, color, depth, stencil,
+                          rect.x, rect.y, rect.w, rect.h);
+   if (ret != PIPE_OK)
+      return ret;
+
+   if (restore_viewport) {
+      memcpy(&rect, &svga->state.hw_clear.viewport, sizeof rect);
+      ret = SVGA3D_SetViewport(svga->swc, &rect);
+   }
+   
+   return ret;
+}
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+svga_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
+	   double depth, unsigned stencil)
+{
+   struct svga_context *svga = svga_context( pipe );
+   int ret;
+   
+   if (buffers & PIPE_CLEAR_COLOR)
+      SVGA_DBG(DEBUG_DMA, "clear sid %p\n",
+               svga_surface(svga->curr.framebuffer.cbufs[0])->handle);
+
+   ret = try_clear( svga, buffers, rgba, depth, stencil );
+
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      /* Flush command buffer and retry:
+       */
+      svga_context_flush( svga, NULL );
+
+      ret = try_clear( svga, buffers, rgba, depth, stencil );
+   }
+
+   /*
+    * Mark target surfaces as dirty
+    * TODO Mark only cleared surfaces.
+    */
+   svga_mark_surfaces_dirty(svga);
+
+   assert (ret == PIPE_OK);
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_constants.c b/src/gallium/drivers/svga/svga_pipe_constants.c
new file mode 100644
index 00000000000..10e7a121892
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_constants.c
@@ -0,0 +1,74 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_hw_reg.h"
+#include "svga_cmd.h"
+
+/***********************************************************************
+ * Constant buffers 
+ */
+
+struct svga_constbuf 
+{
+   unsigned type;
+   float (*data)[4];
+   unsigned count;
+};
+
+
+
+static void svga_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   pipe_buffer_reference( &svga->curr.cb[shader],
+                          buf->buffer );
+
+   if (shader == PIPE_SHADER_FRAGMENT)
+      svga->dirty |= SVGA_NEW_FS_CONST_BUFFER;
+   else
+      svga->dirty |= SVGA_NEW_VS_CONST_BUFFER;
+}
+
+
+
+void svga_init_constbuffer_functions( struct svga_context *svga )
+{
+   svga->pipe.set_constant_buffer = svga_set_constant_buffer;
+}
+
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
new file mode 100644
index 00000000000..df636c08a05
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -0,0 +1,153 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_hw_reg.h"
+
+
+static INLINE unsigned
+svga_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:     return SVGA3D_CMP_NEVER;
+   case PIPE_FUNC_LESS:      return SVGA3D_CMP_LESS;
+   case PIPE_FUNC_LEQUAL:    return SVGA3D_CMP_LESSEQUAL;
+   case PIPE_FUNC_GREATER:   return SVGA3D_CMP_GREATER;
+   case PIPE_FUNC_GEQUAL:    return SVGA3D_CMP_GREATEREQUAL;
+   case PIPE_FUNC_NOTEQUAL:  return SVGA3D_CMP_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:     return SVGA3D_CMP_EQUAL;
+   case PIPE_FUNC_ALWAYS:    return SVGA3D_CMP_ALWAYS;
+   default:
+      assert(0);
+      return SVGA3D_CMP_ALWAYS;
+   }
+}
+
+static INLINE unsigned
+svga_translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:      return SVGA3D_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:      return SVGA3D_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:   return SVGA3D_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:      return SVGA3D_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR:      return SVGA3D_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INCR_WRAP: return SVGA3D_STENCILOP_INCRSAT; /* incorrect? */
+   case PIPE_STENCIL_OP_DECR_WRAP: return SVGA3D_STENCILOP_DECRSAT; /* incorrect? */
+   case PIPE_STENCIL_OP_INVERT:    return SVGA3D_STENCILOP_INVERT;
+   default:
+      assert(0);
+      return SVGA3D_STENCILOP_KEEP;
+   }
+}
+
+
+static void *
+svga_create_depth_stencil_state(struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *templ)
+{
+   struct svga_depth_stencil_state *ds = CALLOC_STRUCT( svga_depth_stencil_state );
+
+   /* Don't try to figure out CW/CCW correspondence with
+    * stencil[0]/[1] at this point.  Presumably this can change as
+    * back/front face are modified.
+    */
+   ds->stencil[0].enabled = templ->stencil[0].enabled;
+   if (ds->stencil[0].enabled) {
+      ds->stencil[0].func  = svga_translate_compare_func(templ->stencil[0].func);
+      ds->stencil[0].fail  = svga_translate_stencil_op(templ->stencil[0].fail_op);
+      ds->stencil[0].zfail = svga_translate_stencil_op(templ->stencil[0].zfail_op);
+      ds->stencil[0].pass  = svga_translate_stencil_op(templ->stencil[0].zpass_op);
+      
+      /* SVGA3D has one ref/mask/writemask triple shared between front &
+       * back face stencil.  We really need two:
+       */
+      ds->stencil_ref       = templ->stencil[0].ref_value & 0xff;
+      ds->stencil_mask      = templ->stencil[0].valuemask & 0xff;
+      ds->stencil_writemask = templ->stencil[0].writemask & 0xff;
+   }
+
+
+   ds->stencil[1].enabled = templ->stencil[1].enabled;
+   if (templ->stencil[1].enabled) {
+      ds->stencil[1].func   = svga_translate_compare_func(templ->stencil[1].func);
+      ds->stencil[1].fail   = svga_translate_stencil_op(templ->stencil[1].fail_op);
+      ds->stencil[1].zfail  = svga_translate_stencil_op(templ->stencil[1].zfail_op);
+      ds->stencil[1].pass   = svga_translate_stencil_op(templ->stencil[1].zpass_op);
+
+      ds->stencil_ref       = templ->stencil[1].ref_value & 0xff;
+      ds->stencil_mask      = templ->stencil[1].valuemask & 0xff;
+      ds->stencil_writemask = templ->stencil[1].writemask & 0xff;
+   }
+
+
+   ds->zenable = templ->depth.enabled;
+   if (ds->zenable) {
+      ds->zfunc = svga_translate_compare_func(templ->depth.func);
+      ds->zwriteenable = templ->depth.writemask;
+   }
+
+   ds->alphatestenable = templ->alpha.enabled;
+   if (ds->alphatestenable) {
+      ds->alphafunc = svga_translate_compare_func(templ->alpha.func);
+      ds->alpharef = templ->alpha.ref_value;
+   }
+
+   return ds;
+}
+
+static void svga_bind_depth_stencil_state(struct pipe_context *pipe,
+                                          void *depth_stencil)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.depth = (const struct svga_depth_stencil_state *)depth_stencil;
+   svga->dirty |= SVGA_NEW_DEPTH_STENCIL;
+}
+
+static void svga_delete_depth_stencil_state(struct pipe_context *pipe,
+                                            void *depth_stencil)
+{
+   FREE(depth_stencil);
+}
+
+
+
+void svga_init_depth_stencil_functions( struct svga_context *svga )
+{
+   svga->pipe.create_depth_stencil_alpha_state = svga_create_depth_stencil_state;
+   svga->pipe.bind_depth_stencil_alpha_state = svga_bind_depth_stencil_state;
+   svga->pipe.delete_depth_stencil_alpha_state = svga_delete_depth_stencil_state;
+}
+
+
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
new file mode 100644
index 00000000000..71a552862e9
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -0,0 +1,261 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_inlines.h"
+#include "util/u_prim.h"
+#include "util/u_time.h"
+#include "indices/u_indices.h"
+
+#include "svga_hw_reg.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_winsys.h"
+#include "svga_draw.h"
+#include "svga_state.h"
+#include "svga_swtnl.h"
+#include "svga_debug.h"
+
+
+
+static enum pipe_error
+retry_draw_range_elements( struct svga_context *svga,
+                           struct pipe_buffer *index_buffer,
+                           unsigned index_size,
+                           unsigned min_index,
+                           unsigned max_index,
+                           unsigned prim, 
+                           unsigned start, 
+                           unsigned count,
+                           boolean do_retry )
+{
+   enum pipe_error ret = 0;
+
+   svga_hwtnl_set_unfilled( svga->hwtnl,
+                            svga->curr.rast->hw_unfilled );
+
+   svga_hwtnl_set_flatshade( svga->hwtnl,
+                             svga->curr.rast->templ.flatshade,
+                             svga->curr.rast->templ.flatshade_first );
+
+
+   ret = svga_update_state( svga, SVGA_STATE_HW_DRAW );
+   if (ret)
+      goto retry;
+
+   ret = svga_hwtnl_draw_range_elements( svga->hwtnl,
+                                         index_buffer, index_size,
+                                         min_index, max_index,
+                                         prim, start, count, 0 );
+   if (ret)
+      goto retry;
+
+   if (svga->curr.any_user_vertex_buffers) {
+      ret = svga_hwtnl_flush( svga->hwtnl );
+      if (ret)
+         goto retry;
+   }
+
+   return PIPE_OK;
+
+retry:
+   svga_context_flush( svga, NULL );
+
+   if (do_retry)
+   {
+      return retry_draw_range_elements( svga,
+                                        index_buffer, index_size,
+                                        min_index, max_index,
+                                        prim, start, count,
+                                        FALSE );
+   }
+
+   return ret;
+}
+
+
+static enum pipe_error
+retry_draw_arrays( struct svga_context *svga,
+                   unsigned prim, 
+                   unsigned start, 
+                   unsigned count,
+                   boolean do_retry )
+{
+   enum pipe_error ret;
+
+   svga_hwtnl_set_unfilled( svga->hwtnl,
+                            svga->curr.rast->hw_unfilled );
+
+   svga_hwtnl_set_flatshade( svga->hwtnl,
+                             svga->curr.rast->templ.flatshade,
+                             svga->curr.rast->templ.flatshade_first );
+
+   ret = svga_update_state( svga, SVGA_STATE_HW_DRAW );
+   if (ret)
+      goto retry;
+
+   ret = svga_hwtnl_draw_arrays( svga->hwtnl, prim,
+                                 start, count );
+   if (ret)
+      goto retry;
+
+   if (svga->curr.any_user_vertex_buffers) {
+      ret = svga_hwtnl_flush( svga->hwtnl );
+      if (ret)
+         goto retry;
+   }
+
+   return 0;
+
+retry:
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY && do_retry) 
+   {
+      svga_context_flush( svga, NULL );
+
+      return retry_draw_arrays( svga,
+                                prim,
+                                start,
+                                count,
+                                FALSE );
+   }
+
+   return ret;
+}
+
+
+
+
+
+static boolean
+svga_draw_range_elements( struct pipe_context *pipe,
+                          struct pipe_buffer *index_buffer,
+                          unsigned index_size,
+                          unsigned min_index,
+                          unsigned max_index,
+                          unsigned prim, unsigned start, unsigned count)
+{
+   struct svga_context *svga = svga_context( pipe );
+   unsigned reduced_prim = u_reduced_prim(prim);
+   enum pipe_error ret = 0;
+
+   if (!u_trim_pipe_prim( prim, &count ))
+      return TRUE;
+
+   /*
+    * Mark currently bound target surfaces as dirty
+    * doesn't really matter if it is done before drawing.
+    *
+    * TODO If we ever normaly return something other then
+    * true we should not mark it as dirty then.
+    */
+   svga_mark_surfaces_dirty(svga_context(pipe));
+
+   if (svga->curr.reduced_prim != reduced_prim) {
+      svga->curr.reduced_prim = reduced_prim;
+      svga->dirty |= SVGA_NEW_REDUCED_PRIMITIVE;
+   }
+   
+   svga_update_state_retry( svga, SVGA_STATE_NEED_SWTNL );
+
+#ifdef DEBUG
+   if (svga->curr.vs->base.id == svga->debug.disable_shader ||
+       svga->curr.fs->base.id == svga->debug.disable_shader)
+      return 0;
+#endif
+
+   if (svga->state.sw.need_swtnl)
+   {
+      ret = svga_swtnl_draw_range_elements( svga, 
+                                            index_buffer, 
+                                            index_size,
+                                            min_index, max_index,
+                                            prim,
+                                            start, count );
+   }
+   else {
+      if (index_buffer) {
+         ret = retry_draw_range_elements( svga,
+                                          index_buffer,
+                                          index_size,
+                                          min_index,
+                                          max_index,
+                                          prim,
+                                          start,
+                                          count,
+                                          TRUE );
+      }
+      else {
+         ret = retry_draw_arrays( svga, 
+                                  prim, 
+                                  start, 
+                                  count,
+                                  TRUE );
+      }
+   }
+
+   if (SVGA_DEBUG & DEBUG_FLUSH) {
+      static unsigned id;
+      debug_printf("%s %d\n", __FUNCTION__, id++);
+      if (id > 1300)
+         util_time_sleep( 2000 );
+
+      svga_hwtnl_flush_retry( svga );
+      svga_context_flush(svga, NULL);
+   }
+
+   return ret == PIPE_OK;
+}
+
+
+static boolean 
+svga_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *index_buffer,
+                    unsigned index_size,
+                    unsigned prim, unsigned start, unsigned count)
+{
+   return svga_draw_range_elements( pipe, index_buffer,
+                                    index_size,
+                                    0, 0xffffffff,
+                                    prim, start, count );
+}
+
+static boolean 
+svga_draw_arrays( struct pipe_context *pipe,
+                  unsigned prim, unsigned start, unsigned count)
+{
+   return svga_draw_range_elements(pipe, NULL, 0, 
+                                   start, start + count - 1, 
+                                   prim, 
+                                   start, count);
+}
+
+
+void svga_init_draw_functions( struct svga_context *svga )
+{
+   svga->pipe.draw_arrays = svga_draw_arrays;
+   svga->pipe.draw_elements = svga_draw_elements;
+   svga->pipe.draw_range_elements = svga_draw_range_elements;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_flush.c b/src/gallium/drivers/svga/svga_pipe_flush.c
new file mode 100644
index 00000000000..0becb0765ac
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_flush.c
@@ -0,0 +1,71 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_defines.h"
+#include "svga_screen.h"
+#include "svga_screen_texture.h"
+#include "svga_context.h"
+#include "svga_winsys.h"
+#include "svga_draw.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+
+
+
+
+static void svga_flush( struct pipe_context *pipe,
+                        unsigned flags,
+                        struct pipe_fence_handle **fence )
+{
+   struct svga_context *svga = svga_context(pipe);
+   int i;
+
+   /* Emit buffered drawing commands.
+    */
+   svga_hwtnl_flush_retry( svga );
+
+   /* Emit back-copy from render target view to texture.
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (svga->curr.framebuffer.cbufs[i])
+         svga_propagate_surface(pipe, svga->curr.framebuffer.cbufs[i]);
+   }
+   if (svga->curr.framebuffer.zsbuf)
+      svga_propagate_surface(pipe, svga->curr.framebuffer.zsbuf);
+
+   /* Flush command queue.
+    */
+   svga_context_flush(svga, fence);
+
+   SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s flags %x fence_ptr %p\n",
+            __FUNCTION__, flags, fence ? *fence : 0x0);
+}
+
+
+void svga_init_flush_functions( struct svga_context *svga )
+{
+   svga->pipe.flush = svga_flush;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_fs.c b/src/gallium/drivers/svga/svga_pipe_fs.c
new file mode 100644
index 00000000000..e3be840d920
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@@ -0,0 +1,124 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_text.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_tgsi.h"
+#include "svga_hw_reg.h"
+#include "svga_cmd.h"
+#include "svga_draw.h"
+#include "svga_debug.h"
+
+
+/***********************************************************************
+ * Fragment shaders 
+ */
+
+static void *
+svga_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_fragment_shader *fs;
+
+   fs = CALLOC_STRUCT(svga_fragment_shader);
+   if (!fs)
+      return NULL;
+
+   fs->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(fs->base.tokens, &fs->base.info);
+
+   fs->base.id = svga->debug.shader_id++;
+   fs->base.use_sm30 = svgascreen->use_ps30;
+   
+   if (SVGA_DEBUG & DEBUG_TGSI || 0) {
+      debug_printf("%s id: %u, inputs: %u, outputs: %u\n",
+                   __FUNCTION__, fs->base.id,
+                   fs->base.info.num_inputs, fs->base.info.num_outputs);
+   }
+
+   return fs;
+}
+
+static void
+svga_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.fs = fs;
+   svga->dirty |= SVGA_NEW_FS;
+}
+
+static
+void svga_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
+   struct svga_shader_result *result, *tmp;
+   enum pipe_error ret;
+
+   svga_hwtnl_flush_retry( svga );
+
+   for (result = fs->base.results; result; result = tmp ) {
+      tmp = result->next;
+
+      ret = SVGA3D_DestroyShader(svga->swc, 
+                                 result->id,
+                                 SVGA3D_SHADERTYPE_PS );
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_DestroyShader(svga->swc, 
+                                    result->id,
+                                    SVGA3D_SHADERTYPE_PS );
+         assert(ret == PIPE_OK);
+      }
+
+      svga_destroy_shader_result( result );
+   }
+
+   FREE((void *)fs->base.tokens);
+   FREE(fs);
+}
+
+
+void svga_init_fs_functions( struct svga_context *svga )
+{
+   svga->pipe.create_fs_state = svga_create_fs_state;
+   svga->pipe.bind_fs_state = svga_bind_fs_state;
+   svga->pipe.delete_fs_state = svga_delete_fs_state;
+}
+
diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
new file mode 100644
index 00000000000..58cb1e6e230
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -0,0 +1,187 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "svga_context.h"
+#include "svga_screen_texture.h"
+#include "svga_state.h"
+#include "svga_winsys.h"
+
+#include "svga_hw_reg.h"
+
+
+
+
+static void svga_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   memcpy( &svga->curr.scissor, scissor, sizeof(*scissor) );
+   svga->dirty |= SVGA_NEW_SCISSOR;
+}
+
+
+static void svga_set_polygon_stipple( struct pipe_context *pipe,
+                                      const struct pipe_poly_stipple *stipple )
+{
+   /* overridden by the draw module */
+}
+
+
+void svga_cleanup_framebuffer(struct svga_context *svga)
+{
+   struct pipe_framebuffer_state *curr = &svga->curr.framebuffer;
+   struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer;
+   int i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&curr->cbufs[i], NULL);
+      pipe_surface_reference(&hw->cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&curr->zsbuf, NULL);
+   pipe_surface_reference(&hw->zsbuf, NULL);
+}
+
+
+#define DEPTH_BIAS_SCALE_FACTOR_D16    ((float)(1<<15))
+#define DEPTH_BIAS_SCALE_FACTOR_D24S8  ((float)(1<<23))
+#define DEPTH_BIAS_SCALE_FACTOR_D32    ((float)(1<<31))
+
+
+static void svga_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct pipe_framebuffer_state *dst = &svga->curr.framebuffer;
+   boolean propagate = FALSE;
+   int i;
+
+   dst->width = fb->width;
+   dst->height = fb->height;
+   dst->nr_cbufs = fb->nr_cbufs;
+
+   /* check if we need to propaget any of the target surfaces */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (dst->cbufs[i] && dst->cbufs[i] != fb->cbufs[i])
+         if (svga_surface_needs_propagation(dst->cbufs[i]))
+            propagate = TRUE;
+   }
+
+   if (propagate) {
+      /* make sure that drawing calls comes before propagation calls */
+      svga_hwtnl_flush_retry( svga );
+   
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+         if (dst->cbufs[i] && dst->cbufs[i] != fb->cbufs[i])
+            svga_propagate_surface(pipe, dst->cbufs[i]);
+   }
+
+   /* XXX: Actually the virtual hardware may support rendertargets with
+    * different size, depending on the host API and driver, but since we cannot
+    * know that make no such assumption here. */
+   for(i = 0; i < fb->nr_cbufs; ++i) {
+      if (fb->zsbuf && fb->cbufs[i]) {
+         assert(fb->zsbuf->width == fb->cbufs[i]->width); 
+         assert(fb->zsbuf->height == fb->cbufs[i]->height); 
+      }
+   }
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      pipe_surface_reference(&dst->cbufs[i], fb->cbufs[i]);
+   pipe_surface_reference(&dst->zsbuf, fb->zsbuf);
+
+
+   if (svga->curr.framebuffer.zsbuf)
+   {
+      switch (svga->curr.framebuffer.zsbuf->format) {
+      case PIPE_FORMAT_Z16_UNORM:
+         svga->curr.depthscale = 1.0f / DEPTH_BIAS_SCALE_FACTOR_D16;
+         break;
+      case PIPE_FORMAT_S8Z24_UNORM:
+      case PIPE_FORMAT_X8Z24_UNORM:
+      case PIPE_FORMAT_Z24S8_UNORM:
+      case PIPE_FORMAT_Z24X8_UNORM:
+         svga->curr.depthscale = 1.0f / DEPTH_BIAS_SCALE_FACTOR_D24S8;
+         break;
+      case PIPE_FORMAT_Z32_UNORM:
+         svga->curr.depthscale = 1.0f / DEPTH_BIAS_SCALE_FACTOR_D32;
+         break;
+      case PIPE_FORMAT_Z32_FLOAT:
+         svga->curr.depthscale = 1.0f / ((float)(1<<23));
+         break;
+      default:
+         svga->curr.depthscale = 0.0f;
+         break;
+      }
+   }
+   else {
+      svga->curr.depthscale = 0.0f;
+   }
+
+   svga->dirty |= SVGA_NEW_FRAME_BUFFER;
+}
+
+
+
+static void svga_set_clip_state( struct pipe_context *pipe,
+                                 const struct pipe_clip_state *clip )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.clip = *clip; /* struct copy */
+
+   svga->dirty |= SVGA_NEW_CLIP;
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void svga_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.viewport = *viewport; /* struct copy */
+
+   svga->dirty |= SVGA_NEW_VIEWPORT;
+}
+
+
+
+void svga_init_misc_functions( struct svga_context *svga )
+{
+   svga->pipe.set_scissor_state = svga_set_scissor_state;
+   svga->pipe.set_polygon_stipple = svga_set_polygon_stipple;
+   svga->pipe.set_framebuffer_state = svga_set_framebuffer_state;
+   svga->pipe.set_clip_state = svga_set_clip_state;
+   svga->pipe.set_viewport_state = svga_set_viewport_state;
+}
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
new file mode 100644
index 00000000000..01336b0a2c3
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -0,0 +1,267 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+
+#include "svga_cmd.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_screen_buffer.h"
+#include "svga_winsys.h"
+#include "svga_draw.h"
+#include "svga_debug.h"
+
+
+/* Fixme: want a public base class for all pipe structs, even if there
+ * isn't much in them.
+ */
+struct pipe_query {
+   int dummy;
+};
+
+struct svga_query {
+   struct pipe_query base;
+   SVGA3dQueryType type;
+   struct svga_winsys_buffer *hwbuf;
+   volatile SVGA3dQueryResult *queryResult;
+   struct pipe_fence_handle *fence;
+};
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct svga_query *
+svga_query( struct pipe_query *q )
+{
+   return (struct svga_query *)q;
+}
+
+static boolean svga_get_query_result(struct pipe_context *pipe, 
+                                     struct pipe_query *q,
+                                     boolean wait,
+                                     uint64_t *result);
+
+static struct pipe_query *svga_create_query( struct pipe_context *pipe,
+                                             unsigned query_type )
+{
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_query *sq;
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+
+   sq = CALLOC_STRUCT(svga_query);
+   if (!sq)
+      goto no_sq;
+
+   sq->type = SVGA3D_QUERYTYPE_OCCLUSION;
+
+   sq->hwbuf = svga_winsys_buffer_create(svgascreen, 
+                                         1,
+                                         SVGA_BUFFER_USAGE_PINNED,
+                                         sizeof *sq->queryResult);
+   if(!sq->hwbuf)
+      goto no_hwbuf;
+    
+   sq->queryResult = (SVGA3dQueryResult *)sws->buffer_map(sws, 
+                                                          sq->hwbuf, 
+                                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!sq->queryResult)
+      goto no_query_result;
+
+   sq->queryResult->totalSize = sizeof *sq->queryResult;
+   sq->queryResult->state = SVGA3D_QUERYSTATE_NEW;
+
+   /*
+    * We request the buffer to be pinned and assume it is always mapped.
+    * 
+    * The reason is that we don't want to wait for fences when checking the
+    * query status.
+    */
+   sws->buffer_unmap(sws, sq->hwbuf);
+
+   return &sq->base;
+
+no_query_result:
+   sws->buffer_destroy(sws, sq->hwbuf);
+no_hwbuf:
+   FREE(sq);
+no_sq:
+   return NULL;
+}
+
+static void svga_destroy_query(struct pipe_context *pipe,
+                               struct pipe_query *q)
+{
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_query *sq = svga_query( q );
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+   sws->buffer_destroy(sws, sq->hwbuf);
+   sws->fence_reference(sws, &sq->fence, NULL);
+   FREE(sq);
+}
+
+static void svga_begin_query(struct pipe_context *pipe, 
+                             struct pipe_query *q)
+{
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_query *sq = svga_query( q );
+   enum pipe_error ret;
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+   
+   assert(!svga->sq);
+
+   /* Need to flush out buffered drawing commands so that they don't
+    * get counted in the query results.
+    */
+   svga_hwtnl_flush_retry(svga);
+   
+   if(sq->queryResult->state == SVGA3D_QUERYSTATE_PENDING) {
+      /* The application doesn't care for the pending query result. We cannot
+       * let go the existing buffer and just get a new one because its storage
+       * may be reused for other purposes and clobbered by the host when it
+       * determines the query result. So the only option here is to wait for
+       * the existing query's result -- not a big deal, given that no sane
+       * application would do this.
+       */
+      uint64_t result;
+
+      svga_get_query_result(pipe, q, TRUE, &result);
+      
+      assert(sq->queryResult->state != SVGA3D_QUERYSTATE_PENDING);
+   }
+   
+   sq->queryResult->state = SVGA3D_QUERYSTATE_NEW;
+   sws->fence_reference(sws, &sq->fence, NULL);
+
+   ret = SVGA3D_BeginQuery(svga->swc, sq->type);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_BeginQuery(svga->swc, sq->type);
+      assert(ret == PIPE_OK);
+   }
+
+   svga->sq = sq;
+}
+
+static void svga_end_query(struct pipe_context *pipe, 
+                           struct pipe_query *q)
+{
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_query *sq = svga_query( q );
+   enum pipe_error ret;
+
+   SVGA_DBG(DEBUG_QUERY, "%s\n", __FUNCTION__);
+   assert(svga->sq == sq);
+
+   svga_hwtnl_flush_retry(svga);
+   
+   /* Set to PENDING before sending EndQuery. */
+   sq->queryResult->state = SVGA3D_QUERYSTATE_PENDING;
+
+   ret = SVGA3D_EndQuery( svga->swc, sq->type, sq->hwbuf);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_EndQuery( svga->swc, sq->type, sq->hwbuf);
+      assert(ret == PIPE_OK);
+   }
+   
+   /* TODO: Delay flushing. We don't really need to flush here, just ensure 
+    * that there is one flush before svga_get_query_result attempts to get the
+    * result */
+   svga_context_flush(svga, NULL);
+
+   svga->sq = NULL;
+}
+
+static boolean svga_get_query_result(struct pipe_context *pipe, 
+                                     struct pipe_query *q,
+                                     boolean wait,
+                                     uint64_t *result)
+{
+   struct svga_context *svga = svga_context( pipe );
+   struct svga_screen *svgascreen = svga_screen( pipe->screen );
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_query *sq = svga_query( q );
+   SVGA3dQueryState state;
+   
+   SVGA_DBG(DEBUG_QUERY, "%s wait: %d\n", __FUNCTION__);
+
+   /* The query status won't be updated by the host unless 
+    * SVGA_3D_CMD_WAIT_FOR_QUERY is emitted. Unfortunately this will cause a 
+    * synchronous wait on the host */
+   if(!sq->fence) {
+      enum pipe_error ret;
+
+      ret = SVGA3D_WaitForQuery( svga->swc, sq->type, sq->hwbuf);
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_WaitForQuery( svga->swc, sq->type, sq->hwbuf);
+         assert(ret == PIPE_OK);
+      }
+   
+      svga_context_flush(svga, &sq->fence);
+      
+      assert(sq->fence);
+   }
+
+   state = sq->queryResult->state;
+   if(state == SVGA3D_QUERYSTATE_PENDING) {
+      if(!wait)
+         return FALSE;
+   
+      sws->fence_finish(sws, sq->fence, 0);
+      
+      state = sq->queryResult->state;
+   }
+
+   assert(state == SVGA3D_QUERYSTATE_SUCCEEDED || 
+          state == SVGA3D_QUERYSTATE_FAILED);
+   
+   *result = (uint64_t)sq->queryResult->result32;
+
+   SVGA_DBG(DEBUG_QUERY, "%s result %d\n", __FUNCTION__, (unsigned)*result);
+
+   return TRUE;
+}
+
+
+
+void svga_init_query_functions( struct svga_context *svga )
+{
+   svga->pipe.create_query = svga_create_query;
+   svga->pipe.destroy_query = svga_destroy_query;
+   svga->pipe.begin_query = svga_begin_query;
+   svga->pipe.end_query = svga_end_query;
+   svga->pipe.get_query_result = svga_get_query_result;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
new file mode 100644
index 00000000000..b03f8eb9cf3
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -0,0 +1,250 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+
+#include "svga_hw_reg.h"
+
+/* Hardware frontwinding is always set up as SVGA3D_FRONTWINDING_CW.
+ */
+static SVGA3dFace svga_translate_cullmode( unsigned mode,
+                                           unsigned front_winding )
+{
+   switch (mode) {
+   case PIPE_WINDING_NONE:
+      return SVGA3D_FACE_NONE;
+   case PIPE_WINDING_CCW:
+      return SVGA3D_FACE_BACK;
+   case PIPE_WINDING_CW:
+      return SVGA3D_FACE_FRONT;
+   case PIPE_WINDING_BOTH:
+      return SVGA3D_FACE_FRONT_BACK;
+   default:
+      assert(0);
+      return SVGA3D_FACE_NONE;
+   }
+}
+
+static SVGA3dShadeMode svga_translate_flatshade( unsigned mode )
+{
+   return mode ? SVGA3D_SHADEMODE_FLAT : SVGA3D_SHADEMODE_SMOOTH;
+}
+
+
+static void *
+svga_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *templ)
+{
+   struct svga_rasterizer_state *rast = CALLOC_STRUCT( svga_rasterizer_state );
+   /* need this for draw module. */
+   rast->templ = *templ;
+
+   /* light_twoside          - XXX: need fragment shader varient */
+   /* poly_smooth            - XXX: no fallback available */
+   /* poly_stipple_enable    - draw module */
+   /* point_sprite           - ? */
+   /* point_size_per_vertex  - ? */
+   /* sprite_coord_mode      - ??? */
+   /* bypass_vs_viewport_and_clip        - handled by viewport setup */
+   /* flatshade_first        - handled by index translation */
+   /* gl_rasterization_rules - XXX - viewport code */
+   /* line_width             - draw module */
+   /* fill_cw, fill_ccw      - draw module or index translation */
+
+   rast->shademode = svga_translate_flatshade( templ->flatshade );
+   rast->cullmode = svga_translate_cullmode( templ->cull_mode, 
+                                             templ->front_winding );
+   rast->scissortestenable = templ->scissor;
+   rast->multisampleantialias = templ->multisample;
+   rast->antialiasedlineenable = templ->line_smooth;
+   rast->lastpixel = templ->line_last_pixel;
+   rast->pointsize = templ->point_size;
+   rast->pointsize_min = templ->point_size_min;
+   rast->pointsize_max = templ->point_size_max;
+   rast->hw_unfilled = PIPE_POLYGON_MODE_FILL;
+
+   /* Use swtnl + decomposition implement these:
+    */
+   if (templ->poly_stipple_enable)
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+
+   if (templ->line_width != 1.0 &&
+       templ->line_width != 0.0)
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+
+   if (templ->line_stipple_enable) {
+      /* LinePattern not implemented on all backends. 
+       */
+      if (0) {
+         SVGA3dLinePattern lp;
+         lp.repeat = templ->line_stipple_factor + 1;
+         lp.pattern = templ->line_stipple_pattern;
+         rast->linepattern = lp.uintValue;
+      }
+      else {
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+      }
+   } 
+
+   if (templ->point_smooth)
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_POINTS;
+
+   {
+      boolean offset_cw = templ->offset_cw;
+      boolean offset_ccw = templ->offset_ccw;
+      boolean offset  = 0;
+      int fill_cw = templ->fill_cw;
+      int fill_ccw = templ->fill_ccw;
+      int fill = PIPE_POLYGON_MODE_FILL;
+
+      switch (templ->cull_mode) {
+      case PIPE_WINDING_BOTH:
+         offset = 0;
+         fill = PIPE_POLYGON_MODE_FILL;
+         break;
+
+      case PIPE_WINDING_CW:
+         offset = offset_ccw;
+         fill = fill_ccw;
+         break;
+
+      case PIPE_WINDING_CCW:
+         offset = offset_cw;
+         fill = fill_cw;
+         break;
+
+      case PIPE_WINDING_NONE:
+         if (fill_cw != fill_ccw || offset_cw != offset_ccw) 
+         {
+            /* Always need the draw module to work out different
+             * front/back fill modes:
+             */
+            rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+         }
+         else {
+            offset = offset_ccw;
+            fill = fill_ccw;
+         }
+         break;
+
+      default:
+         assert(0);
+         break;
+      }
+
+      /* Unfilled primitive modes aren't implemented on all virtual
+       * hardware.  We can do some unfilled processing with index
+       * translation, but otherwise need the draw module:
+       */
+      if (fill != PIPE_POLYGON_MODE_FILL &&
+          (templ->flatshade ||
+           templ->light_twoside ||
+           offset ||
+           templ->cull_mode != PIPE_WINDING_NONE)) 
+      {
+         fill = PIPE_POLYGON_MODE_FILL;
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      }
+
+      /* If we are decomposing to lines, and lines need the pipeline,
+       * then we also need the pipeline for tris.
+       */
+      if (fill == PIPE_POLYGON_MODE_LINE &&
+          (rast->need_pipeline & SVGA_PIPELINE_FLAG_LINES))
+      {
+         fill = PIPE_POLYGON_MODE_FILL;
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      }
+
+      /* Similarly for points:
+       */
+      if (fill == PIPE_POLYGON_MODE_POINT &&
+          (rast->need_pipeline & SVGA_PIPELINE_FLAG_POINTS))
+      {
+         fill = PIPE_POLYGON_MODE_FILL;
+         rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      }
+
+      if (offset) {
+         rast->slopescaledepthbias = templ->offset_scale;
+         rast->depthbias = templ->offset_units;
+      }
+
+      rast->hw_unfilled = fill;
+   }
+
+
+
+
+   if (rast->need_pipeline & SVGA_PIPELINE_FLAG_TRIS) {
+      /* Turn off stuff which will get done in the draw module:
+       */
+      rast->hw_unfilled = PIPE_POLYGON_MODE_FILL;
+      rast->slopescaledepthbias = 0;
+      rast->depthbias = 0;
+   }
+
+   return rast;
+}
+
+static void svga_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *state )
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_rasterizer_state *raster = (struct svga_rasterizer_state *)state;
+
+   svga->curr.rast = raster;
+
+   draw_set_rasterizer_state(svga->swtnl.draw, raster ? &raster->templ : NULL);
+   
+   svga->dirty |= SVGA_NEW_RAST;
+}
+
+static void svga_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *raster)
+{
+   FREE(raster);
+}
+
+
+void svga_init_rasterizer_functions( struct svga_context *svga )
+{
+   svga->pipe.create_rasterizer_state = svga_create_rasterizer_state;
+   svga->pipe.bind_rasterizer_state = svga_bind_rasterizer_state;
+   svga->pipe.delete_rasterizer_state = svga_delete_rasterizer_state;
+}
+
+
+/***********************************************************************
+ * Hardware state update
+ */
+
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
new file mode 100644
index 00000000000..3eeca6b784b
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -0,0 +1,243 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_context.h"
+#include "svga_screen_texture.h"
+#include "svga_state.h"
+
+#include "svga_hw_reg.h"
+
+#include "svga_debug.h"
+
+static INLINE unsigned
+translate_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_REPEAT: 
+      return SVGA3D_TEX_ADDRESS_WRAP;
+
+   case PIPE_TEX_WRAP_CLAMP: 
+      return SVGA3D_TEX_ADDRESS_CLAMP;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 
+      /* Unfortunately SVGA3D_TEX_ADDRESS_EDGE not respected by
+       * hardware.
+       */
+      return SVGA3D_TEX_ADDRESS_CLAMP;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 
+      return SVGA3D_TEX_ADDRESS_BORDER;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT: 
+      return SVGA3D_TEX_ADDRESS_MIRROR;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:  
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:   
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 
+      return SVGA3D_TEX_ADDRESS_MIRRORONCE;
+
+   default:
+      assert(0);
+      return SVGA3D_TEX_ADDRESS_WRAP;
+   }
+}
+
+static INLINE unsigned translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
+   case PIPE_TEX_FILTER_ANISO:   return SVGA3D_TEX_FILTER_ANISOTROPIC;
+   default:
+      assert(0);
+      return SVGA3D_TEX_FILTER_NEAREST;
+   }
+}
+
+static INLINE unsigned translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE:    return SVGA3D_TEX_FILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
+   default:
+      assert(0);
+      return SVGA3D_TEX_FILTER_NONE;
+   }
+}
+
+static void *
+svga_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_sampler_state *cso = CALLOC_STRUCT( svga_sampler_state );
+   
+   cso->mipfilter = translate_mip_filter(sampler->min_mip_filter);
+   cso->magfilter = translate_img_filter( sampler->mag_img_filter );
+   cso->minfilter = translate_img_filter( sampler->min_img_filter );
+   cso->aniso_level = MAX2( (unsigned) sampler->max_anisotropy, 1 );
+   cso->lod_bias = sampler->lod_bias;
+   cso->addressu = translate_wrap_mode(sampler->wrap_s);
+   cso->addressv = translate_wrap_mode(sampler->wrap_t);
+   cso->addressw = translate_wrap_mode(sampler->wrap_r);
+   cso->normalized_coords = sampler->normalized_coords;
+   cso->compare_mode = sampler->compare_mode;
+   cso->compare_func = sampler->compare_func;
+
+   {
+      ubyte r = float_to_ubyte(sampler->border_color[0]);
+      ubyte g = float_to_ubyte(sampler->border_color[1]);
+      ubyte b = float_to_ubyte(sampler->border_color[2]);
+      ubyte a = float_to_ubyte(sampler->border_color[3]);
+
+      util_pack_color_ub( r, g, b, a,
+                          PIPE_FORMAT_B8G8R8A8_UNORM,
+                          &cso->bordercolor );
+   }
+
+   /* No SVGA3D support for:
+    *    - min/max LOD clamping
+    */
+   cso->min_lod = 0;
+   cso->view_min_lod = MAX2(sampler->min_lod, 0);
+   cso->view_max_lod = MAX2(sampler->max_lod, 0);
+
+   /* Use min_mipmap */
+   if (svga->debug.use_min_mipmap) {
+      if (cso->view_min_lod == cso->view_max_lod) {
+         cso->min_lod = cso->view_min_lod;
+         cso->view_min_lod = 0;
+         cso->view_max_lod = 1000; /* Just a high number */
+         cso->mipfilter = SVGA3D_TEX_FILTER_NONE;
+      }
+   }
+
+   SVGA_DBG(DEBUG_VIEWS, "min %u, view(min %u, max %u) lod, mipfilter %s\n",
+            cso->min_lod, cso->view_min_lod, cso->view_max_lod,
+            cso->mipfilter == SVGA3D_TEX_FILTER_NONE ? "SVGA3D_TEX_FILTER_NONE" : "SOMETHING");
+
+   return cso;
+}
+
+static void svga_bind_sampler_states(struct pipe_context *pipe,
+                                     unsigned num, void **sampler)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == svga->curr.num_samplers &&
+       !memcmp(svga->curr.sampler, sampler, num * sizeof(void *))) {
+      debug_printf("sampler noop\n");
+      return;
+   }
+
+   for (i = 0; i < num; i++)
+      svga->curr.sampler[i] = sampler[i];
+
+   for (i = num; i < svga->curr.num_samplers; i++)
+      svga->curr.sampler[i] = NULL;
+
+   svga->curr.num_samplers = num;
+   svga->dirty |= SVGA_NEW_SAMPLER;
+}
+
+static void svga_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   FREE(sampler);
+}
+
+
+static void svga_set_sampler_textures(struct pipe_context *pipe,
+                                      unsigned num,
+                                      struct pipe_texture **texture)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned flag_1d = 0;
+   unsigned flag_srgb = 0;
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == svga->curr.num_textures &&
+       !memcmp(svga->curr.texture, texture, num * sizeof(struct pipe_texture *))) {
+      if (0) debug_printf("texture noop\n");
+      return;
+   }
+
+   for (i = 0; i < num; i++) {
+      pipe_texture_reference(&svga->curr.texture[i],
+                             texture[i]);
+
+      if (!texture[i])
+         continue;
+
+      if (texture[i]->format == PIPE_FORMAT_A8R8G8B8_SRGB)
+         flag_srgb |= 1 << i;
+
+      if (texture[i]->target == PIPE_TEXTURE_1D)
+         flag_1d |= 1 << i;
+   }
+
+   for (i = num; i < svga->curr.num_textures; i++)
+      pipe_texture_reference(&svga->curr.texture[i],
+                             NULL);
+
+   svga->curr.num_textures = num;
+   svga->dirty |= SVGA_NEW_TEXTURE_BINDING;
+
+   if (flag_srgb != svga->curr.tex_flags.flag_srgb ||
+       flag_1d != svga->curr.tex_flags.flag_1d) 
+   {
+      svga->dirty |= SVGA_NEW_TEXTURE_FLAGS;
+      svga->curr.tex_flags.flag_1d = flag_1d;
+      svga->curr.tex_flags.flag_srgb = flag_srgb;
+   }  
+}
+
+
+
+void svga_init_sampler_functions( struct svga_context *svga )
+{
+   svga->pipe.create_sampler_state = svga_create_sampler_state;
+   svga->pipe.bind_sampler_states = svga_bind_sampler_states;
+   svga->pipe.delete_sampler_state = svga_delete_sampler_state;
+   svga->pipe.set_sampler_textures = svga_set_sampler_textures;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
new file mode 100644
index 00000000000..28e2787e0d3
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -0,0 +1,115 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_screen.h"
+#include "svga_screen_buffer.h"
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_winsys.h"
+
+#include "svga_hw_reg.h"
+
+
+static void svga_set_vertex_buffers(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+   boolean any_user_buffer = FALSE;
+
+   /* Check for no change */
+   if (count == svga->curr.num_vertex_buffers &&
+       memcmp(svga->curr.vb, buffers, count * sizeof buffers[0]) == 0)
+      return;
+
+   /* Adjust refcounts */
+   for (i = 0; i < count; i++) {
+      pipe_buffer_reference(&svga->curr.vb[i].buffer, buffers[i].buffer);
+      if (svga_buffer(buffers[i].buffer)->user)
+         any_user_buffer = TRUE;
+   }
+
+   for ( ; i < svga->curr.num_vertex_buffers; i++)
+      pipe_buffer_reference(&svga->curr.vb[i].buffer, NULL);
+
+   /* Copy remaining data */
+   memcpy(svga->curr.vb, buffers, count * sizeof buffers[0]);
+   svga->curr.num_vertex_buffers = count;
+   svga->curr.any_user_vertex_buffers = any_user_buffer;
+
+   svga->dirty |= SVGA_NEW_VBUFFER;
+}
+
+static void svga_set_vertex_elements(struct pipe_context *pipe,
+                                     unsigned count,
+                                     const struct pipe_vertex_element *elements)
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+      svga->curr.ve[i] = elements[i];
+
+   svga->curr.num_vertex_elements = count;
+   svga->dirty |= SVGA_NEW_VELEMENT;
+}
+
+
+static void svga_set_edgeflags(struct pipe_context *pipe,
+                               const unsigned *bitfield)
+{
+   struct svga_context *svga = svga_context(pipe);
+
+   if (bitfield != NULL || svga->curr.edgeflags != NULL) {
+      svga->curr.edgeflags = bitfield;
+      svga->dirty |= SVGA_NEW_EDGEFLAGS;
+   }
+}
+
+
+void svga_cleanup_vertex_state( struct svga_context *svga )
+{
+   unsigned i;
+   
+   for (i = 0 ; i < svga->curr.num_vertex_buffers; i++)
+      pipe_buffer_reference(&svga->curr.vb[i].buffer, NULL);
+}
+
+
+void svga_init_vertex_functions( struct svga_context *svga )
+{
+   svga->pipe.set_vertex_buffers = svga_set_vertex_buffers;
+   svga->pipe.set_vertex_elements = svga_set_vertex_elements;
+   svga->pipe.set_edgeflags = svga_set_edgeflags;
+}
+
+
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
new file mode 100644
index 00000000000..e5ffe668c35
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -0,0 +1,189 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_text.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_tgsi.h"
+#include "svga_hw_reg.h"
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+
+static const struct tgsi_token *substitute_vs( 
+   unsigned shader_id,
+   const struct tgsi_token *old_tokens )
+{
+#if 0
+   if (shader_id == 12) {
+   static struct tgsi_token tokens[300];
+
+   const char *text = 
+      "VERT1.1\n"
+      "DCL IN[0]\n"
+      "DCL IN[1]\n"
+      "DCL IN[2]\n"
+      "DCL OUT[0], POSITION\n"
+      "DCL TEMP[0..4]\n"
+      "IMM FLT32 {     1.0000,     1.0000,     1.0000,     1.0000 }\n"
+      "IMM FLT32 {     0.45,     1.0000,     1.0000,     1.0000 }\n"
+      "IMM FLT32 { 1.297863, 0.039245, 0.035993, 0.035976}\n"
+      "IMM FLT32 { -0.019398, 1.696131, -0.202151, -0.202050  }\n"
+      "IMM FLT32 { 0.051711, -0.348713, -0.979204, -0.978714  }\n"
+      "IMM FLT32 { 0.000000, 0.000003, 139.491577, 141.421356 }\n"
+      "DCL CONST[0..7]\n"
+      "DCL CONST[9..16]\n"
+      "  MOV TEMP[2], IMM[0]\n"
+
+      "  MOV TEMP[2].xyz, IN[2]\n"
+      "  MOV TEMP[2].xyz, IN[0]\n"
+      "  MOV TEMP[2].xyz, IN[1]\n"
+
+      "  MUL TEMP[1], IMM[3], TEMP[2].yyyy\n"
+      "  MAD TEMP[3], IMM[2],  TEMP[2].xxxx, TEMP[1]\n"
+      "  MAD TEMP[1], IMM[4], TEMP[2].zzzz, TEMP[3]\n"
+      "  MAD TEMP[4], IMM[5], TEMP[2].wwww, TEMP[1]\n"
+
+      "  MOV OUT[0], TEMP[4]\n"
+      "  END\n";
+
+   if (!tgsi_text_translate( text,
+                             tokens,
+                             Elements(tokens) ))
+   {
+      assert(0);
+      return NULL;
+   }
+
+   return tokens;
+   }
+#endif
+
+   return old_tokens;
+}
+
+
+/***********************************************************************
+ * Vertex shaders 
+ */
+
+static void *
+svga_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   struct svga_vertex_shader *vs = CALLOC_STRUCT(svga_vertex_shader);
+   if (!vs)
+      return NULL;
+
+   /* substitute a debug shader?
+    */
+   vs->base.tokens = tgsi_dup_tokens(substitute_vs(svga->debug.shader_id,
+                                                   templ->tokens));
+
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(vs->base.tokens, &vs->base.info);
+
+   {
+      /* Need to do construct a new template in case we substitued a
+       * debug shader.
+       */
+      struct pipe_shader_state tmp2 = *templ;
+      tmp2.tokens = vs->base.tokens;
+      vs->draw_shader = draw_create_vertex_shader(svga->swtnl.draw, &tmp2);
+   }
+
+   vs->base.id = svga->debug.shader_id++;
+   vs->base.use_sm30 = svgascreen->use_vs30;
+
+   if (SVGA_DEBUG & DEBUG_TGSI || 0) {
+      debug_printf("%s id: %u, inputs: %u, outputs: %u\n",
+                   __FUNCTION__, vs->base.id,
+                   vs->base.info.num_inputs, vs->base.info.num_outputs);
+   }
+
+   return vs;
+}
+
+static void svga_bind_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_vertex_shader *vs = (struct svga_vertex_shader *)shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   svga->curr.vs = vs;
+   svga->dirty |= SVGA_NEW_VS;
+}
+
+
+static void svga_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_vertex_shader *vs = (struct svga_vertex_shader *)shader;
+   struct svga_shader_result *result, *tmp;
+   enum pipe_error ret;
+
+   svga_hwtnl_flush_retry( svga );
+
+   draw_delete_vertex_shader(svga->swtnl.draw, vs->draw_shader);
+   
+   for (result = vs->base.results; result; result = tmp ) {
+      tmp = result->next;
+
+      ret = SVGA3D_DestroyShader(svga->swc, 
+                                 result->id,
+                                 SVGA3D_SHADERTYPE_VS );
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_DestroyShader(svga->swc, 
+                                    result->id,
+                                    SVGA3D_SHADERTYPE_VS );
+         assert(ret == PIPE_OK);
+      }
+
+      svga_destroy_shader_result( result );
+   }
+
+   FREE((void *)vs->base.tokens);
+   FREE(vs);
+}
+
+
+void svga_init_vs_functions( struct svga_context *svga )
+{
+   svga->pipe.create_vs_state = svga_create_vs_state;
+   svga->pipe.bind_vs_state = svga_bind_vs_state;
+   svga->pipe.delete_vs_state = svga_delete_vs_state;
+}
+
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
new file mode 100644
index 00000000000..fc1b3c980ef
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -0,0 +1,440 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "util/u_string.h"
+#include "util/u_math.h"
+
+#include "svga_winsys.h"
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_screen_texture.h"
+#include "svga_screen_buffer.h"
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+
+#ifdef DEBUG
+int SVGA_DEBUG = 0;
+
+static const struct debug_named_value svga_debug_flags[] = {
+   { "dma",      DEBUG_DMA },
+   { "tgsi",     DEBUG_TGSI },
+   { "pipe",     DEBUG_PIPE },
+   { "state",    DEBUG_STATE },
+   { "screen",   DEBUG_SCREEN },
+   { "tex",      DEBUG_TEX },
+   { "swtnl",    DEBUG_SWTNL },
+   { "const",    DEBUG_CONSTS },
+   { "viewport", DEBUG_VIEWPORT },
+   { "views",    DEBUG_VIEWS },
+   { "perf",     DEBUG_PERF },
+   { "flush",    DEBUG_FLUSH },
+   { "sync",     DEBUG_SYNC },
+   { "cache",    DEBUG_CACHE },
+   {NULL, 0}
+};
+#endif
+
+static const char *
+svga_get_vendor( struct pipe_screen *pscreen )
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+svga_get_name( struct pipe_screen *pscreen )
+{
+#ifdef DEBUG
+   /* Only return internal details in the DEBUG version:
+    */
+   return "SVGA3D; build: DEBUG; mutex: " PIPE_ATOMIC;
+#else
+   return "SVGA3D; build: RELEASE; ";
+#endif
+}
+
+
+
+
+static float
+svga_get_paramf(struct pipe_screen *screen, int param)
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   SVGA3dDevCapResult result;
+
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.0;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      /* Keep this to a reasonable size to avoid failures in
+       * conform/pntaa.c:
+       */
+      return 80.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 16;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return svgascreen->use_ps30 && svgascreen->use_vs30;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 1;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      if(!sws->get_cap(sws, SVGA3D_DEVCAP_MAX_RENDER_TARGETS, &result))
+         return 1;
+      if(!result.u)
+         return 1;
+      return MIN2(result.u, PIPE_MAX_COLOR_BUFS);
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return SVGA_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return SVGA_MAX_TEXTURE_LEVELS;
+
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT: /* req. for GL 1.4 */
+      return 1;
+
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE: /* req. for GL 1.5 */
+      return 1;
+
+   default:
+      return 0;
+   }
+}
+
+
+/* This is a fairly pointless interface
+ */
+static int
+svga_get_param(struct pipe_screen *screen, int param)
+{
+   return (int) svga_get_paramf( screen, param );
+}
+
+
+static INLINE SVGA3dDevCapIndex
+svga_translate_format_cap(enum pipe_format format)
+{
+   switch(format) {
+   
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_A8R8G8B8;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_X8R8G8B8;
+
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_R5G6B5;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_A1R5G5B5;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_A4R4G4B4;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_Z_D16;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_Z_D24S8;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_Z_D24X8;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_ALPHA8;
+   case PIPE_FORMAT_L8_UNORM:
+      return SVGA3D_DEVCAP_SURFACEFMT_LUMINANCE8;
+
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+      return SVGA3D_DEVCAP_SURFACEFMT_DXT1;
+   case PIPE_FORMAT_DXT3_RGBA:
+      return SVGA3D_DEVCAP_SURFACEFMT_DXT3;
+   case PIPE_FORMAT_DXT5_RGBA:
+      return SVGA3D_DEVCAP_SURFACEFMT_DXT5;
+
+   default:
+      return SVGA3D_DEVCAP_MAX;
+   }
+}
+
+
+static boolean
+svga_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format, 
+                          enum pipe_texture_target target,
+                          unsigned tex_usage, 
+                          unsigned geom_flags )
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+   SVGA3dDevCapIndex index;
+   SVGA3dDevCapResult result;
+   
+   assert(tex_usage);
+
+   /* Override host capabilities */
+   if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+      switch(format) { 
+
+      /* Often unsupported/problematic. This means we end up with the same
+       * visuals for all virtual hardware implementations.
+       */
+      case PIPE_FORMAT_A4R4G4B4_UNORM:
+      case PIPE_FORMAT_A1R5G5B5_UNORM:
+         return FALSE;
+         
+      /* Simulate ability to render into compressed textures */
+      case PIPE_FORMAT_DXT1_RGB:
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT5_RGBA:
+         return TRUE;
+
+      default:
+         break;
+      }
+   }
+   
+   /* Try to query the host */
+   index = svga_translate_format_cap(format);
+   if( index < SVGA3D_DEVCAP_MAX && 
+       sws->get_cap(sws, index, &result) )
+   {
+      SVGA3dSurfaceFormatCaps mask;
+      
+      mask.value = 0;
+      if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET)
+         mask.offscreenRenderTarget = 1;
+      if (tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL)
+         mask.zStencil = 1;
+      if (tex_usage & PIPE_TEXTURE_USAGE_SAMPLER)
+         mask.texture = 1;
+
+      if ((result.u & mask.value) == mask.value)
+         return TRUE;
+      else
+         return FALSE;
+   }
+
+   /* Use our translate functions directly rather than relying on a
+    * duplicated list of supported formats which is prone to getting
+    * out of sync:
+    */
+   if(tex_usage & (PIPE_TEXTURE_USAGE_RENDER_TARGET | PIPE_TEXTURE_USAGE_DEPTH_STENCIL))
+      return svga_translate_format_render(format) != SVGA3D_FORMAT_INVALID;
+   else
+      return svga_translate_format(format) != SVGA3D_FORMAT_INVALID;
+}
+
+
+static void
+svga_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **ptr,
+                     struct pipe_fence_handle *fence)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+   sws->fence_reference(sws, ptr, fence);
+}
+
+
+static int
+svga_fence_signalled(struct pipe_screen *screen,
+                     struct pipe_fence_handle *fence,
+                     unsigned flag)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+   return sws->fence_signalled(sws, fence, flag);
+}
+
+
+static int
+svga_fence_finish(struct pipe_screen *screen,
+                  struct pipe_fence_handle *fence,
+                  unsigned flag)
+{
+   struct svga_winsys_screen *sws = svga_screen(screen)->sws;
+
+   SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
+            __FUNCTION__, fence);
+
+   return sws->fence_finish(sws, fence, flag);
+}
+
+
+static void
+svga_destroy_screen( struct pipe_screen *screen )
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   
+   svga_screen_cache_cleanup(svgascreen);
+
+   pipe_mutex_destroy(svgascreen->swc_mutex);
+   pipe_mutex_destroy(svgascreen->tex_mutex);
+
+   svgascreen->swc->destroy(svgascreen->swc);
+   
+   svgascreen->sws->destroy(svgascreen->sws);
+   
+   FREE(svgascreen);
+}
+
+
+/**
+ * Create a new svga_screen object
+ */
+struct pipe_screen *
+svga_screen_create(struct svga_winsys_screen *sws)
+{
+   struct svga_screen *svgascreen;
+   struct pipe_screen *screen;
+   SVGA3dDevCapResult result;
+
+#ifdef DEBUG
+   SVGA_DEBUG = debug_get_flags_option("SVGA_DEBUG", svga_debug_flags, 0 );
+#endif
+
+   svgascreen = CALLOC_STRUCT(svga_screen);
+   if (!svgascreen)
+      goto error1;
+
+   svgascreen->debug.force_level_surface_view =
+      debug_get_bool_option("SVGA_FORCE_LEVEL_SURFACE_VIEW", FALSE);
+   svgascreen->debug.force_surface_view =
+      debug_get_bool_option("SVGA_FORCE_SURFACE_VIEW", FALSE);
+   svgascreen->debug.force_sampler_view =
+      debug_get_bool_option("SVGA_FORCE_SAMPLER_VIEW", FALSE);
+   svgascreen->debug.no_surface_view =
+      debug_get_bool_option("SVGA_NO_SURFACE_VIEW", FALSE);
+   svgascreen->debug.no_sampler_view =
+      debug_get_bool_option("SVGA_NO_SAMPLER_VIEW", FALSE);
+
+   screen = &svgascreen->screen;
+
+   screen->destroy = svga_destroy_screen;
+   screen->get_name = svga_get_name;
+   screen->get_vendor = svga_get_vendor;
+   screen->get_param = svga_get_param;
+   screen->get_paramf = svga_get_paramf;
+   screen->is_format_supported = svga_is_format_supported;
+   screen->fence_reference = svga_fence_reference;
+   screen->fence_signalled = svga_fence_signalled;
+   screen->fence_finish = svga_fence_finish;
+   svgascreen->sws = sws;
+
+   svga_screen_init_texture_functions(screen);
+   svga_screen_init_buffer_functions(screen);
+
+   svgascreen->use_ps30 =
+      sws->get_cap(sws, SVGA3D_DEVCAP_FRAGMENT_SHADER_VERSION, &result) &&
+      result.u >= SVGA3DPSVERSION_30 ? TRUE : FALSE;
+
+   svgascreen->use_vs30 =
+      sws->get_cap(sws, SVGA3D_DEVCAP_VERTEX_SHADER_VERSION, &result) &&
+      result.u >= SVGA3DVSVERSION_30 ? TRUE : FALSE;
+
+#if 1
+   /* Shader model 2.0 is unsupported at the moment. */
+   if(!svgascreen->use_ps30 || !svgascreen->use_vs30)
+      goto error2;
+#else
+   if(debug_get_bool_option("SVGA_NO_SM30", FALSE))
+      svgascreen->use_vs30 = svgascreen->use_ps30 = FALSE;
+#endif
+
+   svgascreen->swc = sws->context_create(sws);
+   if(!svgascreen->swc)
+      goto error2;
+
+   pipe_mutex_init(svgascreen->tex_mutex);
+   pipe_mutex_init(svgascreen->swc_mutex);
+
+   LIST_INITHEAD(&svgascreen->cached_buffers);
+   
+   svga_screen_cache_init(svgascreen);
+
+   return screen;
+error2:
+   FREE(svgascreen);
+error1:
+   return NULL;
+}
+
+void svga_screen_flush( struct svga_screen *svgascreen, 
+                        struct pipe_fence_handle **pfence )
+{
+   struct pipe_fence_handle *fence = NULL;
+
+   SVGA_DBG(DEBUG_PERF, "%s\n", __FUNCTION__);
+   
+   pipe_mutex_lock(svgascreen->swc_mutex);
+   svgascreen->swc->flush(svgascreen->swc, &fence);
+   pipe_mutex_unlock(svgascreen->swc_mutex);
+   
+   svga_screen_cache_flush(svgascreen, fence);
+   
+   if(pfence)
+      *pfence = fence;
+   else
+      svgascreen->sws->fence_reference(svgascreen->sws, &fence, NULL);
+}
+
+struct svga_winsys_screen *
+svga_winsys_screen(struct pipe_screen *screen)
+{
+   return svga_screen(screen)->sws;
+}
+
+#ifdef DEBUG
+struct svga_screen *
+svga_screen(struct pipe_screen *screen)
+{
+   assert(screen);
+   assert(screen->destroy == svga_destroy_screen);
+   return (struct svga_screen *)screen;
+}
+#endif
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
new file mode 100644
index 00000000000..b94ca7fc1ca
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -0,0 +1,95 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SCREEN_H
+#define SVGA_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+#include "pipe/p_thread.h"
+
+#include "util/u_double_list.h"
+
+#include "svga_screen_cache.h"
+
+
+struct svga_winsys_screen;
+struct svga_winsys_context;
+struct SVGACmdMemory;
+
+#define SVGA_COMBINE_USERBUFFERS 1
+
+/**
+ * Subclass of pipe_screen
+ */
+struct svga_screen
+{
+   struct pipe_screen screen;
+   struct svga_winsys_screen *sws;
+
+   unsigned use_ps30;
+   unsigned use_vs30;
+   
+   struct {
+      boolean force_level_surface_view;
+      boolean force_surface_view;
+      boolean no_surface_view;
+      boolean force_sampler_view;
+      boolean no_sampler_view;
+   } debug;
+
+   /* The screen needs its own context */
+   struct svga_winsys_context *swc;
+   struct SVGACmdMemory *fifo;
+
+   unsigned texture_timestamp;
+   pipe_mutex tex_mutex; 
+   pipe_mutex swc_mutex; /* Protects the use of swc and dirty_buffers */
+   
+   /** 
+    * List of buffers with cached GMR. Ordered from the most recently used to
+    * the least recently used 
+    */
+   struct list_head cached_buffers;
+   
+   struct svga_host_surface_cache cache;
+};
+
+#ifndef DEBUG
+/** cast wrapper */
+static INLINE struct svga_screen *
+svga_screen(struct pipe_screen *pscreen)
+{
+   return (struct svga_screen *) pscreen;
+}
+#else
+struct svga_screen *
+svga_screen(struct pipe_screen *screen);
+#endif
+
+void svga_screen_flush( struct svga_screen *svga_screen, 
+                        struct pipe_fence_handle **pfence );
+
+#endif /* SVGA_SCREEN_H */
diff --git a/src/gallium/drivers/svga/svga_screen_buffer.c b/src/gallium/drivers/svga/svga_screen_buffer.c
new file mode 100644
index 00000000000..1f8a8896723
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_buffer.c
@@ -0,0 +1,824 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_screen_buffer.h"
+#include "svga_winsys.h"
+#include "svga_debug.h"
+
+
+/**
+ * Vertex and index buffers have to be treated slightly differently from 
+ * regular guest memory regions because the SVGA device sees them as 
+ * surfaces, and the state tracker can create/destroy without the pipe 
+ * driver, therefore we must do the uploads from the vws.
+ */
+static INLINE boolean
+svga_buffer_needs_hw_storage(unsigned usage)
+{
+   return usage & (PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_INDEX);
+}
+
+
+static INLINE enum pipe_error
+svga_buffer_create_host_surface(struct svga_screen *ss,
+                                struct svga_buffer *sbuf)
+{
+   if(!sbuf->handle) {
+      sbuf->key.flags = 0;
+      
+      sbuf->key.format = SVGA3D_BUFFER;
+      if(sbuf->base.usage & PIPE_BUFFER_USAGE_VERTEX)
+         sbuf->key.flags |= SVGA3D_SURFACE_HINT_VERTEXBUFFER;
+      if(sbuf->base.usage & PIPE_BUFFER_USAGE_INDEX)
+         sbuf->key.flags |= SVGA3D_SURFACE_HINT_INDEXBUFFER;
+      
+      sbuf->key.size.width = sbuf->base.size;
+      sbuf->key.size.height = 1;
+      sbuf->key.size.depth = 1;
+      
+      sbuf->key.numFaces = 1;
+      sbuf->key.numMipLevels = 1;
+      sbuf->key.cachable = 1;
+      
+      SVGA_DBG(DEBUG_DMA, "surface_create for buffer sz %d\n", sbuf->base.size);
+
+      sbuf->handle = svga_screen_surface_create(ss, &sbuf->key);
+      if(!sbuf->handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   
+      /* Always set the discard flag on the first time the buffer is written
+       * as svga_screen_surface_create might have passed a recycled host
+       * buffer.
+       */
+      sbuf->hw.flags.discard = TRUE;
+
+      SVGA_DBG(DEBUG_DMA, "   --> got sid %p sz %d (buffer)\n", sbuf->handle, sbuf->base.size);
+   }
+   
+   return PIPE_OK;
+}   
+
+
+static INLINE void
+svga_buffer_destroy_host_surface(struct svga_screen *ss,
+                                 struct svga_buffer *sbuf)
+{
+   if(sbuf->handle) {
+      SVGA_DBG(DEBUG_DMA, " ungrab sid %p sz %d\n", sbuf->handle, sbuf->base.size);
+      svga_screen_surface_destroy(ss, &sbuf->key, &sbuf->handle);
+   }
+}   
+
+
+static INLINE void
+svga_buffer_destroy_hw_storage(struct svga_screen *ss, struct svga_buffer *sbuf)
+{
+   struct svga_winsys_screen *sws = ss->sws;
+
+   assert(!sbuf->map.count);
+   assert(sbuf->hw.buf);
+   if(sbuf->hw.buf) {
+      sws->buffer_destroy(sws, sbuf->hw.buf);
+      sbuf->hw.buf = NULL;
+      assert(sbuf->head.prev && sbuf->head.next);
+      LIST_DEL(&sbuf->head);
+#ifdef DEBUG
+      sbuf->head.next = sbuf->head.prev = NULL; 
+#endif
+   }
+}
+
+static INLINE enum pipe_error
+svga_buffer_backup(struct svga_screen *ss, struct svga_buffer *sbuf)
+{
+   if (sbuf->hw.buf && sbuf->hw.num_ranges) {
+      void *src;
+
+      if (!sbuf->swbuf)
+	 sbuf->swbuf = align_malloc(sbuf->base.size, sbuf->base.alignment);
+      if (!sbuf->swbuf)
+	 return PIPE_ERROR_OUT_OF_MEMORY;
+
+      src = ss->sws->buffer_map(ss->sws, sbuf->hw.buf,
+				PIPE_BUFFER_USAGE_CPU_READ);
+      if (!src)
+	 return PIPE_ERROR;
+
+      memcpy(sbuf->swbuf, src, sbuf->base.size);
+      ss->sws->buffer_unmap(ss->sws, sbuf->hw.buf);
+   }
+
+   return PIPE_OK;
+}
+
+/**
+ * Try to make GMR space available by freeing the hardware storage of 
+ * unmapped
+ */
+boolean
+svga_buffer_free_cached_hw_storage(struct svga_screen *ss)
+{
+   struct list_head *curr;
+   struct svga_buffer *sbuf;
+   enum pipe_error ret = PIPE_OK;
+
+   curr = ss->cached_buffers.prev;
+   
+   /* free the least recently used buffer's hw storage which is not mapped */
+   do {
+      if(curr == &ss->cached_buffers)
+         return FALSE;
+
+      sbuf = LIST_ENTRY(struct svga_buffer, curr, head);
+      
+      curr = curr->prev;
+      if (sbuf->map.count == 0)
+	 ret = svga_buffer_backup(ss, sbuf);
+
+   } while(sbuf->map.count != 0 || ret != PIPE_OK);
+   
+   svga_buffer_destroy_hw_storage(ss, sbuf);
+   
+   return TRUE;
+}
+
+struct svga_winsys_buffer *
+svga_winsys_buffer_create( struct svga_screen *ss,
+                           unsigned alignment, 
+                           unsigned usage,
+                           unsigned size )
+{
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_winsys_buffer *buf;
+   
+   /* Just try */
+   buf = sws->buffer_create(sws, alignment, usage, size);
+   if(!buf) {
+
+      SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "flushing screen to find %d bytes GMR\n", 
+               size); 
+      
+      /* Try flushing all pending DMAs */
+      svga_screen_flush(ss, NULL);
+      buf = sws->buffer_create(sws, alignment, usage, size);
+
+      SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "evicting buffers to find %d bytes GMR\n", 
+               size);
+
+      /* Try evicing all buffer storage */
+      while(!buf && svga_buffer_free_cached_hw_storage(ss))
+         buf = sws->buffer_create(sws, alignment, usage, size);
+   }
+   
+   return buf;
+}
+
+
+/**
+ * Allocate DMA'ble storage for the buffer. 
+ * 
+ * Called before mapping a buffer.
+ */
+static INLINE enum pipe_error
+svga_buffer_create_hw_storage(struct svga_screen *ss,
+                              struct svga_buffer *sbuf)
+{
+   if(!sbuf->hw.buf) {
+      unsigned alignment = sbuf->base.alignment;
+      unsigned usage = 0;
+      unsigned size = sbuf->base.size;
+      
+      sbuf->hw.buf = svga_winsys_buffer_create(ss, alignment, usage, size);
+      if(!sbuf->hw.buf)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      
+      assert(!sbuf->needs_flush);
+      assert(!sbuf->head.prev && !sbuf->head.next);
+      LIST_ADD(&sbuf->head, &ss->cached_buffers);
+   }
+   
+   return PIPE_OK;
+}
+
+
+/**
+ * Variant of SVGA3D_BufferDMA which leaves the copy box temporarily in blank.
+ */
+static enum pipe_error
+svga_buffer_upload_command(struct svga_context *svga,
+                           struct svga_buffer *sbuf)
+{
+   struct svga_winsys_context *swc = svga->swc;
+   struct svga_winsys_buffer *guest = sbuf->hw.buf;
+   struct svga_winsys_surface *host = sbuf->handle;
+   SVGA3dTransferType transfer = SVGA3D_WRITE_HOST_VRAM;
+   SVGA3dSurfaceDMAFlags flags = sbuf->hw.flags;
+   SVGA3dCmdSurfaceDMA *cmd;
+   uint32 numBoxes = sbuf->hw.num_ranges;
+   SVGA3dCopyBox *boxes;
+   SVGA3dCmdSurfaceDMASuffix *pSuffix;
+   unsigned region_flags;
+   unsigned surface_flags;
+   struct pipe_buffer *dummy;
+
+   if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+      region_flags = PIPE_BUFFER_USAGE_GPU_READ;
+      surface_flags = PIPE_BUFFER_USAGE_GPU_WRITE;
+   }
+   else if(transfer == SVGA3D_READ_HOST_VRAM) {
+      region_flags = PIPE_BUFFER_USAGE_GPU_WRITE;
+      surface_flags = PIPE_BUFFER_USAGE_GPU_READ;
+   }
+   else {
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+
+   assert(numBoxes);
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_SURFACE_DMA,
+                            sizeof *cmd + numBoxes * sizeof *boxes + sizeof *pSuffix,
+                            2);
+   if(!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->region_relocation(swc, &cmd->guest.ptr, guest, 0, region_flags);
+   cmd->guest.pitch = 0;
+
+   swc->surface_relocation(swc, &cmd->host.sid, host, surface_flags);
+   cmd->host.face = 0;
+   cmd->host.mipmap = 0;
+
+   cmd->transfer = transfer;
+
+   sbuf->hw.boxes = (SVGA3dCopyBox *)&cmd[1];
+   sbuf->hw.svga = svga;
+
+   /* Increment reference count */
+   dummy = NULL;
+   pipe_buffer_reference(&dummy, &sbuf->base);
+
+   pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + numBoxes * sizeof *boxes);
+   pSuffix->suffixSize = sizeof *pSuffix;
+   pSuffix->maximumOffset = sbuf->base.size;
+   pSuffix->flags = flags;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Patch up the upload DMA command reserved by svga_buffer_upload_command
+ * with the final ranges.
+ */
+static void
+svga_buffer_upload_flush(struct svga_context *svga,
+                         struct svga_buffer *sbuf)
+{
+   struct svga_screen *ss = svga_screen(svga->pipe.screen);
+   SVGA3dCopyBox *boxes;
+   unsigned i;
+
+   assert(sbuf->handle); 
+   assert(sbuf->hw.buf);
+   assert(sbuf->hw.num_ranges);
+   assert(sbuf->hw.svga == svga);
+   assert(sbuf->hw.boxes);
+   
+   /*
+    * Patch the DMA command with the final copy box.
+    */
+
+   SVGA_DBG(DEBUG_DMA, "dma to sid %p\n", sbuf->handle);
+
+   boxes = sbuf->hw.boxes;
+   for(i = 0; i < sbuf->hw.num_ranges; ++i) {
+      SVGA_DBG(DEBUG_DMA, "  bytes %u - %u\n",
+               sbuf->hw.ranges[i].start, sbuf->hw.ranges[i].end);
+
+      boxes[i].x = sbuf->hw.ranges[i].start;
+      boxes[i].y = 0;
+      boxes[i].z = 0;
+      boxes[i].w = sbuf->hw.ranges[i].end - sbuf->hw.ranges[i].start;
+      boxes[i].h = 1;
+      boxes[i].d = 1;
+      boxes[i].srcx = sbuf->hw.ranges[i].start;
+      boxes[i].srcy = 0;
+      boxes[i].srcz = 0;
+   }
+
+   sbuf->hw.num_ranges = 0;
+   memset(&sbuf->hw.flags, 0, sizeof sbuf->hw.flags);
+
+   assert(sbuf->head.prev && sbuf->head.next);
+   LIST_DEL(&sbuf->head);
+   sbuf->needs_flush = FALSE;
+   /* XXX: do we care about cached_buffers any more ?*/
+   LIST_ADD(&sbuf->head, &ss->cached_buffers);
+
+   sbuf->hw.svga = NULL;
+   sbuf->hw.boxes = NULL;
+
+   /* Decrement reference count */
+   pipe_buffer_reference((struct pipe_buffer **)&sbuf, NULL);
+}
+
+
+/**
+ * Queue a DMA upload of a range of this buffer to the host.
+ *
+ * This function only notes the range down. It doesn't actually emit a DMA
+ * upload command. That only happens when a context tries to refer to this
+ * buffer, and the DMA upload command is added to that context's command buffer.
+ * 
+ * We try to lump as many contiguous DMA transfers together as possible.
+ */
+static void
+svga_buffer_upload_queue(struct svga_buffer *sbuf,
+                         unsigned start,
+                         unsigned end)
+{
+   unsigned i;
+
+   assert(sbuf->hw.buf);
+   assert(end > start);
+   
+   /*
+    * Try to grow one of the ranges.
+    *
+    * Note that it is not this function task to care about overlapping ranges,
+    * as the GMR was already given so it is too late to do anything. Situations
+    * where overlapping ranges may pose a problem should be detected via
+    * pipe_context::is_buffer_referenced and the context that refers to the
+    * buffer should be flushed.
+    */
+
+   for(i = 0; i < sbuf->hw.num_ranges; ++i) {
+      if(start <= sbuf->hw.ranges[i].end && sbuf->hw.ranges[i].start <= end) {
+         sbuf->hw.ranges[i].start = MIN2(sbuf->hw.ranges[i].start, start);
+         sbuf->hw.ranges[i].end   = MAX2(sbuf->hw.ranges[i].end,    end);
+         return;
+      }
+   }
+
+   /*
+    * We cannot add a new range to an existing DMA command, so patch-up the
+    * pending DMA upload and start clean.
+    */
+
+   if(sbuf->needs_flush)
+      svga_buffer_upload_flush(sbuf->hw.svga, sbuf);
+
+   assert(!sbuf->needs_flush);
+   assert(!sbuf->hw.svga);
+   assert(!sbuf->hw.boxes);
+
+   /*
+    * Add a new range.
+    */
+
+   sbuf->hw.ranges[sbuf->hw.num_ranges].start = start;
+   sbuf->hw.ranges[sbuf->hw.num_ranges].end = end;
+   ++sbuf->hw.num_ranges;
+}
+
+
+static void *
+svga_buffer_map_range( struct pipe_screen *screen,
+                       struct pipe_buffer *buf,
+                       unsigned offset, unsigned length,
+                       unsigned usage )
+{
+   struct svga_screen *ss = svga_screen(screen); 
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   void *map;
+
+   if(sbuf->swbuf) {
+      /* User/malloc buffer */
+      map = sbuf->swbuf;
+   }
+   else {
+      if(!sbuf->hw.buf) {
+         struct svga_winsys_surface *handle = sbuf->handle;
+
+         if(svga_buffer_create_hw_storage(ss, sbuf) != PIPE_OK)
+            return NULL;
+         
+         /* Populate the hardware storage if the host surface pre-existed */
+         if((usage & PIPE_BUFFER_USAGE_CPU_READ) && handle) {
+            SVGA3dSurfaceDMAFlags flags;
+            enum pipe_error ret;
+            struct pipe_fence_handle *fence = NULL;
+            
+            SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "dma from sid %p (buffer), bytes %u - %u\n", 
+                     sbuf->handle, 0, sbuf->base.size);
+
+            memset(&flags, 0, sizeof flags);
+            
+            ret = SVGA3D_BufferDMA(ss->swc,
+                                   sbuf->hw.buf,
+                                   sbuf->handle,
+                                   SVGA3D_READ_HOST_VRAM,
+                                   sbuf->base.size,
+                                   0,
+                                   flags);
+            if(ret != PIPE_OK) {
+               ss->swc->flush(ss->swc, NULL);
+               
+               ret = SVGA3D_BufferDMA(ss->swc,
+                                      sbuf->hw.buf,
+                                      sbuf->handle,
+                                      SVGA3D_READ_HOST_VRAM,
+                                      sbuf->base.size,
+                                      0,
+                                      flags);
+               assert(ret == PIPE_OK);
+            }
+            
+            ss->swc->flush(ss->swc, &fence);
+            sws->fence_finish(sws, fence, 0);
+            sws->fence_reference(sws, &fence, NULL);
+         }
+      }
+      else {
+         if((usage & PIPE_BUFFER_USAGE_CPU_READ) && !sbuf->needs_flush) {
+            /* We already had the hardware storage but we would have to issue
+             * a download if we hadn't, so move the buffer to the begginning
+             * of the LRU list.
+             */
+            assert(sbuf->head.prev && sbuf->head.next);
+            LIST_DEL(&sbuf->head);
+            LIST_ADD(&sbuf->head, &ss->cached_buffers);
+         }
+      }
+         
+      map = sws->buffer_map(sws, sbuf->hw.buf, usage);
+   }
+
+   if(map) {
+      pipe_mutex_lock(ss->swc_mutex);
+
+      ++sbuf->map.count;
+
+      if (usage & PIPE_BUFFER_USAGE_CPU_WRITE) {
+         assert(sbuf->map.count <= 1);
+         sbuf->map.writing = TRUE;
+         if (usage & PIPE_BUFFER_USAGE_FLUSH_EXPLICIT)
+            sbuf->map.flush_explicit = TRUE;
+      }
+      
+      pipe_mutex_unlock(ss->swc_mutex);
+   }
+   
+   return map;
+}
+
+static void 
+svga_buffer_flush_mapped_range( struct pipe_screen *screen,
+                                struct pipe_buffer *buf,
+                                unsigned offset, unsigned length)
+{
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   struct svga_screen *ss = svga_screen(screen);
+   
+   pipe_mutex_lock(ss->swc_mutex);
+   assert(sbuf->map.writing);
+   if(sbuf->map.writing) {
+      assert(sbuf->map.flush_explicit);
+      if(sbuf->hw.buf)
+         svga_buffer_upload_queue(sbuf, offset, offset + length);
+   }
+   pipe_mutex_unlock(ss->swc_mutex);
+}
+
+static void 
+svga_buffer_unmap( struct pipe_screen *screen,
+                   struct pipe_buffer *buf)
+{
+   struct svga_screen *ss = svga_screen(screen); 
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   
+   pipe_mutex_lock(ss->swc_mutex);
+   
+   assert(sbuf->map.count);
+   if(sbuf->map.count)
+      --sbuf->map.count;
+
+   if(sbuf->hw.buf)
+      sws->buffer_unmap(sws, sbuf->hw.buf);
+
+   if(sbuf->map.writing) {
+      if(!sbuf->map.flush_explicit) {
+         /* No mapped range was flushed -- flush the whole buffer */
+         SVGA_DBG(DEBUG_DMA, "flushing the whole buffer\n");
+   
+         if(sbuf->hw.buf)
+            svga_buffer_upload_queue(sbuf, 0, sbuf->base.size);
+      }
+      
+      sbuf->map.writing = FALSE;
+      sbuf->map.flush_explicit = FALSE;
+   }
+
+   pipe_mutex_unlock(ss->swc_mutex);
+}
+
+static void
+svga_buffer_destroy( struct pipe_buffer *buf )
+{
+   struct svga_screen *ss = svga_screen(buf->screen); 
+   struct svga_buffer *sbuf = svga_buffer( buf );
+
+   assert(!p_atomic_read(&buf->reference.count));
+   
+   assert(!sbuf->needs_flush);
+
+   if(sbuf->handle) {
+      SVGA_DBG(DEBUG_DMA, "release sid %p sz %d\n", sbuf->handle, sbuf->base.size);
+      svga_screen_surface_destroy(ss, &sbuf->key, &sbuf->handle);
+   }
+   
+   if(sbuf->hw.buf)
+      svga_buffer_destroy_hw_storage(ss, sbuf);
+   
+   if(sbuf->swbuf && !sbuf->user)
+      align_free(sbuf->swbuf);
+   
+   FREE(sbuf);
+}
+
+static struct pipe_buffer *
+svga_buffer_create(struct pipe_screen *screen,
+                   unsigned alignment,
+                   unsigned usage,
+                   unsigned size)
+{
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_buffer *sbuf;
+   
+   sbuf = CALLOC_STRUCT(svga_buffer);
+   if(!sbuf)
+      goto error1;
+      
+   sbuf->magic = SVGA_BUFFER_MAGIC;
+   
+   pipe_reference_init(&sbuf->base.reference, 1);
+   sbuf->base.screen = screen;
+   sbuf->base.alignment = alignment;
+   sbuf->base.usage = usage;
+   sbuf->base.size = size;
+
+   if(svga_buffer_needs_hw_storage(usage)) {
+      if(svga_buffer_create_host_surface(ss, sbuf) != PIPE_OK)
+         goto error2;
+   }
+   else {
+      if(alignment < sizeof(void*))
+         alignment = sizeof(void*);
+
+      usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+      
+      sbuf->swbuf = align_malloc(size, alignment);
+      if(!sbuf->swbuf)
+         goto error2;
+   }
+      
+   return &sbuf->base; 
+
+error2:
+   FREE(sbuf);
+error1:
+   return NULL;
+}
+
+static struct pipe_buffer *
+svga_user_buffer_create(struct pipe_screen *screen,
+                        void *ptr,
+                        unsigned bytes)
+{
+   struct svga_buffer *sbuf;
+   
+   sbuf = CALLOC_STRUCT(svga_buffer);
+   if(!sbuf)
+      goto no_sbuf;
+      
+   sbuf->magic = SVGA_BUFFER_MAGIC;
+   
+   sbuf->swbuf = ptr;
+   sbuf->user = TRUE;
+   
+   pipe_reference_init(&sbuf->base.reference, 1);
+   sbuf->base.screen = screen;
+   sbuf->base.alignment = 1;
+   sbuf->base.usage = 0;
+   sbuf->base.size = bytes;
+   
+   return &sbuf->base; 
+
+no_sbuf:
+   return NULL;
+}
+
+   
+void
+svga_screen_init_buffer_functions(struct pipe_screen *screen)
+{
+   screen->buffer_create = svga_buffer_create;
+   screen->user_buffer_create = svga_user_buffer_create;
+   screen->buffer_map_range = svga_buffer_map_range;
+   screen->buffer_flush_mapped_range = svga_buffer_flush_mapped_range;
+   screen->buffer_unmap = svga_buffer_unmap;
+   screen->buffer_destroy = svga_buffer_destroy;
+}
+
+
+/** 
+ * Copy the contents of the user buffer / malloc buffer to a hardware buffer.
+ */
+static INLINE enum pipe_error
+svga_buffer_update_hw(struct svga_screen *ss, struct svga_buffer *sbuf)
+{
+   if(!sbuf->hw.buf) {
+      enum pipe_error ret;
+      void *map;
+      
+      assert(sbuf->swbuf);
+      if(!sbuf->swbuf)
+         return PIPE_ERROR;
+      
+      ret = svga_buffer_create_hw_storage(ss, sbuf);
+      assert(ret == PIPE_OK);
+      if(ret != PIPE_OK)
+         return ret;
+
+      pipe_mutex_lock(ss->swc_mutex);
+      map = ss->sws->buffer_map(ss->sws, sbuf->hw.buf, PIPE_BUFFER_USAGE_CPU_WRITE);
+      assert(map);
+      if(!map) {
+	 pipe_mutex_unlock(ss->swc_mutex);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      memcpy(map, sbuf->swbuf, sbuf->base.size);
+      ss->sws->buffer_unmap(ss->sws, sbuf->hw.buf);
+
+      /* This user/malloc buffer is now indistinguishable from a gpu buffer */
+      assert(!sbuf->map.count);
+      if(!sbuf->map.count) {
+         if(sbuf->user)
+            sbuf->user = FALSE;
+         else
+            align_free(sbuf->swbuf);
+         sbuf->swbuf = NULL;
+      }
+      
+      svga_buffer_upload_queue(sbuf, 0, sbuf->base.size);
+   }
+   
+   pipe_mutex_unlock(ss->swc_mutex);
+   return PIPE_OK;
+}
+
+
+struct svga_winsys_surface *
+svga_buffer_handle(struct svga_context *svga,
+                   struct pipe_buffer *buf)
+{
+   struct pipe_screen *screen = svga->pipe.screen;
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_buffer *sbuf;
+   enum pipe_error ret;
+
+   if(!buf)
+      return NULL;
+
+   sbuf = svga_buffer(buf);
+   
+   assert(!sbuf->map.count);
+   
+   if(!sbuf->handle) {
+      ret = svga_buffer_create_host_surface(ss, sbuf);
+      if(ret != PIPE_OK)
+	 return NULL;
+
+      ret = svga_buffer_update_hw(ss, sbuf);
+      if(ret != PIPE_OK)
+	 return NULL;
+   }
+
+   if(!sbuf->needs_flush && sbuf->hw.num_ranges) {
+      /* Queue the buffer for flushing */
+      ret = svga_buffer_upload_command(svga, sbuf);
+      if(ret != PIPE_OK)
+         /* XXX: Should probably have a richer return value */
+         return NULL;
+
+      assert(sbuf->hw.svga == svga);
+
+      sbuf->needs_flush = TRUE;
+      assert(sbuf->head.prev && sbuf->head.next);
+      LIST_DEL(&sbuf->head);
+      LIST_ADDTAIL(&sbuf->head, &svga->dirty_buffers);
+   }
+
+   return sbuf->handle;
+}
+
+struct pipe_buffer *
+svga_screen_buffer_wrap_surface(struct pipe_screen *screen,
+				enum SVGA3dSurfaceFormat format,
+				struct svga_winsys_surface *srf)
+{
+   struct pipe_buffer *buf;
+   struct svga_buffer *sbuf;
+   struct svga_winsys_screen *sws = svga_winsys_screen(screen);
+
+   buf = svga_buffer_create(screen, 0, SVGA_BUFFER_USAGE_WRAPPED, 0);
+   if (!buf)
+      return NULL;
+
+   sbuf = svga_buffer(buf);
+
+   /*
+    * We are not the creator of this surface and therefore we must not
+    * cache it for reuse. Set the cacheable flag to zero in the key to
+    * prevent this.
+    */
+   sbuf->key.format = format;
+   sbuf->key.cachable = 0;
+   sws->surface_reference(sws, &sbuf->handle, srf);
+
+   return buf;
+}
+
+
+struct svga_winsys_surface *
+svga_screen_buffer_get_winsys_surface(struct pipe_buffer *buffer)
+{
+   struct svga_winsys_screen *sws = svga_winsys_screen(buffer->screen);
+   struct svga_winsys_surface *vsurf = NULL;
+
+   assert(svga_buffer(buffer)->key.cachable == 0);
+   svga_buffer(buffer)->key.cachable = 0;
+   sws->surface_reference(sws, &vsurf, svga_buffer(buffer)->handle);
+   return vsurf;
+}
+
+void
+svga_context_flush_buffers(struct svga_context *svga)
+{
+   struct list_head *curr, *next;
+   struct svga_buffer *sbuf;
+
+   curr = svga->dirty_buffers.next;
+   next = curr->next;
+   while(curr != &svga->dirty_buffers) {
+      sbuf = LIST_ENTRY(struct svga_buffer, curr, head);
+
+      assert(p_atomic_read(&sbuf->base.reference.count) != 0);
+      assert(sbuf->needs_flush);
+      
+      svga_buffer_upload_flush(svga, sbuf);
+
+      curr = next; 
+      next = curr->next;
+   }
+}
diff --git a/src/gallium/drivers/svga/svga_screen_buffer.h b/src/gallium/drivers/svga/svga_screen_buffer.h
new file mode 100644
index 00000000000..5d7af5a7c50
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_buffer.h
@@ -0,0 +1,190 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_BUFFER_H
+#define SVGA_BUFFER_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "util/u_double_list.h"
+
+#include "svga_screen_cache.h"
+
+
+#define SVGA_BUFFER_MAGIC 0x344f9005
+
+/**
+ * Maximum number of discontiguous ranges
+ */
+#define SVGA_BUFFER_MAX_RANGES 32
+
+
+struct svga_screen;
+struct svga_context;
+struct svga_winsys_buffer;
+struct svga_winsys_surface;
+
+
+struct svga_buffer_range
+{
+   unsigned start;
+   unsigned end;
+};
+
+
+/**
+ * Describe a
+ *
+ * This holds the information to emit a SVGA3dCmdSurfaceDMA.
+ */
+struct svga_buffer_upload
+{
+   /**
+    * Guest memory region.
+    */
+   struct svga_winsys_buffer *buf;
+
+   struct svga_buffer_range ranges[SVGA_BUFFER_MAX_RANGES];
+   unsigned num_ranges;
+
+   SVGA3dSurfaceDMAFlags flags;
+
+   /**
+    * Pointer to the DMA copy box *inside* the command buffer.
+    */
+   SVGA3dCopyBox *boxes;
+
+   /**
+    * Context that has the pending DMA to this buffer.
+    */
+   struct svga_context *svga;
+};
+
+
+/**
+ * SVGA pipe buffer.
+ */
+struct svga_buffer 
+{
+   struct pipe_buffer base;
+
+   /** 
+    * Marker to detect bad casts in runtime.
+    */ 
+   uint32_t magic;
+
+   /**
+    * Regular (non DMA'able) memory.
+    * 
+    * Used for user buffers or for buffers which we know before hand that can
+    * never be used by the virtual hardware directly, such as constant buffers.
+    */
+   void *swbuf;
+   
+   /** 
+    * Whether swbuf was created by the user or not.
+    */
+   boolean user;
+   
+   /**
+    * DMA'ble memory.
+    * 
+    * A piece of GMR memory. It is created when mapping the buffer, and will be
+    * used to upload/download vertex data from the host.
+    */
+   struct svga_buffer_upload hw;
+
+   /**
+    * Creation key for the host surface handle.
+    * 
+    * This structure describes all the host surface characteristics so that it 
+    * can be looked up in cache, since creating a host surface is often a slow
+    * operation.
+    */
+   struct svga_host_surface_cache_key key;
+   
+   /**
+    * Host surface handle.
+    * 
+    * This is a platform independent abstraction for host SID. We create when 
+    * trying to bind
+    */
+   struct svga_winsys_surface *handle;
+   
+   struct {
+      unsigned count;
+      boolean writing;
+      boolean flush_explicit;
+   } map;
+   
+   boolean needs_flush;
+   struct list_head head;
+};
+
+
+static INLINE struct svga_buffer *
+svga_buffer(struct pipe_buffer *buffer)
+{
+   if (buffer) {
+      assert(((struct svga_buffer *)buffer)->magic == SVGA_BUFFER_MAGIC);
+      return (struct svga_buffer *)buffer;
+   }
+   return NULL;
+}
+
+
+/**
+ * Returns TRUE for user buffers.  We may
+ * decide to use an alternate upload path for these buffers.
+ */
+static INLINE boolean 
+svga_buffer_is_user_buffer( struct pipe_buffer *buffer )
+{
+   return svga_buffer(buffer)->user;
+}
+
+
+void
+svga_screen_init_buffer_functions(struct pipe_screen *screen);
+
+struct svga_winsys_surface *
+svga_buffer_handle(struct svga_context *svga,
+                   struct pipe_buffer *buf);
+
+void
+svga_context_flush_buffers(struct svga_context *svga);
+
+boolean
+svga_buffer_free_cached_hw_storage(struct svga_screen *ss);
+
+struct svga_winsys_buffer *
+svga_winsys_buffer_create(struct svga_screen *ss,
+                          unsigned alignment, 
+                          unsigned usage,
+                          unsigned size);
+
+#endif /* SVGA_BUFFER_H */
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
new file mode 100644
index 00000000000..8a06383f61e
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -0,0 +1,338 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_memory.h"
+#include "util/u_hash.h"
+
+#include "svga_debug.h"
+#include "svga_winsys.h"
+#include "svga_screen.h"
+#include "svga_screen_cache.h"
+
+
+#define SVGA_SURFACE_CACHE_ENABLED 1
+
+
+/** 
+ * Compute the bucket for this key. 
+ */
+static INLINE unsigned
+svga_screen_cache_bucket(const struct svga_host_surface_cache_key *key)
+{
+   return util_hash_crc32( key, sizeof *key ) % SVGA_HOST_SURFACE_CACHE_BUCKETS;
+}
+
+
+static INLINE struct svga_winsys_surface *
+svga_screen_cache_lookup(struct svga_screen *svgascreen,
+                         const struct svga_host_surface_cache_key *key)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_host_surface_cache_entry *entry;
+   struct svga_winsys_surface *handle = NULL;
+   struct list_head *curr, *next;
+   unsigned bucket;
+   unsigned tries = 0;
+
+   assert(key->cachable);
+
+   bucket = svga_screen_cache_bucket(key);
+
+   pipe_mutex_lock(cache->mutex);
+
+   curr = cache->bucket[bucket].next;
+   next = curr->next;
+   while(curr != &cache->bucket[bucket]) {
+      ++tries;
+      
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, curr, bucket_head);
+
+      assert(entry->handle);
+      
+      if(memcmp(&entry->key, key, sizeof *key) == 0 &&
+         sws->fence_signalled( sws, entry->fence, 0 ) == 0) {
+         assert(sws->surface_is_flushed(sws, entry->handle));
+         
+         handle = entry->handle; // Reference is transfered here.
+         entry->handle = NULL;
+         
+         LIST_DEL(&entry->bucket_head);
+
+         LIST_DEL(&entry->head);
+         
+         LIST_ADD(&entry->head, &cache->empty);
+
+         break;
+      }
+
+      curr = next; 
+      next = curr->next;
+   }
+
+   pipe_mutex_unlock(cache->mutex);
+   
+   if (SVGA_DEBUG & DEBUG_DMA)
+      debug_printf("%s: cache %s after %u tries (bucket %d)\n", __FUNCTION__, 
+                   handle ? "hit" : "miss", tries, bucket);
+   
+   return handle;
+}
+
+
+/*
+ * Transfers a handle reference.
+ */
+                           
+static INLINE void
+svga_screen_cache_add(struct svga_screen *svgascreen,
+                      const struct svga_host_surface_cache_key *key, 
+                      struct svga_winsys_surface **p_handle)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_host_surface_cache_entry *entry = NULL;
+   struct svga_winsys_surface *handle = *p_handle;
+   
+   assert(key->cachable);
+
+   assert(handle);
+   if(!handle)
+      return;
+   
+   *p_handle = NULL;
+   pipe_mutex_lock(cache->mutex);
+   
+   if(!LIST_IS_EMPTY(&cache->empty)) {
+      /* use the first empty entry */
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, cache->empty.next, head);
+        
+      LIST_DEL(&entry->head);
+   }
+   else if(!LIST_IS_EMPTY(&cache->unused)) {
+      /* free the last used buffer and reuse its entry */
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, cache->unused.prev, head);
+      SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+               "unref sid %p (make space)\n", entry->handle);
+      sws->surface_reference(sws, &entry->handle, NULL);
+
+      LIST_DEL(&entry->bucket_head);
+
+      LIST_DEL(&entry->head);
+   }
+
+   if(entry) {
+      entry->handle = handle;
+      memcpy(&entry->key, key, sizeof entry->key);
+   
+      SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+               "cache sid %p\n", entry->handle);
+      LIST_ADD(&entry->head, &cache->validated);
+   }
+   else {
+      /* Couldn't cache the buffer -- this really shouldn't happen */
+      SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+               "unref sid %p (couldn't find space)\n", handle);
+      sws->surface_reference(sws, &handle, NULL);
+   }
+   
+   pipe_mutex_unlock(cache->mutex);
+}
+
+
+/**
+ * Called during the screen flush to move all buffers not in a validate list
+ * into the unused list.
+ */
+void
+svga_screen_cache_flush(struct svga_screen *svgascreen,
+                        struct pipe_fence_handle *fence)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_host_surface_cache_entry *entry;
+   struct list_head *curr, *next;
+   unsigned bucket;
+
+   pipe_mutex_lock(cache->mutex);
+
+   curr = cache->validated.next;
+   next = curr->next;
+   while(curr != &cache->validated) {
+      entry = LIST_ENTRY(struct svga_host_surface_cache_entry, curr, head);
+
+      assert(entry->handle);
+
+      if(sws->surface_is_flushed(sws, entry->handle)) {
+         LIST_DEL(&entry->head);
+         
+         svgascreen->sws->fence_reference(svgascreen->sws, &entry->fence, fence);
+
+         LIST_ADD(&entry->head, &cache->unused);
+
+         bucket = svga_screen_cache_bucket(&entry->key);
+         LIST_ADD(&entry->bucket_head, &cache->bucket[bucket]);
+      }
+
+      curr = next; 
+      next = curr->next;
+   }
+
+   pipe_mutex_unlock(cache->mutex);
+}
+
+
+void
+svga_screen_cache_cleanup(struct svga_screen *svgascreen)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   unsigned i;
+   
+   for(i = 0; i < SVGA_HOST_SURFACE_CACHE_SIZE; ++i) {
+      if(cache->entries[i].handle) {
+	 SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                  "unref sid %p (shutdown)\n", cache->entries[i].handle);
+	 sws->surface_reference(sws, &cache->entries[i].handle, NULL);
+      }
+
+      if(cache->entries[i].fence)
+         svgascreen->sws->fence_reference(svgascreen->sws, &cache->entries[i].fence, NULL);
+   }
+   
+   pipe_mutex_destroy(cache->mutex);
+}
+
+
+enum pipe_error
+svga_screen_cache_init(struct svga_screen *svgascreen)
+{
+   struct svga_host_surface_cache *cache = &svgascreen->cache;
+   unsigned i;
+
+   pipe_mutex_init(cache->mutex);
+   
+   for(i = 0; i < SVGA_HOST_SURFACE_CACHE_BUCKETS; ++i)
+      LIST_INITHEAD(&cache->bucket[i]);
+
+   LIST_INITHEAD(&cache->unused);
+   
+   LIST_INITHEAD(&cache->validated);
+   
+   LIST_INITHEAD(&cache->empty);
+   for(i = 0; i < SVGA_HOST_SURFACE_CACHE_SIZE; ++i)
+      LIST_ADDTAIL(&cache->entries[i].head, &cache->empty);
+
+   return PIPE_OK;
+}
+
+                           
+struct svga_winsys_surface *
+svga_screen_surface_create(struct svga_screen *svgascreen,
+                           struct svga_host_surface_cache_key *key)
+{
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   struct svga_winsys_surface *handle = NULL;
+   boolean cachable = SVGA_SURFACE_CACHE_ENABLED && key->cachable;
+
+   SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+            "%s sz %dx%dx%d mips %d faces %d cachable %d\n", 
+            __FUNCTION__,
+            key->size.width,
+            key->size.height,
+            key->size.depth,
+            key->numMipLevels,
+            key->numFaces,
+            key->cachable);
+
+   if (cachable) {
+      if (key->format == SVGA3D_BUFFER) {
+         /* For buffers, round the buffer size up to the nearest power
+          * of two to increase the probability of cache hits.  Keep
+          * texture surface dimensions unchanged.
+          */
+         uint32_t size = 1;
+         while(size < key->size.width)
+            size <<= 1;
+         key->size.width = size;
+      }
+
+      handle = svga_screen_cache_lookup(svgascreen, key);
+      if (handle) {
+         if (key->format == SVGA3D_BUFFER)
+            SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                     "reuse sid %p sz %d (buffer)\n", handle, 
+                     key->size.width);
+         else
+            SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                     "reuse sid %p sz %dx%dx%d mips %d faces %d\n", handle, 
+                     key->size.width,
+                     key->size.height,
+                     key->size.depth,
+                     key->numMipLevels,
+                     key->numFaces);
+      }
+   }
+
+   if (!handle) {
+      handle = sws->surface_create(sws,
+                                   key->flags,
+                                   key->format,
+                                   key->size, 
+                                   key->numFaces, 
+                                   key->numMipLevels);
+      if (handle)
+         SVGA_DBG(DEBUG_CACHE|DEBUG_DMA,
+                  "  CREATE sid %p sz %dx%dx%d\n", 
+                  handle, 
+                  key->size.width,
+                  key->size.height,
+                  key->size.depth);
+   }
+
+   return handle;
+}
+
+
+void
+svga_screen_surface_destroy(struct svga_screen *svgascreen,
+                            const struct svga_host_surface_cache_key *key,
+                            struct svga_winsys_surface **p_handle)
+{
+   struct svga_winsys_screen *sws = svgascreen->sws;
+   
+   /* We only set the cachable flag for surfaces of which we are the
+    * exclusive owner.  So just hold onto our existing reference in
+    * that case.
+    */
+   if(SVGA_SURFACE_CACHE_ENABLED && key->cachable) {
+      svga_screen_cache_add(svgascreen, key, p_handle);
+   }
+   else {
+      SVGA_DBG(DEBUG_DMA,
+               "unref sid %p (uncachable)\n", *p_handle);
+      sws->surface_reference(sws, p_handle, NULL);
+   }
+}
diff --git a/src/gallium/drivers/svga/svga_screen_cache.h b/src/gallium/drivers/svga/svga_screen_cache.h
new file mode 100644
index 00000000000..f5aa740d408
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_cache.h
@@ -0,0 +1,144 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SCREEN_CACHE_H_
+#define SVGA_SCREEN_CACHE_H_
+
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+
+#include "pipe/p_thread.h"
+
+#include "util/u_double_list.h"
+
+
+/* Guess the storage size of cached surfaces and try and keep it under
+ * this amount:
+ */ 
+#define SVGA_HOST_SURFACE_CACHE_BYTES 16*1024*1024
+
+/* Maximum number of discrete surfaces in the cache:
+ */
+#define SVGA_HOST_SURFACE_CACHE_SIZE 1024
+
+/* Number of hash buckets:
+ */
+#define SVGA_HOST_SURFACE_CACHE_BUCKETS 256
+
+
+struct svga_winsys_surface;
+struct svga_screen;
+
+/**
+ * Same as svga_winsys_screen::surface_create.
+ */
+struct svga_host_surface_cache_key
+{
+   SVGA3dSurfaceFlags flags;
+   SVGA3dSurfaceFormat format;
+   SVGA3dSize size;
+   uint32_t numFaces:24;
+   uint32_t numMipLevels:7;
+   uint32_t cachable:1;         /* False if this is a shared surface */
+};
+
+
+struct svga_host_surface_cache_entry 
+{
+   /** 
+    * Head for the LRU list, svga_host_surface_cache::unused, and
+    * svga_host_surface_cache::empty
+    */
+   struct list_head head;
+   
+   /** Head for the bucket lists. */
+   struct list_head bucket_head;
+
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+   
+   struct pipe_fence_handle *fence;
+};
+
+
+/**
+ * Cache of the host surfaces.
+ * 
+ * A cache entry can be in the following stages:
+ * 1. empty
+ * 2. holding a buffer in a validate list
+ * 3. holding a flushed buffer (not in any validate list) with an active fence
+ * 4. holding a flushed buffer with an expired fence
+ * 
+ * An entry progresses from 1 -> 2 -> 3 -> 4. When we need an entry to put a 
+ * buffer into we preferencial take from 1, or from the least recentely used 
+ * buffer from 3/4.
+ */
+struct svga_host_surface_cache 
+{
+   pipe_mutex mutex;
+   
+   /* Unused buffers are put in buckets to speed up lookups */
+   struct list_head bucket[SVGA_HOST_SURFACE_CACHE_BUCKETS];
+   
+   /* Entries with unused buffers, ordered from most to least recently used 
+    * (3 and 4) */
+   struct list_head unused;
+   
+   /* Entries with buffers still in validate lists (2) */
+   struct list_head validated;
+   
+   /** Empty entries (1) */
+   struct list_head empty;
+
+   /** The actual storage for the entries */
+   struct svga_host_surface_cache_entry entries[SVGA_HOST_SURFACE_CACHE_SIZE];
+};
+
+
+void
+svga_screen_cache_cleanup(struct svga_screen *svgascreen);
+
+void
+svga_screen_cache_flush(struct svga_screen *svgascreen,
+                        struct pipe_fence_handle *fence);
+
+enum pipe_error
+svga_screen_cache_init(struct svga_screen *svgascreen);
+
+
+struct svga_winsys_surface *
+svga_screen_surface_create(struct svga_screen *svgascreen,
+                           struct svga_host_surface_cache_key *key);
+
+void
+svga_screen_surface_destroy(struct svga_screen *svgascreen,
+                            const struct svga_host_surface_cache_key *key,
+                            struct svga_winsys_surface **handle);
+
+
+#endif /* SVGA_SCREEN_CACHE_H_ */
diff --git a/src/gallium/drivers/svga/svga_screen_texture.c b/src/gallium/drivers/svga/svga_screen_texture.c
new file mode 100644
index 00000000000..e7301aba841
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_texture.c
@@ -0,0 +1,1088 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "svga_cmd.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "svga_screen.h"
+#include "svga_context.h"
+#include "svga_screen_texture.h"
+#include "svga_screen_buffer.h"
+#include "svga_winsys.h"
+#include "svga_debug.h"
+#include "svga_screen_buffer.h"
+
+#include <util/u_string.h>
+
+
+/* XXX: This isn't a real hardware flag, but just a hack for kernel to
+ * know about primary surfaces. Find a better way to accomplish this.
+ */
+#define SVGA3D_SURFACE_HINT_SCANOUT (1 << 9)
+
+
+/*
+ * Helper function and arrays
+ */
+
+SVGA3dSurfaceFormat
+svga_translate_format(enum pipe_format format)
+{
+   switch(format) {
+   
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return SVGA3D_A8R8G8B8;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      return SVGA3D_X8R8G8B8;
+
+      /* Required for GL2.1:
+       */
+   case PIPE_FORMAT_A8R8G8B8_SRGB:
+      return SVGA3D_A8R8G8B8;
+
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return SVGA3D_R5G6B5;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return SVGA3D_A1R5G5B5;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return SVGA3D_A4R4G4B4;
+
+      
+   /* XXX: Doesn't seem to work properly.
+   case PIPE_FORMAT_Z32_UNORM:
+      return SVGA3D_Z_D32;
+    */
+   case PIPE_FORMAT_Z16_UNORM:
+      return SVGA3D_Z_D16;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      return SVGA3D_Z_D24S8;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return SVGA3D_Z_D24X8;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return SVGA3D_ALPHA8;
+   case PIPE_FORMAT_L8_UNORM:
+      return SVGA3D_LUMINANCE8;
+
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+      return SVGA3D_DXT1;
+   case PIPE_FORMAT_DXT3_RGBA:
+      return SVGA3D_DXT3;
+   case PIPE_FORMAT_DXT5_RGBA:
+      return SVGA3D_DXT5;
+
+   default:
+      return SVGA3D_FORMAT_INVALID;
+   }
+}
+
+
+SVGA3dSurfaceFormat
+svga_translate_format_render(enum pipe_format format)
+{
+   switch(format) { 
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_L8_UNORM:
+      return svga_translate_format(format);
+
+#if 1
+   /* For on host conversion */
+   case PIPE_FORMAT_DXT1_RGB:
+      return SVGA3D_X8R8G8B8;
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+      return SVGA3D_A8R8G8B8;
+#endif
+
+   default:
+      return SVGA3D_FORMAT_INVALID;
+   }
+}
+
+
+static INLINE void
+svga_transfer_dma_band(struct svga_transfer *st,
+                       SVGA3dTransferType transfer,
+                       unsigned y, unsigned h, unsigned srcy)
+{
+   struct svga_texture *texture = svga_texture(st->base.texture); 
+   struct svga_screen *screen = svga_screen(texture->base.screen);
+   SVGA3dCopyBox box;
+   enum pipe_error ret;
+   
+   SVGA_DBG(DEBUG_DMA, "dma %s sid %p, face %u, (%u, %u, %u) - (%u, %u, %u), %ubpp\n",
+                transfer == SVGA3D_WRITE_HOST_VRAM ? "to" : "from", 
+                texture->handle,
+                st->base.face,
+                st->base.x,
+                y,
+                st->base.zslice,
+                st->base.x + st->base.width,
+                y + h,
+                st->base.zslice + 1,
+                texture->base.block.size*8/(texture->base.block.width*texture->base.block.height));
+   
+   box.x = st->base.x;
+   box.y = y;
+   box.z = st->base.zslice;
+   box.w = st->base.width;
+   box.h = h;
+   box.d = 1;
+   box.srcx = 0;
+   box.srcy = srcy;
+   box.srcz = 0;
+
+   pipe_mutex_lock(screen->swc_mutex);
+   ret = SVGA3D_SurfaceDMA(screen->swc, st, transfer, &box, 1);
+   if(ret != PIPE_OK) {
+      screen->swc->flush(screen->swc, NULL);
+      ret = SVGA3D_SurfaceDMA(screen->swc, st, transfer, &box, 1);
+      assert(ret == PIPE_OK);
+   }
+   pipe_mutex_unlock(screen->swc_mutex);
+}
+
+
+static INLINE void
+svga_transfer_dma(struct svga_transfer *st,
+                 SVGA3dTransferType transfer)
+{
+   struct svga_texture *texture = svga_texture(st->base.texture); 
+   struct svga_screen *screen = svga_screen(texture->base.screen);
+   struct svga_winsys_screen *sws = screen->sws;
+   struct pipe_fence_handle *fence = NULL;
+   
+   if (transfer == SVGA3D_READ_HOST_VRAM) {
+      SVGA_DBG(DEBUG_PERF, "%s: readback transfer\n", __FUNCTION__);
+   }
+
+
+   if(!st->swbuf) {
+      /* Do the DMA transfer in a single go */
+      
+      svga_transfer_dma_band(st, transfer, st->base.y, st->base.height, 0);
+
+      if(transfer == SVGA3D_READ_HOST_VRAM) {
+         svga_screen_flush(screen, &fence);
+         sws->fence_finish(sws, fence, 0);
+         //sws->fence_reference(sws, &fence, NULL);
+      }
+   }
+   else {
+      unsigned y, h, srcy;
+      h = st->hw_nblocksy * st->base.block.height;
+      srcy = 0;
+      for(y = 0; y < st->base.height; y += h) {
+         unsigned offset, length;
+         void *hw, *sw;
+
+         if (y + h > st->base.height)
+            h = st->base.height - y;
+
+         /* Transfer band must be aligned to pixel block boundaries */
+         assert(y % st->base.block.height == 0);
+         assert(h % st->base.block.height == 0);
+         
+         offset = y * st->base.stride / st->base.block.height;
+         length = h * st->base.stride / st->base.block.height;
+
+         sw = (uint8_t *)st->swbuf + offset;
+         
+         if(transfer == SVGA3D_WRITE_HOST_VRAM) {
+            /* Wait for the previous DMAs to complete */
+            /* TODO: keep one DMA (at half the size) in the background */
+            if(y) {
+               svga_screen_flush(screen, &fence);
+               sws->fence_finish(sws, fence, 0);
+               //sws->fence_reference(sws, &fence, NULL);
+            }
+
+            hw = sws->buffer_map(sws, st->hwbuf, PIPE_BUFFER_USAGE_CPU_WRITE);
+            assert(hw);
+            if(hw) {
+               memcpy(hw, sw, length);
+               sws->buffer_unmap(sws, st->hwbuf);
+            }
+         }
+         
+         svga_transfer_dma_band(st, transfer, y, h, srcy);
+         
+         if(transfer == SVGA3D_READ_HOST_VRAM) {
+            svga_screen_flush(screen, &fence);
+            sws->fence_finish(sws, fence, 0);
+
+            hw = sws->buffer_map(sws, st->hwbuf, PIPE_BUFFER_USAGE_CPU_READ);
+            assert(hw);
+            if(hw) {
+               memcpy(sw, hw, length);
+               sws->buffer_unmap(sws, st->hwbuf);
+            }
+         }
+      }
+   }
+}
+
+
+static struct pipe_texture *
+svga_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
+{
+   struct svga_screen *svgascreen = svga_screen(screen);
+   struct svga_texture *tex = CALLOC_STRUCT(svga_texture);
+   unsigned width, height, depth;
+   unsigned level;
+   
+   if (!tex)
+      goto error1;
+
+   tex->base = *templat;
+   pipe_reference_init(&tex->base.reference, 1);
+   tex->base.screen = screen;
+
+   assert(templat->last_level < SVGA_MAX_TEXTURE_LEVELS);
+   if(templat->last_level >= SVGA_MAX_TEXTURE_LEVELS)
+      goto error2;
+   
+   width = templat->width[0];
+   height = templat->height[0];
+   depth = templat->depth[0];
+   for(level = 0; level <= templat->last_level; ++level) {
+      tex->base.width[level] = width;
+      tex->base.height[level] = height;
+      tex->base.depth[level] = depth;
+      tex->base.nblocksx[level] = pf_get_nblocksx(&tex->base.block, width);  
+      tex->base.nblocksy[level] = pf_get_nblocksy(&tex->base.block, height);  
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+   
+   tex->key.flags = 0;
+   tex->key.size.width = templat->width[0];
+   tex->key.size.height = templat->height[0];
+   tex->key.size.depth = templat->depth[0];
+   
+   if(templat->target == PIPE_TEXTURE_CUBE) {
+      tex->key.flags |= SVGA3D_SURFACE_CUBEMAP;
+      tex->key.numFaces = 6;
+   }
+   else {
+      tex->key.numFaces = 1;
+   }
+
+   if(templat->tex_usage & PIPE_TEXTURE_USAGE_SAMPLER)
+      tex->key.flags |= SVGA3D_SURFACE_HINT_TEXTURE;
+
+   if(templat->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
+      tex->key.flags |= SVGA3D_SURFACE_HINT_SCANOUT;
+   
+   /* 
+    * XXX: Never pass the SVGA3D_SURFACE_HINT_RENDERTARGET hint. Mesa cannot
+    * know beforehand whether a texture will be used as a rendertarget or not
+    * and it always requests PIPE_TEXTURE_USAGE_RENDER_TARGET, therefore
+    * passing the SVGA3D_SURFACE_HINT_RENDERTARGET here defeats its purpose.
+    */
+#if 0
+   if((templat->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) &&
+      !pf_is_compressed(templat->format))
+      tex->key.flags |= SVGA3D_SURFACE_HINT_RENDERTARGET;
+#endif
+   
+   if(templat->tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL)
+      tex->key.flags |= SVGA3D_SURFACE_HINT_DEPTHSTENCIL;
+   
+   tex->key.numMipLevels = templat->last_level + 1;
+   
+   tex->key.format = svga_translate_format(templat->format);
+   if(tex->key.format == SVGA3D_FORMAT_INVALID)
+      goto error2;
+
+   tex->key.cachable = 1;
+   
+   SVGA_DBG(DEBUG_DMA, "surface_create for texture\n", tex->handle);
+   tex->handle = svga_screen_surface_create(svgascreen, &tex->key);
+   if (tex->handle)
+      SVGA_DBG(DEBUG_DMA, "  --> got sid %p (texture)\n", tex->handle);
+
+   return &tex->base;
+
+error2:
+   FREE(tex);
+error1:
+   return NULL;
+}
+
+
+static struct pipe_texture *
+svga_texture_blanket(struct pipe_screen * screen,
+                     const struct pipe_texture *base,
+                     const unsigned *stride,
+                     struct pipe_buffer *buffer)
+{
+   struct svga_texture *tex;
+   struct svga_buffer *sbuf = svga_buffer(buffer);
+   struct svga_winsys_screen *sws = svga_winsys_screen(screen);
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   /**
+    * We currently can't do texture blanket on
+    * SVGA3D_BUFFER. Need to blit to a temporary surface?
+    */
+
+   assert(sbuf->handle);
+   if (!sbuf->handle)
+      return NULL;
+
+   if (svga_translate_format(base->format) != sbuf->key.format) {
+      unsigned f1 = svga_translate_format(base->format);
+      unsigned f2 = sbuf->key.format;
+
+      /* It's okay for XRGB and ARGB or depth with/out stencil to get mixed up */
+      if ( !( (f1 == SVGA3D_X8R8G8B8 && f2 == SVGA3D_A8R8G8B8) ||
+              (f1 == SVGA3D_A8R8G8B8 && f2 == SVGA3D_X8R8G8B8) ||
+              (f1 == SVGA3D_Z_D24X8 && f2 == SVGA3D_Z_D24S8) ) ) {
+         debug_printf("%s wrong format %u != %u\n", __FUNCTION__, f1, f2);
+         return NULL;
+      }
+   }
+
+   tex = CALLOC_STRUCT(svga_texture);
+   if (!tex)
+      return NULL;
+
+   tex->base = *base;
+   
+
+   if (sbuf->key.format == 1)
+      tex->base.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+   else if (sbuf->key.format == 2)
+      tex->base.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+
+   pipe_reference_init(&tex->base.reference, 1);
+   tex->base.screen = screen;
+
+   SVGA_DBG(DEBUG_DMA, "blanket sid %p\n", sbuf->handle);
+
+   /* We don't own this storage, so don't try to cache it.
+    */
+   assert(sbuf->key.cachable == 0);
+   tex->key.cachable = 0;
+   sws->surface_reference(sws, &tex->handle, sbuf->handle);
+
+   return &tex->base;
+}
+
+
+static void
+svga_texture_destroy(struct pipe_texture *pt)
+{
+   struct svga_screen *ss = svga_screen(pt->screen);
+   struct svga_texture *tex = (struct svga_texture *)pt;
+
+   ss->texture_timestamp++;
+
+   svga_sampler_view_reference(&tex->cached_view, NULL);
+
+   /*
+     DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+   */
+   SVGA_DBG(DEBUG_DMA, "unref sid %p (texture)\n", tex->handle);
+   svga_screen_surface_destroy(ss, &tex->key, &tex->handle);
+
+   FREE(tex);
+}
+
+
+static void
+svga_texture_copy_handle(struct svga_context *svga,
+                         struct svga_screen *ss,
+                         struct svga_winsys_surface *src_handle,
+                         unsigned src_x, unsigned src_y, unsigned src_z,
+                         unsigned src_level, unsigned src_face,
+                         struct svga_winsys_surface *dst_handle,
+                         unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                         unsigned dst_level, unsigned dst_face,
+                         unsigned width, unsigned height, unsigned depth)
+{
+   struct svga_surface dst, src;
+   enum pipe_error ret;
+   SVGA3dCopyBox box, *boxes;
+
+   assert(svga || ss);
+
+   src.handle = src_handle;
+   src.real_level = src_level;
+   src.real_face = src_face;
+   src.real_zslice = 0;
+
+   dst.handle = dst_handle;
+   dst.real_level = dst_level;
+   dst.real_face = dst_face;
+   dst.real_zslice = 0;
+
+   box.x = dst_x;
+   box.y = dst_y;
+   box.z = dst_z;
+   box.w = width;
+   box.h = height;
+   box.d = depth;
+   box.srcx = src_x;
+   box.srcy = src_y;
+   box.srcz = src_z;
+
+/*
+   SVGA_DBG(DEBUG_VIEWS, "mipcopy src: %p %u (%ux%ux%u), dst: %p %u (%ux%ux%u)\n",
+            src_handle, src_level, src_x, src_y, src_z,
+            dst_handle, dst_level, dst_x, dst_y, dst_z);
+*/
+
+   if (svga) {
+      ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                    &src.base,
+                                    &dst.base,
+                                    &boxes, 1);
+      if(ret != PIPE_OK) {
+         svga_context_flush(svga, NULL);
+         ret = SVGA3D_BeginSurfaceCopy(svga->swc,
+                                       &src.base,
+                                       &dst.base,
+                                       &boxes, 1);
+         assert(ret == PIPE_OK);
+      }
+      *boxes = box;
+      SVGA_FIFOCommitAll(svga->swc);
+   } else {
+      pipe_mutex_lock(ss->swc_mutex);
+      ret = SVGA3D_BeginSurfaceCopy(ss->swc,
+                                    &src.base,
+                                    &dst.base,
+                                    &boxes, 1);
+      if(ret != PIPE_OK) {
+         ss->swc->flush(ss->swc, NULL);
+         ret = SVGA3D_BeginSurfaceCopy(ss->swc,
+                                       &src.base,
+                                       &dst.base,
+                                       &boxes, 1);
+         assert(ret == PIPE_OK);
+      }
+      *boxes = box;
+      SVGA_FIFOCommitAll(ss->swc);
+      pipe_mutex_unlock(ss->swc_mutex);
+   }
+}
+
+static struct svga_winsys_surface *
+svga_texture_view_surface(struct pipe_context *pipe,
+                          struct svga_texture *tex,
+                          SVGA3dSurfaceFormat format,
+                          unsigned start_mip,
+                          unsigned num_mip,
+                          int face_pick,
+                          int zslice_pick,
+                          struct svga_host_surface_cache_key *key) /* OUT */
+{
+   struct svga_screen *ss = svga_screen(tex->base.screen);
+   struct svga_winsys_surface *handle;
+   int i, j;
+   unsigned z_offset = 0;
+
+   SVGA_DBG(DEBUG_PERF, 
+            "svga: Create surface view: face %d zslice %d mips %d..%d\n",
+            face_pick, zslice_pick, start_mip, start_mip+num_mip-1);
+
+   key->flags = 0;
+   key->format = format;
+   key->numMipLevels = num_mip;
+   key->size.width = tex->base.width[start_mip];
+   key->size.height = tex->base.height[start_mip];
+   key->size.depth = zslice_pick < 0 ? tex->base.depth[start_mip] : 1;
+   key->cachable = 1;
+   assert(key->size.depth == 1);
+   
+   if(tex->base.target == PIPE_TEXTURE_CUBE && face_pick < 0) {
+      key->flags |= SVGA3D_SURFACE_CUBEMAP;
+      key->numFaces = 6;
+   } else {
+      key->numFaces = 1;
+   }
+
+   if(key->format == SVGA3D_FORMAT_INVALID) {
+      key->cachable = 0;
+      return NULL;
+   }
+
+   SVGA_DBG(DEBUG_DMA, "surface_create for texture view\n");
+   handle = svga_screen_surface_create(ss, key);
+   if (!handle) {
+      key->cachable = 0;
+      return NULL;
+   }
+
+   SVGA_DBG(DEBUG_DMA, " --> got sid %p (texture view)\n", handle);
+
+   if (face_pick < 0)
+      face_pick = 0;
+
+   if (zslice_pick >= 0)
+       z_offset = zslice_pick;
+
+   for (i = 0; i < key->numMipLevels; i++) {
+      for (j = 0; j < key->numFaces; j++) {
+         if(tex->defined[j + face_pick][i + start_mip]) {
+            unsigned depth = zslice_pick < 0 ? tex->base.depth[i + start_mip] : 1;
+            svga_texture_copy_handle(svga_context(pipe),
+                                     ss,
+                                     tex->handle, 
+                                     0, 0, z_offset, 
+                                     i + start_mip, 
+                                     j + face_pick,
+                                     handle, 0, 0, 0, i, j,
+                                     tex->base.width[i + start_mip],
+                                     tex->base.height[i + start_mip],
+                                     depth);
+         }
+      }
+   }
+
+   return handle;
+}
+
+
+static struct pipe_surface *
+svga_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned flags)
+{
+   struct svga_texture *tex = svga_texture(pt);
+   struct svga_surface *s;
+   boolean render = flags & PIPE_BUFFER_USAGE_GPU_WRITE ? TRUE : FALSE;
+   boolean view = FALSE;
+   SVGA3dSurfaceFormat format;
+
+   s = CALLOC_STRUCT(svga_surface);
+   if (!s)
+      return NULL;
+
+   pipe_reference_init(&s->base.reference, 1);
+   pipe_texture_reference(&s->base.texture, pt);
+   s->base.format = pt->format;
+   s->base.width = pt->width[level];
+   s->base.height = pt->height[level];
+   s->base.usage = flags;
+   s->base.level = level;
+   s->base.face = face;
+   s->base.zslice = zslice;
+
+   if (!render)
+      format = svga_translate_format(pt->format);
+   else
+      format = svga_translate_format_render(pt->format);
+
+   assert(format != SVGA3D_FORMAT_INVALID);
+   assert(!(flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
+
+
+   if (svga_screen(screen)->debug.force_surface_view)
+      view = TRUE;
+
+   /* Currently only used for compressed textures */
+   if (render && 
+       format != svga_translate_format(pt->format)) {
+      view = TRUE;
+   }
+
+   if (level != 0 && 
+       svga_screen(screen)->debug.force_level_surface_view)
+      view = TRUE;
+
+   if (pt->target == PIPE_TEXTURE_3D)
+      view = TRUE;
+
+   if (svga_screen(screen)->debug.no_surface_view)
+      view = FALSE;
+
+   if (view) {
+      SVGA_DBG(DEBUG_VIEWS, "svga: Surface view: yes %p, level %u face %u z %u, %p\n",
+               pt, level, face, zslice, s);
+
+      s->handle = svga_texture_view_surface(NULL, tex, format, level, 1, face, zslice,
+                                            &s->key);
+      s->real_face = 0;
+      s->real_level = 0;
+      s->real_zslice = 0;
+   } else {
+      struct svga_winsys_screen *sws = svga_winsys_screen(screen);
+
+      SVGA_DBG(DEBUG_VIEWS, "svga: Surface view: no %p, level %u, face %u, z %u, %p\n",
+               pt, level, face, zslice, s);
+
+      memset(&s->key, 0, sizeof s->key);
+      sws->surface_reference(sws, &s->handle, tex->handle);
+      s->real_face = face;
+      s->real_level = level;
+      s->real_zslice = zslice;
+   }
+
+   return &s->base;
+}
+
+
+static void
+svga_tex_surface_destroy(struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+   struct svga_screen *ss = svga_screen(surf->texture->screen);
+
+   SVGA_DBG(DEBUG_DMA, "unref sid %p (tex surface)\n", s->handle);
+   assert(s->key.cachable == 0);
+   svga_screen_surface_destroy(ss, &s->key, &s->handle);
+   pipe_texture_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+static INLINE void 
+svga_mark_surface_dirty(struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+
+   if(!s->dirty) {
+      struct svga_texture *tex = svga_texture(surf->texture);
+
+      s->dirty = TRUE;
+
+      if (s->handle == tex->handle)
+         tex->defined[surf->face][surf->level] = TRUE;
+      else {
+         /* this will happen later in svga_propagate_surface */
+      }
+   }
+}
+
+
+void svga_mark_surfaces_dirty(struct svga_context *svga)
+{
+   unsigned i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (svga->curr.framebuffer.cbufs[i])
+         svga_mark_surface_dirty(svga->curr.framebuffer.cbufs[i]);
+   }
+   if (svga->curr.framebuffer.zsbuf)
+      svga_mark_surface_dirty(svga->curr.framebuffer.zsbuf);
+}
+
+/**
+ * Progagate any changes from surfaces to texture.
+ * pipe is optional context to inline the blit command in.
+ */
+void
+svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+   struct svga_texture *tex = svga_texture(surf->texture);
+   struct svga_screen *ss = svga_screen(surf->texture->screen);
+
+   if (!s->dirty)
+      return;
+
+   s->dirty = FALSE;
+   ss->texture_timestamp++;
+   tex->view_age[surf->level] = ++(tex->age);
+
+   if (s->handle != tex->handle) {
+      SVGA_DBG(DEBUG_VIEWS, "svga: Surface propagate: tex %p, level %u, from %p\n", tex, surf->level, surf);
+      svga_texture_copy_handle(svga_context(pipe), ss,
+                               s->handle, 0, 0, 0, s->real_level, s->real_face,
+                               tex->handle, 0, 0, surf->zslice, surf->level, surf->face,
+                               tex->base.width[surf->level], tex->base.height[surf->level], 1);
+      tex->defined[surf->face][surf->level] = TRUE;
+   }
+}
+
+/**
+ * Check if we should call svga_propagate_surface on the surface.
+ */
+extern boolean
+svga_surface_needs_propagation(struct pipe_surface *surf)
+{
+   struct svga_surface *s = svga_surface(surf);
+   struct svga_texture *tex = svga_texture(surf->texture);
+
+   return s->dirty && s->handle != tex->handle;
+}
+
+
+static struct pipe_transfer *
+svga_get_tex_transfer(struct pipe_screen *screen,
+                     struct pipe_texture *texture,
+                     unsigned face, unsigned level, unsigned zslice,
+                     enum pipe_transfer_usage usage, unsigned x, unsigned y,
+                     unsigned w, unsigned h)
+{
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st;
+
+   /* We can't map texture storage directly */
+   if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
+      return NULL;
+
+   st = CALLOC_STRUCT(svga_transfer);
+   if (!st)
+      return NULL;
+   
+   st->base.format = texture->format;
+   st->base.block = texture->block;
+   st->base.x = x;
+   st->base.y = y;
+   st->base.width = w;
+   st->base.height = h;
+   st->base.nblocksx = pf_get_nblocksx(&texture->block, w);
+   st->base.nblocksy = pf_get_nblocksy(&texture->block, h);
+   st->base.stride = st->base.nblocksx*st->base.block.size;
+   st->base.usage = usage;
+   st->base.face = face;
+   st->base.level = level;
+   st->base.zslice = zslice;
+
+   st->hw_nblocksy = st->base.nblocksy;
+   
+   st->hwbuf = svga_winsys_buffer_create(ss, 
+                                         1, 
+                                         0,
+                                         st->hw_nblocksy*st->base.stride);
+   while(!st->hwbuf && (st->hw_nblocksy /= 2)) {
+      st->hwbuf = svga_winsys_buffer_create(ss, 
+                                            1, 
+                                            0,
+                                            st->hw_nblocksy*st->base.stride);
+   }
+
+   if(!st->hwbuf)
+      goto no_hwbuf;
+
+   if(st->hw_nblocksy < st->base.nblocksy) {
+      /* We couldn't allocate a hardware buffer big enough for the transfer, 
+       * so allocate regular malloc memory instead */
+      debug_printf("%s: failed to allocate %u KB of DMA, splitting into %u x %u KB DMA transfers\n",
+                   __FUNCTION__,
+                   (st->base.nblocksy*st->base.stride + 1023)/1024,
+                   (st->base.nblocksy + st->hw_nblocksy - 1)/st->hw_nblocksy,
+                   (st->hw_nblocksy*st->base.stride + 1023)/1024);
+      st->swbuf = MALLOC(st->base.nblocksy*st->base.stride);
+      if(!st->swbuf)
+         goto no_swbuf;
+   }
+   
+   pipe_texture_reference(&st->base.texture, texture);
+
+   if (usage & PIPE_TRANSFER_READ)
+      svga_transfer_dma(st, SVGA3D_READ_HOST_VRAM);
+
+   return &st->base;
+
+no_swbuf:
+   sws->buffer_destroy(sws, st->hwbuf);
+no_hwbuf:
+   FREE(st);
+   return NULL;
+}
+
+
+static void *
+svga_transfer_map( struct pipe_screen *screen,
+                   struct pipe_transfer *transfer )
+{
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st = svga_transfer(transfer);
+
+   if(st->swbuf)
+      return st->swbuf;
+   else
+      /* The wait for read transfers already happened when svga_transfer_dma
+       * was called. */
+      return sws->buffer_map(sws, st->hwbuf,
+                             pipe_transfer_buffer_flags(transfer));
+}
+
+
+static void
+svga_transfer_unmap(struct pipe_screen *screen,
+                    struct pipe_transfer *transfer)
+{
+   struct svga_screen *ss = svga_screen(screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st = svga_transfer(transfer);
+   
+   if(!st->swbuf)
+      sws->buffer_unmap(sws, st->hwbuf);
+}
+
+
+static void
+svga_tex_transfer_destroy(struct pipe_transfer *transfer)
+{
+   struct svga_texture *tex = svga_texture(transfer->texture);
+   struct svga_screen *ss = svga_screen(transfer->texture->screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_transfer *st = svga_transfer(transfer);
+
+   if (st->base.usage & PIPE_TRANSFER_WRITE) {
+      svga_transfer_dma(st, SVGA3D_WRITE_HOST_VRAM);
+      ss->texture_timestamp++;
+      tex->view_age[transfer->level] = ++(tex->age);
+      tex->defined[transfer->face][transfer->level] = TRUE;
+   }
+
+   pipe_texture_reference(&st->base.texture, NULL);
+   FREE(st->swbuf);
+   sws->buffer_destroy(sws, st->hwbuf);
+   FREE(st);
+}
+
+void
+svga_screen_init_texture_functions(struct pipe_screen *screen)
+{
+   screen->texture_create = svga_texture_create;
+   screen->texture_destroy = svga_texture_destroy;
+   screen->get_tex_surface = svga_get_tex_surface;
+   screen->tex_surface_destroy = svga_tex_surface_destroy;
+   screen->texture_blanket = svga_texture_blanket;
+   screen->get_tex_transfer = svga_get_tex_transfer;
+   screen->transfer_map = svga_transfer_map;
+   screen->transfer_unmap = svga_transfer_unmap;
+   screen->tex_transfer_destroy = svga_tex_transfer_destroy;
+}
+
+/*********************************************************************** 
+ */
+
+struct svga_sampler_view *
+svga_get_tex_sampler_view(struct pipe_context *pipe, struct pipe_texture *pt,
+                          unsigned min_lod, unsigned max_lod)
+{
+   struct svga_screen *ss = svga_screen(pt->screen);
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_texture *tex = svga_texture(pt); 
+   struct svga_sampler_view *sv = NULL;
+   SVGA3dSurfaceFormat format = svga_translate_format(pt->format);
+   boolean view = TRUE;
+
+   assert(pt);
+   assert(min_lod >= 0);
+   assert(min_lod <= max_lod);
+   assert(max_lod <= pt->last_level);
+
+
+   /* Is a view needed */
+   {
+      /*
+       * Can't control max lod. For first level views and when we only
+       * look at one level we disable mip filtering to achive the same
+       * results as a view.
+       */
+      if (min_lod == 0 && max_lod >= pt->last_level)
+         view = FALSE;
+
+      if (pf_is_compressed(pt->format) && view) {
+         format = svga_translate_format_render(pt->format);
+      }
+
+      if (ss->debug.no_sampler_view)
+         view = FALSE;
+
+      if (ss->debug.force_sampler_view)
+         view = TRUE;
+   }
+
+   /* First try the cache */
+   if (view) {
+      pipe_mutex_lock(ss->tex_mutex);
+      if (tex->cached_view &&
+          tex->cached_view->min_lod == min_lod &&
+          tex->cached_view->max_lod == max_lod) {
+         svga_sampler_view_reference(&sv, tex->cached_view);
+         pipe_mutex_unlock(ss->tex_mutex);
+         SVGA_DBG(DEBUG_VIEWS, "svga: Sampler view: reuse %p, %u %u, last %u\n",
+                              pt, min_lod, max_lod, pt->last_level);
+         svga_validate_sampler_view(svga_context(pipe), sv);
+         return sv;
+      }
+      pipe_mutex_unlock(ss->tex_mutex);
+   }
+
+   sv = CALLOC_STRUCT(svga_sampler_view);
+   pipe_reference_init(&sv->reference, 1);
+   sv->texture = tex;
+   sv->min_lod = min_lod;
+   sv->max_lod = max_lod;
+
+   /* No view needed just use the whole texture */
+   if (!view) {
+      SVGA_DBG(DEBUG_VIEWS,
+               "svga: Sampler view: no %p, mips %u..%u, nr %u, size (%ux%ux%u), last %u\n",
+               pt, min_lod, max_lod,
+               max_lod - min_lod + 1,
+               pt->width[0],
+               pt->height[0],
+               pt->depth[0],
+               pt->last_level);
+      sv->key.cachable = 0;
+      sws->surface_reference(sws, &sv->handle, tex->handle);
+      return sv;
+   }
+
+   SVGA_DBG(DEBUG_VIEWS,
+            "svga: Sampler view: yes %p, mips %u..%u, nr %u, size (%ux%ux%u), last %u\n",
+            pt, min_lod, max_lod,
+            max_lod - min_lod + 1,
+            pt->width[0],
+            pt->height[0],
+            pt->depth[0],
+            pt->last_level);
+
+   sv->age = tex->age;
+   sv->handle = svga_texture_view_surface(pipe, tex, format,
+                                          min_lod,
+                                          max_lod - min_lod + 1,
+                                          -1, -1,
+                                          &sv->key);
+
+   if (!sv->handle) {
+      assert(0);
+      sv->key.cachable = 0;
+      sws->surface_reference(sws, &sv->handle, tex->handle);
+      return sv;
+   }
+
+   pipe_mutex_lock(ss->tex_mutex);
+   svga_sampler_view_reference(&tex->cached_view, sv);
+   pipe_mutex_unlock(ss->tex_mutex);
+
+   return sv;
+}
+
+void
+svga_validate_sampler_view(struct svga_context *svga, struct svga_sampler_view *v)
+{
+   struct svga_texture *tex = v->texture;
+   unsigned numFaces;
+   unsigned age = 0;
+   int i, k;
+
+   assert(svga);
+
+   if (v->handle == v->texture->handle)
+      return;
+
+   age = tex->age;
+
+   if(tex->base.target == PIPE_TEXTURE_CUBE)
+      numFaces = 6;
+   else
+      numFaces = 1;
+
+   for (i = v->min_lod; i <= v->max_lod; i++) {
+      for (k = 0; k < numFaces; k++) {
+         if (v->age < tex->view_age[i])
+            svga_texture_copy_handle(svga, NULL,
+                                     tex->handle, 0, 0, 0, i, k,
+                                     v->handle, 0, 0, 0, i - v->min_lod, k,
+                                     tex->base.width[i],
+                                     tex->base.height[i],
+                                     tex->base.depth[i]);
+      }
+   }
+
+   v->age = age;
+}
+
+void
+svga_destroy_sampler_view_priv(struct svga_sampler_view *v)
+{
+   struct svga_screen *ss = svga_screen(v->texture->base.screen);
+
+   SVGA_DBG(DEBUG_DMA, "unref sid %p (sampler view)\n", v->handle);
+   svga_screen_surface_destroy(ss, &v->key, &v->handle);
+
+   FREE(v);
+}
+
+boolean
+svga_screen_buffer_from_texture(struct pipe_texture *texture,
+				struct pipe_buffer **buffer,
+				unsigned *stride)
+{
+   struct svga_texture *stex = svga_texture(texture);
+
+   *buffer = svga_screen_buffer_wrap_surface
+      (texture->screen,
+       svga_translate_format(texture->format),
+       stex->handle);
+
+   *stride = pf_get_nblocksx(&texture->block, texture->width[0]) *
+      texture->block.size;
+
+   return *buffer != NULL;
+}
+
+
+struct svga_winsys_surface *
+svga_screen_texture_get_winsys_surface(struct pipe_texture *texture)
+{
+   struct svga_winsys_screen *sws = svga_winsys_screen(texture->screen);
+   struct svga_winsys_surface *vsurf = NULL;
+
+   assert(svga_texture(texture)->key.cachable == 0);
+   svga_texture(texture)->key.cachable = 0;
+   sws->surface_reference(sws, &vsurf, svga_texture(texture)->handle);
+   return vsurf;
+}
diff --git a/src/gallium/drivers/svga/svga_screen_texture.h b/src/gallium/drivers/svga/svga_screen_texture.h
new file mode 100644
index 00000000000..1cc4063e653
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_screen_texture.h
@@ -0,0 +1,187 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_TEXTURE_H
+#define SVGA_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "svga_screen_cache.h"
+
+struct pipe_context;
+struct pipe_screen;
+struct svga_context;
+struct svga_winsys_surface;
+enum SVGA3dSurfaceFormat;
+
+
+#define SVGA_MAX_TEXTURE_LEVELS 12 /* 2048x2048 */
+
+
+/**
+ * A sampler's view into a texture
+ *
+ * We currently cache one sampler view on
+ * the texture and in there by holding a reference
+ * from the texture to the sampler view.
+ *
+ * Because of this we can not hold a refernce to the
+ * texture from the sampler view. So the user
+ * of the sampler views must make sure that the
+ * texture has a reference take for as long as
+ * the sampler view is refrenced.
+ *
+ * Just unreferencing the sampler_view before the
+ * texture is enough.
+ */
+struct svga_sampler_view
+{
+   struct pipe_reference reference;
+
+   struct svga_texture *texture;
+
+   int min_lod;
+   int max_lod;
+
+   unsigned age;
+
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+};
+
+
+struct svga_texture 
+{
+   struct pipe_texture base;
+
+   boolean defined[6][PIPE_MAX_TEXTURE_LEVELS];
+   
+   struct svga_sampler_view *cached_view;
+
+   unsigned view_age[SVGA_MAX_TEXTURE_LEVELS];
+   unsigned age;
+
+   boolean views_modified;
+
+   /**
+    * Creation key for the host surface handle.
+    * 
+    * This structure describes all the host surface characteristics so that it 
+    * can be looked up in cache, since creating a host surface is often a slow
+    * operation.
+    */
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+};
+
+
+struct svga_surface
+{
+   struct pipe_surface base;
+
+   struct svga_host_surface_cache_key key;
+   struct svga_winsys_surface *handle;
+
+   unsigned real_face;
+   unsigned real_level;
+   unsigned real_zslice;
+
+   boolean dirty;
+};
+
+
+struct svga_transfer
+{
+   struct pipe_transfer base;
+
+   struct svga_winsys_buffer *hwbuf;
+
+   /* Height of the hardware buffer in pixel blocks */
+   unsigned hw_nblocksy;
+
+   /* Temporary malloc buffer when we can't allocate a hardware buffer
+    * big enough */
+   void *swbuf;
+};
+
+
+static INLINE struct svga_texture *
+svga_texture(struct pipe_texture *texture)
+{
+   return (struct svga_texture *)texture;
+}
+
+static INLINE struct svga_surface *
+svga_surface(struct pipe_surface *surface)
+{
+   assert(surface);
+   return (struct svga_surface *)surface;
+}
+
+static INLINE struct svga_transfer *
+svga_transfer(struct pipe_transfer *transfer)
+{
+   assert(transfer);
+   return (struct svga_transfer *)transfer;
+}
+
+extern struct svga_sampler_view *
+svga_get_tex_sampler_view(struct pipe_context *pipe,
+                          struct pipe_texture *pt,
+                          unsigned min_lod, unsigned max_lod);
+
+void
+svga_validate_sampler_view(struct svga_context *svga, struct svga_sampler_view *v);
+
+void
+svga_destroy_sampler_view_priv(struct svga_sampler_view *v);
+
+static INLINE void
+svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v)
+{
+   struct svga_sampler_view *old = *ptr;
+
+   if (pipe_reference((struct pipe_reference **)ptr, &v->reference))
+      svga_destroy_sampler_view_priv(old);
+}
+
+extern void
+svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf);
+
+extern boolean
+svga_surface_needs_propagation(struct pipe_surface *surf);
+
+extern void
+svga_screen_init_texture_functions(struct pipe_screen *screen);
+
+enum SVGA3dSurfaceFormat
+svga_translate_format(enum pipe_format format);
+
+enum SVGA3dSurfaceFormat
+svga_translate_format_render(enum pipe_format format);
+
+
+#endif /* SVGA_TEXTURE_H */
diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c
new file mode 100644
index 00000000000..1c21d3acfe3
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state.c
@@ -0,0 +1,278 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_debug.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+#include "svga_context.h"
+#include "svga_screen.h"
+#include "svga_state.h"
+#include "svga_draw.h"
+#include "svga_cmd.h"
+#include "svga_hw_reg.h"
+
+/* This is just enough to decide whether we need to use the draw
+ * module (swtnl) or not.
+ */
+static const struct svga_tracked_state *need_swtnl_state[] =
+{
+   &svga_update_need_swvfetch,
+   &svga_update_need_pipeline,
+   &svga_update_need_swtnl,
+   NULL
+};
+
+
+/* Atoms to update hardware state prior to emitting a clear or draw
+ * packet.
+ */
+static const struct svga_tracked_state *hw_clear_state[] =
+{
+   &svga_hw_scissor,
+   &svga_hw_viewport,
+   &svga_hw_framebuffer,
+   NULL
+};
+
+
+/* Atoms to update hardware state prior to emitting a draw packet.
+ */
+static const struct svga_tracked_state *hw_draw_state[] =
+{
+   &svga_hw_update_zero_stride,
+   &svga_hw_fs,
+   &svga_hw_vs,
+   &svga_hw_rss,
+   &svga_hw_tss,
+   &svga_hw_tss_binding,
+   &svga_hw_clip_planes,
+   &svga_hw_vdecl,
+   &svga_hw_fs_parameters,
+   &svga_hw_vs_parameters,
+   NULL
+};
+
+
+static const struct svga_tracked_state *swtnl_draw_state[] =
+{
+   &svga_update_swtnl_draw,
+   &svga_update_swtnl_vdecl,
+   NULL
+};
+
+/* Flattens the graph of state dependencies.  Could swap the positions
+ * of hw_clear_state and need_swtnl_state without breaking anything.
+ */
+static const struct svga_tracked_state **state_levels[] = 
+{
+   need_swtnl_state,
+   hw_clear_state,
+   hw_draw_state,
+   swtnl_draw_state
+};
+
+
+
+static unsigned check_state( unsigned a,
+                             unsigned b )
+{
+   return (a & b);
+}
+
+static void accumulate_state( unsigned *a,
+			      unsigned b )
+{
+   *a |= b;
+}
+
+
+static void xor_states( unsigned *result,
+                        unsigned a,
+                        unsigned b )
+{
+   *result = a ^ b;
+}
+
+
+
+static int update_state( struct svga_context *svga,
+                         const struct svga_tracked_state *atoms[],
+                         unsigned *state )
+{
+   boolean debug = TRUE;
+   enum pipe_error ret = 0;
+   unsigned i;
+
+   ret = svga_hwtnl_flush( svga->hwtnl );
+   if (ret != 0)
+      return ret;
+
+   if (debug) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      unsigned examined, prev;      
+
+      examined = 0;
+      prev = *state;
+
+      for (i = 0; atoms[i] != NULL; i++) {	 
+	 unsigned generated;
+
+	 assert(atoms[i]->dirty); 
+	 assert(atoms[i]->update);
+
+	 if (check_state(*state, atoms[i]->dirty)) {
+	    if (0)
+               debug_printf("update: %s\n", atoms[i]->name);
+	    ret = atoms[i]->update( svga, *state );
+            if (ret != 0)
+               return ret;
+	 }
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, prev, *state);
+	 if (check_state(examined, generated)) {
+	    debug_printf("state atom %s generated state already examined\n", 
+                         atoms[i]->name);
+	    assert(0);
+	 }
+			 
+	 prev = *state;
+	 accumulate_state(&examined, atoms[i]->dirty);
+      }
+   }
+   else {
+      for (i = 0; atoms[i] != NULL; i++) {	 
+	 if (check_state(*state, atoms[i]->dirty)) {
+	    ret = atoms[i]->update( svga, *state );
+            if (ret != 0)
+               return ret;
+         }
+      }
+   }
+
+   return 0;
+}
+
+
+
+int svga_update_state( struct svga_context *svga,
+                       unsigned max_level )
+{
+   struct svga_screen *screen = svga_screen(svga->pipe.screen);
+   int ret = 0;
+   int i;
+
+   /* Check for updates to bound textures.  This can't be done in an
+    * atom as there is no flag which could provoke this test, and we
+    * cannot create one.
+    */
+   if (svga->state.texture_timestamp != screen->texture_timestamp) {
+      svga->state.texture_timestamp = screen->texture_timestamp;
+      svga->dirty |= SVGA_NEW_TEXTURE;
+   }
+
+   for (i = 0; i <= max_level; i++) {
+      svga->dirty |= svga->state.dirty[i];
+
+      if (svga->dirty) {
+         ret = update_state( svga, 
+                             state_levels[i], 
+                             &svga->dirty );
+         if (ret != 0)
+            return ret;
+
+         svga->state.dirty[i] = 0;
+      }
+   }
+   
+   for (; i < SVGA_STATE_MAX; i++) 
+      svga->state.dirty[i] |= svga->dirty;
+
+   svga->dirty = 0;
+   return 0;
+}
+
+
+
+
+void svga_update_state_retry( struct svga_context *svga,
+                              unsigned max_level )
+{
+   int ret;
+
+   ret = svga_update_state( svga, max_level );
+
+   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
+      svga_context_flush(svga, NULL);
+      ret = svga_update_state( svga, max_level );
+   }
+
+   assert( ret == 0 );
+}
+
+
+
+#define EMIT_RS(_rs, _count, _name, _value)     \
+do {                                            \
+   _rs[_count].state = _name;                   \
+   _rs[_count].uintValue = _value;              \
+   _count++;                                    \
+} while (0)
+
+
+/* Setup any hardware state which will be constant through the life of
+ * a context.
+ */
+enum pipe_error svga_emit_initial_state( struct svga_context *svga )
+{
+   SVGA3dRenderState *rs;
+   unsigned count = 0;
+   const unsigned COUNT = 2;
+   enum pipe_error ret;
+
+   ret = SVGA3D_BeginSetRenderState( svga->swc, &rs, COUNT );
+   if (ret)
+      return ret;
+
+   /* Always use D3D style coordinate space as this is the only one
+    * which is implemented on all backends.
+    */
+   EMIT_RS(rs, count, SVGA3D_RS_COORDINATETYPE, SVGA3D_COORDINATE_LEFTHANDED );
+   EMIT_RS(rs, count, SVGA3D_RS_FRONTWINDING, SVGA3D_FRONTWINDING_CW );
+   
+   assert( COUNT == count );
+   SVGA_FIFOCommitAll( svga->swc );
+
+   return 0;
+
+}
diff --git a/src/gallium/drivers/svga/svga_state.h b/src/gallium/drivers/svga/svga_state.h
new file mode 100644
index 00000000000..22d5a6d552a
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state.h
@@ -0,0 +1,95 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_STATE_H
+#define SVGA_STATE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+
+struct svga_context;
+
+
+void svga_init_state( struct svga_context *svga );
+void svga_destroy_state( struct svga_context *svga );
+
+
+struct svga_tracked_state {
+   const char *name;
+   unsigned dirty;
+   int (*update)( struct svga_context *svga, unsigned dirty );
+};
+
+/* NEED_SWTNL
+ */
+extern struct svga_tracked_state svga_update_need_swvfetch;
+extern struct svga_tracked_state svga_update_need_pipeline;
+extern struct svga_tracked_state svga_update_need_swtnl;
+
+/* HW_CLEAR
+ */
+extern struct svga_tracked_state svga_hw_viewport;
+extern struct svga_tracked_state svga_hw_scissor;
+extern struct svga_tracked_state svga_hw_framebuffer;
+
+/* HW_DRAW
+ */
+extern struct svga_tracked_state svga_hw_vs;
+extern struct svga_tracked_state svga_hw_fs;
+extern struct svga_tracked_state svga_hw_rss;
+extern struct svga_tracked_state svga_hw_tss;
+extern struct svga_tracked_state svga_hw_tss_binding;
+extern struct svga_tracked_state svga_hw_clip_planes;
+extern struct svga_tracked_state svga_hw_vdecl;
+extern struct svga_tracked_state svga_hw_fs_parameters;
+extern struct svga_tracked_state svga_hw_vs_parameters;
+extern struct svga_tracked_state svga_hw_update_zero_stride;
+
+/* SWTNL_DRAW
+ */
+extern struct svga_tracked_state svga_update_swtnl_draw;
+extern struct svga_tracked_state svga_update_swtnl_vdecl;
+
+/* Bring the hardware fully up-to-date so that we can emit draw
+ * commands.
+ */
+#define SVGA_STATE_NEED_SWTNL        0
+#define SVGA_STATE_HW_CLEAR          1
+#define SVGA_STATE_HW_DRAW           2
+#define SVGA_STATE_SWTNL_DRAW        3
+#define SVGA_STATE_MAX               4
+
+
+enum pipe_error svga_update_state( struct svga_context *svga,
+                                   unsigned level );
+
+void svga_update_state_retry( struct svga_context *svga,
+                              unsigned level );
+
+
+enum pipe_error svga_emit_initial_state( struct svga_context *svga );
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
new file mode 100644
index 00000000000..18cce7dde1a
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -0,0 +1,239 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+
+/***********************************************************************
+ * Hardware update 
+ */
+
+/* Convert from PIPE_SHADER_* to SVGA3D_SHADERTYPE_*
+ */
+static int svga_shader_type( int unit )
+{
+   return unit + 1;
+}
+
+
+static int emit_const( struct svga_context *svga,
+                       int unit,
+                       int i,
+                       const float *value )
+{
+   int ret = PIPE_OK;
+
+   if (memcmp(svga->state.hw_draw.cb[unit][i], value, 4 * sizeof(float)) != 0) {
+      if (SVGA_DEBUG & DEBUG_CONSTS)
+         debug_printf("%s %s %d: %f %f %f %f\n",
+                      __FUNCTION__,
+                      unit == PIPE_SHADER_VERTEX ? "VERT" : "FRAG",
+                      i,
+                      value[0],
+                      value[1],
+                      value[2],
+                      value[3]);
+
+      ret = SVGA3D_SetShaderConst( svga->swc, 
+                                   i,
+                                   svga_shader_type(unit),
+                                   SVGA3D_CONST_TYPE_FLOAT,
+                                   value );
+      if (ret)
+         return ret;
+
+      memcpy(svga->state.hw_draw.cb[unit][i], value, 4 * sizeof(float));
+   }
+   
+   return ret;
+}
+
+static int emit_consts( struct svga_context *svga,
+                        int offset,
+                        int unit )
+{
+   struct pipe_screen *screen = svga->pipe.screen;
+   unsigned count;
+   const float (*data)[4] = NULL;
+   unsigned i;
+   int ret = PIPE_OK;
+
+   if (svga->curr.cb[unit] == NULL)
+      goto done;
+
+   count = svga->curr.cb[unit]->size / (4 * sizeof(float));
+
+   data = (const float (*)[4])pipe_buffer_map(screen,
+                                              svga->curr.cb[unit],
+                                              PIPE_BUFFER_USAGE_CPU_READ);
+   if (data == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto done;
+   }
+
+   for (i = 0; i < count; i++) {
+      ret = emit_const( svga, unit, offset + i, data[i] );
+      if (ret)
+         goto done;
+   }
+
+done:
+   if (data)
+      pipe_buffer_unmap(screen, svga->curr.cb[unit]);
+
+   return ret;
+}
+   
+static int emit_fs_consts( struct svga_context *svga,
+                           unsigned dirty )
+{
+   const struct svga_shader_result *result = svga->state.hw_draw.fs;
+   const struct svga_fs_compile_key *key = &result->key.fkey;
+   int ret = 0;
+
+   ret = emit_consts( svga, 0, PIPE_SHADER_FRAGMENT );
+   if (ret)
+      return ret;
+
+   /* The internally generated fragment shader for xor blending
+    * doesn't have a 'result' struct.  It should be fixed to avoid
+    * this special case, but work around it with a NULL check:
+    */
+   if (result != NULL &&
+       key->num_unnormalized_coords)
+   {
+      unsigned offset = result->shader->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      int i;
+
+      for (i = 0; i < key->num_textures; i++) {
+         if (key->tex[i].unnormalized) {
+            struct pipe_texture *tex = svga->curr.texture[i];
+            float data[4];
+
+            data[0] = 1.0 / (float)tex->width[0];
+            data[1] = 1.0 / (float)tex->height[0];
+            data[2] = 1.0;
+            data[3] = 1.0;
+
+            ret = emit_const( svga,
+                              PIPE_SHADER_FRAGMENT,
+                              key->tex[i].width_height_idx + offset,
+                              data );
+            if (ret)
+               return ret;
+         }
+      }
+
+      offset += key->num_unnormalized_coords;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_fs_parameters = 
+{
+   "hw fs params",
+   (SVGA_NEW_FS_CONST_BUFFER |
+    SVGA_NEW_FS_RESULT |
+    SVGA_NEW_TEXTURE_BINDING),
+   emit_fs_consts
+};
+
+/***********************************************************************
+ */
+
+static int emit_vs_consts( struct svga_context *svga,
+                           unsigned dirty )
+{
+   const struct svga_shader_result *result = svga->state.hw_draw.vs;
+   const struct svga_vs_compile_key *key = &result->key.vkey;
+   int ret = 0;
+   unsigned offset;
+
+   /* SVGA_NEW_VS_RESULT
+    */
+   if (result == NULL) 
+      return 0;
+
+   /* SVGA_NEW_VS_CONST_BUFFER 
+    */
+   ret = emit_consts( svga, 0, PIPE_SHADER_VERTEX );
+   if (ret)
+      return ret;
+
+   offset = result->shader->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+   /* SVGA_NEW_VS_RESULT
+    */
+   if (key->need_prescale) {
+      ret = emit_const( svga, PIPE_SHADER_VERTEX, offset++,
+                        svga->state.hw_clear.prescale.scale );
+      if (ret)
+         return ret;
+
+      ret = emit_const( svga, PIPE_SHADER_VERTEX, offset++,
+                        svga->state.hw_clear.prescale.translate );
+      if (ret)
+         return ret;
+   }
+
+   /* SVGA_NEW_ZERO_STRIDE
+    */
+   if (key->zero_stride_vertex_elements) {
+      unsigned i, curr_zero_stride = 0;
+      for (i = 0; i < PIPE_MAX_ATTRIBS; ++i) {
+         if (key->zero_stride_vertex_elements & (1 << i)) {
+            ret = emit_const( svga, PIPE_SHADER_VERTEX, offset++,
+                              svga->curr.zero_stride_constants +
+                              4 * curr_zero_stride );
+            if (ret)
+               return ret;
+            ++curr_zero_stride;
+         }
+      }
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_vs_parameters = 
+{
+   "hw vs params",
+   (SVGA_NEW_VS_CONST_BUFFER |
+    SVGA_NEW_ZERO_STRIDE |
+    SVGA_NEW_VS_RESULT),
+   emit_vs_consts
+};
+
diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c
new file mode 100644
index 00000000000..cfdcae4ee4a
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -0,0 +1,458 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+
+
+/***********************************************************************
+ * Hardware state update
+ */
+
+
+static int emit_framebuffer( struct svga_context *svga,
+                             unsigned dirty )
+{
+   const struct pipe_framebuffer_state *curr = &svga->curr.framebuffer;
+   struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer;
+   unsigned i;
+   enum pipe_error ret;
+
+   /* XXX: Need shadow state in svga->hw to eliminate redundant
+    * uploads, especially of NULL buffers.
+    */
+   
+   for(i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
+      if (curr->cbufs[i] != hw->cbufs[i]) {
+         if (svga->curr.nr_fbs++ > 8)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_COLOR0 + i, curr->cbufs[i]);
+         if (ret != PIPE_OK)
+            return ret;
+         
+         pipe_surface_reference(&hw->cbufs[i], curr->cbufs[i]);
+      }
+   }
+
+   
+   if (curr->zsbuf != hw->zsbuf) {
+      ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_DEPTH, curr->zsbuf);
+      if (ret != PIPE_OK)
+         return ret;
+
+      if (curr->zsbuf &&
+          curr->zsbuf->format == PIPE_FORMAT_Z24S8_UNORM) {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_STENCIL, curr->zsbuf);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      else {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_STENCIL, NULL);
+         if (ret != PIPE_OK)
+            return ret;
+      }
+      
+      pipe_surface_reference(&hw->zsbuf, curr->zsbuf);
+   }
+
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_framebuffer = 
+{
+   "hw framebuffer state",
+   SVGA_NEW_FRAME_BUFFER,
+   emit_framebuffer
+};
+
+
+
+
+/*********************************************************************** 
+ */
+
+static int emit_viewport( struct svga_context *svga,
+                          unsigned dirty )
+{
+   const struct pipe_viewport_state *viewport = &svga->curr.viewport;
+   struct svga_prescale prescale;
+   SVGA3dRect rect;
+   /* Not sure if this state is relevant with POSITIONT.  Probably
+    * not, but setting to 0,1 avoids some state pingponging.
+    */
+   float range_min = 0.0;
+   float range_max = 1.0;
+   float flip = -1.0;
+   boolean degenerate = FALSE;
+   enum pipe_error ret;
+
+   float fb_width = svga->curr.framebuffer.width;
+   float fb_height = svga->curr.framebuffer.height;
+
+   memset( &prescale, 0, sizeof(prescale) );
+
+   if (svga->curr.rast->templ.bypass_vs_clip_and_viewport) {
+
+      /* Avoid POSITIONT as it has a non trivial implementation outside the D3D
+       * API. Always generate a vertex shader.
+       */
+      rect.x = 0;
+      rect.y = 0;
+      rect.w = svga->curr.framebuffer.width;
+      rect.h = svga->curr.framebuffer.height;
+
+      prescale.scale[0] = 2.0 / (float)rect.w;
+      prescale.scale[1] = - 2.0 / (float)rect.h;
+      prescale.scale[2] = 1.0;
+      prescale.scale[3] = 1.0;
+      prescale.translate[0] = -1.0f;
+      prescale.translate[1] = 1.0f;
+      prescale.translate[2] = 0;
+      prescale.translate[3] = 0;
+      prescale.enabled = TRUE;
+   } else {
+
+      /* Examine gallium viewport transformation and produce a screen
+       * rectangle and possibly vertex shader pre-transformation to
+       * get the same results.
+       */
+      float fx =        viewport->scale[0] * -1.0 + viewport->translate[0];
+      float fy = flip * viewport->scale[1] * -1.0 + viewport->translate[1];
+      float fw =        viewport->scale[0] * 2; 
+      float fh = flip * viewport->scale[1] * 2; 
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "\ninitial %f,%f %fx%f\n",
+               fx,
+               fy,
+               fw,
+               fh);
+
+      prescale.scale[0] = 1.0;
+      prescale.scale[1] = 1.0;
+      prescale.scale[2] = 1.0;
+      prescale.scale[3] = 1.0;
+      prescale.translate[0] = 0;
+      prescale.translate[1] = 0;
+      prescale.translate[2] = 0;
+      prescale.translate[3] = 0;
+      prescale.enabled = TRUE;
+
+
+
+      if (fw < 0) {
+         prescale.scale[0] *= -1.0;
+         prescale.translate[0] += -fw;
+         fw = -fw;
+         fx =        viewport->scale[0] * 1.0 + viewport->translate[0];
+      }
+
+      if (fh < 0) {
+         prescale.scale[1] *= -1.0;
+         prescale.translate[1] += -fh;
+         fh = -fh;
+         fy = flip * viewport->scale[1] * 1.0 + viewport->translate[1];
+      }
+
+      if (fx < 0) {
+         prescale.translate[0] += fx;
+         prescale.scale[0] *= fw / (fw + fx); 
+         fw += fx;
+         fx = 0;
+      }
+
+      if (fy < 0) {
+         prescale.translate[1] += fy;
+         prescale.scale[1] *= fh / (fh + fy); 
+         fh += fy;
+         fy = 0;
+      }
+
+      if (fx + fw > fb_width) {
+         prescale.scale[0] *= fw / (fb_width - fx); 
+         prescale.translate[0] -= fx * (fw / (fb_width - fx));
+         prescale.translate[0] += fx;
+         fw = fb_width - fx;
+         
+      }
+
+      if (fy + fh > fb_height) {
+         prescale.scale[1] *= fh / (fb_height - fy);
+         prescale.translate[1] -= fy * (fh / (fb_height - fy));
+         prescale.translate[1] += fy;
+         fh = fb_height - fy;
+      }
+
+      if (fw < 0 || fh < 0) {
+         fw = fh = fx = fy = 0;
+         degenerate = TRUE;
+         goto out;
+      }
+
+
+      /* D3D viewport is integer space.  Convert fx,fy,etc. to
+       * integers.
+       *
+       * TODO: adjust pretranslate correct for any subpixel error
+       * introduced converting to integers.
+       */
+      rect.x = fx;
+      rect.y = fy;
+      rect.w = fw;
+      rect.h = fh;
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "viewport error %f,%f %fx%f\n",
+               fabs((float)rect.x - fx),
+               fabs((float)rect.y - fy),
+               fabs((float)rect.w - fw),
+               fabs((float)rect.h - fh));
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "viewport %d,%d %dx%d\n",
+               rect.x,
+               rect.y,
+               rect.w,
+               rect.h);
+      
+
+      /* Finally, to get GL rasterization rules, need to tweak the
+       * screen-space coordinates slightly relative to D3D which is
+       * what hardware implements natively.
+       */
+      if (svga->curr.rast->templ.gl_rasterization_rules) {
+         float adjust_x = 0.0;
+         float adjust_y = 0.0;
+
+         switch (svga->curr.reduced_prim) {
+         case PIPE_PRIM_LINES:
+            adjust_x = -0.5;
+            adjust_y = 0;
+            break;
+         case PIPE_PRIM_POINTS:
+         case PIPE_PRIM_TRIANGLES:
+            adjust_x = -0.375;
+            adjust_y = -0.5;
+            break;
+         }
+
+         prescale.translate[0] += adjust_x;
+         prescale.translate[1] += adjust_y;
+         prescale.translate[2] = 0.5; /* D3D clip space */
+         prescale.scale[2]     = 0.5; /* D3D clip space */
+      }
+
+
+      range_min = viewport->scale[2] * -1.0 + viewport->translate[2];
+      range_max = viewport->scale[2] *  1.0 + viewport->translate[2];
+
+      /* D3D (and by implication SVGA) doesn't like dealing with zmax
+       * less than zmin.  Detect that case, flip the depth range and
+       * invert our z-scale factor to achieve the same effect.
+       */
+      if (range_min > range_max) {
+         float range_tmp;
+         range_tmp = range_min; 
+         range_min = range_max; 
+         range_max = range_tmp;
+         prescale.scale[2]     = -prescale.scale[2];
+      }
+   }
+
+   if (prescale.enabled) {
+      float H[2];
+      float J[2];
+      int i;
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "prescale %f,%f %fx%f\n",
+               prescale.translate[0],
+               prescale.translate[1],
+               prescale.scale[0],
+               prescale.scale[1]);
+
+      H[0] = (float)rect.w / 2.0;
+      H[1] = -(float)rect.h / 2.0;
+      J[0] = (float)rect.x + (float)rect.w / 2.0;
+      J[1] = (float)rect.y + (float)rect.h / 2.0;
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "H %f,%f\n"
+               "J %fx%f\n",
+               H[0],
+               H[1],
+               J[0],
+               J[1]);
+
+      /* Adjust prescale to take into account the fact that it is
+       * going to be applied prior to the perspective divide and
+       * viewport transformation.
+       * 
+       * Vwin = H(Vc/Vc.w) + J
+       *
+       * We want to tweak Vwin with scale and translation from above,
+       * as in:
+       *
+       * Vwin' = S Vwin + T
+       *
+       * But we can only modify the values at Vc.  Plugging all the
+       * above together, and rearranging, eventually we get:
+       *
+       *   Vwin' = H(Vc'/Vc'.w) + J
+       * where:
+       *   Vc' = SVc + KVc.w
+       *   K = (T + (S-1)J) / H
+       *
+       * Overwrite prescale.translate with values for K:
+       */
+      for (i = 0; i < 2; i++) {
+         prescale.translate[i] = ((prescale.translate[i] +
+                                   (prescale.scale[i] - 1.0) * J[i]) / H[i]);
+      }
+
+      SVGA_DBG(DEBUG_VIEWPORT,
+               "clipspace %f,%f %fx%f\n",
+               prescale.translate[0],
+               prescale.translate[1],
+               prescale.scale[0],
+               prescale.scale[1]);
+   }
+
+out:
+   if (degenerate) {
+      rect.x = 0;
+      rect.y = 0;
+      rect.w = 1;
+      rect.h = 1;
+      prescale.enabled = FALSE;
+   }
+
+   if (memcmp(&rect, &svga->state.hw_clear.viewport, sizeof(rect)) != 0) {
+      ret = SVGA3D_SetViewport(svga->swc, &rect);
+      if(ret != PIPE_OK)
+         return ret;
+
+      memcpy(&svga->state.hw_clear.viewport, &rect, sizeof(rect));
+      assert(sizeof(rect) == sizeof(svga->state.hw_clear.viewport));
+   }
+
+   if (svga->state.hw_clear.depthrange.zmin != range_min ||
+       svga->state.hw_clear.depthrange.zmax != range_max) 
+   {
+      ret = SVGA3D_SetZRange(svga->swc, range_min, range_max );
+      if(ret != PIPE_OK)
+         return ret;
+
+      svga->state.hw_clear.depthrange.zmin = range_min;
+      svga->state.hw_clear.depthrange.zmax = range_max;
+   }
+
+   if (memcmp(&prescale, &svga->state.hw_clear.prescale, sizeof prescale) != 0) {
+      svga->dirty |= SVGA_NEW_PRESCALE;
+      svga->state.hw_clear.prescale = prescale;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_viewport = 
+{
+   "hw viewport state",
+   ( SVGA_NEW_FRAME_BUFFER |
+     SVGA_NEW_VIEWPORT |
+     SVGA_NEW_RAST |
+     SVGA_NEW_REDUCED_PRIMITIVE ),
+   emit_viewport
+};
+
+
+/***********************************************************************
+ * Scissor state
+ */
+static int emit_scissor_rect( struct svga_context *svga,
+                              unsigned dirty )
+{
+   const struct pipe_scissor_state *scissor = &svga->curr.scissor;
+   SVGA3dRect rect;
+
+   rect.x = scissor->minx;
+   rect.y = scissor->miny;
+   rect.w = scissor->maxx - scissor->minx; /* + 1 ?? */
+   rect.h = scissor->maxy - scissor->miny; /* + 1 ?? */
+   
+   return SVGA3D_SetScissorRect(svga->swc, &rect);
+}
+
+
+struct svga_tracked_state svga_hw_scissor = 
+{
+   "hw scissor state",
+   SVGA_NEW_SCISSOR,
+   emit_scissor_rect
+};
+
+
+/***********************************************************************
+ * Userclip state
+ */
+
+static int emit_clip_planes( struct svga_context *svga,
+                             unsigned dirty )
+{
+   unsigned i;
+   enum pipe_error ret;
+
+   /* TODO: just emit directly from svga_set_clip_state()?
+    */
+   for (i = 0; i < svga->curr.clip.nr; i++) {
+      ret = SVGA3D_SetClipPlane( svga->swc,
+                                 i,
+                                 svga->curr.clip.ucp[i] );
+      if(ret != PIPE_OK)
+         return ret;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_hw_clip_planes = 
+{
+   "hw viewport state",
+   SVGA_NEW_CLIP,
+   emit_clip_planes
+};
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
new file mode 100644
index 00000000000..6ec38ed3e45
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -0,0 +1,282 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+
+#include "svga_hw_reg.h"
+
+
+
+static INLINE int compare_fs_keys( const struct svga_fs_compile_key *a,
+                                   const struct svga_fs_compile_key *b )
+{
+   unsigned keysize = svga_fs_key_size( a );
+   return memcmp( a, b, keysize );
+}
+
+
+static struct svga_shader_result *search_fs_key( struct svga_fragment_shader *fs,
+                                                 const struct svga_fs_compile_key *key )
+{
+   struct svga_shader_result *result = fs->base.results;
+
+   assert(key);
+
+   for ( ; result; result = result->next) {
+      if (compare_fs_keys( key, &result->key.fkey ) == 0)
+         return result;
+   }
+   
+   return NULL;
+}
+
+
+static enum pipe_error compile_fs( struct svga_context *svga,
+                                   struct svga_fragment_shader *fs,
+                                   const struct svga_fs_compile_key *key,
+                                   struct svga_shader_result **out_result )
+{
+   struct svga_shader_result *result;
+   enum pipe_error ret;
+
+   result = svga_translate_fragment_program( fs, key );
+   if (result == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+
+   ret = SVGA3D_DefineShader(svga->swc, 
+                             svga->state.next_fs_id,
+                             SVGA3D_SHADERTYPE_PS,
+                             result->tokens, 
+                             result->nr_tokens * sizeof result->tokens[0]);
+   if (ret)
+      goto fail;
+
+   *out_result = result;
+   result->id = svga->state.next_fs_id++;
+   result->next = fs->base.results;
+   fs->base.results = result;
+   return PIPE_OK;
+
+fail:
+   if (result)
+      svga_destroy_shader_result( result );
+   return ret;
+}
+
+/* The blend workaround for simulating logicop xor behaviour requires
+ * that the incoming fragment color be white.  This change achieves
+ * that by hooking up a hard-wired fragment shader that just emits
+ * color 1,1,1,1
+ *   
+ * This is a slightly incomplete solution as it assumes that the
+ * actual bound shader has no other effects beyond generating a
+ * fragment color.  In particular shaders containing TEXKIL and/or
+ * depth-write will not have the correct behaviour, nor will those
+ * expecting to use alphatest.
+ *   
+ * These are avoidable issues, but they are not much worse than the
+ * unavoidable ones associated with this technique, so it's not clear
+ * how much effort should be expended trying to resolve them - the
+ * ultimate result will still not be correct in most cases.
+ *
+ * Shader below was generated with:
+ *   SVGA_DEBUG=tgsi ./mesa/progs/fp/fp-tri white.txt
+ */
+static int emit_white_fs( struct svga_context *svga )
+{
+   int ret;
+
+   /* ps_3_0
+    * def c0, 1.000000, 0.000000, 0.000000, 1.000000
+    * mov oC0, c0.x
+    * end
+    */
+   static const unsigned white_tokens[] = {
+      0xffff0300,
+      0x05000051,
+      0xa00f0000,
+      0x3f800000,
+      0x00000000,
+      0x00000000,
+      0x3f800000,
+      0x02000001,
+      0x800f0800,
+      0xa0000000,
+      0x0000ffff,
+   };
+
+   ret = SVGA3D_DefineShader(svga->swc, 
+                             svga->state.next_fs_id,
+                             SVGA3D_SHADERTYPE_PS,
+                             white_tokens, 
+                             sizeof(white_tokens));
+   if (ret)
+      return ret;
+
+   svga->state.white_fs_id = svga->state.next_fs_id++;
+   return 0;
+}
+
+
+/* SVGA_NEW_TEXTURE_BINDING
+ * SVGA_NEW_RAST
+ * SVGA_NEW_NEED_SWTNL
+ * SVGA_NEW_SAMPLER
+ */
+static int make_fs_key( const struct svga_context *svga,
+                        struct svga_fs_compile_key *key )
+{
+   int i;
+   int idx = 0;
+
+   memset(key, 0, sizeof *key);
+
+   /* Only need fragment shader fixup for twoside lighting if doing
+    * hwtnl.  Otherwise the draw module does the whole job for us.
+    *
+    * SVGA_NEW_SWTNL
+    */
+   if (!svga->state.sw.need_swtnl) {
+      /* SVGA_NEW_RAST
+       */
+      key->light_twoside = svga->curr.rast->templ.light_twoside;
+      key->front_cw = (svga->curr.rast->templ.front_winding == 
+                       PIPE_WINDING_CW);
+   }
+
+   
+   /* XXX: want to limit this to the textures that the shader actually
+    * refers to.
+    *
+    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
+    */
+   for (i = 0; i < svga->curr.num_textures; i++) {
+      if (svga->curr.texture[i]) {
+         assert(svga->curr.sampler[i]);
+         key->tex[i].texture_target = svga->curr.texture[i]->target;
+         if (!svga->curr.sampler[i]->normalized_coords) {
+            key->tex[i].width_height_idx = idx++;
+            key->tex[i].unnormalized = TRUE;
+            ++key->num_unnormalized_coords;
+         }
+      }
+   }
+   key->num_textures = svga->curr.num_textures;
+
+   idx = 0;
+   for (i = 0; i < svga->curr.num_samplers; ++i) {
+      if (svga->curr.sampler[i]) {
+         key->tex[i].compare_mode = svga->curr.sampler[i]->compare_mode;
+         key->tex[i].compare_func = svga->curr.sampler[i]->compare_func;
+      }
+   }
+
+   return 0;
+}
+
+
+
+static int emit_hw_fs( struct svga_context *svga,
+                       unsigned dirty )
+{
+   struct svga_shader_result *result = NULL;
+   unsigned id = SVGA3D_INVALID_ID;
+   int ret = 0;
+
+   /* SVGA_NEW_BLEND
+    */
+   if (svga->curr.blend->need_white_fragments) {
+      if (svga->state.white_fs_id == SVGA3D_INVALID_ID) {
+         ret = emit_white_fs( svga );
+         if (ret)
+            return ret;
+      }
+      id = svga->state.white_fs_id;
+   }
+   else {
+      struct svga_fragment_shader *fs = svga->curr.fs;
+      struct svga_fs_compile_key key;
+
+      /* SVGA_NEW_TEXTURE_BINDING
+       * SVGA_NEW_RAST
+       * SVGA_NEW_NEED_SWTNL
+       * SVGA_NEW_SAMPLER
+       */
+      ret = make_fs_key( svga, &key );
+      if (ret)
+         return ret;
+
+      result = search_fs_key( fs, &key );
+      if (!result) {
+         ret = compile_fs( svga, fs, &key, &result );
+         if (ret)
+            return ret;
+      }
+
+      assert (result);
+      id = result->id;
+   }
+
+   assert(id != SVGA3D_INVALID_ID);
+
+   if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT]) {
+      ret = SVGA3D_SetShader(svga->swc, 
+                             SVGA3D_SHADERTYPE_PS, 
+                             id );
+      if (ret)
+         return ret;
+
+      svga->dirty |= SVGA_NEW_FS_RESULT;
+      svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT] = id;
+      svga->state.hw_draw.fs = result;      
+   }
+
+   return 0;
+}
+
+struct svga_tracked_state svga_hw_fs = 
+{
+   "fragment shader (hwtnl)",
+   (SVGA_NEW_FS |
+    SVGA_NEW_TEXTURE_BINDING |
+    SVGA_NEW_NEED_SWTNL |
+    SVGA_NEW_RAST |
+    SVGA_NEW_SAMPLER |
+    SVGA_NEW_BLEND),
+   emit_hw_fs
+};
+
+
+
diff --git a/src/gallium/drivers/svga/svga_state_need_swtnl.c b/src/gallium/drivers/svga/svga_state_need_swtnl.c
new file mode 100644
index 00000000000..00201b8091d
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -0,0 +1,200 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_state.h"
+
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_debug.h"
+#include "svga_hw_reg.h"
+
+/***********************************************************************
+ */
+
+static INLINE SVGA3dDeclType 
+svga_translate_vertex_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_R32_FLOAT:            return SVGA3D_DECLTYPE_FLOAT1;
+   case PIPE_FORMAT_R32G32_FLOAT:         return SVGA3D_DECLTYPE_FLOAT2;
+   case PIPE_FORMAT_R32G32B32_FLOAT:      return SVGA3D_DECLTYPE_FLOAT3;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:   return SVGA3D_DECLTYPE_FLOAT4;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:       return SVGA3D_DECLTYPE_D3DCOLOR;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:     return SVGA3D_DECLTYPE_UBYTE4;
+   case PIPE_FORMAT_R16G16_SSCALED:       return SVGA3D_DECLTYPE_SHORT2;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED: return SVGA3D_DECLTYPE_SHORT4;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:       return SVGA3D_DECLTYPE_UBYTE4N;
+   case PIPE_FORMAT_R16G16_SNORM:         return SVGA3D_DECLTYPE_SHORT2N;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:   return SVGA3D_DECLTYPE_SHORT4N;
+   case PIPE_FORMAT_R16G16_UNORM:         return SVGA3D_DECLTYPE_USHORT2N;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:   return SVGA3D_DECLTYPE_USHORT4N;
+
+   /* These formats don't exist yet:
+    * 
+   case PIPE_FORMAT_R10G10B10_USCALED:    return SVGA3D_DECLTYPE_UDEC3;
+   case PIPE_FORMAT_R10G10B10_SNORM:      return SVGA3D_DECLTYPE_DEC3N;
+   case PIPE_FORMAT_R16G16_FLOAT:         return SVGA3D_DECLTYPE_FLOAT16_2;
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:   return SVGA3D_DECLTYPE_FLOAT16_4;
+   */
+
+   default:
+      /* There are many formats without hardware support.  This case
+       * will be hit regularly, meaning we'll need swvfetch.
+       */
+      return SVGA3D_DECLTYPE_MAX;
+   }
+}
+
+
+static int update_need_swvfetch( struct svga_context *svga,
+                                 unsigned dirty )
+{
+   unsigned i;
+   boolean need_swvfetch = FALSE;
+
+   for (i = 0; i < svga->curr.num_vertex_elements; i++) {
+      svga->state.sw.ve_format[i] = svga_translate_vertex_format(svga->curr.ve[i].src_format);
+      if (svga->state.sw.ve_format[i] == SVGA3D_DECLTYPE_MAX) {
+         need_swvfetch = TRUE;
+         break;
+      }
+   }
+
+   if (need_swvfetch != svga->state.sw.need_swvfetch) {
+      svga->state.sw.need_swvfetch = need_swvfetch;
+      svga->dirty |= SVGA_NEW_NEED_SWVFETCH;
+   }
+   
+   return 0;
+}
+
+struct svga_tracked_state svga_update_need_swvfetch = 
+{
+   "update need_swvfetch",
+   ( SVGA_NEW_VELEMENT ),
+   update_need_swvfetch
+};
+
+
+/*********************************************************************** 
+ */
+
+static int update_need_pipeline( struct svga_context *svga,
+                                 unsigned dirty )
+{
+   
+   boolean need_pipeline = FALSE;
+
+   /* SVGA_NEW_RAST, SVGA_NEW_REDUCED_PRIMITIVE
+    */
+   if (svga->curr.rast->need_pipeline & (1 << svga->curr.reduced_prim)) {
+      SVGA_DBG(DEBUG_SWTNL, "%s: rast need_pipeline (%d) & prim (%x)\n", 
+                 __FUNCTION__,
+                 svga->curr.rast->need_pipeline,
+                 (1 << svga->curr.reduced_prim) );
+      need_pipeline = TRUE;
+   }
+
+   /* SVGA_NEW_EDGEFLAGS
+    */
+   if (svga->curr.rast->hw_unfilled != PIPE_POLYGON_MODE_FILL &&
+       svga->curr.reduced_prim == PIPE_PRIM_TRIANGLES && 
+       svga->curr.edgeflags != NULL) {
+      SVGA_DBG(DEBUG_SWTNL, "%s: edgeflags\n", __FUNCTION__);
+      need_pipeline = TRUE;
+   }
+
+   /* SVGA_NEW_CLIP 
+    */
+   if (!svga->curr.rast->templ.bypass_vs_clip_and_viewport &&
+       svga->curr.clip.nr) {
+      SVGA_DBG(DEBUG_SWTNL, "%s: userclip\n", __FUNCTION__);
+      need_pipeline = TRUE;
+   }
+
+   if (need_pipeline != svga->state.sw.need_pipeline) {
+      svga->state.sw.need_pipeline = need_pipeline;
+      svga->dirty |= SVGA_NEW_NEED_PIPELINE;
+   }
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_update_need_pipeline = 
+{
+   "need pipeline",
+   (SVGA_NEW_RAST |
+    SVGA_NEW_CLIP |
+    SVGA_NEW_REDUCED_PRIMITIVE),
+   update_need_pipeline
+};
+
+
+/*********************************************************************** 
+ */
+
+static int update_need_swtnl( struct svga_context *svga,
+                              unsigned dirty )
+{
+   boolean need_swtnl;
+
+   if (svga->debug.no_swtnl) {
+      svga->state.sw.need_swvfetch = 0;
+      svga->state.sw.need_pipeline = 0;
+   }
+
+   need_swtnl = (svga->state.sw.need_swvfetch ||
+                 svga->state.sw.need_pipeline);
+
+   if (svga->debug.force_swtnl) {
+      need_swtnl = 1;
+   }
+
+   if (need_swtnl != svga->state.sw.need_swtnl) {
+      SVGA_DBG(DEBUG_SWTNL|DEBUG_PERF,
+               "%s need_swvfetch: %s, need_pipeline %s\n",
+               __FUNCTION__,
+               svga->state.sw.need_swvfetch ? "true" : "false",
+               svga->state.sw.need_pipeline ? "true" : "false");
+
+      svga->state.sw.need_swtnl = need_swtnl;
+      svga->dirty |= SVGA_NEW_NEED_SWTNL;
+      svga->swtnl.new_vdecl = TRUE;
+   }
+  
+   return 0;
+}
+
+
+struct svga_tracked_state svga_update_need_swtnl =
+{
+   "need swtnl",
+   (SVGA_NEW_NEED_PIPELINE |
+    SVGA_NEW_NEED_SWVFETCH),
+   update_need_swtnl
+};
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
new file mode 100644
index 00000000000..8b6803a285a
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -0,0 +1,268 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+
+#include "svga_hw_reg.h"
+
+
+
+struct rs_queue {
+   unsigned rs_count;
+   SVGA3dRenderState rs[SVGA3D_RS_MAX];
+};
+
+
+#define EMIT_RS(svga, value, token, fail)                       \
+do {                                                            \
+   if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) {    \
+      svga_queue_rs( &queue, SVGA3D_RS_##token, value );        \
+      svga->state.hw_draw.rs[SVGA3D_RS_##token] = value;        \
+   }                                                            \
+} while (0)
+
+#define EMIT_RS_FLOAT(svga, fvalue, token, fail)                \
+do {                                                            \
+   unsigned value = fui(fvalue);                                \
+   if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) {    \
+      svga_queue_rs( &queue, SVGA3D_RS_##token, value );        \
+      svga->state.hw_draw.rs[SVGA3D_RS_##token] = value;        \
+   }                                                            \
+} while (0)
+
+
+static INLINE void
+svga_queue_rs( struct rs_queue *q,
+               unsigned rss,
+               unsigned value )
+{
+   q->rs[q->rs_count].state = rss;
+   q->rs[q->rs_count].uintValue = value;
+   q->rs_count++;
+}
+
+
+/* Compare old and new render states and emit differences between them
+ * to hardware.  Simplest implementation would be to emit the whole of
+ * the "to" state.
+ */
+static int emit_rss( struct svga_context *svga,
+                     unsigned dirty )
+{
+   struct rs_queue queue;
+
+   queue.rs_count = 0;
+
+   if (dirty & SVGA_NEW_BLEND) {
+      const struct svga_blend_state *curr = svga->curr.blend;
+
+      EMIT_RS( svga, curr->rt[0].writemask, COLORWRITEENABLE, fail );
+      EMIT_RS( svga, curr->rt[0].blend_enable, BLENDENABLE, fail );
+
+      if (curr->rt[0].blend_enable) {
+         EMIT_RS( svga, curr->rt[0].srcblend, SRCBLEND, fail );
+         EMIT_RS( svga, curr->rt[0].dstblend, DSTBLEND, fail );
+         EMIT_RS( svga, curr->rt[0].blendeq, BLENDEQUATION, fail );
+
+         EMIT_RS( svga, curr->rt[0].separate_alpha_blend_enable, 
+                  SEPARATEALPHABLENDENABLE, fail );
+
+         if (curr->rt[0].separate_alpha_blend_enable) {
+            EMIT_RS( svga, curr->rt[0].srcblend_alpha, SRCBLENDALPHA, fail );
+            EMIT_RS( svga, curr->rt[0].dstblend_alpha, DSTBLENDALPHA, fail );
+            EMIT_RS( svga, curr->rt[0].blendeq_alpha, BLENDEQUATIONALPHA, fail );
+         }
+      }
+   }
+
+
+   if (dirty & (SVGA_NEW_DEPTH_STENCIL | SVGA_NEW_RAST)) {
+      const struct svga_depth_stencil_state *curr = svga->curr.depth; 
+      const struct svga_rasterizer_state *rast = svga->curr.rast; 
+
+      if (!curr->stencil[0].enabled) 
+      {
+         /* Stencil disabled
+          */
+         EMIT_RS( svga, FALSE, STENCILENABLE, fail );
+         EMIT_RS( svga, FALSE, STENCILENABLE2SIDED, fail );
+      }
+      else if (curr->stencil[0].enabled && !curr->stencil[1].enabled)
+      {
+         /* Regular stencil
+          */
+         EMIT_RS( svga, TRUE, STENCILENABLE, fail );
+         EMIT_RS( svga, FALSE, STENCILENABLE2SIDED, fail );
+
+         EMIT_RS( svga, curr->stencil[0].func,  STENCILFUNC, fail );
+         EMIT_RS( svga, curr->stencil[0].fail,  STENCILFAIL, fail );
+         EMIT_RS( svga, curr->stencil[0].zfail, STENCILZFAIL, fail );
+         EMIT_RS( svga, curr->stencil[0].pass,  STENCILPASS, fail );
+         
+         EMIT_RS( svga, curr->stencil_ref, STENCILREF, fail );
+         EMIT_RS( svga, curr->stencil_mask, STENCILMASK, fail );
+         EMIT_RS( svga, curr->stencil_writemask, STENCILWRITEMASK, fail );
+      }
+      else 
+      {
+         int cw, ccw;
+
+         /* Hardware frontwinding is always CW, so if ours is also CW,
+          * then our definition of front face agrees with hardware.
+          * Otherwise need to flip.
+          */
+         if (rast->templ.front_winding == PIPE_WINDING_CW) {
+            cw = 0;
+            ccw = 1;
+         }
+         else {
+            cw = 1;
+            ccw = 0;
+         }
+
+         /* Twoside stencil
+          */
+         EMIT_RS( svga, TRUE, STENCILENABLE, fail );
+         EMIT_RS( svga, TRUE, STENCILENABLE2SIDED, fail );
+
+         EMIT_RS( svga, curr->stencil[cw].func,  STENCILFUNC, fail );
+         EMIT_RS( svga, curr->stencil[cw].fail,  STENCILFAIL, fail );
+         EMIT_RS( svga, curr->stencil[cw].zfail, STENCILZFAIL, fail );
+         EMIT_RS( svga, curr->stencil[cw].pass,  STENCILPASS, fail );
+
+         EMIT_RS( svga, curr->stencil[ccw].func,  CCWSTENCILFUNC, fail );
+         EMIT_RS( svga, curr->stencil[ccw].fail,  CCWSTENCILFAIL, fail );
+         EMIT_RS( svga, curr->stencil[ccw].zfail, CCWSTENCILZFAIL, fail );
+         EMIT_RS( svga, curr->stencil[ccw].pass,  CCWSTENCILPASS, fail );
+
+         EMIT_RS( svga, curr->stencil_ref, STENCILREF, fail );
+         EMIT_RS( svga, curr->stencil_mask, STENCILMASK, fail );
+         EMIT_RS( svga, curr->stencil_writemask, STENCILWRITEMASK, fail );
+      }
+
+      EMIT_RS( svga, curr->zenable, ZENABLE, fail );
+      if (curr->zenable) {
+         EMIT_RS( svga, curr->zfunc, ZFUNC, fail );
+         EMIT_RS( svga, curr->zwriteenable, ZWRITEENABLE, fail );
+      }
+
+      EMIT_RS( svga, curr->alphatestenable, ALPHATESTENABLE, fail );
+      if (curr->alphatestenable) {
+         EMIT_RS( svga, curr->alphafunc, ALPHAFUNC, fail );
+         EMIT_RS_FLOAT( svga, curr->alpharef, ALPHAREF, fail );
+      }
+   }
+
+
+   if (dirty & SVGA_NEW_RAST)
+   {
+      const struct svga_rasterizer_state *curr = svga->curr.rast; 
+
+      /* Shademode: still need to rearrange index list to move
+       * flat-shading PV first vertex.
+       */
+      EMIT_RS( svga, curr->shademode, SHADEMODE, fail );
+      EMIT_RS( svga, curr->cullmode, CULLMODE, fail );
+      EMIT_RS( svga, curr->scissortestenable, SCISSORTESTENABLE, fail );
+      EMIT_RS( svga, curr->multisampleantialias, MULTISAMPLEANTIALIAS, fail );
+      EMIT_RS( svga, curr->lastpixel, LASTPIXEL, fail );
+      EMIT_RS( svga, curr->linepattern, LINEPATTERN, fail );
+      EMIT_RS_FLOAT( svga, curr->pointsize, POINTSIZE, fail );
+      EMIT_RS_FLOAT( svga, curr->pointsize_min, POINTSIZEMIN, fail );
+      EMIT_RS_FLOAT( svga, curr->pointsize_max, POINTSIZEMAX, fail );
+   }
+
+   if (dirty & (SVGA_NEW_RAST | SVGA_NEW_FRAME_BUFFER | SVGA_NEW_NEED_PIPELINE))
+   {
+      const struct svga_rasterizer_state *curr = svga->curr.rast; 
+      float slope = 0.0;
+      float bias  = 0.0;
+
+      /* Need to modify depth bias according to bound depthbuffer
+       * format.  Don't do hardware depthbias while the software
+       * pipeline is active.
+       */
+      if (!svga->state.sw.need_pipeline &&
+          svga->curr.framebuffer.zsbuf)
+      {
+         slope = curr->slopescaledepthbias;
+         bias  = svga->curr.depthscale * curr->depthbias;
+      }
+
+      EMIT_RS_FLOAT( svga, slope, SLOPESCALEDEPTHBIAS, fail );
+      EMIT_RS_FLOAT( svga, bias, DEPTHBIAS, fail );
+   }
+
+
+   if (queue.rs_count) {
+      SVGA3dRenderState *rs;
+
+      if (SVGA3D_BeginSetRenderState( svga->swc,
+                                      &rs,
+                                      queue.rs_count ) != PIPE_OK)
+         goto fail;
+
+      memcpy( rs,
+              queue.rs,
+              queue.rs_count * sizeof queue.rs[0]);
+      
+      SVGA_FIFOCommitAll( svga->swc );
+   }
+
+   /* Also blend color:
+    */
+
+   return 0;
+
+fail:
+   /* XXX: need to poison cached hardware state on failure to ensure
+    * dirty state gets re-emitted.  Fix this by re-instating partial
+    * FIFOCommit command and only updating cached hw state once the
+    * initial allocation has succeeded.
+    */
+   memset(svga->state.hw_draw.rs, 0xcd, sizeof(svga->state.hw_draw.rs));
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+struct svga_tracked_state svga_hw_rss = 
+{
+   "hw rss state",
+
+   (SVGA_NEW_BLEND |
+    SVGA_NEW_DEPTH_STENCIL |
+    SVGA_NEW_RAST |
+    SVGA_NEW_FRAME_BUFFER |
+    SVGA_NEW_NEED_PIPELINE),
+
+   emit_rss
+};
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
new file mode 100644
index 00000000000..b3137945202
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -0,0 +1,279 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+
+#include "svga_screen_texture.h"
+#include "svga_winsys.h"
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+
+#include "svga_hw_reg.h"
+
+
+void svga_cleanup_tss_binding(struct svga_context *svga)
+{
+   int i;
+   unsigned count = MAX2( svga->curr.num_textures,
+                          svga->state.hw_draw.num_views );
+
+   for (i = 0; i < count; i++) {
+      struct svga_hw_view_state *view = &svga->state.hw_draw.views[i];
+
+      svga_sampler_view_reference(&view->v, NULL);
+      pipe_texture_reference( &svga->curr.texture[i], NULL );
+      pipe_texture_reference( &view->texture, NULL );
+
+      view->dirty = 1;
+   }
+}
+
+
+static int
+update_tss_binding(struct svga_context *svga, 
+                   unsigned dirty )
+{
+   unsigned i;
+   unsigned count = MAX2( svga->curr.num_textures,
+                          svga->state.hw_draw.num_views );
+   unsigned min_lod;
+   unsigned max_lod;
+
+
+   struct {
+      struct {
+         unsigned unit;
+         struct svga_hw_view_state *view;
+      } bind[PIPE_MAX_SAMPLERS];
+
+      unsigned bind_count;
+   } queue;
+
+   queue.bind_count = 0;
+   
+   for (i = 0; i < count; i++) {
+      const struct svga_sampler_state *s = svga->curr.sampler[i];
+      struct svga_hw_view_state *view = &svga->state.hw_draw.views[i];
+
+      /* get min max lod */
+      if (svga->curr.texture[i]) {
+         min_lod = MAX2(s->view_min_lod, 0);
+         max_lod = MIN2(s->view_max_lod, svga->curr.texture[i]->last_level);
+      } else {
+         min_lod = 0;
+         max_lod = 0;
+      }
+
+      if (view->texture != svga->curr.texture[i] ||
+          view->min_lod != min_lod ||
+          view->max_lod != max_lod) {
+
+         svga_sampler_view_reference(&view->v, NULL);
+         pipe_texture_reference( &view->texture, svga->curr.texture[i] );
+
+         view->dirty = TRUE;
+         view->min_lod = min_lod;
+         view->max_lod = max_lod;
+
+         if (svga->curr.texture[i])
+            view->v = svga_get_tex_sampler_view(&svga->pipe, 
+                                                svga->curr.texture[i], 
+                                                min_lod,
+                                                max_lod);
+      }
+
+      if (view->dirty) {
+         queue.bind[queue.bind_count].unit = i;
+         queue.bind[queue.bind_count].view = view;
+         queue.bind_count++;
+      } 
+      else if (view->v) {
+         svga_validate_sampler_view(svga, view->v);
+      }
+   }
+
+   svga->state.hw_draw.num_views = svga->curr.num_textures;
+
+   if (queue.bind_count) {
+      SVGA3dTextureState *ts;
+
+      if (SVGA3D_BeginSetTextureState( svga->swc,
+                                       &ts,
+                                       queue.bind_count ) != PIPE_OK)
+         goto fail;
+
+      for (i = 0; i < queue.bind_count; i++) {
+         ts[i].stage = queue.bind[i].unit;
+         ts[i].name = SVGA3D_TS_BIND_TEXTURE;
+
+         if (queue.bind[i].view->v) {
+            svga->swc->surface_relocation(svga->swc,
+                                          &ts[i].value,
+                                          queue.bind[i].view->v->handle,
+                                          PIPE_BUFFER_USAGE_GPU_READ);
+         }
+         else {
+            ts[i].value = SVGA3D_INVALID_ID;
+         }
+         
+         queue.bind[i].view->dirty = FALSE;
+      }
+
+      SVGA_FIFOCommitAll( svga->swc );
+   }
+
+   return 0;
+
+fail:
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+struct svga_tracked_state svga_hw_tss_binding = {
+   "texture binding emit",
+   SVGA_NEW_TEXTURE_BINDING |
+   SVGA_NEW_SAMPLER,
+   update_tss_binding
+};
+
+
+/***********************************************************************
+ */
+
+struct ts_queue {
+   unsigned ts_count;
+   SVGA3dTextureState ts[PIPE_MAX_SAMPLERS*SVGA3D_TS_MAX];
+};
+
+
+#define EMIT_TS(svga, unit, val, token, fail)                           \
+do {                                                                    \
+   if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) {        \
+      svga_queue_tss( &queue, unit, SVGA3D_TS_##token, val );           \
+      svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val;            \
+   }                                                                    \
+} while (0)
+
+#define EMIT_TS_FLOAT(svga, unit, fvalue, token, fail)                  \
+do {                                                                    \
+   unsigned val = fui(fvalue);                                          \
+   if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) {        \
+      svga_queue_tss( &queue, unit, SVGA3D_TS_##token, val );           \
+      svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val;            \
+   }                                                                    \
+} while (0)
+
+
+static INLINE void 
+svga_queue_tss( struct ts_queue *q,
+                unsigned unit,
+                unsigned tss,
+                unsigned value )
+{
+   assert(q->ts_count < sizeof(q->ts)/sizeof(q->ts[0]));
+   q->ts[q->ts_count].stage = unit;
+   q->ts[q->ts_count].name = tss;
+   q->ts[q->ts_count].value = value;
+   q->ts_count++;
+}
+
+
+static int
+update_tss(struct svga_context *svga, 
+           unsigned dirty )
+{
+   unsigned i;
+   struct ts_queue queue;
+
+   queue.ts_count = 0;
+   for (i = 0; i < svga->curr.num_samplers; i++) {
+      if (svga->curr.sampler[i]) {
+         const struct svga_sampler_state *curr = svga->curr.sampler[i];
+
+         EMIT_TS(svga, i, curr->mipfilter, MIPFILTER, fail);
+         EMIT_TS(svga, i, curr->min_lod, TEXTURE_MIPMAP_LEVEL, fail);
+         EMIT_TS(svga, i, curr->magfilter, MAGFILTER, fail);
+         EMIT_TS(svga, i, curr->minfilter, MINFILTER, fail);
+         EMIT_TS(svga, i, curr->aniso_level, TEXTURE_ANISOTROPIC_LEVEL, fail);
+         EMIT_TS_FLOAT(svga, i, curr->lod_bias, TEXTURE_LOD_BIAS, fail);
+         EMIT_TS(svga, i, curr->addressu, ADDRESSU, fail);
+         EMIT_TS(svga, i, curr->addressw, ADDRESSW, fail);
+         EMIT_TS(svga, i, curr->bordercolor, BORDERCOLOR, fail);
+         // TEXCOORDINDEX -- hopefully not needed
+
+         if (svga->curr.tex_flags.flag_1d & (1 << i)) {
+            debug_printf("wrap 1d tex %d\n", i);
+            EMIT_TS(svga, i, SVGA3D_TEX_ADDRESS_WRAP, ADDRESSV, fail);
+         }
+         else
+            EMIT_TS(svga, i, curr->addressv, ADDRESSV, fail);
+
+         if (svga->curr.tex_flags.flag_srgb & (1 << i))
+            EMIT_TS_FLOAT(svga, i, 2.2f, GAMMA, fail);
+         else
+            EMIT_TS_FLOAT(svga, i, 1.0f, GAMMA, fail);
+
+      }
+   }
+ 
+   if (queue.ts_count) {
+      SVGA3dTextureState *ts;
+
+      if (SVGA3D_BeginSetTextureState( svga->swc,
+                                       &ts,
+                                       queue.ts_count ) != PIPE_OK)
+         goto fail;
+
+      memcpy( ts,
+              queue.ts,
+              queue.ts_count * sizeof queue.ts[0]);
+      
+      SVGA_FIFOCommitAll( svga->swc );
+   }
+
+   return 0;
+
+fail:
+   /* XXX: need to poison cached hardware state on failure to ensure
+    * dirty state gets re-emitted.  Fix this by re-instating partial
+    * FIFOCommit command and only updating cached hw state once the
+    * initial allocation has succeeded.
+    */
+   memset(svga->state.hw_draw.ts, 0xcd, sizeof(svga->state.hw_draw.ts));
+
+   return PIPE_ERROR_OUT_OF_MEMORY;
+}
+
+
+struct svga_tracked_state svga_hw_tss = {
+   "texture state emit",
+   (SVGA_NEW_SAMPLER |
+    SVGA_NEW_TEXTURE_FLAGS),
+   update_tss
+};
+
diff --git a/src/gallium/drivers/svga/svga_state_vdecl.c b/src/gallium/drivers/svga/svga_state_vdecl.c
new file mode 100644
index 00000000000..c534308f503
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@@ -0,0 +1,182 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_upload_mgr.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_draw.h"
+#include "svga_tgsi.h"
+#include "svga_screen.h"
+#include "svga_screen_buffer.h"
+
+#include "svga_hw_reg.h"
+
+
+static int
+upload_user_buffers( struct svga_context *svga )
+{
+   enum pipe_error ret = PIPE_OK;
+   int i;
+   int nr;
+
+   if (0) 
+      debug_printf("%s: %d\n", __FUNCTION__, svga->curr.num_vertex_buffers);
+
+   nr = svga->curr.num_vertex_buffers;
+
+   for (i = 0; i < nr; i++) 
+   {
+      if (svga_buffer_is_user_buffer(svga->curr.vb[i].buffer))
+      {
+         struct pipe_buffer *upload_buffer = NULL;
+         unsigned offset = /*svga->curr.vb[i].buffer_offset*/ 0;
+         unsigned size = svga->curr.vb[i].buffer->size /*- offset*/;
+         unsigned upload_offset;
+
+         ret = u_upload_buffer( svga->upload_vb,
+                                offset,
+                                size,
+                                svga->curr.vb[i].buffer,
+                                &upload_offset,
+                                &upload_buffer );
+         if (ret)
+            return ret;
+
+         if (0)
+            debug_printf("%s: %d: orig buf %p upl buf %p ofs %d sz %d\n", 
+                         __FUNCTION__, 
+                         i,
+                         svga->curr.vb[i].buffer,
+                         upload_buffer, upload_offset, size);
+
+         /* Make sure we release the old buffer and end up with the
+          * correct refcount on the uploaded buffer.
+          */
+         pipe_buffer_reference( &svga->curr.vb[i].buffer, NULL );
+         svga->curr.vb[i].buffer = upload_buffer;
+         svga->curr.vb[i].buffer_offset = upload_offset;
+      }
+   }
+
+   if (0)
+      debug_printf("%s: DONE\n", __FUNCTION__);
+
+   return ret;
+}
+
+
+/***********************************************************************
+ */
+
+
+static int emit_hw_vs_vdecl( struct svga_context *svga,
+                             unsigned dirty )
+{
+   const struct pipe_vertex_element *ve = svga->curr.ve;
+   SVGA3dVertexDecl decl;
+   unsigned i;
+
+   assert(svga->curr.num_vertex_elements >=
+          svga->curr.vs->base.info.file_count[TGSI_FILE_INPUT]);
+
+   svga_hwtnl_reset_vdecl( svga->hwtnl, 
+                           svga->curr.num_vertex_elements );
+
+   for (i = 0; i < svga->curr.num_vertex_elements; i++) {
+      const struct pipe_vertex_buffer *vb = &svga->curr.vb[ve[i].vertex_buffer_index];
+      unsigned usage, index;
+
+
+      svga_generate_vdecl_semantics( i, &usage, &index );
+
+      /* SVGA_NEW_VELEMENT
+       */
+      decl.identity.type = svga->state.sw.ve_format[i];
+      decl.identity.method = SVGA3D_DECLMETHOD_DEFAULT;
+      decl.identity.usage = usage;
+      decl.identity.usageIndex = index;
+      decl.array.stride = vb->stride;
+      decl.array.offset = (vb->buffer_offset +
+                           ve[i].src_offset);
+
+      svga_hwtnl_vdecl( svga->hwtnl,
+                        i,
+                        &decl,
+                        vb->buffer );
+   }
+
+   return 0;
+}
+
+
+static int emit_hw_vdecl( struct svga_context *svga,
+                          unsigned dirty )
+{
+   int ret = 0;
+
+   /* SVGA_NEW_NEED_SWTNL
+    */
+   if (svga->state.sw.need_swtnl)
+      return 0; /* Do not emit during swtnl */
+
+   /* If we get to here, we know that we're going to draw.  Upload
+    * userbuffers now and try to combine multiple userbuffers from
+    * multiple draw calls into a single host buffer for performance.
+    */
+   if (svga->curr.any_user_vertex_buffers &&
+       SVGA_COMBINE_USERBUFFERS)
+   {
+      ret = upload_user_buffers( svga );
+      if (ret)
+         return ret;
+
+      svga->curr.any_user_vertex_buffers = FALSE;
+   }
+
+   return emit_hw_vs_vdecl( svga, dirty );
+}
+
+
+struct svga_tracked_state svga_hw_vdecl = 
+{
+   "hw vertex decl state (hwtnl version)",
+   ( SVGA_NEW_NEED_SWTNL |
+     SVGA_NEW_VELEMENT |
+     SVGA_NEW_VBUFFER |
+     SVGA_NEW_RAST |
+     SVGA_NEW_FS |
+     SVGA_NEW_VS ),
+   emit_hw_vdecl
+};
+
+
+
+
+
+
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
new file mode 100644
index 00000000000..a947745732c
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -0,0 +1,239 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "translate/translate.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+
+#include "svga_hw_reg.h"
+
+/***********************************************************************
+ */
+
+
+static INLINE int compare_vs_keys( const struct svga_vs_compile_key *a,
+                                   const struct svga_vs_compile_key *b )
+{
+   unsigned keysize = svga_vs_key_size( a );
+   return memcmp( a, b, keysize );
+}
+
+
+static struct svga_shader_result *search_vs_key( struct svga_vertex_shader *vs,
+                                                 const struct svga_vs_compile_key *key )
+{
+   struct svga_shader_result *result = vs->base.results;
+
+   assert(key);
+
+   for ( ; result; result = result->next) {
+      if (compare_vs_keys( key, &result->key.vkey ) == 0)
+         return result;
+   }
+   
+   return NULL;
+}
+
+
+static enum pipe_error compile_vs( struct svga_context *svga,
+                                   struct svga_vertex_shader *vs,
+                                   const struct svga_vs_compile_key *key,
+                                   struct svga_shader_result **out_result )
+{
+   struct svga_shader_result *result;
+   enum pipe_error ret = PIPE_OK;
+
+   result = svga_translate_vertex_program( vs, key );
+   if (result == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   ret = SVGA3D_DefineShader(svga->swc, 
+                             svga->state.next_vs_id,
+                             SVGA3D_SHADERTYPE_VS,
+                             result->tokens, 
+                             result->nr_tokens * sizeof result->tokens[0]);
+   if (ret)
+      goto fail;
+
+   *out_result = result;
+   result->id = svga->state.next_vs_id++;
+   result->next = vs->base.results;
+   vs->base.results = result;
+   return PIPE_OK;
+
+fail:
+   if (result)
+      svga_destroy_shader_result( result );
+   return ret;
+}
+
+/* SVGA_NEW_PRESCALE, SVGA_NEW_RAST, SVGA_NEW_ZERO_STRIDE
+ */
+static int make_vs_key( struct svga_context *svga,
+                        struct svga_vs_compile_key *key )
+{
+   memset(key, 0, sizeof *key);
+   key->need_prescale = svga->state.hw_clear.prescale.enabled;
+   key->allow_psiz = svga->curr.rast->templ.point_size_per_vertex;
+   key->zero_stride_vertex_elements =
+      svga->curr.zero_stride_vertex_elements;
+   key->num_zero_stride_vertex_elements =
+      svga->curr.num_zero_stride_vertex_elements;
+   return 0;
+}
+
+
+
+static int emit_hw_vs( struct svga_context *svga,
+                       unsigned dirty )
+{
+   struct svga_shader_result *result = NULL;
+   unsigned id = SVGA3D_INVALID_ID;
+   int ret = 0;
+
+   /* SVGA_NEW_NEED_SWTNL */
+   if (!svga->state.sw.need_swtnl) {
+      struct svga_vertex_shader *vs = svga->curr.vs;
+      struct svga_vs_compile_key key;
+
+      ret = make_vs_key( svga, &key );
+      if (ret)
+         return ret;
+
+      result = search_vs_key( vs, &key );
+      if (!result) {
+         ret = compile_vs( svga, vs, &key, &result );
+         if (ret)
+            return ret;
+      }
+
+      assert (result);
+      id = result->id;
+   }
+
+   if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX]) {
+      ret = SVGA3D_SetShader(svga->swc, 
+                             SVGA3D_SHADERTYPE_VS, 
+                             id );
+      if (ret)
+         return ret;
+
+      svga->dirty |= SVGA_NEW_VS_RESULT;
+      svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX] = id;
+      svga->state.hw_draw.vs = result;      
+   }
+
+   return 0;
+}
+
+struct svga_tracked_state svga_hw_vs = 
+{
+   "vertex shader (hwtnl)",
+   (SVGA_NEW_VS |
+    SVGA_NEW_PRESCALE |
+    SVGA_NEW_NEED_SWTNL |
+    SVGA_NEW_ZERO_STRIDE),
+   emit_hw_vs
+};
+
+
+/***********************************************************************
+ */
+static int update_zero_stride( struct svga_context *svga,
+                               unsigned dirty )
+{
+   unsigned i;
+
+   svga->curr.zero_stride_vertex_elements = 0;
+   svga->curr.num_zero_stride_vertex_elements = 0;
+
+   for (i = 0; i < svga->curr.num_vertex_elements; i++) {
+      const struct pipe_vertex_element *vel = &svga->curr.ve[i];
+      const struct pipe_vertex_buffer *vbuffer = &svga->curr.vb[
+         vel->vertex_buffer_index];
+      if (vbuffer->stride == 0) {
+         unsigned const_idx =
+            svga->curr.num_zero_stride_vertex_elements;
+         struct translate *translate;
+         struct translate_key key;
+         void *mapped_buffer;
+
+         svga->curr.zero_stride_vertex_elements |= (1 << i);
+         ++svga->curr.num_zero_stride_vertex_elements;
+
+         key.output_stride = 4 * sizeof(float);
+         key.nr_elements = 1;
+         key.element[0].input_format = vel->src_format;
+         key.element[0].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         key.element[0].input_buffer = vel->vertex_buffer_index;
+         key.element[0].input_offset = vel->src_offset;
+         key.element[0].output_offset = const_idx * 4 * sizeof(float);
+
+         translate_key_sanitize(&key);
+         /* translate_generic_create is technically private but
+          * we don't want to code-generate, just want generic
+          * translation */
+         translate = translate_generic_create(&key);
+
+         assert(vel->src_offset == 0);
+         
+         mapped_buffer = pipe_buffer_map_range(svga->pipe.screen, 
+                                               vbuffer->buffer,
+                                               vel->src_offset,
+                                               pf_get_size(vel->src_format),
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+         translate->set_buffer(translate, vel->vertex_buffer_index,
+                               mapped_buffer,
+                               vbuffer->stride);
+         translate->run(translate, 0, 1,
+                        svga->curr.zero_stride_constants);
+
+         pipe_buffer_unmap(svga->pipe.screen,
+                           vbuffer->buffer);
+         translate->release(translate);
+      }
+   }
+
+   if (svga->curr.num_zero_stride_vertex_elements)
+      svga->dirty |= SVGA_NEW_ZERO_STRIDE;
+
+   return 0;
+}
+
+struct svga_tracked_state svga_hw_update_zero_stride =
+{
+   "update zero_stride",
+   ( SVGA_NEW_VELEMENT |
+     SVGA_NEW_VBUFFER ),
+   update_zero_stride
+};
diff --git a/src/gallium/drivers/svga/svga_swtnl.h b/src/gallium/drivers/svga/svga_swtnl.h
new file mode 100644
index 00000000000..4882f26b170
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl.h
@@ -0,0 +1,52 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SWTNL_H
+#define SVGA_SWTNL_H
+
+#include "pipe/p_compiler.h"
+
+struct svga_context;
+struct pipe_context;
+struct pipe_buffer;
+struct vbuf_render;
+
+
+boolean svga_init_swtnl( struct svga_context *svga );
+void svga_destroy_swtnl( struct svga_context *svga );
+
+
+enum pipe_error
+svga_swtnl_draw_range_elements(struct svga_context *svga,
+                               struct pipe_buffer *indexBuffer,
+                               unsigned indexSize,
+                               unsigned min_index,
+                               unsigned max_index,
+                               unsigned prim, 
+                               unsigned start, 
+                               unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_swtnl_backend.c b/src/gallium/drivers/svga/svga_swtnl_backend.c
new file mode 100644
index 00000000000..b4f757a47a9
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@@ -0,0 +1,349 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_vbuf.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+
+#include "util/u_debug.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_shaders.h"
+
+#include "svga_context.h"
+#include "svga_state.h"
+#include "svga_swtnl.h"
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+#include "svga_draw.h"
+#include "svga_swtnl_private.h"
+
+
+static const struct vertex_info *
+svga_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+
+   svga_swtnl_update_vdecl(svga);
+
+   return &svga_render->vertex_info;
+}
+
+
+static boolean
+svga_vbuf_render_allocate_vertices( struct vbuf_render *render,
+                                    ushort vertex_size,
+                                    ushort nr_vertices )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   struct pipe_screen *screen = svga->pipe.screen;
+   size_t size = (size_t)nr_vertices * (size_t)vertex_size;
+   boolean new_vbuf = FALSE;
+   boolean new_ibuf = FALSE;
+
+   if (svga_render->vertex_size != vertex_size)
+      svga->swtnl.new_vdecl = TRUE;
+   svga_render->vertex_size = (size_t)vertex_size;
+
+   if (svga->swtnl.new_vbuf)
+      new_ibuf = new_vbuf = TRUE;
+   svga->swtnl.new_vbuf = FALSE;
+
+   if (svga_render->vbuf_size < svga_render->vbuf_offset + svga_render->vbuf_used + size)
+      new_vbuf = TRUE;
+
+   if (new_vbuf)
+      pipe_buffer_reference(&svga_render->vbuf, NULL);
+   if (new_ibuf)
+      pipe_buffer_reference(&svga_render->ibuf, NULL);
+
+   if (!svga_render->vbuf) {
+      svga_render->vbuf_size = MAX2(size, svga_render->vbuf_alloc_size);
+      svga_render->vbuf = pipe_buffer_create(screen,
+                                             0,
+                                             PIPE_BUFFER_USAGE_VERTEX,
+                                             svga_render->vbuf_size);
+      if(!svga_render->vbuf) {
+         svga_context_flush(svga, NULL);
+         svga_render->vbuf = pipe_buffer_create(screen,
+                                                0,
+                                                PIPE_BUFFER_USAGE_VERTEX,
+                                                svga_render->vbuf_size);
+         assert(svga_render->vbuf);
+      }
+
+      svga->swtnl.new_vdecl = TRUE;
+      svga_render->vbuf_offset = 0;
+   } else {
+      svga_render->vbuf_offset += svga_render->vbuf_used;
+   }
+
+   svga_render->vbuf_used = 0;
+
+   if (svga->swtnl.new_vdecl)
+      svga_render->vdecl_offset = svga_render->vbuf_offset;
+
+   return TRUE;
+}
+
+static void *
+svga_vbuf_render_map_vertices( struct vbuf_render *render )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   struct pipe_screen *screen = svga->pipe.screen;
+
+   char *ptr = (char*)pipe_buffer_map(screen,
+                                      svga_render->vbuf,
+                                      PIPE_BUFFER_USAGE_CPU_WRITE | 
+                                      PIPE_BUFFER_USAGE_FLUSH_EXPLICIT);
+   return ptr + svga_render->vbuf_offset;
+}
+
+static void
+svga_vbuf_render_unmap_vertices( struct vbuf_render *render,
+                                 ushort min_index,
+                                 ushort max_index )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   struct pipe_screen *screen = svga->pipe.screen;
+   unsigned offset, length;
+   size_t used = svga_render->vertex_size * ((size_t)max_index + 1);
+
+   offset = svga_render->vbuf_offset + svga_render->vertex_size * min_index;
+   length = svga_render->vertex_size * (max_index + 1 - min_index);
+   pipe_buffer_flush_mapped_range(screen, svga_render->vbuf, offset, length);
+   pipe_buffer_unmap(screen, svga_render->vbuf);
+   svga_render->min_index = min_index;
+   svga_render->max_index = max_index;
+   svga_render->vbuf_used = MAX2(svga_render->vbuf_used, used);
+}
+
+static boolean
+svga_vbuf_render_set_primitive( struct vbuf_render *render,
+                                unsigned prim )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   svga_render->prim = prim;
+
+   return TRUE;
+}
+
+static void
+svga_vbuf_sumbit_state( struct svga_vbuf_render *svga_render )
+{
+   struct svga_context *svga = svga_render->svga;
+   SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
+   enum pipe_error ret;
+   int i;
+
+   /* if the vdecl or vbuf hasn't changed do nothing */
+   if (!svga->swtnl.new_vdecl)
+      return;
+
+   memcpy(vdecl, svga_render->vdecl, sizeof(vdecl));
+
+   /* flush the hw state */
+   ret = svga_hwtnl_flush(svga->hwtnl);
+   if (ret) {
+      svga_context_flush(svga, NULL);
+      ret = svga_hwtnl_flush(svga->hwtnl);
+      /* if we hit this path we might become synced with hw */
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == 0);
+   }
+
+   svga_hwtnl_reset_vdecl(svga->hwtnl, svga_render->vdecl_count);
+
+   for (i = 0; i < svga_render->vdecl_count; i++) {
+      vdecl[i].array.offset += svga_render->vdecl_offset;
+
+      svga_hwtnl_vdecl( svga->hwtnl,
+                        i,
+                        &vdecl[i],
+                        svga_render->vbuf );
+   }
+
+   /* We have already taken care of flatshading, so let the hwtnl
+    * module use whatever is most convenient:
+    */
+   if (svga->state.sw.need_pipeline) {
+      svga_hwtnl_set_flatshade(svga->hwtnl, FALSE, FALSE);
+      svga_hwtnl_set_unfilled(svga->hwtnl, PIPE_POLYGON_MODE_FILL);
+   }
+   else {
+      svga_hwtnl_set_flatshade( svga->hwtnl,
+                                svga->curr.rast->templ.flatshade,
+                                svga->curr.rast->templ.flatshade_first );
+
+      svga_hwtnl_set_unfilled( svga->hwtnl,
+                               svga->curr.rast->hw_unfilled );
+   }
+
+   svga->swtnl.new_vdecl = FALSE;
+}
+
+static void
+svga_vbuf_render_draw_arrays( struct vbuf_render *render,
+                              unsigned start,
+                              uint nr )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
+   enum pipe_error ret = 0;
+
+   svga_vbuf_sumbit_state(svga_render);
+
+   /* Need to call update_state() again as the draw module may have
+    * altered some of our state behind our backs.  Testcase:
+    * redbook/polys.c
+    */
+   svga_update_state_retry( svga, SVGA_STATE_HW_DRAW );
+
+   ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias, nr);
+   if (ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias, nr);
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == PIPE_OK);
+   }
+}
+
+
+static void
+svga_vbuf_render_draw( struct vbuf_render *render,
+                       const ushort *indices,
+                       uint nr_indices)
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+   struct svga_context *svga = svga_render->svga;
+   struct pipe_screen *screen = svga->pipe.screen;
+   unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
+   boolean ret;
+   size_t size = 2 * nr_indices;
+
+   assert(( svga_render->vbuf_offset - svga_render->vdecl_offset) % svga_render->vertex_size == 0);
+   
+   if (svga_render->ibuf_size < svga_render->ibuf_offset + size)
+      pipe_buffer_reference(&svga_render->ibuf, NULL);
+
+   if (!svga_render->ibuf) {
+      svga_render->ibuf_size = MAX2(size, svga_render->ibuf_alloc_size);
+      svga_render->ibuf = pipe_buffer_create(screen,
+                                             0,
+                                             PIPE_BUFFER_USAGE_VERTEX,
+                                             svga_render->ibuf_size);
+      svga_render->ibuf_offset = 0;
+   }
+
+   pipe_buffer_write(screen, svga_render->ibuf,
+                     svga_render->ibuf_offset, 2 * nr_indices, indices);
+
+
+   /* off to hardware */
+   svga_vbuf_sumbit_state(svga_render);
+
+   /* Need to call update_state() again as the draw module may have
+    * altered some of our state behind our backs.  Testcase:
+    * redbook/polys.c
+    */
+   svga_update_state_retry( svga, SVGA_STATE_HW_DRAW );
+
+   ret = svga_hwtnl_draw_range_elements(svga->hwtnl,
+                                        svga_render->ibuf,
+                                        2,
+                                        svga_render->min_index,
+                                        svga_render->max_index,
+                                        svga_render->prim,
+                                        svga_render->ibuf_offset / 2, nr_indices, bias);
+   if(ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = svga_hwtnl_draw_range_elements(svga->hwtnl,
+                                           svga_render->ibuf,
+                                           2,
+                                           svga_render->min_index,
+                                           svga_render->max_index,
+                                           svga_render->prim,
+                                           svga_render->ibuf_offset / 2, nr_indices, bias);
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == PIPE_OK);
+   }
+
+   svga_render->ibuf_offset += size;
+}
+
+
+static void
+svga_vbuf_render_release_vertices( struct vbuf_render *render )
+{
+
+}
+
+
+static void
+svga_vbuf_render_destroy( struct vbuf_render *render )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(render);
+
+   pipe_buffer_reference(&svga_render->vbuf, NULL);
+   pipe_buffer_reference(&svga_render->ibuf, NULL);
+   FREE(svga_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+struct vbuf_render *
+svga_vbuf_render_create( struct svga_context *svga )
+{
+   struct svga_vbuf_render *svga_render = CALLOC_STRUCT(svga_vbuf_render);
+
+   svga_render->svga = svga;
+   svga_render->ibuf_size = 0;
+   svga_render->vbuf_size = 0;
+   svga_render->ibuf_alloc_size = 4*1024;
+   svga_render->vbuf_alloc_size = 64*1024;
+   svga_render->base.max_vertex_buffer_bytes = 64*1024/10;
+   svga_render->base.max_indices = 65536;
+   svga_render->base.get_vertex_info = svga_vbuf_render_get_vertex_info;
+   svga_render->base.allocate_vertices = svga_vbuf_render_allocate_vertices;
+   svga_render->base.map_vertices = svga_vbuf_render_map_vertices;
+   svga_render->base.unmap_vertices = svga_vbuf_render_unmap_vertices;
+   svga_render->base.set_primitive = svga_vbuf_render_set_primitive;
+   svga_render->base.draw = svga_vbuf_render_draw;
+   svga_render->base.draw_arrays = svga_vbuf_render_draw_arrays;
+   svga_render->base.release_vertices = svga_vbuf_render_release_vertices;
+   svga_render->base.destroy = svga_vbuf_render_destroy;
+
+   return &svga_render->base;
+}
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
new file mode 100644
index 00000000000..8b14c913f72
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -0,0 +1,170 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_swtnl.h"
+#include "svga_state.h"
+#include "svga_swtnl_private.h"
+
+
+
+enum pipe_error
+svga_swtnl_draw_range_elements(struct svga_context *svga,
+                               struct pipe_buffer *indexBuffer,
+                               unsigned indexSize,
+                               unsigned min_index,
+                               unsigned max_index,
+                               unsigned prim, unsigned start, unsigned count)
+{
+   struct draw_context *draw = svga->swtnl.draw;
+   unsigned i;
+   const void *map;
+   enum pipe_error ret;
+
+   assert(!svga->dirty);
+   assert(svga->state.sw.need_swtnl);
+   assert(draw);
+
+   ret = svga_update_state(svga, SVGA_STATE_SWTNL_DRAW);
+   if (ret) {
+      svga_context_flush(svga, NULL);
+      ret = svga_update_state(svga, SVGA_STATE_SWTNL_DRAW);
+      svga->swtnl.new_vbuf = TRUE;
+      assert(ret == PIPE_OK);
+   }
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
+      map = pipe_buffer_map(svga->pipe.screen,
+                            svga->curr.vb[i].buffer,
+                            PIPE_BUFFER_USAGE_CPU_READ);
+
+      draw_set_mapped_vertex_buffer(draw, i, map);
+   }
+
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      map = pipe_buffer_map(svga->pipe.screen, indexBuffer,
+                            PIPE_BUFFER_USAGE_CPU_READ);
+
+      draw_set_mapped_element_buffer_range(draw, 
+                                           indexSize, 
+                                           min_index,
+                                           max_index,
+                                           map);
+   }
+   
+   if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
+      map = pipe_buffer_map(svga->pipe.screen,
+                            svga->curr.cb[PIPE_SHADER_VERTEX],
+                            PIPE_BUFFER_USAGE_CPU_READ);
+      assert(map);
+      draw_set_mapped_constant_buffer(
+         draw, 
+         map,
+         svga->curr.cb[PIPE_SHADER_VERTEX]->size);
+   }
+
+   draw_arrays(svga->swtnl.draw, prim, start, count);
+
+   draw_flush(svga->swtnl.draw);
+
+   /* Ensure the draw module didn't touch this */
+   assert(i == svga->curr.num_vertex_buffers);
+   
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
+      pipe_buffer_unmap(svga->pipe.screen, svga->curr.vb[i].buffer);
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+
+   if (indexBuffer) {
+      pipe_buffer_unmap(svga->pipe.screen, indexBuffer);
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+   if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
+      pipe_buffer_unmap(svga->pipe.screen,
+                        svga->curr.cb[PIPE_SHADER_VERTEX]);
+   }
+
+   return ret;
+}
+
+
+
+
+boolean svga_init_swtnl( struct svga_context *svga )
+{
+   svga->swtnl.backend = svga_vbuf_render_create(svga);
+   if(!svga->swtnl.backend)
+      goto fail;
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   svga->swtnl.draw = draw_create();
+   if (svga->swtnl.draw == NULL)
+      goto fail;
+
+
+   draw_set_rasterize_stage(svga->swtnl.draw, 
+                            draw_vbuf_stage( svga->swtnl.draw, svga->swtnl.backend ));
+
+   draw_set_render(svga->swtnl.draw, svga->swtnl.backend);
+
+   draw_install_aaline_stage(svga->swtnl.draw, &svga->pipe);
+   draw_install_aapoint_stage(svga->swtnl.draw, &svga->pipe);
+   draw_install_pstipple_stage(svga->swtnl.draw, &svga->pipe);
+
+   draw_set_driver_clipping(svga->swtnl.draw, debug_get_bool_option("SVGA_SWTNL_FSE", FALSE));
+
+   return TRUE;
+
+fail:
+   if (svga->swtnl.backend)
+      svga->swtnl.backend->destroy( svga->swtnl.backend );
+
+   if (svga->swtnl.draw)
+      draw_destroy( svga->swtnl.draw );
+
+   return FALSE;
+}
+
+
+void svga_destroy_swtnl( struct svga_context *svga )
+{
+   draw_destroy( svga->swtnl.draw );
+}
diff --git a/src/gallium/drivers/svga/svga_swtnl_private.h b/src/gallium/drivers/svga/svga_swtnl_private.h
new file mode 100644
index 00000000000..9bbb42910f5
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_private.h
@@ -0,0 +1,93 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_SWTNL_PRIVATE_H
+#define SVGA_SWTNL_PRIVATE_H
+
+#include "svga_swtnl.h"
+#include "draw/draw_vertex.h"
+
+#include "svga_types.h"
+#include "svga3d_reg.h"
+
+/**
+ * Primitive renderer for svga.
+ */
+struct svga_vbuf_render {
+   struct vbuf_render base;
+
+   struct svga_context *svga;
+   struct vertex_info vertex_info;
+
+   unsigned vertex_size;
+
+   unsigned prim;
+
+   struct pipe_buffer *vbuf;
+   struct pipe_buffer *ibuf;
+
+   /* current size of buffer */
+   size_t vbuf_size;
+   size_t ibuf_size;
+
+   /* size of that the buffer should be */
+   size_t vbuf_alloc_size;
+   size_t ibuf_alloc_size;
+
+   /* current write place */
+   size_t vbuf_offset;
+   size_t ibuf_offset;
+
+   /* currently used */
+   size_t vbuf_used;
+
+   SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
+   unsigned vdecl_offset;
+   unsigned vdecl_count;
+
+   ushort min_index;
+   ushort max_index;
+};
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct svga_vbuf_render *
+svga_vbuf_render( struct vbuf_render *render )
+{
+   assert(render);
+   return (struct svga_vbuf_render *)render;
+}
+
+
+struct vbuf_render *
+svga_vbuf_render_create( struct svga_context *svga );
+
+
+int
+svga_swtnl_update_vdecl( struct svga_context *svga );
+
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
new file mode 100644
index 00000000000..16163121131
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -0,0 +1,242 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+
+#include "svga_context.h"
+#include "svga_swtnl.h"
+#include "svga_state.h"
+
+#include "svga_swtnl_private.h"
+
+
+#define SVGA_POINT_ADJ_X -0.375
+#define SVGA_POINT_ADJ_Y -0.5
+
+#define SVGA_LINE_ADJ_X -0.5
+#define SVGA_LINE_ADJ_Y -0.5
+
+#define SVGA_TRIANGLE_ADJ_X -0.375
+#define SVGA_TRIANGLE_ADJ_Y -0.5
+
+
+static void set_draw_viewport( struct svga_context *svga )
+{
+   struct pipe_viewport_state vp = svga->curr.viewport;
+   float adjx = 0;
+   float adjy = 0;
+
+   switch (svga->curr.reduced_prim) {
+   case PIPE_PRIM_POINTS:
+      adjx = SVGA_POINT_ADJ_X;
+      adjy = SVGA_POINT_ADJ_Y;
+      break;
+   case PIPE_PRIM_LINES:
+      /* XXX: This is to compensate for the fact that wide lines are
+       * going to be drawn with triangles, but we're not catching all
+       * cases where that will happen.
+       */
+      if (svga->curr.rast->templ.line_width > 1.0) 
+      {
+         adjx = SVGA_LINE_ADJ_X + 0.175;
+         adjy = SVGA_LINE_ADJ_Y - 0.175;
+      }
+      else {
+         adjx = SVGA_LINE_ADJ_X;
+         adjy = SVGA_LINE_ADJ_Y;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      adjx += SVGA_TRIANGLE_ADJ_X;
+      adjy += SVGA_TRIANGLE_ADJ_Y;
+      break;
+   }
+
+   vp.translate[0] += adjx;
+   vp.translate[1] += adjy;
+
+   draw_set_viewport_state(svga->swtnl.draw, &vp);
+}
+
+static int update_swtnl_draw( struct svga_context *svga,
+                              unsigned dirty )
+{
+   draw_flush( svga->swtnl.draw );
+
+   if (dirty & SVGA_NEW_VS) 
+      draw_bind_vertex_shader(svga->swtnl.draw,
+                              svga->curr.vs->draw_shader);
+
+   if (dirty & SVGA_NEW_VBUFFER)
+      draw_set_vertex_buffers(svga->swtnl.draw, 
+                              svga->curr.num_vertex_buffers, 
+                              svga->curr.vb);
+
+   if (dirty & SVGA_NEW_VELEMENT)
+      draw_set_vertex_elements(svga->swtnl.draw, 
+                               svga->curr.num_vertex_elements, 
+                               svga->curr.ve );
+
+   if (dirty & SVGA_NEW_CLIP)
+      draw_set_clip_state(svga->swtnl.draw, 
+                          &svga->curr.clip);
+
+   if (dirty & (SVGA_NEW_VIEWPORT |
+                SVGA_NEW_REDUCED_PRIMITIVE | 
+                SVGA_NEW_RAST))
+      set_draw_viewport( svga );
+
+   if (dirty & SVGA_NEW_RAST)
+      draw_set_rasterizer_state(svga->swtnl.draw,
+                                &svga->curr.rast->templ);
+
+   if (dirty & SVGA_NEW_FRAME_BUFFER)
+      draw_set_mrd(svga->swtnl.draw, 
+                   svga->curr.depthscale);
+
+   if (dirty & SVGA_NEW_EDGEFLAGS)
+      draw_set_edgeflags( svga->swtnl.draw, 
+                          svga->curr.edgeflags );
+
+   return 0;
+}
+
+
+struct svga_tracked_state svga_update_swtnl_draw =
+{
+   "update draw module state",
+   (SVGA_NEW_VS |
+    SVGA_NEW_VBUFFER |
+    SVGA_NEW_VELEMENT |
+    SVGA_NEW_CLIP |
+    SVGA_NEW_VIEWPORT |
+    SVGA_NEW_RAST |
+    SVGA_NEW_FRAME_BUFFER |
+    SVGA_NEW_REDUCED_PRIMITIVE |
+    SVGA_NEW_EDGEFLAGS),
+   update_swtnl_draw
+};
+
+
+int svga_swtnl_update_vdecl( struct svga_context *svga )
+{
+   struct svga_vbuf_render *svga_render = svga_vbuf_render(svga->swtnl.backend);
+   struct draw_context *draw = svga->swtnl.draw;
+   struct vertex_info *vinfo = &svga_render->vertex_info;
+   SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
+   const enum interp_mode colorInterp =
+      svga->curr.rast->templ.flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   const struct svga_fragment_shader *fs = svga->curr.fs;
+   int offset = 0;
+   int nr_decls = 0;
+   int src, i;
+
+   memset(vinfo, 0, sizeof(*vinfo));
+   memset(vdecl, 0, sizeof(vdecl));
+
+   /* always add position */
+   src = draw_find_vs_output(draw, TGSI_SEMANTIC_POSITION, 0);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+   vinfo->attrib[0].emit = EMIT_4F;
+   vdecl[0].array.offset = offset;
+   vdecl[0].identity.type = SVGA3D_DECLTYPE_FLOAT4;
+   vdecl[0].identity.usage = SVGA3D_DECLUSAGE_POSITIONT;
+   vdecl[0].identity.usageIndex = 0;
+   offset += 16;
+   nr_decls++;
+
+   for (i = 0; i < fs->base.info.num_inputs; i++) {
+      unsigned name = fs->base.info.input_semantic_name[i];
+      unsigned index = fs->base.info.input_semantic_index[i];
+      src = draw_find_vs_output(draw, name, index);
+      vdecl[nr_decls].array.offset = offset;
+      vdecl[nr_decls].identity.usageIndex = fs->base.info.input_semantic_index[i];
+
+      switch (name) {
+      case TGSI_SEMANTIC_COLOR:
+         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_COLOR;
+         vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
+         offset += 16;
+         nr_decls++;
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
+         vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT4;
+         vdecl[nr_decls].identity.usageIndex += 1;
+         offset += 16;
+         nr_decls++;
+         break;
+      case TGSI_SEMANTIC_FOG:
+         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         vdecl[nr_decls].identity.usage = SVGA3D_DECLUSAGE_TEXCOORD;
+         vdecl[nr_decls].identity.type = SVGA3D_DECLTYPE_FLOAT1;
+         assert(vdecl[nr_decls].identity.usageIndex == 0);
+         offset += 4;
+         nr_decls++;
+         break;
+      case TGSI_SEMANTIC_POSITION:
+         /* generated internally, not a vertex shader output */
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   draw_compute_vertex_size(vinfo);
+
+   svga_render->vdecl_count = nr_decls;
+   for (i = 0; i < svga_render->vdecl_count; i++)
+      vdecl[i].array.stride = offset;
+
+   if (memcmp(svga_render->vdecl, vdecl, sizeof(vdecl)) == 0)
+      return 0;
+
+   memcpy(svga_render->vdecl, vdecl, sizeof(vdecl));
+   svga->swtnl.new_vdecl = TRUE;
+
+   return 0;
+}
+
+
+static int update_swtnl_vdecl( struct svga_context *svga,
+                               unsigned dirty )
+{
+   return svga_swtnl_update_vdecl( svga );
+}
+
+
+struct svga_tracked_state svga_update_swtnl_vdecl =
+{
+   "update draw module vdecl",
+   (SVGA_NEW_VS |
+    SVGA_NEW_FS),
+   update_swtnl_vdecl
+};
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
new file mode 100644
index 00000000000..b8ef137c015
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -0,0 +1,280 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_memory.h"
+
+#include "svgadump/svga_shader_dump.h"
+
+#include "svga_context.h"
+#include "svga_tgsi.h"
+#include "svga_tgsi_emit.h"
+#include "svga_debug.h"
+
+#include "svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+
+/* Sinkhole used only in error conditions.
+ */
+static char err_buf[128];
+
+#if 0
+static void svga_destroy_shader_emitter( struct svga_shader_emitter *emit )
+{
+   if (emit->buf != err_buf)
+      FREE(emit->buf);
+}
+#endif
+
+
+static boolean svga_shader_expand( struct svga_shader_emitter *emit )
+{
+   char *new_buf;
+   unsigned newsize = emit->size * 2;
+
+   if(emit->buf != err_buf)
+      new_buf = REALLOC(emit->buf, emit->size, newsize);
+   else
+      new_buf = NULL;
+
+   if (new_buf == NULL) {
+      emit->ptr = err_buf;
+      emit->buf = err_buf;
+      emit->size = sizeof(err_buf);
+      return FALSE;
+   }
+
+   emit->size = newsize;
+   emit->ptr = new_buf + (emit->ptr - emit->buf);
+   emit->buf = new_buf;
+   return TRUE;
+}   
+
+static INLINE boolean reserve(  struct svga_shader_emitter *emit,
+                                unsigned nr_dwords )
+{
+   if (emit->ptr - emit->buf + nr_dwords * sizeof(unsigned) >= emit->size) {
+      if (!svga_shader_expand( emit ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+boolean svga_shader_emit_dword( struct svga_shader_emitter *emit,
+                                unsigned dword )
+{
+   if (!reserve(emit, 1))
+      return FALSE;
+
+   *(unsigned *)emit->ptr = dword;
+   emit->ptr += sizeof dword;
+   return TRUE;
+}
+
+boolean svga_shader_emit_dwords( struct svga_shader_emitter *emit,
+                                 const unsigned *dwords,
+                                 unsigned nr )
+{
+   if (!reserve(emit, nr))
+      return FALSE;
+
+   memcpy( emit->ptr, dwords, nr * sizeof *dwords );
+   emit->ptr += nr * sizeof *dwords;
+   return TRUE;
+}
+
+boolean svga_shader_emit_opcode( struct svga_shader_emitter *emit,
+                                 unsigned opcode )
+{
+   SVGA3dShaderInstToken *here;
+
+   if (!reserve(emit, 1))
+      return FALSE;
+
+   here = (SVGA3dShaderInstToken *)emit->ptr;
+   here->value = opcode;
+
+   if (emit->insn_offset) {
+      SVGA3dShaderInstToken *prev = (SVGA3dShaderInstToken *)(emit->buf + 
+                                                              emit->insn_offset);
+      prev->size = (here - prev) - 1;
+   }
+   
+   emit->insn_offset = emit->ptr - emit->buf;
+   emit->ptr += sizeof(unsigned);
+   return TRUE;
+}
+
+#define SVGA3D_PS_2X (SVGA3D_PS_20 | 1)
+#define SVGA3D_VS_2X (SVGA3D_VS_20 | 1)
+
+static boolean svga_shader_emit_header( struct svga_shader_emitter *emit )
+{
+   SVGA3dShaderVersion header;
+
+   memset( &header, 0, sizeof header );
+
+   switch (emit->unit) {
+   case PIPE_SHADER_FRAGMENT:
+      header.value = emit->use_sm30 ? SVGA3D_PS_30 : SVGA3D_PS_2X;
+      break;
+   case PIPE_SHADER_VERTEX:
+      header.value = emit->use_sm30 ? SVGA3D_VS_30 : SVGA3D_VS_2X;
+      break;
+   }
+ 
+   return svga_shader_emit_dword( emit, header.value );
+}
+
+
+
+
+
+/* Parse TGSI shader and translate to SVGA/DX9 serialized
+ * representation.  
+ *
+ * In this function SVGA shader is emitted to an in-memory buffer that
+ * can be dynamically grown.  Once we've finished and know how large
+ * it is, it will be copied to a hardware buffer for upload.
+ */
+static struct svga_shader_result *
+svga_tgsi_translate( const struct svga_shader *shader,
+                     union svga_compile_key key,
+                     unsigned unit )
+{
+   struct svga_shader_result *result = NULL;
+   struct svga_shader_emitter emit;
+   int ret = 0;
+
+   memset(&emit, 0, sizeof(emit));
+
+   emit.use_sm30 = shader->use_sm30;
+   emit.size = 1024;
+   emit.buf = MALLOC(emit.size);
+   if (emit.buf == NULL) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto fail;
+   }
+
+   emit.ptr = emit.buf;
+   emit.unit = unit;
+   emit.key = key;
+
+   tgsi_scan_shader( shader->tokens, &emit.info);
+
+   emit.imm_start = emit.info.file_max[TGSI_FILE_CONSTANT] + 1;
+   
+   if (unit == PIPE_SHADER_FRAGMENT)
+      emit.imm_start += key.fkey.num_unnormalized_coords;
+
+   if (unit == PIPE_SHADER_VERTEX) {
+      emit.imm_start += key.vkey.need_prescale ? 2 : 0;
+      emit.imm_start += key.vkey.num_zero_stride_vertex_elements;
+   }
+
+   emit.nr_hw_const = (emit.imm_start + emit.info.file_max[TGSI_FILE_IMMEDIATE] + 1);
+
+   emit.nr_hw_temp = emit.info.file_max[TGSI_FILE_TEMPORARY] + 1;
+   emit.in_main_func = TRUE;
+
+   if (!svga_shader_emit_header( &emit ))
+      goto fail;
+
+   if (!svga_shader_emit_instructions( &emit, shader->tokens ))
+      goto fail;
+   
+   result = CALLOC_STRUCT(svga_shader_result);
+   if (result == NULL)
+      goto fail;
+
+   result->shader = shader;
+   result->tokens = (const unsigned *)emit.buf;
+   result->nr_tokens = (emit.ptr - emit.buf) / sizeof(unsigned);
+   memcpy(&result->key, &key, sizeof key);
+
+   if (SVGA_DEBUG & DEBUG_TGSI) 
+   {
+      debug_printf( "#####################################\n" );
+      debug_printf( "Shader %u below\n", shader->id );
+      tgsi_dump( shader->tokens, 0 );
+      if (SVGA_DEBUG & DEBUG_TGSI) {
+         debug_printf( "Shader %u compiled below\n", shader->id );
+         svga_shader_dump( result->tokens,
+                           result->nr_tokens ,
+                           FALSE );
+      }
+      debug_printf( "#####################################\n" );
+   }
+
+   return result;
+
+fail:
+   FREE(result);
+   FREE(emit.buf);
+   return NULL;
+}
+
+
+
+
+struct svga_shader_result *
+svga_translate_fragment_program( const struct svga_fragment_shader *fs,
+                                 const struct svga_fs_compile_key *fkey )
+{
+   union svga_compile_key key;
+   memcpy(&key.fkey, fkey, sizeof *fkey);
+
+   return svga_tgsi_translate( &fs->base, 
+                               key,
+                               PIPE_SHADER_FRAGMENT );
+}
+
+struct svga_shader_result *
+svga_translate_vertex_program( const struct svga_vertex_shader *vs,
+                               const struct svga_vs_compile_key *vkey )
+{
+   union svga_compile_key key;
+   memcpy(&key.vkey, vkey, sizeof *vkey);
+
+   return svga_tgsi_translate( &vs->base, 
+                               key,
+                               PIPE_SHADER_VERTEX );
+}
+
+
+void svga_destroy_shader_result( struct svga_shader_result *result )
+{
+   FREE((unsigned *)result->tokens);
+   FREE(result);
+}
+
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
new file mode 100644
index 00000000000..896c90a89ae
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -0,0 +1,139 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_TGSI_H
+#define SVGA_TGSI_H
+
+#include "pipe/p_state.h"
+
+#include "svga_hw_reg.h"
+
+struct svga_fragment_shader;
+struct svga_vertex_shader;
+struct svga_shader;
+struct tgsi_shader_info;
+struct tgsi_token;
+
+
+struct svga_vs_compile_key
+{
+   ubyte need_prescale:1;
+   ubyte allow_psiz:1;
+   unsigned zero_stride_vertex_elements;
+   ubyte num_zero_stride_vertex_elements:6;
+};
+
+struct svga_fs_compile_key
+{
+   boolean light_twoside:1;
+   boolean front_cw:1;
+   ubyte num_textures;
+   ubyte num_unnormalized_coords;
+   struct {
+      ubyte compare_mode       : 1;
+      ubyte compare_func       : 3;
+      ubyte unnormalized       : 1;
+
+      ubyte width_height_idx   : 7;
+
+      ubyte texture_target;
+   } tex[PIPE_MAX_SAMPLERS];
+};
+
+union svga_compile_key {
+   struct svga_vs_compile_key vkey;
+   struct svga_fs_compile_key fkey;
+};
+
+struct svga_shader_result
+{
+   const struct svga_shader *shader;
+
+   /* Parameters used to generate this compilation result:
+    */
+   union svga_compile_key key;
+
+   /* Compiled shader tokens:
+    */
+   const unsigned *tokens;
+   unsigned nr_tokens;
+
+   /* SVGA Shader ID:
+    */
+   unsigned id;
+   
+   /* Next compilation result:
+    */
+   struct svga_shader_result *next;
+};
+
+
+/* TGSI doesn't provide use with VS input semantics (they're actually
+ * pretty meaningless), so we just generate some plausible ones here.
+ * This is called both from within the TGSI translator and when
+ * building vdecls to ensure they match up.
+ *
+ * The real use of this information is matching vertex elements to
+ * fragment shader inputs in the case where vertex shader is disabled.
+ */
+static INLINE void svga_generate_vdecl_semantics( unsigned idx,
+                                                  unsigned *usage,
+                                                  unsigned *usage_index )
+{
+   if (idx == 0) {
+      *usage = SVGA3D_DECLUSAGE_POSITION;
+      *usage_index = 0;
+   }
+   else {
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      *usage_index = idx - 1;
+   }
+}
+
+
+
+static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
+{
+   return sizeof *key;
+}
+
+static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
+{
+   return (const char *)&key->tex[key->num_textures].texture_target -
+      (const char *)key;
+}
+
+struct svga_shader_result *
+svga_translate_fragment_program( const struct svga_fragment_shader *fs,
+                                 const struct svga_fs_compile_key *fkey );
+
+struct svga_shader_result *
+svga_translate_vertex_program( const struct svga_vertex_shader *fs,
+                               const struct svga_vs_compile_key *vkey );
+
+
+void svga_destroy_shader_result( struct svga_shader_result *result );
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
new file mode 100644
index 00000000000..54457082a06
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
@@ -0,0 +1,280 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+
+#include "svga_tgsi_emit.h"
+#include "svga_context.h"
+
+
+
+
+static boolean ps20_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   struct src_register reg;
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   switch (semantic.SemanticName) {
+   case TGSI_SEMANTIC_POSITION:
+      /* Special case:
+       */
+      reg = src_register( SVGA3DREG_MISCTYPE, 
+                          SVGA3DMISCREG_POSITION );
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      reg = src_register( SVGA3DREG_INPUT, 
+                          semantic.SemanticIndex );
+      break;
+   case TGSI_SEMANTIC_FOG:
+      assert(semantic.SemanticIndex == 0);
+      reg = src_register( SVGA3DREG_TEXTURE, 0 );
+      break;
+   case TGSI_SEMANTIC_GENERIC:
+      reg = src_register( SVGA3DREG_TEXTURE,
+                          semantic.SemanticIndex + 1 );
+      break;
+   default:
+      assert(0);
+      return TRUE;
+   }
+
+   emit->input_map[idx] = reg;
+
+   dcl.dst = dst( reg );
+
+   dcl.usage = 0;
+   dcl.index = 0;
+
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+static boolean ps20_output( struct svga_shader_emitter *emit,
+                            struct tgsi_declaration_semantic semantic,
+                            unsigned idx )
+{
+   SVGA3dShaderDestToken reg;
+
+   switch (semantic.SemanticName) {
+   case TGSI_SEMANTIC_COLOR:
+      if (semantic.SemanticIndex < PIPE_MAX_COLOR_BUFS) {
+         unsigned cbuf = semantic.SemanticIndex;
+
+         emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                               emit->nr_hw_temp++ );
+         emit->temp_col[cbuf] = emit->output_map[idx];
+         emit->true_col[cbuf] = dst_register( SVGA3DREG_COLOROUT, 
+                                              semantic.SemanticIndex );
+      }
+      else {
+         assert(0);
+         reg = dst_register( SVGA3DREG_COLOROUT, 0 );
+      }
+      break;
+   case TGSI_SEMANTIC_POSITION:
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
+                                     semantic.SemanticIndex );
+      break;
+   default:
+      assert(0);
+      reg = dst_register( SVGA3DREG_COLOROUT, 0 );
+      break;
+   }
+
+   return TRUE;
+}
+
+
+static boolean vs20_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   emit->input_map[idx] = src_register( SVGA3DREG_INPUT, idx );
+   dcl.dst = dst_register( SVGA3DREG_INPUT, idx );
+
+   assert(dcl.dst.reserved0);
+
+   /* Mesa doesn't provide use with VS input semantics (they're
+    * actually pretty meaningless), so we just generate some plausible
+    * ones here.  This has to match what we declare in the vdecl code
+    * in svga_pipe_vertex.c.
+    */
+   if (idx == 0) {
+      dcl.usage = SVGA3D_DECLUSAGE_POSITION;
+      dcl.index = 0;
+   }
+   else {
+      dcl.usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      dcl.index = idx - 1;
+   }
+
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+static boolean vs20_output( struct svga_shader_emitter *emit,
+                         struct tgsi_declaration_semantic semantic,
+                         unsigned idx )
+{
+   /* Don't emit dcl instruction for vs20 inputs
+    */
+
+   /* Just build the register map table: 
+    */
+   switch (semantic.SemanticName) {
+   case TGSI_SEMANTIC_POSITION:
+      assert(semantic.SemanticIndex == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dst_register( SVGA3DREG_RASTOUT, 
+                                     SVGA3DRASTOUT_POSITION);
+      break;
+   case TGSI_SEMANTIC_PSIZE:
+      assert(semantic.SemanticIndex == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_psiz = emit->output_map[idx];
+      emit->true_psiz = dst_register( SVGA3DREG_RASTOUT, 
+                                      SVGA3DRASTOUT_PSIZE );
+      break;
+   case TGSI_SEMANTIC_FOG:
+      assert(semantic.SemanticIndex == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEXCRDOUT, 0 );
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      /* oD0 */
+      emit->output_map[idx] = dst_register( SVGA3DREG_ATTROUT,
+                                            semantic.SemanticIndex );
+      break;
+   case TGSI_SEMANTIC_GENERIC:
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEXCRDOUT,
+                                            semantic.SemanticIndex + 1 );
+      break;
+   default:
+      assert(0);
+      emit->output_map[idx] = dst_register(  SVGA3DREG_TEMP, 0 );
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean ps20_sampler( struct svga_shader_emitter *emit,
+                          struct tgsi_declaration_semantic semantic,
+                          unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   dcl.dst = dst_register( SVGA3DREG_SAMPLER, idx );
+   dcl.type = svga_tgsi_sampler_type( emit, idx );
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+boolean svga_translate_decl_sm20( struct svga_shader_emitter *emit,
+                             const struct tgsi_full_declaration *decl )
+{
+   unsigned first = decl->DeclarationRange.First;
+   unsigned last = decl->DeclarationRange.Last;
+   unsigned semantic = 0;
+   unsigned semantic_idx = 0;
+   unsigned idx;
+   
+   if (decl->Declaration.Semantic) {
+      semantic = decl->Semantic.SemanticName;
+      semantic_idx = decl->Semantic.SemanticIndex;
+   }
+
+   for( idx = first; idx <= last; idx++ ) {
+      boolean ok;
+
+      switch (decl->Declaration.File) {
+      case TGSI_FILE_SAMPLER:
+         assert (emit->unit == PIPE_SHADER_FRAGMENT);
+         ok = ps20_sampler( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_INPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs20_input( emit, decl->Semantic, idx );
+         else
+            ok = ps20_input( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs20_output( emit, decl->Semantic, idx );
+         else
+            ok = ps20_output( emit, decl->Semantic, idx );
+         break;
+
+      default:
+         /* don't need to declare other vars */
+         ok = TRUE;
+      }
+
+      if (!ok)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
new file mode 100644
index 00000000000..08e7dfb117c
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -0,0 +1,385 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+
+#include "svga_tgsi_emit.h"
+#include "svga_context.h"
+
+static boolean translate_vs_ps_semantic( struct tgsi_declaration_semantic semantic,
+                                         unsigned *usage,
+                                         unsigned *idx )
+{
+   switch (semantic.SemanticName) {
+   case TGSI_SEMANTIC_POSITION:  
+      *idx = semantic.SemanticIndex;
+      *usage = SVGA3D_DECLUSAGE_POSITION;
+      break;
+   case TGSI_SEMANTIC_COLOR:     
+
+      *idx = semantic.SemanticIndex;
+      *usage = SVGA3D_DECLUSAGE_COLOR;
+      break;
+   case TGSI_SEMANTIC_BCOLOR:
+      *idx = semantic.SemanticIndex + 2; /* sharing with COLOR */
+      *usage = SVGA3D_DECLUSAGE_COLOR;
+      break;
+   case TGSI_SEMANTIC_FOG:       
+      *idx = 0;
+      assert(semantic.SemanticIndex == 0);
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      break;
+   case TGSI_SEMANTIC_PSIZE:     
+      *idx = semantic.SemanticIndex;
+      *usage = SVGA3D_DECLUSAGE_PSIZE;
+      break;
+   case TGSI_SEMANTIC_GENERIC:   
+      *idx = semantic.SemanticIndex + 1; /* texcoord[0] is reserved for fog */
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      break;
+   case TGSI_SEMANTIC_NORMAL:    
+      *idx = semantic.SemanticIndex;
+      *usage = SVGA3D_DECLUSAGE_NORMAL;
+      break;
+   default:
+      assert(0);
+      *usage = SVGA3D_DECLUSAGE_TEXCOORD;
+      *idx = 0;
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_decl( struct svga_shader_emitter *emit,
+                          SVGA3dShaderDestToken reg,
+                          unsigned usage, 
+                          unsigned index )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   dcl.dst = reg;
+   dcl.usage = usage;
+   dcl.index = index;
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+static boolean emit_vface_decl( struct svga_shader_emitter *emit )
+{
+   if (!emit->emitted_vface) {
+      SVGA3dShaderDestToken reg =
+         dst_register( SVGA3DREG_MISCTYPE,
+                       SVGA3DMISCREG_FACE );
+
+      if (!emit_decl( emit, reg, 0, 0 ))
+         return FALSE;
+
+      emit->emitted_vface = TRUE;
+   }
+   return TRUE;
+}
+
+static boolean ps30_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   unsigned usage, index;
+   SVGA3dShaderDestToken reg;
+
+   if (semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+      emit->input_map[idx] = src_register( SVGA3DREG_MISCTYPE,
+                                           SVGA3DMISCREG_POSITION );
+
+      emit->input_map[idx].base.swizzle = TRANSLATE_SWIZZLE( TGSI_SWIZZLE_X,
+                                                             TGSI_SWIZZLE_Y,
+                                                             TGSI_SWIZZLE_Y,
+                                                             TGSI_SWIZZLE_Y );
+
+      reg = writemask( dst(emit->input_map[idx]),
+                       TGSI_WRITEMASK_XY );
+
+      return emit_decl( emit, reg, 0, 0 );
+   }
+   else if (emit->key.fkey.light_twoside &&
+            (semantic.SemanticName == TGSI_SEMANTIC_COLOR)) {
+
+      if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+         return FALSE;
+
+      emit->internal_color_idx[emit->internal_color_count] = idx;
+      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, emit->ps30_input_count );
+      emit->ps30_input_count++;
+      emit->internal_color_count++;
+
+      reg = dst( emit->input_map[idx] );
+
+      if (!emit_decl( emit, reg, usage, index ))
+         return FALSE;
+
+      semantic.SemanticName = TGSI_SEMANTIC_BCOLOR;
+      if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+         return FALSE;
+
+      reg = dst_register( SVGA3DREG_INPUT, emit->ps30_input_count++ );
+
+      if (!emit_decl( emit, reg, usage, index ))
+         return FALSE;
+
+      if (!emit_vface_decl( emit ))
+         return FALSE;
+
+      return TRUE;
+   }
+   else if (semantic.SemanticName == TGSI_SEMANTIC_FACE) {
+      if (!emit_vface_decl( emit ))
+         return FALSE;
+      emit->emit_frontface = TRUE;
+      emit->internal_frontface_idx = idx;
+      return TRUE;
+   }
+   else {
+
+      if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+         return FALSE;
+
+      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, emit->ps30_input_count++ );
+      reg = dst( emit->input_map[idx] );
+
+      return emit_decl( emit, reg, usage, index );
+   }
+
+}
+
+
+/* PS output registers are the same as 2.0
+ */
+static boolean ps30_output( struct svga_shader_emitter *emit,
+                            struct tgsi_declaration_semantic semantic,
+                            unsigned idx )
+{
+   SVGA3dShaderDestToken reg;
+
+   switch (semantic.SemanticName) {
+   case TGSI_SEMANTIC_COLOR:
+      emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 
+                                            semantic.SemanticIndex );
+      break;
+   case TGSI_SEMANTIC_POSITION:
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
+                                     semantic.SemanticIndex );
+      break;
+   default:
+      assert(0);
+      reg = dst_register( SVGA3DREG_COLOROUT, 0 );
+      break;
+   }
+
+   return TRUE;
+}
+
+
+/* We still make up the input semantics the same as in 2.0
+ */
+static boolean vs30_input( struct svga_shader_emitter *emit,
+                           struct tgsi_declaration_semantic semantic,
+                           unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+   unsigned usage, index;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   if (emit->key.vkey.zero_stride_vertex_elements & (1 << idx)) {
+      unsigned i;
+      unsigned offset = 0;
+      unsigned start_idx = emit->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      /* adjust for prescale constants */
+      start_idx += emit->key.vkey.need_prescale ? 2 : 0;
+      /* compute the offset from the start of zero stride constants */
+      for (i = 0; i < PIPE_MAX_ATTRIBS && i < idx; ++i) {
+         if (emit->key.vkey.zero_stride_vertex_elements & (1<<i))
+            ++offset;
+      }
+      emit->input_map[idx] = src_register( SVGA3DREG_CONST,
+                                           start_idx + offset );
+   } else {
+      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, idx );
+      dcl.dst = dst_register( SVGA3DREG_INPUT, idx );
+
+      assert(dcl.dst.reserved0);
+
+      svga_generate_vdecl_semantics( idx, &usage, &index );
+
+      dcl.usage = usage;
+      dcl.index = index;
+      dcl.values[0] |= 1<<31;
+
+      return  (emit_instruction(emit, opcode) &&
+               svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+   }
+   return TRUE;
+}
+
+/* VS3.0 outputs have proper declarations and semantic info for
+ * matching against PS inputs.
+ */
+static boolean vs30_output( struct svga_shader_emitter *emit,
+                         struct tgsi_declaration_semantic semantic,
+                         unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+   unsigned usage, index;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   if (!translate_vs_ps_semantic( semantic, &usage, &index ))
+      return FALSE;
+
+   dcl.dst = dst_register( SVGA3DREG_OUTPUT, idx );
+   dcl.usage = usage;
+   dcl.index = index;
+   dcl.values[0] |= 1<<31;
+
+   if (semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+      assert(idx == 0);
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_pos = emit->output_map[idx];
+      emit->true_pos = dcl.dst;
+   }
+   else if (semantic.SemanticName == TGSI_SEMANTIC_PSIZE) {
+      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++ );
+      emit->temp_psiz = emit->output_map[idx];
+
+      /* This has the effect of not declaring psiz (below) and not 
+       * emitting the final MOV to true_psiz in the postamble.
+       */
+      if (!emit->key.vkey.allow_psiz)
+         return TRUE;
+
+      emit->true_psiz = dcl.dst;
+   }
+   else {
+      emit->output_map[idx] = dcl.dst;
+   }
+
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+static boolean ps30_sampler( struct svga_shader_emitter *emit,
+                          struct tgsi_declaration_semantic semantic,
+                          unsigned idx )
+{
+   SVGA3DOpDclArgs dcl;
+   SVGA3dShaderInstToken opcode;
+
+   opcode = inst_token( SVGA3DOP_DCL );
+   dcl.values[0] = 0;
+   dcl.values[1] = 0;
+
+   dcl.dst = dst_register( SVGA3DREG_SAMPLER, idx );
+   dcl.type = svga_tgsi_sampler_type( emit, idx );
+   dcl.values[0] |= 1<<31;
+
+   return  (emit_instruction(emit, opcode) &&
+            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
+}
+
+
+boolean svga_translate_decl_sm30( struct svga_shader_emitter *emit,
+                             const struct tgsi_full_declaration *decl )
+{
+   unsigned first = decl->DeclarationRange.First;
+   unsigned last = decl->DeclarationRange.Last;
+   unsigned semantic = 0;
+   unsigned semantic_idx = 0;
+   unsigned idx;
+
+   if (decl->Declaration.Semantic) {
+      semantic = decl->Semantic.SemanticName;
+      semantic_idx = decl->Semantic.SemanticIndex;
+   }
+
+   for( idx = first; idx <= last; idx++ ) {
+      boolean ok;
+
+      switch (decl->Declaration.File) {
+      case TGSI_FILE_SAMPLER:
+         assert (emit->unit == PIPE_SHADER_FRAGMENT);
+         ok = ps30_sampler( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_INPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs30_input( emit, decl->Semantic, idx );
+         else
+            ok = ps30_input( emit, decl->Semantic, idx );
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         if (emit->unit == PIPE_SHADER_VERTEX)
+            ok = vs30_output( emit, decl->Semantic, idx );
+         else
+            ok = ps30_output( emit, decl->Semantic, idx );
+         break;
+
+      default:
+         /* don't need to declare other vars */
+         ok = TRUE;
+      }
+
+      if (!ok)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
new file mode 100644
index 00000000000..2557824293e
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -0,0 +1,345 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_TGSI_EMIT_H
+#define SVGA_TGSI_EMIT_H
+
+#include "tgsi/tgsi_scan.h"
+#include "svga_hw_reg.h"
+#include "svga_tgsi.h"
+#include "svga3d_shaderdefs.h"
+
+struct src_register
+{
+   SVGA3dShaderSrcToken base;
+   SVGA3dShaderSrcToken indirect;
+};
+
+
+struct svga_arl_consts {
+   int number;
+   int idx;
+   int swizzle;
+   int arl_num;
+};
+
+/* Internal functions:
+ */
+
+struct svga_shader_emitter
+{
+   boolean use_sm30;
+   
+   unsigned size;
+   char *buf;
+   char *ptr;
+
+   union svga_compile_key key;
+   struct tgsi_shader_info info;
+   int unit;
+
+   int imm_start;
+
+   int nr_hw_const;
+   int nr_hw_temp;
+   
+   int insn_offset;
+
+   int internal_temp_count;
+   int internal_imm_count;
+
+   int internal_color_idx[2]; /* diffuse, specular */
+   int internal_color_count;
+
+   boolean emitted_vface;
+   boolean emit_frontface;
+   int internal_frontface_idx;
+
+   int ps30_input_count;
+
+   boolean in_main_func;
+
+   boolean created_zero_immediate;
+   int zero_immediate_idx;
+
+   boolean created_loop_const;
+   int loop_const_idx;
+
+   boolean created_sincos_consts;
+   int sincos_consts_idx;
+
+   unsigned label[32];
+   unsigned nr_labels;
+
+   struct src_register input_map[PIPE_MAX_ATTRIBS];
+   SVGA3dShaderDestToken output_map[PIPE_MAX_ATTRIBS];
+
+   struct src_register imm_0055;
+   SVGA3dShaderDestToken temp_pos;
+   SVGA3dShaderDestToken true_pos;
+
+   SVGA3dShaderDestToken temp_col[PIPE_MAX_COLOR_BUFS];
+   SVGA3dShaderDestToken true_col[PIPE_MAX_COLOR_BUFS];
+
+   SVGA3dShaderDestToken temp_psiz;
+   SVGA3dShaderDestToken true_psiz;
+
+   struct svga_arl_consts arl_consts[12];
+   int num_arl_consts;
+   int current_arl;
+};
+
+
+boolean svga_shader_emit_dword( struct svga_shader_emitter *emit,
+                                unsigned dword );
+
+boolean svga_shader_emit_dwords( struct svga_shader_emitter *emit,
+                                 const unsigned *dwords,
+                                 unsigned nr );
+
+boolean svga_shader_emit_opcode( struct svga_shader_emitter *emit,
+                                 unsigned opcode );
+
+boolean svga_shader_emit_instructions( struct svga_shader_emitter *emit,
+                                       const struct tgsi_token *tokens );
+
+boolean svga_translate_decl_sm20( struct svga_shader_emitter *emit,
+                               const struct tgsi_full_declaration *decl );
+
+boolean svga_translate_decl_sm30( struct svga_shader_emitter *emit,
+                               const struct tgsi_full_declaration *decl );
+
+
+static INLINE boolean emit_dst( struct svga_shader_emitter *emit,
+                         SVGA3dShaderDestToken dest )
+{
+   assert(dest.reserved0);
+   return svga_shader_emit_dword( emit, dest.value );
+}
+
+static INLINE boolean emit_src( struct svga_shader_emitter *emit,
+                         const struct src_register src )
+{
+   if (src.base.relAddr) {
+      assert(src.base.reserved0);
+      assert(src.indirect.reserved0);
+      return (svga_shader_emit_dword( emit, src.base.value ) &&
+              svga_shader_emit_dword( emit, src.indirect.value ));
+   }
+   else {
+      assert(src.base.reserved0);
+      return svga_shader_emit_dword( emit, src.base.value );
+   }
+}
+
+
+static INLINE boolean emit_instruction( struct svga_shader_emitter *emit,
+                                 SVGA3dShaderInstToken opcode )
+{
+   return svga_shader_emit_opcode( emit, opcode.value );
+}
+
+
+static INLINE boolean emit_op1( struct svga_shader_emitter *emit,
+                         SVGA3dShaderInstToken inst,
+                         SVGA3dShaderDestToken dest,
+                         struct src_register src0 )
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ));
+}
+
+static INLINE boolean emit_op2( struct svga_shader_emitter *emit,
+                     SVGA3dShaderInstToken inst,
+                     SVGA3dShaderDestToken dest,
+                     struct src_register src0,
+                     struct src_register src1 )
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ) &&
+           emit_src( emit, src1 ));
+}
+
+static INLINE boolean emit_op3( struct svga_shader_emitter *emit,
+                         SVGA3dShaderInstToken inst,
+                         SVGA3dShaderDestToken dest,
+                         struct src_register src0,
+                         struct src_register src1,
+                         struct src_register src2 )
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ) &&
+           emit_src( emit, src1 ) &&
+           emit_src( emit, src2 ));
+}
+
+
+#define TRANSLATE_SWIZZLE(x,y,z,w)  ((x) | ((y) << 2) | ((z) << 4) | ((w) << 6))
+#define SWIZZLE_XYZW  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_W)
+#define SWIZZLE_XXXX  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,TGSI_SWIZZLE_X,TGSI_SWIZZLE_X,TGSI_SWIZZLE_X)
+#define SWIZZLE_YYYY  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Y)
+#define SWIZZLE_ZZZZ  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_Z,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_Z)
+#define SWIZZLE_WWWW  \
+ TRANSLATE_SWIZZLE(TGSI_SWIZZLE_W,TGSI_SWIZZLE_W,TGSI_SWIZZLE_W,TGSI_SWIZZLE_W)
+
+
+
+static INLINE SVGA3dShaderInstToken
+inst_token( unsigned opcode )
+{
+   SVGA3dShaderInstToken inst;
+
+   inst.value = 0;
+   inst.op = opcode;
+
+   return inst;
+}
+
+static INLINE SVGA3dShaderDestToken 
+dst_register( unsigned file,
+              int number )
+{
+   SVGA3dShaderDestToken dest;
+
+   dest.value = 0;
+   dest.num = number;
+   dest.type_upper = file >> 3;
+   dest.relAddr = 0;
+   dest.reserved1 = 0;
+   dest.mask = 0xf;
+   dest.dstMod = 0;
+   dest.shfScale = 0;
+   dest.type_lower = file & 0x7;
+   dest.reserved0 = 1;          /* is_reg */
+   
+   return dest;
+}
+
+static INLINE SVGA3dShaderDestToken
+writemask( SVGA3dShaderDestToken dest,
+           unsigned mask )
+{
+   dest.mask &= mask;
+   return dest;
+}
+
+
+static INLINE SVGA3dShaderSrcToken 
+src_token( unsigned file, int number )
+{
+   SVGA3dShaderSrcToken src;
+
+   src.value = 0;
+   src.num = number;
+   src.type_upper = file >> 3;
+   src.relAddr = 0;
+   src.reserved1 = 0;
+   src.swizzle = SWIZZLE_XYZW;
+   src.srcMod = 0;
+   src.type_lower = file & 0x7;
+   src.reserved0 = 1;           /* is_reg */
+
+   return src;
+}
+
+
+static INLINE struct src_register 
+absolute( struct src_register src )
+{
+   src.base.srcMod = SVGA3DSRCMOD_ABS;
+
+   return src;
+}
+
+
+static INLINE struct src_register 
+negate( struct src_register src )
+{
+   switch (src.base.srcMod) {
+   case SVGA3DSRCMOD_ABS:
+      src.base.srcMod = SVGA3DSRCMOD_ABSNEG;
+      break;
+   case SVGA3DSRCMOD_ABSNEG:
+      src.base.srcMod = SVGA3DSRCMOD_ABS;
+      break;
+   case SVGA3DSRCMOD_NEG:
+      src.base.srcMod = SVGA3DSRCMOD_NONE;
+      break;
+   case SVGA3DSRCMOD_NONE:
+      src.base.srcMod = SVGA3DSRCMOD_NEG;
+      break;
+   }
+   return src;
+}
+
+
+static INLINE struct src_register 
+src_register( unsigned file, int number )
+{
+   struct src_register src;
+   
+   src.base = src_token( file, number );
+   src.indirect.value = 0;
+
+   return src;
+}
+
+static INLINE SVGA3dShaderDestToken dst( struct src_register src )
+{
+   return dst_register( SVGA3dShaderGetRegType( src.base.value ),
+                        src.base.num );
+}
+
+static INLINE struct src_register src( SVGA3dShaderDestToken dst )
+{
+   return src_register( SVGA3dShaderGetRegType( dst.value ),
+                        dst.num );
+}
+
+static INLINE ubyte svga_tgsi_sampler_type( struct svga_shader_emitter *emit,
+                                            int idx )
+{
+   switch (emit->key.fkey.tex[idx].texture_target) {
+   case PIPE_TEXTURE_1D:
+      return SVGA3DSAMP_2D;
+   case PIPE_TEXTURE_2D:
+      return SVGA3DSAMP_2D;
+   case PIPE_TEXTURE_3D:
+      return SVGA3DSAMP_VOLUME;
+   case PIPE_TEXTURE_CUBE:
+      return SVGA3DSAMP_CUBE;
+   }
+
+   return SVGA3DSAMP_UNKNOWN;
+}
+
+#endif
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
new file mode 100644
index 00000000000..ea409b7e165
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -0,0 +1,2716 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+
+#include "svga_tgsi_emit.h"
+#include "svga_context.h"
+
+
+static boolean emit_vs_postamble( struct svga_shader_emitter *emit );
+static boolean emit_ps_postamble( struct svga_shader_emitter *emit );
+
+
+
+ 
+static unsigned
+translate_opcode(
+   uint opcode )
+{
+   switch (opcode) {
+   case TGSI_OPCODE_ABS:        return SVGA3DOP_ABS;
+   case TGSI_OPCODE_ADD:        return SVGA3DOP_ADD;
+   case TGSI_OPCODE_BREAKC:     return SVGA3DOP_BREAKC;
+   case TGSI_OPCODE_DDX:        return SVGA3DOP_DSX;
+   case TGSI_OPCODE_DDY:        return SVGA3DOP_DSY;
+   case TGSI_OPCODE_DP2A:       return SVGA3DOP_DP2ADD;
+   case TGSI_OPCODE_DP3:        return SVGA3DOP_DP3;
+   case TGSI_OPCODE_DP4:        return SVGA3DOP_DP4;
+   case TGSI_OPCODE_ENDFOR:     return SVGA3DOP_ENDLOOP;
+   case TGSI_OPCODE_FRC:        return SVGA3DOP_FRC;
+   case TGSI_OPCODE_BGNFOR:     return SVGA3DOP_LOOP;
+   case TGSI_OPCODE_MAD:        return SVGA3DOP_MAD;
+   case TGSI_OPCODE_MAX:        return SVGA3DOP_MAX;
+   case TGSI_OPCODE_MIN:        return SVGA3DOP_MIN;
+   case TGSI_OPCODE_MOV:        return SVGA3DOP_MOV;
+   case TGSI_OPCODE_MUL:        return SVGA3DOP_MUL;
+   case TGSI_OPCODE_NOP:        return SVGA3DOP_NOP;
+   case TGSI_OPCODE_NRM4:       return SVGA3DOP_NRM;
+   case TGSI_OPCODE_SSG:        return SVGA3DOP_SGN;
+   default:
+      debug_printf("Unkown opcode %u\n", opcode);
+      assert( 0 );
+      return SVGA3DOP_LAST_INST;
+   }
+}
+
+
+static unsigned translate_file( unsigned file )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY: return SVGA3DREG_TEMP;
+   case TGSI_FILE_INPUT:     return SVGA3DREG_INPUT;
+   case TGSI_FILE_OUTPUT:    return SVGA3DREG_OUTPUT; /* VS3.0+ only */
+   case TGSI_FILE_IMMEDIATE: return SVGA3DREG_CONST;
+   case TGSI_FILE_CONSTANT:  return SVGA3DREG_CONST;
+   case TGSI_FILE_SAMPLER:   return SVGA3DREG_SAMPLER;
+   case TGSI_FILE_ADDRESS:   return SVGA3DREG_ADDR;
+   default:
+      assert( 0 );
+      return SVGA3DREG_TEMP;
+   }
+}
+
+
+
+
+
+
+static SVGA3dShaderDestToken 
+translate_dst_register( struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn,
+                        unsigned idx )
+{
+   const struct tgsi_full_dst_register *reg = &insn->FullDstRegisters[idx];
+   SVGA3dShaderDestToken dest;
+
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      /* Output registers encode semantic information in their name.
+       * Need to lookup a table built at decl time:
+       */
+      dest = emit->output_map[reg->DstRegister.Index];
+      break;
+
+   default:
+      dest = dst_register( translate_file( reg->DstRegister.File ),
+                           reg->DstRegister.Index );
+      break;
+   }
+
+   dest.mask = reg->DstRegister.WriteMask;
+
+   if (insn->Instruction.Saturate) 
+      dest.dstMod = SVGA3DDSTMOD_SATURATE;
+
+   return dest;
+}
+
+
+static struct src_register 
+swizzle( struct src_register src,
+         int x,
+         int y,
+         int z,
+         int w )
+{
+   x = (src.base.swizzle >> (x * 2)) & 0x3;
+   y = (src.base.swizzle >> (y * 2)) & 0x3;
+   z = (src.base.swizzle >> (z * 2)) & 0x3;
+   w = (src.base.swizzle >> (w * 2)) & 0x3;
+
+   src.base.swizzle = TRANSLATE_SWIZZLE(x,y,z,w);
+
+   return src;
+}
+
+static struct src_register
+scalar( struct src_register src,
+        int comp )
+{
+   return swizzle( src, comp, comp, comp, comp );
+}
+
+static INLINE boolean
+svga_arl_needs_adjustment( const struct svga_shader_emitter *emit )
+{
+   int i;
+
+   for (i = 0; i < emit->num_arl_consts; ++i) {
+      if (emit->arl_consts[i].arl_num == emit->current_arl)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+static INLINE int
+svga_arl_adjustment( const struct svga_shader_emitter *emit )
+{
+   int i;
+
+   for (i = 0; i < emit->num_arl_consts; ++i) {
+      if (emit->arl_consts[i].arl_num == emit->current_arl)
+         return emit->arl_consts[i].number;
+   }
+   return 0;
+}
+
+static struct src_register 
+translate_src_register( const struct svga_shader_emitter *emit,
+                        const struct tgsi_full_src_register *reg )
+{
+   struct src_register src;
+
+   switch (reg->SrcRegister.File) {
+   case TGSI_FILE_INPUT:
+      /* Input registers are referred to by their semantic name rather
+       * than by index.  Use the mapping build up from the decls:
+       */
+      src = emit->input_map[reg->SrcRegister.Index];
+      break;
+       
+   case TGSI_FILE_IMMEDIATE:
+      /* Immediates are appended after TGSI constants in the D3D
+       * constant buffer.
+       */
+      src = src_register( translate_file( reg->SrcRegister.File ),
+                          reg->SrcRegister.Index + 
+                          emit->imm_start );
+      break;
+
+   default:
+      src = src_register( translate_file( reg->SrcRegister.File ),
+                          reg->SrcRegister.Index );
+
+      break;
+   }
+
+   /* Indirect addressing (for coninstant buffer lookups only)
+    */
+   if (reg->SrcRegister.Indirect)
+   {
+      /* we shift the offset towards the minimum */
+      if (svga_arl_needs_adjustment( emit )) {
+         src.base.num -= svga_arl_adjustment( emit );
+      }
+      src.base.relAddr = 1;
+
+      /* Not really sure what should go in the second token:
+       */
+      src.indirect = src_token( SVGA3DREG_ADDR,
+                                reg->SrcRegisterInd.Index );
+
+      src.indirect.swizzle = SWIZZLE_XXXX;
+   }
+
+   src = swizzle( src,
+                  reg->SrcRegister.SwizzleX,
+                  reg->SrcRegister.SwizzleY,
+                  reg->SrcRegister.SwizzleZ,
+                  reg->SrcRegister.SwizzleW );
+
+   /* src.mod isn't a bitfield, unfortunately:
+    * See tgsi_util_get_full_src_register_sign_mode for implementation details.
+    */
+   if (reg->SrcRegisterExtMod.Absolute) {
+      if (reg->SrcRegisterExtMod.Negate)
+         src.base.srcMod = SVGA3DSRCMOD_ABSNEG;
+      else
+         src.base.srcMod = SVGA3DSRCMOD_ABS;
+   }
+   else {
+      if (reg->SrcRegister.Negate != reg->SrcRegisterExtMod.Negate)
+         src.base.srcMod = SVGA3DSRCMOD_NEG;
+      else
+         src.base.srcMod = SVGA3DSRCMOD_NONE;
+   }
+
+   return src;
+}
+
+
+/*
+ * Get a temporary register, return -1 if none available
+ */
+static INLINE SVGA3dShaderDestToken 
+get_temp( struct svga_shader_emitter *emit )
+{
+   int i = emit->nr_hw_temp + emit->internal_temp_count++;
+
+   return dst_register( SVGA3DREG_TEMP, i );
+}
+
+/* Release a single temp.  Currently only effective if it was the last
+ * allocated temp, otherwise release will be delayed until the next
+ * call to reset_temp_regs().
+ */
+static INLINE void 
+release_temp( struct svga_shader_emitter *emit,
+              SVGA3dShaderDestToken temp )
+{
+   if (temp.num == emit->internal_temp_count - 1)
+      emit->internal_temp_count--;
+}
+
+static void reset_temp_regs( struct svga_shader_emitter *emit )
+{
+   emit->internal_temp_count = 0;
+}
+   
+
+static boolean submit_op0( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest )
+{
+   return (emit_instruction( emit, inst ) && 
+           emit_dst( emit, dest ));
+}
+
+static boolean submit_op1( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0 )
+{
+   return emit_op1( emit, inst, dest, src0 );
+}
+
+
+/* SVGA shaders may not refer to >1 constant register in a single
+ * instruction.  This function checks for that usage and inserts a
+ * move to temporary if detected.
+ *
+ * The same applies to input registers -- at most a single input
+ * register may be read by any instruction.
+ */
+static boolean submit_op2( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0,
+                           struct src_register src1 )
+{
+   SVGA3dShaderDestToken temp;
+   SVGA3dShaderRegType type0, type1;
+   boolean need_temp = FALSE;
+
+   temp.value = 0;
+   type0 = SVGA3dShaderGetRegType( src0.base.value );
+   type1 = SVGA3dShaderGetRegType( src1.base.value );
+
+   if (type0 == SVGA3DREG_CONST &&
+       type1 == SVGA3DREG_CONST &&
+       src0.base.num != src1.base.num)
+      need_temp = TRUE;
+
+   if (type0 == SVGA3DREG_INPUT &&
+       type1 == SVGA3DREG_INPUT &&
+       src0.base.num != src1.base.num)
+      need_temp = TRUE;
+
+   if (need_temp)
+   {
+      temp = get_temp( emit );
+
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp, src0 ))
+         return FALSE;
+
+      src0 = src( temp );
+   }
+
+   if (!emit_op2( emit, inst, dest, src0, src1 ))
+      return FALSE;
+
+   if (need_temp)
+      release_temp( emit, temp );
+
+   return TRUE;
+}
+
+
+/* SVGA shaders may not refer to >1 constant register in a single
+ * instruction.  This function checks for that usage and inserts a
+ * move to temporary if detected.
+ */
+static boolean submit_op3( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0,
+                           struct src_register src1,
+                           struct src_register src2 )
+{
+   SVGA3dShaderDestToken temp0;
+   SVGA3dShaderDestToken temp1;
+   boolean need_temp0 = FALSE;
+   boolean need_temp1 = FALSE;
+   SVGA3dShaderRegType type0, type1, type2;
+
+   temp0.value = 0;
+   temp1.value = 0;
+   type0 = SVGA3dShaderGetRegType( src0.base.value );
+   type1 = SVGA3dShaderGetRegType( src1.base.value );
+   type2 = SVGA3dShaderGetRegType( src2.base.value );
+
+   if (inst.op != SVGA3DOP_SINCOS) {
+      if (type0 == SVGA3DREG_CONST &&
+          ((type1 == SVGA3DREG_CONST && src0.base.num != src1.base.num) ||
+           (type2 == SVGA3DREG_CONST && src0.base.num != src2.base.num)))
+         need_temp0 = TRUE;
+
+      if (type1 == SVGA3DREG_CONST &&
+          (type2 == SVGA3DREG_CONST && src1.base.num != src2.base.num))
+         need_temp1 = TRUE;
+   }
+
+   if (type0 == SVGA3DREG_INPUT &&
+       ((type1 == SVGA3DREG_INPUT && src0.base.num != src1.base.num) ||
+        (type2 == SVGA3DREG_INPUT && src0.base.num != src2.base.num)))
+      need_temp0 = TRUE;
+
+   if (type1 == SVGA3DREG_INPUT &&
+       (type2 == SVGA3DREG_INPUT && src1.base.num != src2.base.num))
+      need_temp1 = TRUE;
+
+   if (need_temp0)
+   {
+      temp0 = get_temp( emit );
+ 
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp0, src0 ))
+         return FALSE;
+         
+      src0 = src( temp0 );
+   }
+
+   if (need_temp1)
+   {
+      temp1 = get_temp( emit );
+
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp1, src1 ))
+         return FALSE;
+
+      src1 = src( temp1 );
+   }
+
+   if (!emit_op3( emit, inst, dest, src0, src1, src2 ))
+      return FALSE;
+
+   if (need_temp1)
+      release_temp( emit, temp1 );
+   if (need_temp0)
+      release_temp( emit, temp0 );
+   return TRUE;
+}
+
+
+static boolean emit_def_const( struct svga_shader_emitter *emit,
+                               SVGA3dShaderConstType type,
+                               unsigned idx,
+                               float a,
+                               float b,
+                               float c,
+                               float d )
+{
+   SVGA3DOpDefArgs def;
+   SVGA3dShaderInstToken opcode;
+
+   switch (type) {
+   case SVGA3D_CONST_TYPE_FLOAT:
+      opcode = inst_token( SVGA3DOP_DEF );
+      def.dst = dst_register( SVGA3DREG_CONST, idx );
+      def.constValues[0] = a;
+      def.constValues[1] = b;
+      def.constValues[2] = c;
+      def.constValues[3] = d;
+      break;
+   case SVGA3D_CONST_TYPE_INT:
+      opcode = inst_token( SVGA3DOP_DEFI );
+      def.dst = dst_register( SVGA3DREG_CONSTINT, idx );
+      def.constIValues[0] = (int)a;
+      def.constIValues[1] = (int)b;
+      def.constIValues[2] = (int)c;
+      def.constIValues[3] = (int)d;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   if (!emit_instruction(emit, opcode) ||
+       !svga_shader_emit_dwords( emit, def.values, Elements(def.values)))
+      return FALSE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_zero_immediate( struct svga_shader_emitter *emit )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT,
+                        idx, 0, 0, 0, 1 ))
+      return FALSE;
+
+   emit->zero_immediate_idx = idx;
+   emit->created_zero_immediate = TRUE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_loop_const( struct svga_shader_emitter *emit )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_INT, idx,
+                        255, /* iteration count */
+                        0, /* initial value */
+                        1, /* step size */
+                        0 /* not used, must be 0 */))
+      return FALSE;
+
+   emit->loop_const_idx = idx;
+   emit->created_loop_const = TRUE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_sincos_consts( struct svga_shader_emitter *emit )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT, idx,
+                        -1.5500992e-006f,
+                        -2.1701389e-005f,
+                        0.0026041667f,
+                        0.00026041668f ))
+      return FALSE;
+
+   emit->sincos_consts_idx = idx;
+   idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT, idx,
+                        -0.020833334f,
+                        -0.12500000f,
+                        1.0f,
+                        0.50000000f ))
+      return FALSE;
+
+   emit->created_sincos_consts = TRUE;
+
+   return TRUE;
+}
+
+static INLINE boolean
+create_arl_consts( struct svga_shader_emitter *emit )
+{
+   int i;
+
+   for (i = 0; i < emit->num_arl_consts; i += 4) {
+      int j;
+      unsigned idx = emit->nr_hw_const++;
+      float vals[4];
+      for (j = 0; j < 4 && (j + i) < emit->num_arl_consts; ++j) {
+         vals[j] = emit->arl_consts[i + j].number;
+         emit->arl_consts[i + j].idx = idx;
+         switch (j) {
+         case 0:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_X;
+            break;
+         case 1:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_Y;
+            break;
+         case 2:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_Z;
+            break;
+         case 3:
+            emit->arl_consts[i + 0].swizzle = TGSI_SWIZZLE_W;
+            break;
+         }
+      }
+      while (j < 4)
+         vals[j++] = 0;
+
+      if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT, idx,
+                           vals[0], vals[1],
+                           vals[2], vals[3]))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static INLINE struct src_register
+get_vface( struct svga_shader_emitter *emit )
+{
+   assert(emit->emitted_vface);
+   return src_register(SVGA3DREG_MISCTYPE, 
+                       SVGA3DMISCREG_FACE);
+}
+
+/* returns {0, 0, 0, 1} immediate */
+static INLINE struct src_register
+get_zero_immediate( struct svga_shader_emitter *emit )
+{
+   assert(emit->created_zero_immediate);
+   assert(emit->zero_immediate_idx >= 0);
+   return src_register( SVGA3DREG_CONST,
+                        emit->zero_immediate_idx );
+}
+
+/* returns the loop const */
+static INLINE struct src_register
+get_loop_const( struct svga_shader_emitter *emit )
+{
+   assert(emit->created_loop_const);
+   assert(emit->loop_const_idx >= 0);
+   return src_register( SVGA3DREG_CONSTINT,
+                        emit->loop_const_idx );
+}
+
+/* returns a sincos const */
+static INLINE struct src_register
+get_sincos_const( struct svga_shader_emitter *emit,
+                  unsigned index )
+{
+   assert(emit->created_sincos_consts);
+   assert(emit->sincos_consts_idx >= 0);
+   assert(index == 0 || index == 1);
+   return src_register( SVGA3DREG_CONST,
+                        emit->sincos_consts_idx + index );
+}
+
+static INLINE struct src_register
+get_fake_arl_const( struct svga_shader_emitter *emit )
+{
+   struct src_register reg;
+   int idx = 0, swizzle = 0, i;
+
+   for (i = 0; i < emit->num_arl_consts; ++ i) {
+      if (emit->arl_consts[i].arl_num == emit->current_arl) {
+         idx = emit->arl_consts[i].idx;
+         swizzle = emit->arl_consts[i].swizzle;
+      }
+   }
+
+   reg = src_register( SVGA3DREG_CONST, idx );
+   return scalar(reg, swizzle);
+}
+
+static INLINE struct src_register
+get_tex_dimensions( struct svga_shader_emitter *emit, int sampler_num )
+{
+   int idx;
+   struct src_register reg;
+
+   /* the width/height indexes start right after constants */
+   idx = emit->key.fkey.tex[sampler_num].width_height_idx +
+         emit->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+   reg = src_register( SVGA3DREG_CONST, idx );
+   return reg;
+}
+
+static boolean emit_fake_arl(struct svga_shader_emitter *emit,
+                             const struct tgsi_full_instruction *insn)
+{
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   struct src_register src1 = get_fake_arl_const( emit );
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   SVGA3dShaderDestToken tmp = get_temp( emit );
+
+   if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), tmp, src0))
+      return FALSE;
+
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), tmp, src( tmp ),
+                    src1))
+      return FALSE;
+
+   /* replicate the original swizzle */
+   src1 = src(tmp);
+   src1.base.swizzle = src0.base.swizzle;
+
+   return submit_op1( emit, inst_token( SVGA3DOP_MOVA ),
+                      dst, src1 );
+}
+
+static boolean emit_if(struct svga_shader_emitter *emit,
+                       const struct tgsi_full_instruction *insn)
+{
+   const struct src_register src = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   struct src_register zero = get_zero_immediate( emit );
+   SVGA3dShaderInstToken if_token = inst_token( SVGA3DOP_IFC );
+
+   if_token.control = SVGA3DOPCOMPC_NE;
+   zero = scalar(zero, TGSI_SWIZZLE_X);
+
+   return (emit_instruction( emit, if_token ) &&
+           emit_src( emit, src ) &&
+           emit_src( emit, zero ) );
+}
+
+static boolean emit_endif(struct svga_shader_emitter *emit,
+                       const struct tgsi_full_instruction *insn)
+{
+   return (emit_instruction( emit,
+                             inst_token( SVGA3DOP_ENDIF )));
+}
+
+static boolean emit_else(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn)
+{
+   return (emit_instruction( emit,
+                             inst_token( SVGA3DOP_ELSE )));
+}
+
+/* Translate the following TGSI FLR instruction.
+ *    FLR  DST, SRC
+ * To the following SVGA3D instruction sequence.
+ *    FRC  TMP, SRC
+ *    SUB  DST, SRC, TMP
+ */
+static boolean emit_floor(struct svga_shader_emitter *emit,
+                          const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* FRC  TMP, SRC */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_FRC ), temp, src0 ))
+      return FALSE;
+
+   /* SUB  DST, SRC, TMP */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst, src0,
+                    negate( src( temp ) ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/* Translate the following TGSI CMP instruction.
+ *    CMP  DST, SRC0, SRC1, SRC2
+ * To the following SVGA3D instruction sequence.
+ *    CMP  DST, SRC0, SRC2, SRC1
+ */
+static boolean emit_cmp(struct svga_shader_emitter *emit,
+                          const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   const struct src_register src2 = translate_src_register(
+      emit, &insn->FullSrcRegisters[2] );
+
+   /* CMP  DST, SRC0, SRC2, SRC1 */
+   return submit_op3( emit, inst_token( SVGA3DOP_CMP ), dst, src0, src2, src1);
+}
+
+
+
+/* Translate the following TGSI DIV instruction.
+ *    DIV  DST.xy, SRC0, SRC1
+ * To the following SVGA3D instruction sequence.
+ *    RCP  TMP.x, SRC1.xxxx
+ *    RCP  TMP.y, SRC1.yyyy
+ *    MUL  DST.xy, SRC0, TMP
+ */
+static boolean emit_div(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+   int i;
+
+   /* For each enabled element, perform a RCP instruction.  Note that
+    * RCP is scalar in SVGA3D:
+    */
+   for (i = 0; i < 4; i++) {
+      unsigned channel = 1 << i;
+      if (dst.mask & channel) {
+         /* RCP  TMP.?, SRC1.???? */
+         if (!submit_op1( emit, inst_token( SVGA3DOP_RCP ), 
+                          writemask(temp, channel), 
+                          scalar(src1, i) ))
+            return FALSE;
+      }
+   }
+
+   /* Then multiply them out with a single mul:
+    *
+    * MUL  DST, SRC0, TMP
+    */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), dst, src0,
+                    src( temp ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Translate the following TGSI DP2 instruction.
+ *    DP2  DST, SRC1, SRC2
+ * To the following SVGA3D instruction sequence.
+ *    MUL  TMP, SRC1, SRC2
+ *    ADD  DST, TMP.xxxx, TMP.yyyy
+ */
+static boolean emit_dp2(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+   struct src_register temp_src0, temp_src1;
+
+   /* MUL  TMP, SRC1, SRC2 */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), temp, src0, src1 ))
+      return FALSE;
+
+   temp_src0 = scalar(src( temp ), TGSI_SWIZZLE_X);
+   temp_src1 = scalar(src( temp ), TGSI_SWIZZLE_Y);
+
+   /* ADD  DST, TMP.xxxx, TMP.yyyy */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
+                    temp_src0, temp_src1 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/* Translate the following TGSI DPH instruction.
+ *    DPH  DST, SRC1, SRC2
+ * To the following SVGA3D instruction sequence.
+ *    DP3  TMP, SRC1, SRC2
+ *    ADD  DST, TMP, SRC2.wwww
+ */
+static boolean emit_dph(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* DP3  TMP, SRC1, SRC2 */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_DP3 ), temp, src0, src1 ))
+      return FALSE;
+
+   src1 = scalar(src1, TGSI_SWIZZLE_W);
+
+   /* ADD  DST, TMP, SRC2.wwww */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
+                    src( temp ), src1 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Translate the following TGSI DST instruction.
+ *    NRM  DST, SRC
+ * To the following SVGA3D instruction sequence.
+ *    DP3  TMP, SRC, SRC
+ *    RSQ  TMP, TMP
+ *    MUL  DST, SRC, TMP
+ */
+static boolean emit_nrm(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* DP3  TMP, SRC, SRC */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_DP3 ), temp, src0, src0 ))
+      return FALSE;
+
+   /* RSQ  TMP, TMP */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_RSQ ), temp, src( temp )))
+      return FALSE;
+
+   /* MUL  DST, SRC, TMP */
+   if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), dst,
+                    src0, src( temp )))
+      return FALSE;
+
+   return TRUE;
+
+}
+
+static boolean do_emit_sincos(struct svga_shader_emitter *emit,
+                              SVGA3dShaderDestToken dst,
+                              struct src_register src0)
+{
+   src0 = scalar(src0, TGSI_SWIZZLE_X);
+
+   if (emit->use_sm30) {
+      return submit_op1( emit, inst_token( SVGA3DOP_SINCOS ),
+                         dst, src0 );
+   } else {
+      struct src_register const1 = get_sincos_const( emit, 0 );
+      struct src_register const2 = get_sincos_const( emit, 1 );
+
+      return submit_op3( emit, inst_token( SVGA3DOP_SINCOS ),
+                         dst, src0, const1, const2 );
+   }
+}
+
+static boolean emit_sincos(struct svga_shader_emitter *emit,
+                           const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* SCS TMP SRC */
+   if (!do_emit_sincos(emit, writemask(temp, TGSI_WRITEMASK_XY), src0 ))
+      return FALSE;
+
+   /* MOV DST TMP */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src( temp ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/*
+ * SCS TMP SRC
+ * MOV DST TMP.yyyy
+ */
+static boolean emit_sin(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* SCS TMP SRC */
+   if (!do_emit_sincos(emit, writemask(temp, TGSI_WRITEMASK_Y), src0))
+      return FALSE;
+
+   src0 = scalar(src( temp ), TGSI_SWIZZLE_Y);
+
+   /* MOV DST TMP.yyyy */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src0 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/*
+ * SCS TMP SRC
+ * MOV DST TMP.xxxx
+ */
+static boolean emit_cos(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   SVGA3dShaderDestToken temp = get_temp( emit );
+
+   /* SCS TMP SRC */
+   if (!do_emit_sincos( emit, writemask(temp, TGSI_WRITEMASK_X), src0 ))
+      return FALSE;
+
+   src0 = scalar(src( temp ), TGSI_SWIZZLE_X);
+
+   /* MOV DST TMP.xxxx */
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src0 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/*
+ * ADD DST SRC0, negate(SRC0)
+ */
+static boolean emit_sub(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+
+   src1 = negate(src1);
+
+   if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst,
+                    src0, src1 ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+static boolean emit_kil(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[0];
+   struct src_register src0;
+
+   inst = inst_token( SVGA3DOP_TEXKILL );
+   src0 = translate_src_register( emit, reg );
+
+   if (reg->SrcRegisterExtMod.Absolute ||
+       reg->SrcRegister.Negate != reg->SrcRegisterExtMod.Negate ||
+       reg->SrcRegister.Indirect ||
+       reg->SrcRegister.SwizzleX != 0 ||
+       reg->SrcRegister.SwizzleY != 1 ||
+       reg->SrcRegister.SwizzleZ != 2 ||
+       reg->SrcRegister.File != TGSI_FILE_TEMPORARY)
+   {
+      SVGA3dShaderDestToken temp = get_temp( emit );
+
+      submit_op1( emit, inst_token( SVGA3DOP_MOV ), temp, src0 );
+      src0 = src( temp );
+   }
+
+   return submit_op0( emit, inst, dst(src0) );
+}
+
+
+/* mesa state tracker always emits kilp as an unconditional
+ * kil */
+static boolean emit_kilp(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken temp;
+   struct src_register one = get_zero_immediate( emit );
+
+   inst = inst_token( SVGA3DOP_TEXKILL );
+   one = scalar( one, TGSI_SWIZZLE_W );
+
+   /* texkill doesn't allow negation on the operand so lets move
+    * negation of {1} to a temp register */
+   temp = get_temp( emit );
+   if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), temp,
+                    negate( one ) ))
+      return FALSE;
+
+   return submit_op0( emit, inst, temp );
+}
+
+/* Implement conditionals by initializing destination reg to 'fail',
+ * then set predicate reg with UFOP_SETP, then move 'pass' to dest
+ * based on predicate reg.
+ *
+ * SETP src0, cmp, src1  -- do this first to avoid aliasing problems.
+ * MOV dst, fail
+ * MOV dst, pass, p0 
+ */
+static boolean
+emit_conditional(struct svga_shader_emitter *emit,
+                 unsigned compare_func,
+                 SVGA3dShaderDestToken dst,
+                 struct src_register src0,
+                 struct src_register src1,
+                 struct src_register pass,
+                 struct src_register fail)
+{
+   SVGA3dShaderDestToken pred_reg = dst_register( SVGA3DREG_PREDICATE, 0 );
+   SVGA3dShaderInstToken setp_token, mov_token;
+   setp_token = inst_token( SVGA3DOP_SETP );
+
+   switch (compare_func) {
+   case PIPE_FUNC_NEVER:
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         dst, fail );
+      break;
+   case PIPE_FUNC_LESS:
+      setp_token.control = SVGA3DOPCOMP_LT;
+      break;
+   case PIPE_FUNC_EQUAL:
+      setp_token.control = SVGA3DOPCOMP_EQ;
+      break;
+   case PIPE_FUNC_LEQUAL:
+      setp_token.control = SVGA3DOPCOMP_LE;
+      break;
+   case PIPE_FUNC_GREATER:
+      setp_token.control = SVGA3DOPCOMP_GT;
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      setp_token.control = SVGA3DOPCOMPC_NE;
+      break;
+   case PIPE_FUNC_GEQUAL:
+      setp_token.control = SVGA3DOPCOMP_GE;
+      break;
+   case PIPE_FUNC_ALWAYS:
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         dst, pass );
+      break;
+   }
+
+   /* SETP src0, COMPOP, src1 */
+   if (!submit_op2( emit, setp_token, pred_reg,
+                    src0, src1 ))
+      return FALSE;
+
+   mov_token = inst_token( SVGA3DOP_MOV );
+
+   /* MOV dst, fail */
+   if (!submit_op1( emit, mov_token, dst,
+                    fail ))
+      return FALSE;
+
+   /* MOV dst, pass (predicated)
+    *
+    * Note that the predicate reg (and possible modifiers) is passed
+    * as the first source argument.
+    */
+   mov_token.predicated = 1;
+   if (!submit_op2( emit, mov_token, dst,
+                    src( pred_reg ), pass ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+static boolean
+emit_select(struct svga_shader_emitter *emit,
+            unsigned compare_func,
+            SVGA3dShaderDestToken dst,
+            struct src_register src0,
+            struct src_register src1 )
+{
+   /* There are some SVGA instructions which implement some selects
+    * directly, but they are only available in the vertex shader.
+    */
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      switch (compare_func) {
+      case PIPE_FUNC_GEQUAL:
+         return submit_op2( emit, inst_token( SVGA3DOP_SGE ), dst, src0, src1 );
+      case PIPE_FUNC_LEQUAL:
+         return submit_op2( emit, inst_token( SVGA3DOP_SGE ), dst, src1, src0 );
+      case PIPE_FUNC_GREATER:
+         return submit_op2( emit, inst_token( SVGA3DOP_SLT ), dst, src1, src0 );
+      case PIPE_FUNC_LESS:
+         return submit_op2( emit, inst_token( SVGA3DOP_SLT ), dst, src0, src1 );
+      default:
+         break;
+      }
+   }
+
+
+   /* Otherwise, need to use the setp approach:
+    */
+   {
+      struct src_register one, zero;
+      /* zero immediate is 0,0,0,1 */
+      zero = get_zero_immediate( emit );
+      one  = scalar( zero, TGSI_SWIZZLE_W );
+      zero = scalar( zero, TGSI_SWIZZLE_X );
+
+      return emit_conditional(
+         emit,
+         compare_func,
+         dst,
+         src0,
+         src1,
+         one, zero);
+   }
+}
+
+
+static boolean emit_select_op(struct svga_shader_emitter *emit,
+                              unsigned compare,
+                              const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+      
+   return emit_select( emit, compare, dst, src0, src1 );
+}
+
+
+/* Translate texture instructions to SVGA3D representation.
+ */
+static boolean emit_tex2(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn,
+                         SVGA3dShaderDestToken dst )
+{
+   SVGA3dShaderInstToken inst;
+   struct src_register src0;
+   struct src_register src1;
+
+   inst.value = 0;
+   inst.op = SVGA3DOP_TEX;
+
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXP:
+      inst.control = SVGA3DOPCONT_PROJECT;
+      break;
+   case TGSI_OPCODE_TXB:
+      inst.control = SVGA3DOPCONT_BIAS;
+      break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   src0 = translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   src1 = translate_src_register( emit, &insn->FullSrcRegisters[1] );
+
+   if (emit->key.fkey.tex[src1.base.num].unnormalized) {
+      struct src_register wh = get_tex_dimensions( emit, src1.base.num );
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+
+      /* MUL  tmp, SRC0, WH */
+      if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                       tmp, src0, wh ))
+         return FALSE;
+      src0 = src( tmp );
+   }
+
+   return submit_op2( emit, inst, dst, src0, src1 );
+}
+
+
+
+
+/* Translate texture instructions to SVGA3D representation.
+ */
+static boolean emit_tex3(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn,
+                         SVGA3dShaderDestToken dst )
+{
+   SVGA3dShaderInstToken inst;
+   struct src_register src0;
+   struct src_register src1;
+   struct src_register src2;
+
+   inst.value = 0;
+
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_TXD: 
+      inst.op = SVGA3DOP_TEXLDD;
+      break;
+   case TGSI_OPCODE_TXL:
+      inst.op = SVGA3DOP_TEXLDL;
+      break;
+   }
+
+   src0 = translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   src1 = translate_src_register( emit, &insn->FullSrcRegisters[1] );
+   src2 = translate_src_register( emit, &insn->FullSrcRegisters[2] );
+
+   return submit_op3( emit, inst, dst, src0, src1, src2 );
+}
+
+
+static boolean emit_tex(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = 
+      translate_dst_register( emit, insn, 0 );
+   struct src_register src0 =
+      translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   struct src_register src1 =
+      translate_src_register( emit, &insn->FullSrcRegisters[1] );
+
+   SVGA3dShaderDestToken tex_result;
+
+   /* check for shadow samplers */
+   boolean compare = (emit->key.fkey.tex[src1.base.num].compare_mode ==
+                      PIPE_TEX_COMPARE_R_TO_TEXTURE);
+
+
+   /* If doing compare processing, need to put this value into a
+    * temporary so it can be used as a source later on.
+    */
+   if (compare ||
+       (!emit->use_sm30 && dst.mask != TGSI_WRITEMASK_XYZW) ) {
+      tex_result = get_temp( emit );
+   }
+   else {
+      tex_result = dst;
+   }
+
+   switch(insn->Instruction.Opcode) {
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXP:
+      if (!emit_tex2( emit, insn, tex_result ))
+         return FALSE;
+      break;
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXD:
+      if (!emit_tex3( emit, insn, tex_result ))
+         return FALSE;
+      break;
+   default:
+      assert(0);
+   }
+
+
+   if (compare) {
+      SVGA3dShaderDestToken src0_zdivw = get_temp( emit );
+      struct src_register tex_src_x = scalar(src(tex_result), TGSI_SWIZZLE_Y);
+      struct src_register one =
+         scalar( get_zero_immediate( emit ), TGSI_SWIZZLE_W );
+
+      /* Divide texcoord R by Q */
+      if (!submit_op1( emit, inst_token( SVGA3DOP_RCP ),
+                       src0_zdivw,
+                       scalar(src0, TGSI_SWIZZLE_W) ))
+         return FALSE;
+
+      if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                       src0_zdivw,
+                       scalar(src0, TGSI_SWIZZLE_Z),
+                       src(src0_zdivw) ))
+         return FALSE;
+
+      if (!emit_select(
+             emit,
+             emit->key.fkey.tex[src1.base.num].compare_func,
+             dst,
+             src(src0_zdivw),
+             tex_src_x))
+         return FALSE;
+
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         writemask( dst, TGSI_WRITEMASK_W),
+                         one );
+   }
+   else if (!emit->use_sm30 && dst.mask != TGSI_WRITEMASK_XYZW) 
+   {
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, src(tex_result) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean emit_bgnloop2( struct svga_shader_emitter *emit,
+                              const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_LOOP );
+   struct src_register loop_reg = src_register( SVGA3DREG_LOOP, 0 );
+   struct src_register const_int = get_loop_const( emit );
+
+   return (emit_instruction( emit, inst ) &&
+           emit_src( emit, loop_reg ) &&
+           emit_src( emit, const_int ) );
+}
+
+static boolean emit_endloop2( struct svga_shader_emitter *emit,
+                              const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_ENDLOOP );
+   return emit_instruction( emit, inst );
+}
+
+static boolean emit_brk( struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_BREAK );
+   return emit_instruction( emit, inst );
+}
+
+static boolean emit_scalar_op1( struct svga_shader_emitter *emit,
+                                unsigned opcode,
+                                const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dst;
+   struct src_register src;
+
+   inst = inst_token( opcode );
+   dst = translate_dst_register( emit, insn, 0 );
+   src = translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   src = scalar( src, TGSI_SWIZZLE_X );
+
+   return submit_op1( emit, inst, dst, src );
+}
+
+
+static boolean emit_simple_instruction(struct svga_shader_emitter *emit,
+                                       unsigned opcode,
+                                       const struct tgsi_full_instruction *insn )
+{
+   const struct tgsi_full_src_register *src = insn->FullSrcRegisters;
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dst;
+
+   inst = inst_token( opcode );
+   dst = translate_dst_register( emit, insn, 0 );
+
+   switch (insn->Instruction.NumSrcRegs) {
+   case 0:
+      return submit_op0( emit, inst, dst );
+   case 1:
+      return submit_op1( emit, inst, dst,
+                         translate_src_register( emit, &src[0] ));
+   case 2:
+      return submit_op2( emit, inst, dst,
+                         translate_src_register( emit, &src[0] ),
+                         translate_src_register( emit, &src[1] ) );
+   case 3:
+      return submit_op3( emit, inst, dst,
+                         translate_src_register( emit, &src[0] ),
+                         translate_src_register( emit, &src[1] ),
+                         translate_src_register( emit, &src[2] ) );
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+static boolean emit_arl(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   ++emit->current_arl;
+   if (svga_arl_needs_adjustment( emit )) {
+      return emit_fake_arl( emit, insn );
+   } else {
+      /* no need to adjust, just emit straight arl */
+      return emit_simple_instruction(emit, SVGA3DOP_MOVA, insn);
+   }
+}
+
+static boolean alias_src_dst( struct src_register src,
+                              SVGA3dShaderDestToken dst )
+{
+   if (src.base.num != dst.num)
+      return FALSE;
+
+   if (SVGA3dShaderGetRegType(dst.value) != 
+       SVGA3dShaderGetRegType(src.base.value))
+      return FALSE;
+
+   return TRUE;
+}
+
+static boolean emit_pow(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   boolean need_tmp = FALSE;
+   
+   /* POW can only output to a temporary */
+   if (insn->FullDstRegisters[0].DstRegister.File != TGSI_FILE_TEMPORARY)
+      need_tmp = TRUE;
+   
+   /* POW src1 must not be the same register as dst */
+   if (alias_src_dst( src1, dst ))
+      need_tmp = TRUE;
+
+   /* it's a scalar op */
+   src0 = scalar( src0, TGSI_SWIZZLE_X );
+   src1 = scalar( src1, TGSI_SWIZZLE_X );
+
+   if (need_tmp) {
+      SVGA3dShaderDestToken tmp = writemask(get_temp( emit ), TGSI_WRITEMASK_X );
+
+      if (!submit_op2(emit, inst_token( SVGA3DOP_POW ), tmp, src0, src1))
+         return FALSE;
+
+      return submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, scalar(src(tmp), 0) );
+   } 
+   else {
+      return submit_op2(emit, inst_token( SVGA3DOP_POW ), dst, src0, src1);
+   }
+}
+
+static boolean emit_xpd(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   boolean need_dst_tmp = FALSE;
+
+   /* XPD can only output to a temporary */
+   if (SVGA3dShaderGetRegType(dst.value) != SVGA3DREG_TEMP) 
+      need_dst_tmp = TRUE;
+
+   /* The dst reg must not be the same as src0 or src1*/
+   if (alias_src_dst(src0, dst) ||
+       alias_src_dst(src1, dst))
+      need_dst_tmp = TRUE;
+
+   if (need_dst_tmp) {
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+
+      /* Obey DX9 restrictions on mask:
+       */
+      tmp.mask = dst.mask & TGSI_WRITEMASK_XYZ;
+
+      if (!submit_op2(emit, inst_token( SVGA3DOP_CRS ), tmp, src0, src1))
+         return FALSE;
+
+      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
+         return FALSE;
+   } 
+   else {
+      if (!submit_op2(emit, inst_token( SVGA3DOP_CRS ), dst, src0, src1))
+         return FALSE;
+   }
+
+   /* Need to emit 1.0 to dst.w?
+    */
+   if (dst.mask & TGSI_WRITEMASK_W) {
+      struct src_register zero = get_zero_immediate( emit );
+
+      if (!submit_op1(emit, 
+                      inst_token( SVGA3DOP_MOV ), 
+                      writemask(dst, TGSI_WRITEMASK_W),
+                      zero))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_lrp(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   SVGA3dShaderDestToken tmp;
+   const struct src_register src0 = translate_src_register(
+      emit, &insn->FullSrcRegisters[0] );
+   const struct src_register src1 = translate_src_register(
+      emit, &insn->FullSrcRegisters[1] );
+   const struct src_register src2 = translate_src_register(
+      emit, &insn->FullSrcRegisters[2] );
+   boolean need_dst_tmp = FALSE;
+
+   /* The dst reg must not be the same as src0 or src2 */
+   if (alias_src_dst(src0, dst) ||
+       alias_src_dst(src2, dst))
+      need_dst_tmp = TRUE;
+
+   if (need_dst_tmp) {
+      tmp = get_temp( emit );
+      tmp.mask = dst.mask;
+   }
+   else {
+      tmp = dst;
+   }
+
+   if (!submit_op3(emit, inst_token( SVGA3DOP_LRP ), tmp, src0, src1, src2))
+      return FALSE;
+
+   if (need_dst_tmp) {
+      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
+         return FALSE;      
+   } 
+
+   return TRUE;
+}
+
+
+static boolean emit_dst_insn(struct svga_shader_emitter *emit,
+                             const struct tgsi_full_instruction *insn )
+{
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      /* SVGA/DX9 has a DST instruction, but only for vertex shaders:
+       */
+      return emit_simple_instruction(emit, SVGA3DOP_DST, insn);
+   }
+   else {
+
+      /* result[0] = 1    * 1;
+       * result[1] = a[1] * b[1];
+       * result[2] = a[2] * 1;
+       * result[3] = 1    * b[3];
+       */
+
+      SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+      SVGA3dShaderDestToken tmp;
+      const struct src_register src0 = translate_src_register(
+         emit, &insn->FullSrcRegisters[0] );
+      const struct src_register src1 = translate_src_register(
+         emit, &insn->FullSrcRegisters[1] );
+      struct src_register zero = get_zero_immediate( emit );
+      boolean need_tmp = FALSE;
+
+      if (SVGA3dShaderGetRegType(dst.value) != SVGA3DREG_TEMP ||
+          alias_src_dst(src0, dst) ||
+          alias_src_dst(src1, dst))
+         need_tmp = TRUE;
+
+      if (need_tmp) {
+         tmp = get_temp( emit );
+      }
+      else {
+         tmp = dst;
+      }
+
+      /* tmp.xw = 1.0
+       */
+      if (tmp.mask & TGSI_WRITEMASK_XW) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          writemask(tmp, TGSI_WRITEMASK_XW ),
+                          scalar( zero, 3 )))
+            return FALSE;
+      }
+      
+      /* tmp.yz = src0
+       */
+      if (tmp.mask & TGSI_WRITEMASK_YZ) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          writemask(tmp, TGSI_WRITEMASK_YZ ),
+                          src0))
+            return FALSE;
+      }
+
+      /* tmp.yw = tmp * src1
+       */
+      if (tmp.mask & TGSI_WRITEMASK_YW) {
+         if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ), 
+                          writemask(tmp, TGSI_WRITEMASK_YW ),
+                          src(tmp),
+                          src1))
+            return FALSE;
+      }
+
+      /* dst = tmp
+       */
+      if (need_tmp) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          dst,
+                          src(tmp)))
+            return FALSE;
+      }      
+   }
+   
+   return TRUE;
+}
+
+
+static boolean emit_exp(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 =
+      translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   struct src_register zero = get_zero_immediate( emit );
+   SVGA3dShaderDestToken fraction;
+
+   if (dst.mask & TGSI_WRITEMASK_Y)
+      fraction = dst;
+   else if (dst.mask & TGSI_WRITEMASK_X)
+      fraction = get_temp( emit );
+
+   /* If y is being written, fill it with src0 - floor(src0).
+    */
+   if (dst.mask & TGSI_WRITEMASK_XY) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_FRC ),
+                       writemask( fraction, TGSI_WRITEMASK_Y ),
+                       src0 ))
+         return FALSE;
+   }
+
+   /* If x is being written, fill it with 2 ^ floor(src0).
+    */
+   if (dst.mask & TGSI_WRITEMASK_X) {
+      if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ),
+                       writemask( dst, dst.mask & TGSI_WRITEMASK_X ),
+                       src0,
+                       scalar( negate( src( fraction ) ), TGSI_SWIZZLE_Y ) ) )
+         return FALSE;
+
+      if (!submit_op1( emit, inst_token( SVGA3DOP_EXP ),
+                       writemask( dst, dst.mask & TGSI_WRITEMASK_X ),
+                       scalar( src( dst ), TGSI_SWIZZLE_X ) ) )
+         return FALSE;
+
+      if (!(dst.mask & TGSI_WRITEMASK_Y))
+         release_temp( emit, fraction );
+   }
+
+   /* If z is being written, fill it with 2 ^ src0 (partial precision).
+    */
+   if (dst.mask & TGSI_WRITEMASK_Z) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_EXPP ),
+                       writemask( dst, dst.mask & TGSI_WRITEMASK_Z ),
+                       src0 ) )
+         return FALSE;
+   }
+
+   /* If w is being written, fill it with one.
+    */
+   if (dst.mask & TGSI_WRITEMASK_W) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                       writemask(dst, TGSI_WRITEMASK_W),
+                       scalar( zero, TGSI_SWIZZLE_W ) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean emit_lit(struct svga_shader_emitter *emit,
+                             const struct tgsi_full_instruction *insn )
+{
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      /* SVGA/DX9 has a LIT instruction, but only for vertex shaders:
+       */
+      return emit_simple_instruction(emit, SVGA3DOP_LIT, insn);
+   }
+   else {
+
+      /* D3D vs. GL semantics can be fairly easily accomodated by
+       * variations on this sequence.
+       *
+       * GL:
+       *   tmp.y = src.x
+       *   tmp.z = pow(src.y,src.w)
+       *   p0 = src0.xxxx > 0
+       *   result = zero.wxxw
+       *   (p0) result.yz = tmp
+       *
+       * D3D:
+       *   tmp.y = src.x
+       *   tmp.z = pow(src.y,src.w)
+       *   p0 = src0.xxyy > 0
+       *   result = zero.wxxw
+       *   (p0) result.yz = tmp
+       *
+       * Will implement the GL version for now.
+       */
+
+      SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+      const struct src_register src0 = translate_src_register(
+         emit, &insn->FullSrcRegisters[0] );
+      struct src_register zero = get_zero_immediate( emit );
+
+      /* tmp = pow(src.y, src.w)
+       */
+      if (dst.mask & TGSI_WRITEMASK_Z) {
+         if (!submit_op2(emit, inst_token( SVGA3DOP_POW ), 
+                         tmp, 
+                         scalar(src0, 1), 
+                         scalar(src0, 3)))
+            return FALSE;
+      }
+
+      /* tmp.y = src.x
+       */
+      if (dst.mask & TGSI_WRITEMASK_Y) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), 
+                          writemask(tmp, TGSI_WRITEMASK_Y ),
+                          scalar(src0, 0)))
+            return FALSE;
+      }
+      
+      /* Can't quite do this with emit conditional due to the extra
+       * writemask on the predicated mov:
+       */
+      {
+         SVGA3dShaderDestToken pred_reg = dst_register( SVGA3DREG_PREDICATE, 0 );
+         SVGA3dShaderInstToken setp_token, mov_token;
+         struct src_register predsrc;
+
+         setp_token = inst_token( SVGA3DOP_SETP );
+         mov_token = inst_token( SVGA3DOP_MOV );
+
+         setp_token.control = SVGA3DOPCOMP_GT;
+
+         /* D3D vs GL semantics:
+          */
+         if (0)
+            predsrc = swizzle(src0, 0, 0, 1, 1); /* D3D */
+         else
+            predsrc = swizzle(src0, 0, 0, 0, 0); /* GL */
+
+         /* SETP src0.xxyy, GT, {0}.x */
+         if (!submit_op2( emit, setp_token, pred_reg,
+                          predsrc, 
+                          swizzle(zero, 0, 0, 0, 0) ))
+            return FALSE;
+         
+         /* MOV dst, fail */
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), dst,
+                          swizzle(zero, 3, 0, 0, 3 )))
+             return FALSE;
+
+         /* MOV dst.yz, tmp (predicated)
+          *
+          * Note that the predicate reg (and possible modifiers) is passed
+          * as the first source argument.
+          */
+         if (dst.mask & TGSI_WRITEMASK_YZ) {
+            mov_token.predicated = 1;
+            if (!submit_op2( emit, mov_token,
+                             writemask(dst, TGSI_WRITEMASK_YZ),
+                             src( pred_reg ), src( tmp ) ))
+               return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+
+static boolean emit_ex2( struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderInstToken inst;
+   SVGA3dShaderDestToken dst;
+   struct src_register src0;
+
+   inst = inst_token( SVGA3DOP_EXP );
+   dst = translate_dst_register( emit, insn, 0 );
+   src0 = translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   src0 = scalar( src0, TGSI_SWIZZLE_X );
+
+   if (dst.mask != TGSI_WRITEMASK_XYZW) {
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+
+      if (!submit_op1( emit, inst, tmp, src0 ))
+         return FALSE;
+
+      return submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                         dst,
+                         scalar( src( tmp ), TGSI_SWIZZLE_X ) );
+   }
+
+   return submit_op1( emit, inst, dst, src0 );
+}
+
+
+static boolean emit_log(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 =
+      translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   struct src_register zero = get_zero_immediate( emit );
+   SVGA3dShaderDestToken abs_tmp;
+   struct src_register abs_src0;
+   SVGA3dShaderDestToken log2_abs;
+
+   if (dst.mask & TGSI_WRITEMASK_Z)
+      log2_abs = dst;
+   else if (dst.mask & TGSI_WRITEMASK_XY)
+      log2_abs = get_temp( emit );
+
+   /* If z is being written, fill it with log2( abs( src0 ) ).
+    */
+   if (dst.mask & TGSI_WRITEMASK_XYZ) {
+      if (!src0.base.srcMod || src0.base.srcMod == SVGA3DSRCMOD_ABS)
+         abs_src0 = src0;
+      else {
+         abs_tmp = get_temp( emit );
+
+         if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                          abs_tmp,
+                          src0 ) )
+            return FALSE;
+
+         abs_src0 = src( abs_tmp );
+      }
+
+      abs_src0 = absolute( scalar( abs_src0, TGSI_SWIZZLE_X ) );
+
+      if (!submit_op1( emit, inst_token( SVGA3DOP_LOG ),
+                       writemask( log2_abs, TGSI_WRITEMASK_Z ),
+                       abs_src0 ) )
+         return FALSE;
+   }
+
+   if (dst.mask & TGSI_WRITEMASK_XY) {
+      SVGA3dShaderDestToken floor_log2;
+
+      if (dst.mask & TGSI_WRITEMASK_X)
+         floor_log2 = dst;
+      else
+         floor_log2 = get_temp( emit );
+
+      /* If x is being written, fill it with floor( log2( abs( src0 ) ) ).
+       */
+      if (!submit_op1( emit, inst_token( SVGA3DOP_FRC ),
+                       writemask( floor_log2, TGSI_WRITEMASK_X ),
+                       scalar( src( log2_abs ), TGSI_SWIZZLE_Z ) ) )
+         return FALSE;
+
+      if (!submit_op2( emit, inst_token( SVGA3DOP_ADD ),
+                       writemask( floor_log2, TGSI_WRITEMASK_X ),
+                       scalar( src( log2_abs ), TGSI_SWIZZLE_Z ),
+                       negate( src( floor_log2 ) ) ) )
+         return FALSE;
+
+      /* If y is being written, fill it with
+       * abs ( src0 ) / ( 2 ^ floor( log2( abs( src0 ) ) ) ).
+       */
+      if (dst.mask & TGSI_WRITEMASK_Y) {
+         if (!submit_op1( emit, inst_token( SVGA3DOP_EXP ),
+                          writemask( dst, TGSI_WRITEMASK_Y ),
+                          negate( scalar( src( floor_log2 ),
+                                          TGSI_SWIZZLE_X ) ) ) )
+            return FALSE;
+
+         if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
+                          writemask( dst, TGSI_WRITEMASK_Y ),
+                          src( dst ),
+                          abs_src0 ) )
+            return FALSE;
+      }
+
+      if (!(dst.mask & TGSI_WRITEMASK_X))
+         release_temp( emit, floor_log2 );
+
+      if (!(dst.mask & TGSI_WRITEMASK_Z))
+         release_temp( emit, log2_abs );
+   }
+
+   if (dst.mask & TGSI_WRITEMASK_XYZ && src0.base.srcMod &&
+       src0.base.srcMod != SVGA3DSRCMOD_ABS)
+      release_temp( emit, abs_tmp );
+
+   /* If w is being written, fill it with one.
+    */
+   if (dst.mask & TGSI_WRITEMASK_W) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ),
+                       writemask(dst, TGSI_WRITEMASK_W),
+                       scalar( zero, TGSI_SWIZZLE_W ) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_bgnsub( struct svga_shader_emitter *emit,
+                           unsigned position,
+                           const struct tgsi_full_instruction *insn )
+{
+   unsigned i;
+
+   /* Note that we've finished the main function and are now emitting
+    * subroutines.  This affects how we terminate the generated
+    * shader.
+    */
+   emit->in_main_func = FALSE;
+   
+   for (i = 0; i < emit->nr_labels; i++) {
+      if (emit->label[i] == position) {
+         return (emit_instruction( emit, inst_token( SVGA3DOP_RET ) ) &&
+                 emit_instruction( emit, inst_token( SVGA3DOP_LABEL ) ) &&
+                 emit_src( emit, src_register( SVGA3DREG_LABEL, i )));
+      }
+   }
+
+   assert(0);
+   return TRUE;
+}
+
+static boolean emit_call( struct svga_shader_emitter *emit,
+                           const struct tgsi_full_instruction *insn )
+{
+   unsigned position = insn->InstructionExtLabel.Label;
+   unsigned i;
+   
+   for (i = 0; i < emit->nr_labels; i++) {
+      if (emit->label[i] == position) 
+         break;
+   }
+
+   if (emit->nr_labels == Elements(emit->label))
+      return FALSE;
+
+   if (i == emit->nr_labels) {
+      emit->label[i] = position;
+      emit->nr_labels++;
+   }
+
+   return (emit_instruction( emit, inst_token( SVGA3DOP_CALL ) ) &&
+           emit_src( emit, src_register( SVGA3DREG_LABEL, i )));
+}
+
+
+static boolean emit_end( struct svga_shader_emitter *emit )
+{
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      return emit_vs_postamble( emit );
+   }
+   else {
+      return emit_ps_postamble( emit );
+   }
+}
+
+
+
+static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
+                                      unsigned position,
+                                      const struct tgsi_full_instruction *insn )
+{
+   switch (insn->Instruction.Opcode) {
+
+   case TGSI_OPCODE_ARL:
+      return emit_arl( emit, insn );
+
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXD:
+      return emit_tex( emit, insn );
+
+   case TGSI_OPCODE_BGNSUB:
+      return emit_bgnsub( emit, position, insn );
+
+   case TGSI_OPCODE_ENDSUB:
+      return TRUE;
+
+   case TGSI_OPCODE_CAL:
+      return emit_call( emit, insn );
+
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_TRUNC:        /* should be TRUNC, not FLR */
+      return emit_floor( emit, insn );
+
+   case TGSI_OPCODE_CMP:
+      return emit_cmp( emit, insn );
+
+   case TGSI_OPCODE_DIV:
+      return emit_div( emit, insn );
+
+   case TGSI_OPCODE_DP2:
+      return emit_dp2( emit, insn );
+
+   case TGSI_OPCODE_DPH:
+      return emit_dph( emit, insn );
+
+   case TGSI_OPCODE_NRM:
+      return emit_nrm( emit, insn );
+
+   case TGSI_OPCODE_COS:
+      return emit_cos( emit, insn );
+
+   case TGSI_OPCODE_SIN:
+      return emit_sin( emit, insn );
+
+   case TGSI_OPCODE_SCS:
+      return emit_sincos( emit, insn );
+
+   case TGSI_OPCODE_END:
+      /* TGSI always finishes the main func with an END */
+      return emit_end( emit );
+
+   case TGSI_OPCODE_KIL:
+      return emit_kil( emit, insn );
+
+      /* Selection opcodes.  The underlying language is fairly
+       * non-orthogonal about these.
+       */
+   case TGSI_OPCODE_SEQ:
+      return emit_select_op( emit, PIPE_FUNC_EQUAL, insn );
+
+   case TGSI_OPCODE_SNE:
+      return emit_select_op( emit, PIPE_FUNC_NOTEQUAL, insn );
+
+   case TGSI_OPCODE_SGT:
+      return emit_select_op( emit, PIPE_FUNC_GREATER, insn );
+
+   case TGSI_OPCODE_SGE:
+      return emit_select_op( emit, PIPE_FUNC_GEQUAL, insn );
+
+   case TGSI_OPCODE_SLT:
+      return emit_select_op( emit, PIPE_FUNC_LESS, insn );
+
+   case TGSI_OPCODE_SLE:
+      return emit_select_op( emit, PIPE_FUNC_LEQUAL, insn );
+
+   case TGSI_OPCODE_SUB:
+      return emit_sub( emit, insn );
+
+   case TGSI_OPCODE_POW:
+      return emit_pow( emit, insn );
+
+   case TGSI_OPCODE_EX2:
+      return emit_ex2( emit, insn );
+
+   case TGSI_OPCODE_EXP:
+      return emit_exp( emit, insn );
+
+   case TGSI_OPCODE_LOG:
+      return emit_log( emit, insn );
+
+   case TGSI_OPCODE_LG2:
+      return emit_scalar_op1( emit, SVGA3DOP_LOG, insn );
+
+   case TGSI_OPCODE_RSQ:
+      return emit_scalar_op1( emit, SVGA3DOP_RSQ, insn );
+
+   case TGSI_OPCODE_RCP:
+      return emit_scalar_op1( emit, SVGA3DOP_RCP, insn );
+
+   case TGSI_OPCODE_CONT:
+   case TGSI_OPCODE_RET:
+      /* This is a noop -- we tell mesa that we can't support RET
+       * within a function (early return), so this will always be
+       * followed by an ENDSUB.
+       */
+      return TRUE;
+
+      /* These aren't actually used by any of the frontends we care
+       * about:
+       */
+   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_ROUND:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_SHL:
+   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_XOR:
+      return FALSE;
+
+   case TGSI_OPCODE_IF:
+      return emit_if( emit, insn );
+   case TGSI_OPCODE_ELSE:
+      return emit_else( emit, insn );
+   case TGSI_OPCODE_ENDIF:
+      return emit_endif( emit, insn );
+
+   case TGSI_OPCODE_BGNLOOP:
+      return emit_bgnloop2( emit, insn );
+   case TGSI_OPCODE_ENDLOOP:
+      return emit_endloop2( emit, insn );
+   case TGSI_OPCODE_BRK:
+      return emit_brk( emit, insn );
+
+   case TGSI_OPCODE_XPD:
+      return emit_xpd( emit, insn );
+
+   case TGSI_OPCODE_KILP:
+      return emit_kilp( emit, insn );
+
+   case TGSI_OPCODE_DST:
+      return emit_dst_insn( emit, insn );
+
+   case TGSI_OPCODE_LIT:
+      return emit_lit( emit, insn );
+
+   case TGSI_OPCODE_LRP:
+      return emit_lrp( emit, insn );
+
+   default: {
+      unsigned opcode = translate_opcode(insn->Instruction.Opcode);
+
+      if (opcode == SVGA3DOP_LAST_INST)
+         return FALSE;
+
+      if (!emit_simple_instruction( emit, opcode, insn ))
+         return FALSE;
+   }
+   }
+
+   return TRUE;
+}
+
+
+static boolean svga_emit_immediate( struct svga_shader_emitter *emit,
+                                    struct tgsi_full_immediate *imm)
+{
+   static const float id[4] = {0,0,0,1};
+   float value[4];
+   unsigned i;
+
+   assert(1 <= imm->Immediate.NrTokens && imm->Immediate.NrTokens <= 5);
+   for (i = 0; i < imm->Immediate.NrTokens - 1; i++)
+      value[i] = imm->u[i].Float;
+
+   for ( ; i < 4; i++ )
+      value[i] = id[i];
+
+   return emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT,
+                          emit->imm_start + emit->internal_imm_count++,
+                          value[0], value[1], value[2], value[3]);
+}
+
+static boolean make_immediate( struct svga_shader_emitter *emit,
+                               float a,
+                               float b,
+                               float c,
+                               float d,
+                               struct src_register *out )
+{
+   unsigned idx = emit->nr_hw_const++;
+
+   if (!emit_def_const( emit, SVGA3D_CONST_TYPE_FLOAT,
+                        idx, a, b, c, d ))
+      return FALSE;
+
+   *out = src_register( SVGA3DREG_CONST, idx );
+
+   return TRUE;
+}
+
+static boolean emit_vs_preamble( struct svga_shader_emitter *emit )
+{
+   if (!emit->key.vkey.need_prescale) {
+      if (!make_immediate( emit, 0, 0, .5, .5,
+                           &emit->imm_0055))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean emit_ps_preamble( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   /* For SM20, need to initialize the temporaries we're using to hold
+    * color outputs to some value.  Shaders which don't set all of
+    * these values are likely to be rejected by the DX9 runtime.
+    */
+   if (!emit->use_sm30) {
+      struct src_register zero = get_zero_immediate( emit );
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         if (SVGA3dShaderGetRegType(emit->true_col[i].value) != 0) {
+            
+            if (!submit_op1( emit,
+                             inst_token(SVGA3DOP_MOV),
+                             emit->temp_col[i],
+                             zero ))
+               return FALSE;
+         }
+      }
+   }
+   
+   return TRUE;
+}
+
+static boolean emit_ps_postamble( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   /* PS oDepth is incredibly fragile and it's very hard to catch the
+    * types of usage that break it during shader emit.  Easier just to
+    * redirect the main program to a temporary and then only touch
+    * oDepth with a hand-crafted MOV below.
+    */
+   if (SVGA3dShaderGetRegType(emit->true_pos.value) != 0) {
+
+      if (!submit_op1( emit,
+                       inst_token(SVGA3DOP_MOV),
+                       emit->true_pos,
+                       scalar(src(emit->temp_pos), TGSI_SWIZZLE_Z) ))
+         return FALSE;
+   }
+
+   /* Similarly for SM20 color outputs...  Luckily SM30 isn't so
+    * fragile.
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (SVGA3dShaderGetRegType(emit->true_col[i].value) != 0) {
+
+         if (!submit_op1( emit,
+                          inst_token(SVGA3DOP_MOV),
+                          emit->true_col[i],
+                          src(emit->temp_col[i]) ))
+            return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+static boolean emit_vs_postamble( struct svga_shader_emitter *emit )
+{
+   /* PSIZ output is incredibly fragile and it's very hard to catch
+    * the types of usage that break it during shader emit.  Easier
+    * just to redirect the main program to a temporary and then only
+    * touch PSIZ with a hand-crafted MOV below.
+    */
+   if (SVGA3dShaderGetRegType(emit->true_psiz.value) != 0) {
+      
+      if (!submit_op1( emit,
+                       inst_token(SVGA3DOP_MOV),
+                       emit->true_psiz,
+                       scalar(src(emit->temp_psiz), TGSI_SWIZZLE_X) ))
+         return FALSE;
+   }
+
+   /* Need to perform various manipulations on vertex position to cope
+    * with the different GL and D3D clip spaces.
+    */
+   if (emit->key.vkey.need_prescale) {
+      SVGA3dShaderDestToken temp_pos = emit->temp_pos;
+      SVGA3dShaderDestToken pos = emit->true_pos;
+      unsigned offset = emit->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      struct src_register prescale_scale = src_register( SVGA3DREG_CONST, 
+                                                         offset + 0 ); 
+      struct src_register prescale_trans = src_register( SVGA3DREG_CONST, 
+                                                         offset + 1 ); 
+
+      /* MUL temp_pos.xyz,    temp_pos,      prescale.scale
+       * MAD result.position, temp_pos.wwww, prescale.trans, temp_pos
+       *   --> Note that prescale.trans.w == 0
+       */
+      if (!submit_op2( emit, 
+                       inst_token(SVGA3DOP_MUL), 
+                       writemask(temp_pos, TGSI_WRITEMASK_XYZ), 
+                       src(temp_pos),
+                       prescale_scale ))
+         return FALSE;
+
+      if (!submit_op3( emit, 
+                       inst_token(SVGA3DOP_MAD), 
+                       pos, 
+                       swizzle(src(temp_pos), 3, 3, 3, 3),
+                       prescale_trans,
+                       src(temp_pos)))
+         return FALSE;
+   }
+   else {
+      SVGA3dShaderDestToken temp_pos = emit->temp_pos;
+      SVGA3dShaderDestToken pos = emit->true_pos;
+      struct src_register imm_0055 = emit->imm_0055;
+
+      /* Adjust GL clipping coordinate space to hardware (D3D-style):
+       *
+       * DP4 temp_pos.z, {0,0,.5,.5}, temp_pos
+       * MOV result.position, temp_pos 
+       */
+      if (!submit_op2( emit, 
+                       inst_token(SVGA3DOP_DP4), 
+                       writemask(temp_pos, TGSI_WRITEMASK_Z), 
+                       imm_0055, 
+                       src(temp_pos) ))
+         return FALSE;
+
+      if (!submit_op1( emit,
+                       inst_token(SVGA3DOP_MOV),
+                       pos,
+                       src(temp_pos) ))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+/*
+  0: IF VFACE :4
+  1:   COLOR = FrontColor;
+  2: ELSE
+  3:   COLOR = BackColor;
+  4: ENDIF
+ */
+static boolean emit_light_twoside( struct svga_shader_emitter *emit )
+{
+   struct src_register vface, zero;
+   struct src_register front[2];
+   struct src_register back[2];
+   SVGA3dShaderDestToken color[2];
+   int count =  emit->internal_color_count;
+   int i;
+   SVGA3dShaderInstToken if_token;
+
+   if (count == 0)
+      return TRUE;
+
+   vface = get_vface( emit );
+   zero = get_zero_immediate( emit );
+
+   /* Can't use get_temp() to allocate the color reg as such
+    * temporaries will be reclaimed after each instruction by the call
+    * to reset_temp_regs().
+    */
+   for (i = 0; i < count; i++) {
+      color[i] = dst_register( SVGA3DREG_TEMP, 
+                               emit->nr_hw_temp++ );
+
+      front[i] = emit->input_map[emit->internal_color_idx[i]];
+
+      /* Back is always the next input:
+       */
+      back[i] = front[i];
+      back[i].base.num = front[i].base.num + 1;
+
+      /* Reassign the input_map to the actual front-face color:
+       */
+      emit->input_map[emit->internal_color_idx[i]] = src(color[i]);
+   }
+   
+   if_token = inst_token( SVGA3DOP_IFC );
+
+   if (emit->key.fkey.front_cw)
+      if_token.control = SVGA3DOPCOMP_GT;
+   else
+      if_token.control = SVGA3DOPCOMP_LT;
+
+   zero = scalar(zero, TGSI_SWIZZLE_X);
+
+   if (!(emit_instruction( emit, if_token ) &&
+         emit_src( emit, vface ) &&
+         emit_src( emit, zero ) ))
+      return FALSE;
+
+   for (i = 0; i < count; i++) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), color[i], front[i] ))
+         return FALSE;
+   }
+
+   if (!(emit_instruction( emit, inst_token( SVGA3DOP_ELSE))))
+      return FALSE;
+   
+   for (i = 0; i < count; i++) {
+      if (!submit_op1( emit, inst_token( SVGA3DOP_MOV ), color[i], back[i] ))
+         return FALSE;
+   }
+
+   if (!emit_instruction( emit, inst_token( SVGA3DOP_ENDIF ) ))
+      return FALSE;
+
+   return TRUE;
+}
+
+/*
+  0: SETP_GT TEMP, VFACE, 0
+  where TEMP is a fake frontface register
+ */
+static boolean emit_frontface( struct svga_shader_emitter *emit )
+{
+   struct src_register vface, zero;
+   SVGA3dShaderDestToken temp;
+   struct src_register pass, fail;
+
+   vface = get_vface( emit );
+   zero = get_zero_immediate( emit );
+
+   /* Can't use get_temp() to allocate the fake frontface reg as such
+    * temporaries will be reclaimed after each instruction by the call
+    * to reset_temp_regs().
+    */
+   temp = dst_register( SVGA3DREG_TEMP,
+                        emit->nr_hw_temp++ );
+
+   if (emit->key.fkey.front_cw) {
+      pass = scalar( zero, TGSI_SWIZZLE_W );
+      fail = scalar( zero, TGSI_SWIZZLE_X );
+   } else {
+      pass = scalar( zero, TGSI_SWIZZLE_X );
+      fail = scalar( zero, TGSI_SWIZZLE_W );
+   }
+
+   if (!emit_conditional(emit, PIPE_FUNC_GREATER,
+                         temp, vface, scalar( zero, TGSI_SWIZZLE_X ),
+                         pass, fail))
+      return FALSE;
+
+   /* Reassign the input_map to the actual front-face color:
+    */
+   emit->input_map[emit->internal_frontface_idx] = src(temp);
+
+   return TRUE;
+}
+
+static INLINE boolean
+needs_to_create_zero( struct svga_shader_emitter *emit )
+{
+   int i;
+
+   if (emit->unit == PIPE_SHADER_FRAGMENT) {
+      if (!emit->use_sm30)
+         return TRUE;
+
+      if (emit->key.fkey.light_twoside)
+         return TRUE;
+
+      if (emit->emit_frontface)
+         return TRUE;
+
+      if (emit->info.opcode_count[TGSI_OPCODE_DST] >= 1 ||
+          emit->info.opcode_count[TGSI_OPCODE_LIT] >= 1)
+         return TRUE;
+   }
+
+   if (emit->info.opcode_count[TGSI_OPCODE_IF] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SGE] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SGT] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SLE] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SLT] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SNE] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_SEQ] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_EXP] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_LOG] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_XPD] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_KILP] >= 1)
+      return TRUE;
+
+   for (i = 0; i < emit->key.fkey.num_textures; i++) {
+      if (emit->key.fkey.tex[i].compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+static INLINE boolean
+needs_to_create_loop_const( struct svga_shader_emitter *emit )
+{
+   return (emit->info.opcode_count[TGSI_OPCODE_BGNLOOP] >= 1);
+}
+
+static INLINE boolean
+needs_to_create_sincos_consts( struct svga_shader_emitter *emit )
+{
+   return !emit->use_sm30 && (emit->info.opcode_count[TGSI_OPCODE_SIN] >= 1 ||
+                              emit->info.opcode_count[TGSI_OPCODE_COS] >= 1 ||
+                              emit->info.opcode_count[TGSI_OPCODE_SCS] >= 1);
+}
+
+static INLINE boolean
+needs_to_create_arl_consts( struct svga_shader_emitter *emit )
+{
+   return (emit->num_arl_consts > 0);
+}
+
+static INLINE boolean
+pre_parse_add_indirect( struct svga_shader_emitter *emit,
+                        int num, int current_arl)
+{
+   int i;
+   assert(num < 0);
+
+   for (i = 0; i < emit->num_arl_consts; ++i) {
+      if (emit->arl_consts[i].arl_num == current_arl)
+         break;
+   }
+   /* new entry */
+   if (emit->num_arl_consts == i) {
+      ++emit->num_arl_consts;
+   }
+   emit->arl_consts[i].number = (emit->arl_consts[i].number > num) ?
+                                num :
+                                emit->arl_consts[i].number;
+   emit->arl_consts[i].arl_num = current_arl;
+   return TRUE;
+}
+
+static boolean
+pre_parse_instruction( struct svga_shader_emitter *emit,
+                       const struct tgsi_full_instruction *insn,
+                       int current_arl)
+{
+   if (insn->FullSrcRegisters[0].SrcRegister.Indirect &&
+       insn->FullSrcRegisters[0].SrcRegisterInd.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[0];
+      if (reg->SrcRegister.Index < 0) {
+         pre_parse_add_indirect(emit, reg->SrcRegister.Index, current_arl);
+      }
+   }
+
+   if (insn->FullSrcRegisters[1].SrcRegister.Indirect &&
+       insn->FullSrcRegisters[1].SrcRegisterInd.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[1];
+      if (reg->SrcRegister.Index < 0) {
+         pre_parse_add_indirect(emit, reg->SrcRegister.Index, current_arl);
+      }
+   }
+
+   if (insn->FullSrcRegisters[2].SrcRegister.Indirect &&
+       insn->FullSrcRegisters[2].SrcRegisterInd.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[2];
+      if (reg->SrcRegister.Index < 0) {
+         pre_parse_add_indirect(emit, reg->SrcRegister.Index, current_arl);
+      }
+   }
+
+   return TRUE;
+}
+
+static boolean
+pre_parse_tokens( struct svga_shader_emitter *emit,
+                  const struct tgsi_token *tokens )
+{
+   struct tgsi_parse_context parse;
+   int current_arl = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      tgsi_parse_token( &parse );
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullToken.FullInstruction.Instruction.Opcode ==
+             TGSI_OPCODE_ARL) {
+            ++current_arl;
+         }
+         if (!pre_parse_instruction( emit, &parse.FullToken.FullInstruction,
+                                     current_arl ))
+            return FALSE;
+         break;
+      default:
+         break;
+      }
+
+   }
+   return TRUE;
+}
+
+static boolean svga_shader_emit_helpers( struct svga_shader_emitter *emit )
+
+{
+   if (needs_to_create_zero( emit )) {
+      create_zero_immediate( emit );
+   }
+   if (needs_to_create_loop_const( emit )) {
+      create_loop_const( emit );
+   }
+   if (needs_to_create_sincos_consts( emit )) {
+      create_sincos_consts( emit );
+   }
+   if (needs_to_create_arl_consts( emit )) {
+      create_arl_consts( emit );
+   }
+
+   if (emit->unit == PIPE_SHADER_FRAGMENT) {
+      if (!emit_ps_preamble( emit ))
+         return FALSE;
+
+      if (emit->key.fkey.light_twoside) {
+         if (!emit_light_twoside( emit ))
+            return FALSE;
+      }
+      if (emit->emit_frontface) {
+         if (!emit_frontface( emit ))
+            return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean svga_shader_emit_instructions( struct svga_shader_emitter *emit,
+                                       const struct tgsi_token *tokens )
+{
+   struct tgsi_parse_context parse;
+   boolean ret = TRUE;
+   boolean helpers_emitted = FALSE;
+   unsigned line_nr = 0;
+
+   tgsi_parse_init( &parse, tokens );
+   emit->internal_imm_count = 0;
+
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      ret = emit_vs_preamble( emit );
+      if (!ret)
+         goto done;
+   }
+
+   pre_parse_tokens(emit, tokens);
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      tgsi_parse_token( &parse );
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         ret = svga_emit_immediate( emit, &parse.FullToken.FullImmediate );
+         if (!ret)
+            goto done;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (emit->use_sm30)
+            ret = svga_translate_decl_sm30( emit, &parse.FullToken.FullDeclaration );
+         else
+            ret = svga_translate_decl_sm20( emit, &parse.FullToken.FullDeclaration );
+         if (!ret)
+            goto done;
+         break;
+         
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!helpers_emitted) {
+            if (!svga_shader_emit_helpers( emit ))
+               goto done;
+            helpers_emitted = TRUE;
+         }
+         ret = svga_emit_instruction( emit, 
+                                      line_nr++,
+                                      &parse.FullToken.FullInstruction );
+         if (!ret)
+            goto done;
+         break;
+      default:
+         break;
+      }
+      
+      reset_temp_regs( emit );
+   }
+
+   /* Need to terminate the current subroutine.  Note that the
+    * hardware doesn't tolerate shaders without sub-routines
+    * terminating with RET+END.
+    */
+   if (!emit->in_main_func) {
+      ret = emit_instruction( emit, inst_token( SVGA3DOP_RET ) );
+      if (!ret)
+         goto done;
+   }
+
+   /* Need to terminate the whole shader:
+    */
+   ret = emit_instruction( emit, inst_token( SVGA3DOP_END ) );
+   if (!ret)
+      goto done;
+
+done:
+   assert(ret);
+   tgsi_parse_free( &parse );
+   return ret;
+}
+
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
new file mode 100644
index 00000000000..59f299c1858
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -0,0 +1,299 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * VMware SVGA specific winsys interface.
+ * 
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * 
+ * Documentation taken from the VMware SVGA DDK.
+ */
+
+#ifndef SVGA_WINSYS_H_
+#define SVGA_WINSYS_H_
+
+
+#include "svga_types.h"
+#include "svga_reg.h"
+#include "svga3d_reg.h"
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+
+
+struct svga_winsys_screen;
+struct svga_winsys_buffer;
+struct pipe_screen;
+struct pipe_context;
+struct pipe_fence_handle;
+struct pipe_texture;
+struct svga_region;
+
+
+#define SVGA_BUFFER_USAGE_PINNED  (PIPE_BUFFER_USAGE_CUSTOM << 0)
+#define SVGA_BUFFER_USAGE_WRAPPED (PIPE_BUFFER_USAGE_CUSTOM << 1)
+
+
+/** Opaque surface handle */
+struct svga_winsys_surface;
+
+/** Opaque buffer handle */
+struct svga_winsys_handle;
+
+
+/**
+ * SVGA per-context winsys interface.
+ */
+struct svga_winsys_context
+{
+   void
+   (*destroy)(struct svga_winsys_context *swc);
+
+   void *       
+   (*reserve)(struct svga_winsys_context *swc, 
+	      uint32_t nr_bytes, uint32_t nr_relocs );
+   
+   /**
+    * Emit a relocation for a host surface.
+    * 
+    * @param flags PIPE_BUFFER_USAGE_GPU_READ/WRITE
+    * 
+    * NOTE: Order of this call does matter. It should be the same order
+    * as relocations appear in the command buffer.
+    */
+   void
+   (*surface_relocation)(struct svga_winsys_context *swc, 
+	                 uint32 *sid, 
+	                 struct svga_winsys_surface *surface,
+	                 unsigned flags);
+   
+   /**
+    * Emit a relocation for a guest memory region.
+    * 
+    * @param flags PIPE_BUFFER_USAGE_GPU_READ/WRITE
+    * 
+    * NOTE: Order of this call does matter. It should be the same order
+    * as relocations appear in the command buffer.
+    */
+   void
+   (*region_relocation)(struct svga_winsys_context *swc, 
+	                struct SVGAGuestPtr *ptr, 
+	                struct svga_winsys_buffer *buffer,
+	                uint32 offset,
+                        unsigned flags);
+
+   void
+   (*commit)(struct svga_winsys_context *swc);
+   
+   enum pipe_error
+   (*flush)(struct svga_winsys_context *swc, 
+	    struct pipe_fence_handle **pfence);
+
+   /** 
+    * Context ID used to fill in the commands
+    * 
+    * Context IDs are arbitrary small non-negative integers,
+    * global to the entire SVGA device.
+    */
+   uint32 cid;
+};
+
+
+/**
+ * SVGA per-screen winsys interface.
+ */
+struct svga_winsys_screen
+{
+   void
+   (*destroy)(struct svga_winsys_screen *sws);
+   
+   boolean
+   (*get_cap)(struct svga_winsys_screen *sws,
+              SVGA3dDevCapIndex index,
+              SVGA3dDevCapResult *result);
+   
+   /**
+    * Create a new context.
+    *
+    * Context objects encapsulate all render state, and shader
+    * objects are per-context.
+    *
+    * Surfaces are not per-context. The same surface can be shared
+    * between multiple contexts, and surface operations can occur
+    * without a context.
+    */
+   struct svga_winsys_context *
+   (*context_create)(struct svga_winsys_screen *sws);
+   
+   
+   /**
+    * This creates a "surface" object in the SVGA3D device,
+    * and returns the surface ID (sid). Surfaces are generic
+    * containers for host VRAM objects like textures, vertex
+    * buffers, and depth/stencil buffers.
+    *
+    * Surfaces are hierarchial:
+    *
+    * - Surface may have multiple faces (for cube maps)
+    *
+    * - Each face has a list of mipmap levels
+    *
+    * - Each mipmap image may have multiple volume
+    *   slices, if the image is three dimensional.
+    *
+    * - Each slice is a 2D array of 'blocks'
+    *
+    * - Each block may be one or more pixels.
+    *   (Usually 1, more for DXT or YUV formats.)
+    *
+    * Surfaces are generic host VRAM objects. The SVGA3D device
+    * may optimize surfaces according to the format they were
+    * created with, but this format does not limit the ways in
+    * which the surface may be used. For example, a depth surface
+    * can be used as a texture, or a floating point image may
+    * be used as a vertex buffer. Some surface usages may be
+    * lower performance, due to software emulation, but any
+    * usage should work with any surface.
+    */
+   struct svga_winsys_surface *
+   (*surface_create)(struct svga_winsys_screen *sws,
+                     SVGA3dSurfaceFlags flags,
+                     SVGA3dSurfaceFormat format,
+                     SVGA3dSize size,
+                     uint32 numFaces,
+                     uint32 numMipLevels);
+
+   /**
+    * Whether this surface is sitting in a validate list
+    */
+   boolean
+   (*surface_is_flushed)(struct svga_winsys_screen *sws,
+                         struct svga_winsys_surface *surface);
+
+   /**
+    * Reference a SVGA3D surface object. This allows sharing of a
+    * surface between different objects.
+    */
+   void 
+   (*surface_reference)(struct svga_winsys_screen *sws,
+			struct svga_winsys_surface **pdst,
+			struct svga_winsys_surface *src);
+
+   /**
+    * Buffer management. Buffer attributes are mostly fixed over its lifetime.
+    *
+    * Remember that gallium gets to choose the interface it needs, and the
+    * window systems must then implement that interface (rather than the
+    * other way around...).
+    *
+    * usage is a bitmask of PIPE_BUFFER_USAGE_PIXEL/VERTEX/INDEX/CONSTANT. This
+    * usage argument is only an optimization hint, not a guarantee, therefore 
+    * proper behavior must be observed in all circumstances.
+    *
+    * alignment indicates the client's alignment requirements, eg for
+    * SSE instructions.
+    */
+   struct svga_winsys_buffer *
+   (*buffer_create)( struct svga_winsys_screen *sws, 
+	             unsigned alignment, 
+	             unsigned usage,
+	             unsigned size );
+
+   /** 
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is a bitmask of:
+    * - PIPE_BUFFER_USAGE_CPU_READ/WRITE
+    * - PIPE_BUFFER_USAGE_DONTBLOCK
+    * - PIPE_BUFFER_USAGE_UNSYNCHRONIZED
+    */
+   void *
+   (*buffer_map)( struct svga_winsys_screen *sws, 
+	          struct svga_winsys_buffer *buf,
+		  unsigned usage );
+   
+   void 
+   (*buffer_unmap)( struct svga_winsys_screen *sws, 
+                    struct svga_winsys_buffer *buf );
+
+   void 
+   (*buffer_destroy)( struct svga_winsys_screen *sws,
+	              struct svga_winsys_buffer *buf );
+
+
+   /**
+    * Reference a fence object.
+    */
+   void
+   (*fence_reference)( struct svga_winsys_screen *sws,
+                       struct pipe_fence_handle **pdst,
+                       struct pipe_fence_handle *src );
+
+   /**
+    * Checks whether the fence has been signalled.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_signalled)( struct svga_winsys_screen *sws,
+                           struct pipe_fence_handle *fence,
+                           unsigned flag );
+
+   /**
+    * Wait for the fence to finish.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_finish)( struct svga_winsys_screen *sws,
+                        struct pipe_fence_handle *fence,
+                        unsigned flag );
+
+};
+
+
+struct pipe_context *
+svga_context_create(struct pipe_screen *screen);
+
+struct pipe_screen *
+svga_screen_create(struct svga_winsys_screen *sws);
+
+struct svga_winsys_screen *
+svga_winsys_screen(struct pipe_screen *screen);
+
+struct pipe_buffer *
+svga_screen_buffer_wrap_surface(struct pipe_screen *screen,
+				enum SVGA3dSurfaceFormat format,
+				struct svga_winsys_surface *srf);
+
+struct svga_winsys_surface *
+svga_screen_texture_get_winsys_surface(struct pipe_texture *texture);
+struct svga_winsys_surface *
+svga_screen_buffer_get_winsys_surface(struct pipe_buffer *buffer);
+
+boolean
+svga_screen_buffer_from_texture(struct pipe_texture *texture,
+				struct pipe_buffer **buffer,
+				unsigned *stride);
+
+#endif /* SVGA_WINSYS_H_ */
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.c b/src/gallium/drivers/svga/svgadump/svga_dump.c
new file mode 100644
index 00000000000..910afa25287
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.c
@@ -0,0 +1,1736 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Dump SVGA commands.
+ *
+ * Generated automatically from svga3d_reg.h by svga_dump.py.
+ */
+
+#include "svga_types.h"
+#include "svga_shader_dump.h"
+#include "svga3d_reg.h"
+
+#include "util/u_debug.h"
+#include "svga_dump.h"
+
+static void
+dump_SVGA3dVertexDecl(const SVGA3dVertexDecl *cmd)
+{
+   switch((*cmd).identity.type) {
+   case SVGA3D_DECLTYPE_FLOAT1:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT1\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT2:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT2\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT3:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT3\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT4:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT4\n");
+      break;
+   case SVGA3D_DECLTYPE_D3DCOLOR:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_D3DCOLOR\n");
+      break;
+   case SVGA3D_DECLTYPE_UBYTE4:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT2:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT4:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4\n");
+      break;
+   case SVGA3D_DECLTYPE_UBYTE4N:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4N\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT2N:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2N\n");
+      break;
+   case SVGA3D_DECLTYPE_SHORT4N:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4N\n");
+      break;
+   case SVGA3D_DECLTYPE_USHORT2N:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT2N\n");
+      break;
+   case SVGA3D_DECLTYPE_USHORT4N:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT4N\n");
+      break;
+   case SVGA3D_DECLTYPE_UDEC3:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UDEC3\n");
+      break;
+   case SVGA3D_DECLTYPE_DEC3N:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_DEC3N\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT16_2:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_2\n");
+      break;
+   case SVGA3D_DECLTYPE_FLOAT16_4:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_4\n");
+      break;
+   case SVGA3D_DECLTYPE_MAX:
+      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.identity.type = %i\n", (*cmd).identity.type);
+      break;
+   }
+   switch((*cmd).identity.method) {
+   case SVGA3D_DECLMETHOD_DEFAULT:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_DEFAULT\n");
+      break;
+   case SVGA3D_DECLMETHOD_PARTIALU:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALU\n");
+      break;
+   case SVGA3D_DECLMETHOD_PARTIALV:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALV\n");
+      break;
+   case SVGA3D_DECLMETHOD_CROSSUV:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_CROSSUV\n");
+      break;
+   case SVGA3D_DECLMETHOD_UV:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_UV\n");
+      break;
+   case SVGA3D_DECLMETHOD_LOOKUP:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUP\n");
+      break;
+   case SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED:
+      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED\n");
+      break;
+   default:
+      debug_printf("\t\t.identity.method = %i\n", (*cmd).identity.method);
+      break;
+   }
+   switch((*cmd).identity.usage) {
+   case SVGA3D_DECLUSAGE_POSITION:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITION\n");
+      break;
+   case SVGA3D_DECLUSAGE_BLENDWEIGHT:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDWEIGHT\n");
+      break;
+   case SVGA3D_DECLUSAGE_BLENDINDICES:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDINDICES\n");
+      break;
+   case SVGA3D_DECLUSAGE_NORMAL:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_NORMAL\n");
+      break;
+   case SVGA3D_DECLUSAGE_PSIZE:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_PSIZE\n");
+      break;
+   case SVGA3D_DECLUSAGE_TEXCOORD:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TEXCOORD\n");
+      break;
+   case SVGA3D_DECLUSAGE_TANGENT:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TANGENT\n");
+      break;
+   case SVGA3D_DECLUSAGE_BINORMAL:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BINORMAL\n");
+      break;
+   case SVGA3D_DECLUSAGE_TESSFACTOR:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TESSFACTOR\n");
+      break;
+   case SVGA3D_DECLUSAGE_POSITIONT:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITIONT\n");
+      break;
+   case SVGA3D_DECLUSAGE_COLOR:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_COLOR\n");
+      break;
+   case SVGA3D_DECLUSAGE_FOG:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_FOG\n");
+      break;
+   case SVGA3D_DECLUSAGE_DEPTH:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_DEPTH\n");
+      break;
+   case SVGA3D_DECLUSAGE_SAMPLE:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_SAMPLE\n");
+      break;
+   case SVGA3D_DECLUSAGE_MAX:
+      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.identity.usage = %i\n", (*cmd).identity.usage);
+      break;
+   }
+   debug_printf("\t\t.identity.usageIndex = %u\n", (*cmd).identity.usageIndex);
+   debug_printf("\t\t.array.surfaceId = %u\n", (*cmd).array.surfaceId);
+   debug_printf("\t\t.array.offset = %u\n", (*cmd).array.offset);
+   debug_printf("\t\t.array.stride = %u\n", (*cmd).array.stride);
+   debug_printf("\t\t.rangeHint.first = %u\n", (*cmd).rangeHint.first);
+   debug_printf("\t\t.rangeHint.last = %u\n", (*cmd).rangeHint.last);
+}
+
+static void
+dump_SVGA3dTextureState(const SVGA3dTextureState *cmd)
+{
+   debug_printf("\t\t.stage = %u\n", (*cmd).stage);
+   switch((*cmd).name) {
+   case SVGA3D_TS_INVALID:
+      debug_printf("\t\t.name = SVGA3D_TS_INVALID\n");
+      break;
+   case SVGA3D_TS_BIND_TEXTURE:
+      debug_printf("\t\t.name = SVGA3D_TS_BIND_TEXTURE\n");
+      break;
+   case SVGA3D_TS_COLOROP:
+      debug_printf("\t\t.name = SVGA3D_TS_COLOROP\n");
+      break;
+   case SVGA3D_TS_COLORARG1:
+      debug_printf("\t\t.name = SVGA3D_TS_COLORARG1\n");
+      break;
+   case SVGA3D_TS_COLORARG2:
+      debug_printf("\t\t.name = SVGA3D_TS_COLORARG2\n");
+      break;
+   case SVGA3D_TS_ALPHAOP:
+      debug_printf("\t\t.name = SVGA3D_TS_ALPHAOP\n");
+      break;
+   case SVGA3D_TS_ALPHAARG1:
+      debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG1\n");
+      break;
+   case SVGA3D_TS_ALPHAARG2:
+      debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG2\n");
+      break;
+   case SVGA3D_TS_ADDRESSU:
+      debug_printf("\t\t.name = SVGA3D_TS_ADDRESSU\n");
+      break;
+   case SVGA3D_TS_ADDRESSV:
+      debug_printf("\t\t.name = SVGA3D_TS_ADDRESSV\n");
+      break;
+   case SVGA3D_TS_MIPFILTER:
+      debug_printf("\t\t.name = SVGA3D_TS_MIPFILTER\n");
+      break;
+   case SVGA3D_TS_MAGFILTER:
+      debug_printf("\t\t.name = SVGA3D_TS_MAGFILTER\n");
+      break;
+   case SVGA3D_TS_MINFILTER:
+      debug_printf("\t\t.name = SVGA3D_TS_MINFILTER\n");
+      break;
+   case SVGA3D_TS_BORDERCOLOR:
+      debug_printf("\t\t.name = SVGA3D_TS_BORDERCOLOR\n");
+      break;
+   case SVGA3D_TS_TEXCOORDINDEX:
+      debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDINDEX\n");
+      break;
+   case SVGA3D_TS_TEXTURETRANSFORMFLAGS:
+      debug_printf("\t\t.name = SVGA3D_TS_TEXTURETRANSFORMFLAGS\n");
+      break;
+   case SVGA3D_TS_TEXCOORDGEN:
+      debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDGEN\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT00:
+      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT00\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT01:
+      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT01\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT10:
+      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT10\n");
+      break;
+   case SVGA3D_TS_BUMPENVMAT11:
+      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT11\n");
+      break;
+   case SVGA3D_TS_TEXTURE_MIPMAP_LEVEL:
+      debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_MIPMAP_LEVEL\n");
+      break;
+   case SVGA3D_TS_TEXTURE_LOD_BIAS:
+      debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_LOD_BIAS\n");
+      break;
+   case SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL:
+      debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL\n");
+      break;
+   case SVGA3D_TS_ADDRESSW:
+      debug_printf("\t\t.name = SVGA3D_TS_ADDRESSW\n");
+      break;
+   case SVGA3D_TS_GAMMA:
+      debug_printf("\t\t.name = SVGA3D_TS_GAMMA\n");
+      break;
+   case SVGA3D_TS_BUMPENVLSCALE:
+      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLSCALE\n");
+      break;
+   case SVGA3D_TS_BUMPENVLOFFSET:
+      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLOFFSET\n");
+      break;
+   case SVGA3D_TS_COLORARG0:
+      debug_printf("\t\t.name = SVGA3D_TS_COLORARG0\n");
+      break;
+   case SVGA3D_TS_ALPHAARG0:
+      debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG0\n");
+      break;
+   case SVGA3D_TS_MAX:
+      debug_printf("\t\t.name = SVGA3D_TS_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.name = %i\n", (*cmd).name);
+      break;
+   }
+   debug_printf("\t\t.value = %u\n", (*cmd).value);
+   debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
+}
+
+static void
+dump_SVGA3dCopyBox(const SVGA3dCopyBox *cmd)
+{
+   debug_printf("\t\t.x = %u\n", (*cmd).x);
+   debug_printf("\t\t.y = %u\n", (*cmd).y);
+   debug_printf("\t\t.z = %u\n", (*cmd).z);
+   debug_printf("\t\t.w = %u\n", (*cmd).w);
+   debug_printf("\t\t.h = %u\n", (*cmd).h);
+   debug_printf("\t\t.d = %u\n", (*cmd).d);
+   debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
+   debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
+   debug_printf("\t\t.srcz = %u\n", (*cmd).srcz);
+}
+
+static void
+dump_SVGA3dCmdSetClipPlane(const SVGA3dCmdSetClipPlane *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.index = %u\n", (*cmd).index);
+   debug_printf("\t\t.plane[0] = %f\n", (*cmd).plane[0]);
+   debug_printf("\t\t.plane[1] = %f\n", (*cmd).plane[1]);
+   debug_printf("\t\t.plane[2] = %f\n", (*cmd).plane[2]);
+   debug_printf("\t\t.plane[3] = %f\n", (*cmd).plane[3]);
+}
+
+static void
+dump_SVGA3dCmdWaitForQuery(const SVGA3dCmdWaitForQuery *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_QUERYTYPE_OCCLUSION:
+      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      break;
+   case SVGA3D_QUERYTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
+   debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
+}
+
+static void
+dump_SVGA3dCmdSetRenderTarget(const SVGA3dCmdSetRenderTarget *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_RT_DEPTH:
+      debug_printf("\t\t.type = SVGA3D_RT_DEPTH\n");
+      break;
+   case SVGA3D_RT_STENCIL:
+      debug_printf("\t\t.type = SVGA3D_RT_STENCIL\n");
+      break;
+   default:
+      debug_printf("\t\t.type = SVGA3D_RT_COLOR%u\n", (*cmd).type - SVGA3D_RT_COLOR0);
+      break;
+   }
+   debug_printf("\t\t.target.sid = %u\n", (*cmd).target.sid);
+   debug_printf("\t\t.target.face = %u\n", (*cmd).target.face);
+   debug_printf("\t\t.target.mipmap = %u\n", (*cmd).target.mipmap);
+}
+
+static void
+dump_SVGA3dCmdSetTextureState(const SVGA3dCmdSetTextureState *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dCmdSurfaceCopy(const SVGA3dCmdSurfaceCopy *cmd)
+{
+   debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
+   debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
+   debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
+   debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
+   debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
+   debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
+}
+
+static void
+dump_SVGA3dCmdSetMaterial(const SVGA3dCmdSetMaterial *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).face) {
+   case SVGA3D_FACE_INVALID:
+      debug_printf("\t\t.face = SVGA3D_FACE_INVALID\n");
+      break;
+   case SVGA3D_FACE_NONE:
+      debug_printf("\t\t.face = SVGA3D_FACE_NONE\n");
+      break;
+   case SVGA3D_FACE_FRONT:
+      debug_printf("\t\t.face = SVGA3D_FACE_FRONT\n");
+      break;
+   case SVGA3D_FACE_BACK:
+      debug_printf("\t\t.face = SVGA3D_FACE_BACK\n");
+      break;
+   case SVGA3D_FACE_FRONT_BACK:
+      debug_printf("\t\t.face = SVGA3D_FACE_FRONT_BACK\n");
+      break;
+   case SVGA3D_FACE_MAX:
+      debug_printf("\t\t.face = SVGA3D_FACE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.face = %i\n", (*cmd).face);
+      break;
+   }
+   debug_printf("\t\t.material.diffuse[0] = %f\n", (*cmd).material.diffuse[0]);
+   debug_printf("\t\t.material.diffuse[1] = %f\n", (*cmd).material.diffuse[1]);
+   debug_printf("\t\t.material.diffuse[2] = %f\n", (*cmd).material.diffuse[2]);
+   debug_printf("\t\t.material.diffuse[3] = %f\n", (*cmd).material.diffuse[3]);
+   debug_printf("\t\t.material.ambient[0] = %f\n", (*cmd).material.ambient[0]);
+   debug_printf("\t\t.material.ambient[1] = %f\n", (*cmd).material.ambient[1]);
+   debug_printf("\t\t.material.ambient[2] = %f\n", (*cmd).material.ambient[2]);
+   debug_printf("\t\t.material.ambient[3] = %f\n", (*cmd).material.ambient[3]);
+   debug_printf("\t\t.material.specular[0] = %f\n", (*cmd).material.specular[0]);
+   debug_printf("\t\t.material.specular[1] = %f\n", (*cmd).material.specular[1]);
+   debug_printf("\t\t.material.specular[2] = %f\n", (*cmd).material.specular[2]);
+   debug_printf("\t\t.material.specular[3] = %f\n", (*cmd).material.specular[3]);
+   debug_printf("\t\t.material.emissive[0] = %f\n", (*cmd).material.emissive[0]);
+   debug_printf("\t\t.material.emissive[1] = %f\n", (*cmd).material.emissive[1]);
+   debug_printf("\t\t.material.emissive[2] = %f\n", (*cmd).material.emissive[2]);
+   debug_printf("\t\t.material.emissive[3] = %f\n", (*cmd).material.emissive[3]);
+   debug_printf("\t\t.material.shininess = %f\n", (*cmd).material.shininess);
+}
+
+static void
+dump_SVGA3dCmdSetLightData(const SVGA3dCmdSetLightData *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.index = %u\n", (*cmd).index);
+   switch((*cmd).data.type) {
+   case SVGA3D_LIGHTTYPE_INVALID:
+      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_INVALID\n");
+      break;
+   case SVGA3D_LIGHTTYPE_POINT:
+      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_POINT\n");
+      break;
+   case SVGA3D_LIGHTTYPE_SPOT1:
+      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT1\n");
+      break;
+   case SVGA3D_LIGHTTYPE_SPOT2:
+      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT2\n");
+      break;
+   case SVGA3D_LIGHTTYPE_DIRECTIONAL:
+      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_DIRECTIONAL\n");
+      break;
+   case SVGA3D_LIGHTTYPE_MAX:
+      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.data.type = %i\n", (*cmd).data.type);
+      break;
+   }
+   debug_printf("\t\t.data.inWorldSpace = %u\n", (*cmd).data.inWorldSpace);
+   debug_printf("\t\t.data.diffuse[0] = %f\n", (*cmd).data.diffuse[0]);
+   debug_printf("\t\t.data.diffuse[1] = %f\n", (*cmd).data.diffuse[1]);
+   debug_printf("\t\t.data.diffuse[2] = %f\n", (*cmd).data.diffuse[2]);
+   debug_printf("\t\t.data.diffuse[3] = %f\n", (*cmd).data.diffuse[3]);
+   debug_printf("\t\t.data.specular[0] = %f\n", (*cmd).data.specular[0]);
+   debug_printf("\t\t.data.specular[1] = %f\n", (*cmd).data.specular[1]);
+   debug_printf("\t\t.data.specular[2] = %f\n", (*cmd).data.specular[2]);
+   debug_printf("\t\t.data.specular[3] = %f\n", (*cmd).data.specular[3]);
+   debug_printf("\t\t.data.ambient[0] = %f\n", (*cmd).data.ambient[0]);
+   debug_printf("\t\t.data.ambient[1] = %f\n", (*cmd).data.ambient[1]);
+   debug_printf("\t\t.data.ambient[2] = %f\n", (*cmd).data.ambient[2]);
+   debug_printf("\t\t.data.ambient[3] = %f\n", (*cmd).data.ambient[3]);
+   debug_printf("\t\t.data.position[0] = %f\n", (*cmd).data.position[0]);
+   debug_printf("\t\t.data.position[1] = %f\n", (*cmd).data.position[1]);
+   debug_printf("\t\t.data.position[2] = %f\n", (*cmd).data.position[2]);
+   debug_printf("\t\t.data.position[3] = %f\n", (*cmd).data.position[3]);
+   debug_printf("\t\t.data.direction[0] = %f\n", (*cmd).data.direction[0]);
+   debug_printf("\t\t.data.direction[1] = %f\n", (*cmd).data.direction[1]);
+   debug_printf("\t\t.data.direction[2] = %f\n", (*cmd).data.direction[2]);
+   debug_printf("\t\t.data.direction[3] = %f\n", (*cmd).data.direction[3]);
+   debug_printf("\t\t.data.range = %f\n", (*cmd).data.range);
+   debug_printf("\t\t.data.falloff = %f\n", (*cmd).data.falloff);
+   debug_printf("\t\t.data.attenuation0 = %f\n", (*cmd).data.attenuation0);
+   debug_printf("\t\t.data.attenuation1 = %f\n", (*cmd).data.attenuation1);
+   debug_printf("\t\t.data.attenuation2 = %f\n", (*cmd).data.attenuation2);
+   debug_printf("\t\t.data.theta = %f\n", (*cmd).data.theta);
+   debug_printf("\t\t.data.phi = %f\n", (*cmd).data.phi);
+}
+
+static void
+dump_SVGA3dCmdSetViewport(const SVGA3dCmdSetViewport *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
+   debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
+   debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
+   debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
+}
+
+static void
+dump_SVGA3dCmdSetScissorRect(const SVGA3dCmdSetScissorRect *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
+   debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
+   debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
+   debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
+}
+
+static void
+dump_SVGA3dCopyRect(const SVGA3dCopyRect *cmd)
+{
+   debug_printf("\t\t.x = %u\n", (*cmd).x);
+   debug_printf("\t\t.y = %u\n", (*cmd).y);
+   debug_printf("\t\t.w = %u\n", (*cmd).w);
+   debug_printf("\t\t.h = %u\n", (*cmd).h);
+   debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
+   debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
+}
+
+static void
+dump_SVGA3dCmdSetShader(const SVGA3dCmdSetShader *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+}
+
+static void
+dump_SVGA3dCmdEndQuery(const SVGA3dCmdEndQuery *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_QUERYTYPE_OCCLUSION:
+      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      break;
+   case SVGA3D_QUERYTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
+   debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
+}
+
+static void
+dump_SVGA3dSize(const SVGA3dSize *cmd)
+{
+   debug_printf("\t\t.width = %u\n", (*cmd).width);
+   debug_printf("\t\t.height = %u\n", (*cmd).height);
+   debug_printf("\t\t.depth = %u\n", (*cmd).depth);
+}
+
+static void
+dump_SVGA3dCmdDestroySurface(const SVGA3dCmdDestroySurface *cmd)
+{
+   debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+}
+
+static void
+dump_SVGA3dCmdDefineContext(const SVGA3dCmdDefineContext *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dRect(const SVGA3dRect *cmd)
+{
+   debug_printf("\t\t.x = %u\n", (*cmd).x);
+   debug_printf("\t\t.y = %u\n", (*cmd).y);
+   debug_printf("\t\t.w = %u\n", (*cmd).w);
+   debug_printf("\t\t.h = %u\n", (*cmd).h);
+}
+
+static void
+dump_SVGA3dCmdBeginQuery(const SVGA3dCmdBeginQuery *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_QUERYTYPE_OCCLUSION:
+      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      break;
+   case SVGA3D_QUERYTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dRenderState(const SVGA3dRenderState *cmd)
+{
+   switch((*cmd).state) {
+   case SVGA3D_RS_INVALID:
+      debug_printf("\t\t.state = SVGA3D_RS_INVALID\n");
+      break;
+   case SVGA3D_RS_ZENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_ZENABLE\n");
+      break;
+   case SVGA3D_RS_ZWRITEENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_ZWRITEENABLE\n");
+      break;
+   case SVGA3D_RS_ALPHATESTENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_ALPHATESTENABLE\n");
+      break;
+   case SVGA3D_RS_DITHERENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_DITHERENABLE\n");
+      break;
+   case SVGA3D_RS_BLENDENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_BLENDENABLE\n");
+      break;
+   case SVGA3D_RS_FOGENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_FOGENABLE\n");
+      break;
+   case SVGA3D_RS_SPECULARENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_SPECULARENABLE\n");
+      break;
+   case SVGA3D_RS_STENCILENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE\n");
+      break;
+   case SVGA3D_RS_LIGHTINGENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_LIGHTINGENABLE\n");
+      break;
+   case SVGA3D_RS_NORMALIZENORMALS:
+      debug_printf("\t\t.state = SVGA3D_RS_NORMALIZENORMALS\n");
+      break;
+   case SVGA3D_RS_POINTSPRITEENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSPRITEENABLE\n");
+      break;
+   case SVGA3D_RS_POINTSCALEENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALEENABLE\n");
+      break;
+   case SVGA3D_RS_STENCILREF:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILREF\n");
+      break;
+   case SVGA3D_RS_STENCILMASK:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILMASK\n");
+      break;
+   case SVGA3D_RS_STENCILWRITEMASK:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILWRITEMASK\n");
+      break;
+   case SVGA3D_RS_FOGSTART:
+      debug_printf("\t\t.state = SVGA3D_RS_FOGSTART\n");
+      break;
+   case SVGA3D_RS_FOGEND:
+      debug_printf("\t\t.state = SVGA3D_RS_FOGEND\n");
+      break;
+   case SVGA3D_RS_FOGDENSITY:
+      debug_printf("\t\t.state = SVGA3D_RS_FOGDENSITY\n");
+      break;
+   case SVGA3D_RS_POINTSIZE:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSIZE\n");
+      break;
+   case SVGA3D_RS_POINTSIZEMIN:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMIN\n");
+      break;
+   case SVGA3D_RS_POINTSIZEMAX:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMAX\n");
+      break;
+   case SVGA3D_RS_POINTSCALE_A:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_A\n");
+      break;
+   case SVGA3D_RS_POINTSCALE_B:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_B\n");
+      break;
+   case SVGA3D_RS_POINTSCALE_C:
+      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_C\n");
+      break;
+   case SVGA3D_RS_FOGCOLOR:
+      debug_printf("\t\t.state = SVGA3D_RS_FOGCOLOR\n");
+      break;
+   case SVGA3D_RS_AMBIENT:
+      debug_printf("\t\t.state = SVGA3D_RS_AMBIENT\n");
+      break;
+   case SVGA3D_RS_CLIPPLANEENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_CLIPPLANEENABLE\n");
+      break;
+   case SVGA3D_RS_FOGMODE:
+      debug_printf("\t\t.state = SVGA3D_RS_FOGMODE\n");
+      break;
+   case SVGA3D_RS_FILLMODE:
+      debug_printf("\t\t.state = SVGA3D_RS_FILLMODE\n");
+      break;
+   case SVGA3D_RS_SHADEMODE:
+      debug_printf("\t\t.state = SVGA3D_RS_SHADEMODE\n");
+      break;
+   case SVGA3D_RS_LINEPATTERN:
+      debug_printf("\t\t.state = SVGA3D_RS_LINEPATTERN\n");
+      break;
+   case SVGA3D_RS_SRCBLEND:
+      debug_printf("\t\t.state = SVGA3D_RS_SRCBLEND\n");
+      break;
+   case SVGA3D_RS_DSTBLEND:
+      debug_printf("\t\t.state = SVGA3D_RS_DSTBLEND\n");
+      break;
+   case SVGA3D_RS_BLENDEQUATION:
+      debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATION\n");
+      break;
+   case SVGA3D_RS_CULLMODE:
+      debug_printf("\t\t.state = SVGA3D_RS_CULLMODE\n");
+      break;
+   case SVGA3D_RS_ZFUNC:
+      debug_printf("\t\t.state = SVGA3D_RS_ZFUNC\n");
+      break;
+   case SVGA3D_RS_ALPHAFUNC:
+      debug_printf("\t\t.state = SVGA3D_RS_ALPHAFUNC\n");
+      break;
+   case SVGA3D_RS_STENCILFUNC:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILFUNC\n");
+      break;
+   case SVGA3D_RS_STENCILFAIL:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILFAIL\n");
+      break;
+   case SVGA3D_RS_STENCILZFAIL:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILZFAIL\n");
+      break;
+   case SVGA3D_RS_STENCILPASS:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILPASS\n");
+      break;
+   case SVGA3D_RS_ALPHAREF:
+      debug_printf("\t\t.state = SVGA3D_RS_ALPHAREF\n");
+      break;
+   case SVGA3D_RS_FRONTWINDING:
+      debug_printf("\t\t.state = SVGA3D_RS_FRONTWINDING\n");
+      break;
+   case SVGA3D_RS_COORDINATETYPE:
+      debug_printf("\t\t.state = SVGA3D_RS_COORDINATETYPE\n");
+      break;
+   case SVGA3D_RS_ZBIAS:
+      debug_printf("\t\t.state = SVGA3D_RS_ZBIAS\n");
+      break;
+   case SVGA3D_RS_RANGEFOGENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_RANGEFOGENABLE\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE\n");
+      break;
+   case SVGA3D_RS_VERTEXMATERIALENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_VERTEXMATERIALENABLE\n");
+      break;
+   case SVGA3D_RS_DIFFUSEMATERIALSOURCE:
+      debug_printf("\t\t.state = SVGA3D_RS_DIFFUSEMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_SPECULARMATERIALSOURCE:
+      debug_printf("\t\t.state = SVGA3D_RS_SPECULARMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_AMBIENTMATERIALSOURCE:
+      debug_printf("\t\t.state = SVGA3D_RS_AMBIENTMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_EMISSIVEMATERIALSOURCE:
+      debug_printf("\t\t.state = SVGA3D_RS_EMISSIVEMATERIALSOURCE\n");
+      break;
+   case SVGA3D_RS_TEXTUREFACTOR:
+      debug_printf("\t\t.state = SVGA3D_RS_TEXTUREFACTOR\n");
+      break;
+   case SVGA3D_RS_LOCALVIEWER:
+      debug_printf("\t\t.state = SVGA3D_RS_LOCALVIEWER\n");
+      break;
+   case SVGA3D_RS_SCISSORTESTENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_SCISSORTESTENABLE\n");
+      break;
+   case SVGA3D_RS_BLENDCOLOR:
+      debug_printf("\t\t.state = SVGA3D_RS_BLENDCOLOR\n");
+      break;
+   case SVGA3D_RS_STENCILENABLE2SIDED:
+      debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE2SIDED\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILFUNC:
+      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFUNC\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILFAIL:
+      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFAIL\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILZFAIL:
+      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILZFAIL\n");
+      break;
+   case SVGA3D_RS_CCWSTENCILPASS:
+      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILPASS\n");
+      break;
+   case SVGA3D_RS_VERTEXBLEND:
+      debug_printf("\t\t.state = SVGA3D_RS_VERTEXBLEND\n");
+      break;
+   case SVGA3D_RS_SLOPESCALEDEPTHBIAS:
+      debug_printf("\t\t.state = SVGA3D_RS_SLOPESCALEDEPTHBIAS\n");
+      break;
+   case SVGA3D_RS_DEPTHBIAS:
+      debug_printf("\t\t.state = SVGA3D_RS_DEPTHBIAS\n");
+      break;
+   case SVGA3D_RS_OUTPUTGAMMA:
+      debug_printf("\t\t.state = SVGA3D_RS_OUTPUTGAMMA\n");
+      break;
+   case SVGA3D_RS_ZVISIBLE:
+      debug_printf("\t\t.state = SVGA3D_RS_ZVISIBLE\n");
+      break;
+   case SVGA3D_RS_LASTPIXEL:
+      debug_printf("\t\t.state = SVGA3D_RS_LASTPIXEL\n");
+      break;
+   case SVGA3D_RS_CLIPPING:
+      debug_printf("\t\t.state = SVGA3D_RS_CLIPPING\n");
+      break;
+   case SVGA3D_RS_WRAP0:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP0\n");
+      break;
+   case SVGA3D_RS_WRAP1:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP1\n");
+      break;
+   case SVGA3D_RS_WRAP2:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP2\n");
+      break;
+   case SVGA3D_RS_WRAP3:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP3\n");
+      break;
+   case SVGA3D_RS_WRAP4:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP4\n");
+      break;
+   case SVGA3D_RS_WRAP5:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP5\n");
+      break;
+   case SVGA3D_RS_WRAP6:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP6\n");
+      break;
+   case SVGA3D_RS_WRAP7:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP7\n");
+      break;
+   case SVGA3D_RS_WRAP8:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP8\n");
+      break;
+   case SVGA3D_RS_WRAP9:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP9\n");
+      break;
+   case SVGA3D_RS_WRAP10:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP10\n");
+      break;
+   case SVGA3D_RS_WRAP11:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP11\n");
+      break;
+   case SVGA3D_RS_WRAP12:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP12\n");
+      break;
+   case SVGA3D_RS_WRAP13:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP13\n");
+      break;
+   case SVGA3D_RS_WRAP14:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP14\n");
+      break;
+   case SVGA3D_RS_WRAP15:
+      debug_printf("\t\t.state = SVGA3D_RS_WRAP15\n");
+      break;
+   case SVGA3D_RS_MULTISAMPLEANTIALIAS:
+      debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEANTIALIAS\n");
+      break;
+   case SVGA3D_RS_MULTISAMPLEMASK:
+      debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEMASK\n");
+      break;
+   case SVGA3D_RS_INDEXEDVERTEXBLENDENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_INDEXEDVERTEXBLENDENABLE\n");
+      break;
+   case SVGA3D_RS_TWEENFACTOR:
+      debug_printf("\t\t.state = SVGA3D_RS_TWEENFACTOR\n");
+      break;
+   case SVGA3D_RS_ANTIALIASEDLINEENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_ANTIALIASEDLINEENABLE\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE1:
+      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE1\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE2:
+      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE2\n");
+      break;
+   case SVGA3D_RS_COLORWRITEENABLE3:
+      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE3\n");
+      break;
+   case SVGA3D_RS_SEPARATEALPHABLENDENABLE:
+      debug_printf("\t\t.state = SVGA3D_RS_SEPARATEALPHABLENDENABLE\n");
+      break;
+   case SVGA3D_RS_SRCBLENDALPHA:
+      debug_printf("\t\t.state = SVGA3D_RS_SRCBLENDALPHA\n");
+      break;
+   case SVGA3D_RS_DSTBLENDALPHA:
+      debug_printf("\t\t.state = SVGA3D_RS_DSTBLENDALPHA\n");
+      break;
+   case SVGA3D_RS_BLENDEQUATIONALPHA:
+      debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATIONALPHA\n");
+      break;
+   case SVGA3D_RS_MAX:
+      debug_printf("\t\t.state = SVGA3D_RS_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.state = %i\n", (*cmd).state);
+      break;
+   }
+   debug_printf("\t\t.uintValue = %u\n", (*cmd).uintValue);
+   debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
+}
+
+static void
+dump_SVGA3dVertexDivisor(const SVGA3dVertexDivisor *cmd)
+{
+   debug_printf("\t\t.value = %u\n", (*cmd).value);
+   debug_printf("\t\t.count = %u\n", (*cmd).count);
+   debug_printf("\t\t.indexedData = %u\n", (*cmd).indexedData);
+   debug_printf("\t\t.instanceData = %u\n", (*cmd).instanceData);
+}
+
+static void
+dump_SVGA3dCmdDefineShader(const SVGA3dCmdDefineShader *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSetShaderConst(const SVGA3dCmdSetShaderConst *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.reg = %u\n", (*cmd).reg);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   switch((*cmd).ctype) {
+   case SVGA3D_CONST_TYPE_FLOAT:
+      debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_FLOAT\n");
+      debug_printf("\t\t.values[0] = %f\n", *(const float *)&(*cmd).values[0]);
+      debug_printf("\t\t.values[1] = %f\n", *(const float *)&(*cmd).values[1]);
+      debug_printf("\t\t.values[2] = %f\n", *(const float *)&(*cmd).values[2]);
+      debug_printf("\t\t.values[3] = %f\n", *(const float *)&(*cmd).values[3]);
+      break;
+   case SVGA3D_CONST_TYPE_INT:
+      debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_INT\n");
+      debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      break;
+   case SVGA3D_CONST_TYPE_BOOL:
+      debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_BOOL\n");
+      debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      break;
+   default:
+      debug_printf("\t\t.ctype = %i\n", (*cmd).ctype);
+      debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSetZRange(const SVGA3dCmdSetZRange *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.zRange.min = %f\n", (*cmd).zRange.min);
+   debug_printf("\t\t.zRange.max = %f\n", (*cmd).zRange.max);
+}
+
+static void
+dump_SVGA3dCmdDrawPrimitives(const SVGA3dCmdDrawPrimitives *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.numVertexDecls = %u\n", (*cmd).numVertexDecls);
+   debug_printf("\t\t.numRanges = %u\n", (*cmd).numRanges);
+}
+
+static void
+dump_SVGA3dCmdSetLightEnabled(const SVGA3dCmdSetLightEnabled *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.index = %u\n", (*cmd).index);
+   debug_printf("\t\t.enabled = %u\n", (*cmd).enabled);
+}
+
+static void
+dump_SVGA3dPrimitiveRange(const SVGA3dPrimitiveRange *cmd)
+{
+   switch((*cmd).primType) {
+   case SVGA3D_PRIMITIVE_INVALID:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_INVALID\n");
+      break;
+   case SVGA3D_PRIMITIVE_TRIANGLELIST:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLELIST\n");
+      break;
+   case SVGA3D_PRIMITIVE_POINTLIST:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_POINTLIST\n");
+      break;
+   case SVGA3D_PRIMITIVE_LINELIST:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINELIST\n");
+      break;
+   case SVGA3D_PRIMITIVE_LINESTRIP:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINESTRIP\n");
+      break;
+   case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLESTRIP\n");
+      break;
+   case SVGA3D_PRIMITIVE_TRIANGLEFAN:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLEFAN\n");
+      break;
+   case SVGA3D_PRIMITIVE_MAX:
+      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.primType = %i\n", (*cmd).primType);
+      break;
+   }
+   debug_printf("\t\t.primitiveCount = %u\n", (*cmd).primitiveCount);
+   debug_printf("\t\t.indexArray.surfaceId = %u\n", (*cmd).indexArray.surfaceId);
+   debug_printf("\t\t.indexArray.offset = %u\n", (*cmd).indexArray.offset);
+   debug_printf("\t\t.indexArray.stride = %u\n", (*cmd).indexArray.stride);
+   debug_printf("\t\t.indexWidth = %u\n", (*cmd).indexWidth);
+   debug_printf("\t\t.indexBias = %i\n", (*cmd).indexBias);
+}
+
+static void
+dump_SVGA3dCmdPresent(const SVGA3dCmdPresent *cmd)
+{
+   debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+}
+
+static void
+dump_SVGA3dCmdSetRenderState(const SVGA3dCmdSetRenderState *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dCmdSurfaceStretchBlt(const SVGA3dCmdSurfaceStretchBlt *cmd)
+{
+   debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
+   debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
+   debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
+   debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
+   debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
+   debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
+   debug_printf("\t\t.boxSrc.x = %u\n", (*cmd).boxSrc.x);
+   debug_printf("\t\t.boxSrc.y = %u\n", (*cmd).boxSrc.y);
+   debug_printf("\t\t.boxSrc.z = %u\n", (*cmd).boxSrc.z);
+   debug_printf("\t\t.boxSrc.w = %u\n", (*cmd).boxSrc.w);
+   debug_printf("\t\t.boxSrc.h = %u\n", (*cmd).boxSrc.h);
+   debug_printf("\t\t.boxSrc.d = %u\n", (*cmd).boxSrc.d);
+   debug_printf("\t\t.boxDest.x = %u\n", (*cmd).boxDest.x);
+   debug_printf("\t\t.boxDest.y = %u\n", (*cmd).boxDest.y);
+   debug_printf("\t\t.boxDest.z = %u\n", (*cmd).boxDest.z);
+   debug_printf("\t\t.boxDest.w = %u\n", (*cmd).boxDest.w);
+   debug_printf("\t\t.boxDest.h = %u\n", (*cmd).boxDest.h);
+   debug_printf("\t\t.boxDest.d = %u\n", (*cmd).boxDest.d);
+   switch((*cmd).mode) {
+   case SVGA3D_STRETCH_BLT_POINT:
+      debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_POINT\n");
+      break;
+   case SVGA3D_STRETCH_BLT_LINEAR:
+      debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_LINEAR\n");
+      break;
+   case SVGA3D_STRETCH_BLT_MAX:
+      debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.mode = %i\n", (*cmd).mode);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSurfaceDMA(const SVGA3dCmdSurfaceDMA *cmd)
+{
+   debug_printf("\t\t.guest.ptr.gmrId = %u\n", (*cmd).guest.ptr.gmrId);
+   debug_printf("\t\t.guest.ptr.offset = %u\n", (*cmd).guest.ptr.offset);
+   debug_printf("\t\t.guest.pitch = %u\n", (*cmd).guest.pitch);
+   debug_printf("\t\t.host.sid = %u\n", (*cmd).host.sid);
+   debug_printf("\t\t.host.face = %u\n", (*cmd).host.face);
+   debug_printf("\t\t.host.mipmap = %u\n", (*cmd).host.mipmap);
+   switch((*cmd).transfer) {
+   case SVGA3D_WRITE_HOST_VRAM:
+      debug_printf("\t\t.transfer = SVGA3D_WRITE_HOST_VRAM\n");
+      break;
+   case SVGA3D_READ_HOST_VRAM:
+      debug_printf("\t\t.transfer = SVGA3D_READ_HOST_VRAM\n");
+      break;
+   default:
+      debug_printf("\t\t.transfer = %i\n", (*cmd).transfer);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdSurfaceDMASuffix(const SVGA3dCmdSurfaceDMASuffix *cmd)
+{
+   debug_printf("\t\t.suffixSize = %u\n", (*cmd).suffixSize);
+   debug_printf("\t\t.maximumOffset = %u\n", (*cmd).maximumOffset);
+   debug_printf("\t\t.flags.discard = %u\n", (*cmd).flags.discard);
+   debug_printf("\t\t.flags.unsynchronized = %u\n", (*cmd).flags.unsynchronized);
+}
+
+static void
+dump_SVGA3dCmdSetTransform(const SVGA3dCmdSetTransform *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).type) {
+   case SVGA3D_TRANSFORM_INVALID:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_INVALID\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD\n");
+      break;
+   case SVGA3D_TRANSFORM_VIEW:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_VIEW\n");
+      break;
+   case SVGA3D_TRANSFORM_PROJECTION:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_PROJECTION\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE0:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE0\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE1:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE1\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE2:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE2\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE3:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE3\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE4:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE4\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE5:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE5\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE6:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE6\n");
+      break;
+   case SVGA3D_TRANSFORM_TEXTURE7:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE7\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD1:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD1\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD2:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD2\n");
+      break;
+   case SVGA3D_TRANSFORM_WORLD3:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD3\n");
+      break;
+   case SVGA3D_TRANSFORM_MAX:
+      debug_printf("\t\t.type = SVGA3D_TRANSFORM_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+   debug_printf("\t\t.matrix[0] = %f\n", (*cmd).matrix[0]);
+   debug_printf("\t\t.matrix[1] = %f\n", (*cmd).matrix[1]);
+   debug_printf("\t\t.matrix[2] = %f\n", (*cmd).matrix[2]);
+   debug_printf("\t\t.matrix[3] = %f\n", (*cmd).matrix[3]);
+   debug_printf("\t\t.matrix[4] = %f\n", (*cmd).matrix[4]);
+   debug_printf("\t\t.matrix[5] = %f\n", (*cmd).matrix[5]);
+   debug_printf("\t\t.matrix[6] = %f\n", (*cmd).matrix[6]);
+   debug_printf("\t\t.matrix[7] = %f\n", (*cmd).matrix[7]);
+   debug_printf("\t\t.matrix[8] = %f\n", (*cmd).matrix[8]);
+   debug_printf("\t\t.matrix[9] = %f\n", (*cmd).matrix[9]);
+   debug_printf("\t\t.matrix[10] = %f\n", (*cmd).matrix[10]);
+   debug_printf("\t\t.matrix[11] = %f\n", (*cmd).matrix[11]);
+   debug_printf("\t\t.matrix[12] = %f\n", (*cmd).matrix[12]);
+   debug_printf("\t\t.matrix[13] = %f\n", (*cmd).matrix[13]);
+   debug_printf("\t\t.matrix[14] = %f\n", (*cmd).matrix[14]);
+   debug_printf("\t\t.matrix[15] = %f\n", (*cmd).matrix[15]);
+}
+
+static void
+dump_SVGA3dCmdDestroyShader(const SVGA3dCmdDestroyShader *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   switch((*cmd).type) {
+   case SVGA3D_SHADERTYPE_COMPILED_DX8:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      break;
+   case SVGA3D_SHADERTYPE_VS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      break;
+   case SVGA3D_SHADERTYPE_PS:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      break;
+   case SVGA3D_SHADERTYPE_MAX:
+      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      break;
+   }
+}
+
+static void
+dump_SVGA3dCmdDestroyContext(const SVGA3dCmdDestroyContext *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+}
+
+static void
+dump_SVGA3dCmdClear(const SVGA3dCmdClear *cmd)
+{
+   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   switch((*cmd).clearFlag) {
+   case SVGA3D_CLEAR_COLOR:
+      debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_COLOR\n");
+      break;
+   case SVGA3D_CLEAR_DEPTH:
+      debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_DEPTH\n");
+      break;
+   case SVGA3D_CLEAR_STENCIL:
+      debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_STENCIL\n");
+      break;
+   default:
+      debug_printf("\t\t.clearFlag = %i\n", (*cmd).clearFlag);
+      break;
+   }
+   debug_printf("\t\t.color = %u\n", (*cmd).color);
+   debug_printf("\t\t.depth = %f\n", (*cmd).depth);
+   debug_printf("\t\t.stencil = %u\n", (*cmd).stencil);
+}
+
+static void
+dump_SVGA3dCmdDefineSurface(const SVGA3dCmdDefineSurface *cmd)
+{
+   debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+   switch((*cmd).surfaceFlags) {
+   case SVGA3D_SURFACE_CUBEMAP:
+      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_CUBEMAP\n");
+      break;
+   case SVGA3D_SURFACE_HINT_STATIC:
+      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_STATIC\n");
+      break;
+   case SVGA3D_SURFACE_HINT_DYNAMIC:
+      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_DYNAMIC\n");
+      break;
+   case SVGA3D_SURFACE_HINT_INDEXBUFFER:
+      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_INDEXBUFFER\n");
+      break;
+   case SVGA3D_SURFACE_HINT_VERTEXBUFFER:
+      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_VERTEXBUFFER\n");
+      break;
+   default:
+      debug_printf("\t\t.surfaceFlags = %i\n", (*cmd).surfaceFlags);
+      break;
+   }
+   switch((*cmd).format) {
+   case SVGA3D_FORMAT_INVALID:
+      debug_printf("\t\t.format = SVGA3D_FORMAT_INVALID\n");
+      break;
+   case SVGA3D_X8R8G8B8:
+      debug_printf("\t\t.format = SVGA3D_X8R8G8B8\n");
+      break;
+   case SVGA3D_A8R8G8B8:
+      debug_printf("\t\t.format = SVGA3D_A8R8G8B8\n");
+      break;
+   case SVGA3D_R5G6B5:
+      debug_printf("\t\t.format = SVGA3D_R5G6B5\n");
+      break;
+   case SVGA3D_X1R5G5B5:
+      debug_printf("\t\t.format = SVGA3D_X1R5G5B5\n");
+      break;
+   case SVGA3D_A1R5G5B5:
+      debug_printf("\t\t.format = SVGA3D_A1R5G5B5\n");
+      break;
+   case SVGA3D_A4R4G4B4:
+      debug_printf("\t\t.format = SVGA3D_A4R4G4B4\n");
+      break;
+   case SVGA3D_Z_D32:
+      debug_printf("\t\t.format = SVGA3D_Z_D32\n");
+      break;
+   case SVGA3D_Z_D16:
+      debug_printf("\t\t.format = SVGA3D_Z_D16\n");
+      break;
+   case SVGA3D_Z_D24S8:
+      debug_printf("\t\t.format = SVGA3D_Z_D24S8\n");
+      break;
+   case SVGA3D_Z_D15S1:
+      debug_printf("\t\t.format = SVGA3D_Z_D15S1\n");
+      break;
+   case SVGA3D_LUMINANCE8:
+      debug_printf("\t\t.format = SVGA3D_LUMINANCE8\n");
+      break;
+   case SVGA3D_LUMINANCE4_ALPHA4:
+      debug_printf("\t\t.format = SVGA3D_LUMINANCE4_ALPHA4\n");
+      break;
+   case SVGA3D_LUMINANCE16:
+      debug_printf("\t\t.format = SVGA3D_LUMINANCE16\n");
+      break;
+   case SVGA3D_LUMINANCE8_ALPHA8:
+      debug_printf("\t\t.format = SVGA3D_LUMINANCE8_ALPHA8\n");
+      break;
+   case SVGA3D_DXT1:
+      debug_printf("\t\t.format = SVGA3D_DXT1\n");
+      break;
+   case SVGA3D_DXT2:
+      debug_printf("\t\t.format = SVGA3D_DXT2\n");
+      break;
+   case SVGA3D_DXT3:
+      debug_printf("\t\t.format = SVGA3D_DXT3\n");
+      break;
+   case SVGA3D_DXT4:
+      debug_printf("\t\t.format = SVGA3D_DXT4\n");
+      break;
+   case SVGA3D_DXT5:
+      debug_printf("\t\t.format = SVGA3D_DXT5\n");
+      break;
+   case SVGA3D_BUMPU8V8:
+      debug_printf("\t\t.format = SVGA3D_BUMPU8V8\n");
+      break;
+   case SVGA3D_BUMPL6V5U5:
+      debug_printf("\t\t.format = SVGA3D_BUMPL6V5U5\n");
+      break;
+   case SVGA3D_BUMPX8L8V8U8:
+      debug_printf("\t\t.format = SVGA3D_BUMPX8L8V8U8\n");
+      break;
+   case SVGA3D_BUMPL8V8U8:
+      debug_printf("\t\t.format = SVGA3D_BUMPL8V8U8\n");
+      break;
+   case SVGA3D_ARGB_S10E5:
+      debug_printf("\t\t.format = SVGA3D_ARGB_S10E5\n");
+      break;
+   case SVGA3D_ARGB_S23E8:
+      debug_printf("\t\t.format = SVGA3D_ARGB_S23E8\n");
+      break;
+   case SVGA3D_A2R10G10B10:
+      debug_printf("\t\t.format = SVGA3D_A2R10G10B10\n");
+      break;
+   case SVGA3D_V8U8:
+      debug_printf("\t\t.format = SVGA3D_V8U8\n");
+      break;
+   case SVGA3D_Q8W8V8U8:
+      debug_printf("\t\t.format = SVGA3D_Q8W8V8U8\n");
+      break;
+   case SVGA3D_CxV8U8:
+      debug_printf("\t\t.format = SVGA3D_CxV8U8\n");
+      break;
+   case SVGA3D_X8L8V8U8:
+      debug_printf("\t\t.format = SVGA3D_X8L8V8U8\n");
+      break;
+   case SVGA3D_A2W10V10U10:
+      debug_printf("\t\t.format = SVGA3D_A2W10V10U10\n");
+      break;
+   case SVGA3D_ALPHA8:
+      debug_printf("\t\t.format = SVGA3D_ALPHA8\n");
+      break;
+   case SVGA3D_R_S10E5:
+      debug_printf("\t\t.format = SVGA3D_R_S10E5\n");
+      break;
+   case SVGA3D_R_S23E8:
+      debug_printf("\t\t.format = SVGA3D_R_S23E8\n");
+      break;
+   case SVGA3D_RG_S10E5:
+      debug_printf("\t\t.format = SVGA3D_RG_S10E5\n");
+      break;
+   case SVGA3D_RG_S23E8:
+      debug_printf("\t\t.format = SVGA3D_RG_S23E8\n");
+      break;
+   case SVGA3D_BUFFER:
+      debug_printf("\t\t.format = SVGA3D_BUFFER\n");
+      break;
+   case SVGA3D_Z_D24X8:
+      debug_printf("\t\t.format = SVGA3D_Z_D24X8\n");
+      break;
+   case SVGA3D_FORMAT_MAX:
+      debug_printf("\t\t.format = SVGA3D_FORMAT_MAX\n");
+      break;
+   default:
+      debug_printf("\t\t.format = %i\n", (*cmd).format);
+      break;
+   }
+   debug_printf("\t\t.face[0].numMipLevels = %u\n", (*cmd).face[0].numMipLevels);
+   debug_printf("\t\t.face[1].numMipLevels = %u\n", (*cmd).face[1].numMipLevels);
+   debug_printf("\t\t.face[2].numMipLevels = %u\n", (*cmd).face[2].numMipLevels);
+   debug_printf("\t\t.face[3].numMipLevels = %u\n", (*cmd).face[3].numMipLevels);
+   debug_printf("\t\t.face[4].numMipLevels = %u\n", (*cmd).face[4].numMipLevels);
+   debug_printf("\t\t.face[5].numMipLevels = %u\n", (*cmd).face[5].numMipLevels);
+}
+
+
+void            
+svga_dump_commands(const void *commands, uint32_t size)
+{
+   const uint8_t *next = commands;
+   const uint8_t *last = next + size;
+   
+   assert(size % sizeof(uint32_t) == 0);
+   
+   while(next < last) {
+      const uint32_t cmd_id = *(const uint32_t *)next;
+
+      if(SVGA_3D_CMD_BASE <= cmd_id && cmd_id < SVGA_3D_CMD_MAX) {
+         const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
+         const uint8_t *body = (const uint8_t *)&header[1];
+
+         next = (const uint8_t *)body + header->size;
+         if(next > last)
+            break;
+
+         switch(cmd_id) {
+         case SVGA_3D_CMD_SURFACE_DEFINE:
+            debug_printf("\tSVGA_3D_CMD_SURFACE_DEFINE\n");
+            {
+               const SVGA3dCmdDefineSurface *cmd = (const SVGA3dCmdDefineSurface *)body;
+               dump_SVGA3dCmdDefineSurface(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dSize) <= next) {
+                  dump_SVGA3dSize((const SVGA3dSize *)body);
+                  body += sizeof(SVGA3dSize);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_SURFACE_DESTROY:
+            debug_printf("\tSVGA_3D_CMD_SURFACE_DESTROY\n");
+            {
+               const SVGA3dCmdDestroySurface *cmd = (const SVGA3dCmdDestroySurface *)body;
+               dump_SVGA3dCmdDestroySurface(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SURFACE_COPY:
+            debug_printf("\tSVGA_3D_CMD_SURFACE_COPY\n");
+            {
+               const SVGA3dCmdSurfaceCopy *cmd = (const SVGA3dCmdSurfaceCopy *)body;
+               dump_SVGA3dCmdSurfaceCopy(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dCopyBox) <= next) {
+                  dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+                  body += sizeof(SVGA3dCopyBox);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_SURFACE_STRETCHBLT:
+            debug_printf("\tSVGA_3D_CMD_SURFACE_STRETCHBLT\n");
+            {
+               const SVGA3dCmdSurfaceStretchBlt *cmd = (const SVGA3dCmdSurfaceStretchBlt *)body;
+               dump_SVGA3dCmdSurfaceStretchBlt(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SURFACE_DMA:
+            debug_printf("\tSVGA_3D_CMD_SURFACE_DMA\n");
+            {
+               const SVGA3dCmdSurfaceDMA *cmd = (const SVGA3dCmdSurfaceDMA *)body;
+               dump_SVGA3dCmdSurfaceDMA(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dCopyBox) <= next) {
+                  dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+                  body += sizeof(SVGA3dCopyBox);
+               }
+               while(body + sizeof(SVGA3dCmdSurfaceDMASuffix) <= next) {
+                  dump_SVGA3dCmdSurfaceDMASuffix((const SVGA3dCmdSurfaceDMASuffix *)body);
+                  body += sizeof(SVGA3dCmdSurfaceDMASuffix);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_CONTEXT_DEFINE:
+            debug_printf("\tSVGA_3D_CMD_CONTEXT_DEFINE\n");
+            {
+               const SVGA3dCmdDefineContext *cmd = (const SVGA3dCmdDefineContext *)body;
+               dump_SVGA3dCmdDefineContext(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_CONTEXT_DESTROY:
+            debug_printf("\tSVGA_3D_CMD_CONTEXT_DESTROY\n");
+            {
+               const SVGA3dCmdDestroyContext *cmd = (const SVGA3dCmdDestroyContext *)body;
+               dump_SVGA3dCmdDestroyContext(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETTRANSFORM:
+            debug_printf("\tSVGA_3D_CMD_SETTRANSFORM\n");
+            {
+               const SVGA3dCmdSetTransform *cmd = (const SVGA3dCmdSetTransform *)body;
+               dump_SVGA3dCmdSetTransform(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETZRANGE:
+            debug_printf("\tSVGA_3D_CMD_SETZRANGE\n");
+            {
+               const SVGA3dCmdSetZRange *cmd = (const SVGA3dCmdSetZRange *)body;
+               dump_SVGA3dCmdSetZRange(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETRENDERSTATE:
+            debug_printf("\tSVGA_3D_CMD_SETRENDERSTATE\n");
+            {
+               const SVGA3dCmdSetRenderState *cmd = (const SVGA3dCmdSetRenderState *)body;
+               dump_SVGA3dCmdSetRenderState(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dRenderState) <= next) {
+                  dump_SVGA3dRenderState((const SVGA3dRenderState *)body);
+                  body += sizeof(SVGA3dRenderState);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_SETRENDERTARGET:
+            debug_printf("\tSVGA_3D_CMD_SETRENDERTARGET\n");
+            {
+               const SVGA3dCmdSetRenderTarget *cmd = (const SVGA3dCmdSetRenderTarget *)body;
+               dump_SVGA3dCmdSetRenderTarget(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETTEXTURESTATE:
+            debug_printf("\tSVGA_3D_CMD_SETTEXTURESTATE\n");
+            {
+               const SVGA3dCmdSetTextureState *cmd = (const SVGA3dCmdSetTextureState *)body;
+               dump_SVGA3dCmdSetTextureState(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dTextureState) <= next) {
+                  dump_SVGA3dTextureState((const SVGA3dTextureState *)body);
+                  body += sizeof(SVGA3dTextureState);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_SETMATERIAL:
+            debug_printf("\tSVGA_3D_CMD_SETMATERIAL\n");
+            {
+               const SVGA3dCmdSetMaterial *cmd = (const SVGA3dCmdSetMaterial *)body;
+               dump_SVGA3dCmdSetMaterial(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETLIGHTDATA:
+            debug_printf("\tSVGA_3D_CMD_SETLIGHTDATA\n");
+            {
+               const SVGA3dCmdSetLightData *cmd = (const SVGA3dCmdSetLightData *)body;
+               dump_SVGA3dCmdSetLightData(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETLIGHTENABLED:
+            debug_printf("\tSVGA_3D_CMD_SETLIGHTENABLED\n");
+            {
+               const SVGA3dCmdSetLightEnabled *cmd = (const SVGA3dCmdSetLightEnabled *)body;
+               dump_SVGA3dCmdSetLightEnabled(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETVIEWPORT:
+            debug_printf("\tSVGA_3D_CMD_SETVIEWPORT\n");
+            {
+               const SVGA3dCmdSetViewport *cmd = (const SVGA3dCmdSetViewport *)body;
+               dump_SVGA3dCmdSetViewport(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SETCLIPPLANE:
+            debug_printf("\tSVGA_3D_CMD_SETCLIPPLANE\n");
+            {
+               const SVGA3dCmdSetClipPlane *cmd = (const SVGA3dCmdSetClipPlane *)body;
+               dump_SVGA3dCmdSetClipPlane(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_CLEAR:
+            debug_printf("\tSVGA_3D_CMD_CLEAR\n");
+            {
+               const SVGA3dCmdClear *cmd = (const SVGA3dCmdClear *)body;
+               dump_SVGA3dCmdClear(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dRect) <= next) {
+                  dump_SVGA3dRect((const SVGA3dRect *)body);
+                  body += sizeof(SVGA3dRect);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_PRESENT:
+            debug_printf("\tSVGA_3D_CMD_PRESENT\n");
+            {
+               const SVGA3dCmdPresent *cmd = (const SVGA3dCmdPresent *)body;
+               dump_SVGA3dCmdPresent(cmd);
+               body = (const uint8_t *)&cmd[1];
+               while(body + sizeof(SVGA3dCopyRect) <= next) {
+                  dump_SVGA3dCopyRect((const SVGA3dCopyRect *)body);
+                  body += sizeof(SVGA3dCopyRect);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_SHADER_DEFINE:
+            debug_printf("\tSVGA_3D_CMD_SHADER_DEFINE\n");
+            {
+               const SVGA3dCmdDefineShader *cmd = (const SVGA3dCmdDefineShader *)body;
+               dump_SVGA3dCmdDefineShader(cmd);
+               body = (const uint8_t *)&cmd[1];
+               svga_shader_dump((const uint32_t *)body, 
+                            (unsigned)(next - body)/sizeof(uint32_t),
+                            FALSE );
+               body = next;
+            }
+            break;
+         case SVGA_3D_CMD_SHADER_DESTROY:
+            debug_printf("\tSVGA_3D_CMD_SHADER_DESTROY\n");
+            {
+               const SVGA3dCmdDestroyShader *cmd = (const SVGA3dCmdDestroyShader *)body;
+               dump_SVGA3dCmdDestroyShader(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SET_SHADER:
+            debug_printf("\tSVGA_3D_CMD_SET_SHADER\n");
+            {
+               const SVGA3dCmdSetShader *cmd = (const SVGA3dCmdSetShader *)body;
+               dump_SVGA3dCmdSetShader(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_SET_SHADER_CONST:
+            debug_printf("\tSVGA_3D_CMD_SET_SHADER_CONST\n");
+            {
+               const SVGA3dCmdSetShaderConst *cmd = (const SVGA3dCmdSetShaderConst *)body;
+               dump_SVGA3dCmdSetShaderConst(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_DRAW_PRIMITIVES:
+            debug_printf("\tSVGA_3D_CMD_DRAW_PRIMITIVES\n");
+            {
+               const SVGA3dCmdDrawPrimitives *cmd = (const SVGA3dCmdDrawPrimitives *)body;
+               unsigned i, j;
+               dump_SVGA3dCmdDrawPrimitives(cmd);
+               body = (const uint8_t *)&cmd[1];
+               for(i = 0; i < cmd->numVertexDecls; ++i) {
+                  dump_SVGA3dVertexDecl((const SVGA3dVertexDecl *)body);
+                  body += sizeof(SVGA3dVertexDecl);
+               }
+               for(j = 0; j < cmd->numRanges; ++j) {
+                  dump_SVGA3dPrimitiveRange((const SVGA3dPrimitiveRange *)body);
+                  body += sizeof(SVGA3dPrimitiveRange);
+               }
+               while(body + sizeof(SVGA3dVertexDivisor) <= next) {
+                  dump_SVGA3dVertexDivisor((const SVGA3dVertexDivisor *)body);
+                  body += sizeof(SVGA3dVertexDivisor);
+               }
+            }
+            break;
+         case SVGA_3D_CMD_SETSCISSORRECT:
+            debug_printf("\tSVGA_3D_CMD_SETSCISSORRECT\n");
+            {
+               const SVGA3dCmdSetScissorRect *cmd = (const SVGA3dCmdSetScissorRect *)body;
+               dump_SVGA3dCmdSetScissorRect(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_BEGIN_QUERY:
+            debug_printf("\tSVGA_3D_CMD_BEGIN_QUERY\n");
+            {
+               const SVGA3dCmdBeginQuery *cmd = (const SVGA3dCmdBeginQuery *)body;
+               dump_SVGA3dCmdBeginQuery(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_END_QUERY:
+            debug_printf("\tSVGA_3D_CMD_END_QUERY\n");
+            {
+               const SVGA3dCmdEndQuery *cmd = (const SVGA3dCmdEndQuery *)body;
+               dump_SVGA3dCmdEndQuery(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         case SVGA_3D_CMD_WAIT_FOR_QUERY:
+            debug_printf("\tSVGA_3D_CMD_WAIT_FOR_QUERY\n");
+            {
+               const SVGA3dCmdWaitForQuery *cmd = (const SVGA3dCmdWaitForQuery *)body;
+               dump_SVGA3dCmdWaitForQuery(cmd);
+               body = (const uint8_t *)&cmd[1];
+            }
+            break;
+         default:
+            debug_printf("\t0x%08x\n", cmd_id);
+            break;
+         }
+
+         while(body + sizeof(uint32_t) <= next) {
+            debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+            body += sizeof(uint32_t);
+         }
+         while(body + sizeof(uint32_t) <= next)
+            debug_printf("\t\t0x%02x\n", *body++);
+      }
+      else if(cmd_id == SVGA_CMD_FENCE) {
+         debug_printf("\tSVGA_CMD_FENCE\n");
+         debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
+         next += 2*sizeof(uint32_t);
+      }
+      else {
+         debug_printf("\t0x%08x\n", cmd_id);
+         next += sizeof(uint32_t);
+      }
+   }
+}
+
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.h b/src/gallium/drivers/svga/svgadump/svga_dump.h
new file mode 100644
index 00000000000..69a87020875
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.h
@@ -0,0 +1,34 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef SVGA_DUMP_H_
+#define SVGA_DUMP_H_
+
+#include "pipe/p_compiler.h"
+
+void
+svga_dump_commands(const void *commands, uint32_t size);
+
+#endif /* SVGA_DUMP_H_ */
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.py b/src/gallium/drivers/svga/svgadump/svga_dump.py
new file mode 100755
index 00000000000..288e753296e
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python
+'''
+Generates dumper for the SVGA 3D command stream using pygccxml.
+
+Jose Fonseca <jfonseca@vmware.com>
+'''
+
+copyright = '''
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+ '''
+
+import os
+import sys
+
+from pygccxml import parser
+from pygccxml import declarations
+
+from pygccxml.declarations import algorithm
+from pygccxml.declarations import decl_visitor
+from pygccxml.declarations import type_traits
+from pygccxml.declarations import type_visitor
+
+
+enums = True
+
+
+class decl_dumper_t(decl_visitor.decl_visitor_t):
+
+    def __init__(self, instance = '', decl = None):
+        decl_visitor.decl_visitor_t.__init__(self)
+        self._instance = instance
+        self.decl = decl
+
+    def clone(self):
+        return decl_dumper_t(self._instance, self.decl)
+
+    def visit_class(self):
+        class_ = self.decl
+        assert self.decl.class_type in ('struct', 'union')
+
+        for variable in class_.variables():
+            if variable.name != '':
+                #print 'variable = %r' % variable.name
+                dump_type(self._instance + '.' + variable.name, variable.type)
+
+    def visit_enumeration(self):
+        if enums:
+            print '   switch(%s) {' % ("(*cmd)" + self._instance,)
+            for name, value in self.decl.values:
+                print '   case %s:' % (name,)
+                print '      debug_printf("\\t\\t%s = %s\\n");' % (self._instance, name)
+                print '      break;'
+            print '   default:'
+            print '      debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
+            print '      break;'
+            print '   }'
+        else:
+            print '   debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
+
+
+def dump_decl(instance, decl):
+    dumper = decl_dumper_t(instance, decl)
+    algorithm.apply_visitor(dumper, decl)
+
+
+class type_dumper_t(type_visitor.type_visitor_t):
+
+    def __init__(self, instance, type_):
+        type_visitor.type_visitor_t.__init__(self)
+        self.instance = instance
+        self.type = type_
+
+    def clone(self):
+        return type_dumper_t(self.instance, self.type)
+
+    def visit_char(self):
+        self.print_instance('%i')
+        
+    def visit_unsigned_char(self):
+        self.print_instance('%u')
+
+    def visit_signed_char(self):
+        self.print_instance('%i')
+    
+    def visit_wchar(self):
+        self.print_instance('%i')
+        
+    def visit_short_int(self):
+        self.print_instance('%i')
+        
+    def visit_short_unsigned_int(self):
+        self.print_instance('%u')
+        
+    def visit_bool(self):
+        self.print_instance('%i')
+        
+    def visit_int(self):
+        self.print_instance('%i')
+        
+    def visit_unsigned_int(self):
+        self.print_instance('%u')
+        
+    def visit_long_int(self):
+        self.print_instance('%li')
+        
+    def visit_long_unsigned_int(self):
+        self.print_instance('%lu')
+        
+    def visit_long_long_int(self):
+        self.print_instance('%lli')
+        
+    def visit_long_long_unsigned_int(self):
+        self.print_instance('%llu')
+        
+    def visit_float(self):
+        self.print_instance('%f')
+        
+    def visit_double(self):
+        self.print_instance('%f')
+        
+    def visit_array(self):
+        for i in range(type_traits.array_size(self.type)):
+            dump_type(self.instance + '[%i]' % i, type_traits.base_type(self.type))
+
+    def visit_pointer(self):
+        self.print_instance('%p')
+
+    def visit_declarated(self):
+        #print 'decl = %r' % self.type.decl_string
+        decl = type_traits.remove_declarated(self.type)
+        dump_decl(self.instance, decl)
+
+    def print_instance(self, format):
+        print '   debug_printf("\\t\\t%s = %s\\n", %s);' % (self.instance, format, "(*cmd)" + self.instance)
+
+
+def dump_type(instance, type_):
+    type_ = type_traits.remove_alias(type_)
+    visitor = type_dumper_t(instance, type_)
+    algorithm.apply_visitor(visitor, type_)
+
+
+def dump_struct(decls, class_):
+    print 'static void'
+    print 'dump_%s(const %s *cmd)' % (class_.name, class_.name)
+    print '{'
+    dump_decl('', class_)
+    print '}'
+    print ''
+
+
+cmds = [
+    ('SVGA_3D_CMD_SURFACE_DEFINE', 'SVGA3dCmdDefineSurface', (), 'SVGA3dSize'),
+    ('SVGA_3D_CMD_SURFACE_DESTROY', 'SVGA3dCmdDestroySurface', (), None),
+    ('SVGA_3D_CMD_SURFACE_COPY', 'SVGA3dCmdSurfaceCopy', (), 'SVGA3dCopyBox'),
+    ('SVGA_3D_CMD_SURFACE_STRETCHBLT', 'SVGA3dCmdSurfaceStretchBlt', (), None),
+    ('SVGA_3D_CMD_SURFACE_DMA', 'SVGA3dCmdSurfaceDMA', (), 'SVGA3dCopyBox'),
+    ('SVGA_3D_CMD_CONTEXT_DEFINE', 'SVGA3dCmdDefineContext', (), None),
+    ('SVGA_3D_CMD_CONTEXT_DESTROY', 'SVGA3dCmdDestroyContext', (), None),
+    ('SVGA_3D_CMD_SETTRANSFORM', 'SVGA3dCmdSetTransform', (), None),
+    ('SVGA_3D_CMD_SETZRANGE', 'SVGA3dCmdSetZRange', (), None),
+    ('SVGA_3D_CMD_SETRENDERSTATE', 'SVGA3dCmdSetRenderState', (), 'SVGA3dRenderState'),
+    ('SVGA_3D_CMD_SETRENDERTARGET', 'SVGA3dCmdSetRenderTarget', (), None),
+    ('SVGA_3D_CMD_SETTEXTURESTATE', 'SVGA3dCmdSetTextureState', (), 'SVGA3dTextureState'),
+    ('SVGA_3D_CMD_SETMATERIAL', 'SVGA3dCmdSetMaterial', (), None),
+    ('SVGA_3D_CMD_SETLIGHTDATA', 'SVGA3dCmdSetLightData', (), None),
+    ('SVGA_3D_CMD_SETLIGHTENABLED', 'SVGA3dCmdSetLightEnabled', (), None),
+    ('SVGA_3D_CMD_SETVIEWPORT', 'SVGA3dCmdSetViewport', (), None),
+    ('SVGA_3D_CMD_SETCLIPPLANE', 'SVGA3dCmdSetClipPlane', (), None),
+    ('SVGA_3D_CMD_CLEAR', 'SVGA3dCmdClear', (), 'SVGA3dRect'),
+    ('SVGA_3D_CMD_PRESENT', 'SVGA3dCmdPresent', (), 'SVGA3dCopyRect'),
+    ('SVGA_3D_CMD_SHADER_DEFINE', 'SVGA3dCmdDefineShader', (), None),
+    ('SVGA_3D_CMD_SHADER_DESTROY', 'SVGA3dCmdDestroyShader', (), None),
+    ('SVGA_3D_CMD_SET_SHADER', 'SVGA3dCmdSetShader', (), None),
+    ('SVGA_3D_CMD_SET_SHADER_CONST', 'SVGA3dCmdSetShaderConst', (), None),
+    ('SVGA_3D_CMD_DRAW_PRIMITIVES', 'SVGA3dCmdDrawPrimitives', (('SVGA3dVertexDecl', 'numVertexDecls'), ('SVGA3dPrimitiveRange', 'numRanges')), 'SVGA3dVertexDivisor'),
+    ('SVGA_3D_CMD_SETSCISSORRECT', 'SVGA3dCmdSetScissorRect', (), None),
+    ('SVGA_3D_CMD_BEGIN_QUERY', 'SVGA3dCmdBeginQuery', (), None),
+    ('SVGA_3D_CMD_END_QUERY', 'SVGA3dCmdEndQuery', (), None),
+    ('SVGA_3D_CMD_WAIT_FOR_QUERY', 'SVGA3dCmdWaitForQuery', (), None),
+    #('SVGA_3D_CMD_PRESENT_READBACK', None, (), None),
+]
+
+def dump_cmds():
+    print r'''
+void            
+svga_dump_commands(const void *commands, uint32_t size)
+{
+   const uint8_t *next = commands;
+   const uint8_t *last = next + size;
+   
+   assert(size % sizeof(uint32_t) == 0);
+   
+   while(next < last) {
+      const uint32_t cmd_id = *(const uint32_t *)next;
+
+      if(SVGA_3D_CMD_BASE <= cmd_id && cmd_id < SVGA_3D_CMD_MAX) {
+         const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
+         const uint8_t *body = (const uint8_t *)&header[1];
+
+         next = (const uint8_t *)body + header->size;
+         if(next > last)
+            break;
+'''
+
+    print '         switch(cmd_id) {'
+    indexes = 'ijklmn'
+    for id, header, body, footer in cmds:
+        print '         case %s:' % id
+        print '            debug_printf("\\t%s\\n");' % id
+        print '            {'
+        print '               const %s *cmd = (const %s *)body;' % (header, header)
+        if len(body):
+            print '               unsigned ' + ', '.join(indexes[:len(body)]) + ';'
+        print '               dump_%s(cmd);' % header
+        print '               body = (const uint8_t *)&cmd[1];'
+        for i in range(len(body)):
+            struct, count = body[i]
+            idx = indexes[i]
+            print '               for(%s = 0; %s < cmd->%s; ++%s) {' % (idx, idx, count, idx)
+            print '                  dump_%s((const %s *)body);' % (struct, struct)
+            print '                  body += sizeof(%s);' % struct
+            print '               }'
+        if footer is not None:
+            print '               while(body + sizeof(%s) <= next) {' % footer
+            print '                  dump_%s((const %s *)body);' % (footer, footer)
+            print '                  body += sizeof(%s);' % footer
+            print '               }'
+        if id == 'SVGA_3D_CMD_SHADER_DEFINE':
+            print '               sh_svga_dump((const uint32_t *)body, (unsigned)(next - body)/sizeof(uint32_t));'
+            print '               body = next;'
+        print '            }'
+        print '            break;'
+    print '         default:'
+    print '            debug_printf("\\t0x%08x\\n", cmd_id);'
+    print '            break;'
+    print '         }'
+            
+    print r'''
+         while(body + sizeof(uint32_t) <= next) {
+            debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+            body += sizeof(uint32_t);
+         }
+         while(body + sizeof(uint32_t) <= next)
+            debug_printf("\t\t0x%02x\n", *body++);
+      }
+      else if(cmd_id == SVGA_CMD_FENCE) {
+         debug_printf("\tSVGA_CMD_FENCE\n");
+         debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
+         next += 2*sizeof(uint32_t);
+      }
+      else {
+         debug_printf("\t0x%08x\n", cmd_id);
+         next += sizeof(uint32_t);
+      }
+   }
+}
+'''
+
+def main():
+    print copyright.strip()
+    print
+    print '/**'
+    print ' * @file'
+    print ' * Dump SVGA commands.'
+    print ' *'
+    print ' * Generated automatically from svga3d_reg.h by svga_dump.py.'
+    print ' */'
+    print
+    print '#include "svga_types.h"'
+    print '#include "svga_shader_dump.h"'
+    print '#include "svga3d_reg.h"'
+    print
+    print '#include "pipe/p_debug.h"'
+    print '#include "svga_dump.h"'
+    print
+
+    config = parser.config_t(
+        include_paths = ['include'],
+        compiler = 'gcc',
+    )
+
+    headers = [
+        'include/svga_types.h', 
+        'include/svga3d_reg.h', 
+    ]
+
+    decls = parser.parse(headers, config, parser.COMPILATION_MODE.ALL_AT_ONCE)
+    global_ns = declarations.get_global_namespace(decls)
+
+    names = set()
+    for id, header, body, footer in cmds:
+        names.add(header)
+        for struct, count in body:
+            names.add(struct)
+        if footer is not None:
+            names.add(footer)
+
+    for class_ in global_ns.classes(lambda decl: decl.name in names):
+        dump_struct(decls, class_)
+
+    dump_cmds()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader.h b/src/gallium/drivers/svga/svgadump/svga_shader.h
new file mode 100644
index 00000000000..9217af2dd99
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader.h
@@ -0,0 +1,220 @@
+/**********************************************************
+ * Copyright 2007-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Token Definitions
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#ifndef ST_SHADER_SVGA_H
+#define ST_SHADER_SVGA_H
+
+#include "pipe/p_compiler.h"
+
+struct sh_op
+{
+   unsigned opcode:16;
+   unsigned control:8;
+   unsigned length:4;
+   unsigned predicated:1;
+   unsigned unused:1;
+   unsigned coissue:1;
+   unsigned is_reg:1;
+};
+
+struct sh_reg
+{
+   unsigned number:11;
+   unsigned type_hi:2;
+   unsigned relative:1;
+   unsigned unused:14;
+   unsigned type_lo:3;
+   unsigned is_reg:1;
+};
+
+static INLINE unsigned
+sh_reg_type( struct sh_reg reg )
+{
+   return reg.type_lo | (reg.type_hi << 3);
+}
+
+struct sh_cdata
+{
+   float xyzw[4];
+};
+
+struct sh_def
+{
+   struct sh_op op;
+   struct sh_reg reg;
+   struct sh_cdata cdata;
+};
+
+struct sh_defb
+{
+   struct sh_op op;
+   struct sh_reg reg;
+   uint data;
+};
+
+struct sh_idata
+{
+   int xyzw[4];
+};
+
+struct sh_defi
+{
+   struct sh_op op;
+   struct sh_reg reg;
+   struct sh_idata idata;
+};
+
+#define PS_TEXTURETYPE_UNKNOWN   SVGA3DSAMP_UNKNOWN
+#define PS_TEXTURETYPE_2D        SVGA3DSAMP_2D
+#define PS_TEXTURETYPE_CUBE      SVGA3DSAMP_CUBE
+#define PS_TEXTURETYPE_VOLUME    SVGA3DSAMP_VOLUME
+
+struct ps_sampleinfo
+{
+   unsigned unused:27;
+   unsigned texture_type:4;
+   unsigned is_reg:1;
+};
+
+struct vs_semantic
+{
+   unsigned usage:5;
+   unsigned unused1:11;
+   unsigned usage_index:4;
+   unsigned unused2:12;
+};
+
+struct sh_dstreg
+{
+   unsigned number:11;
+   unsigned type_hi:2;
+   unsigned relative:1;
+   unsigned unused:2;
+   unsigned write_mask:4;
+   unsigned modifier:4;
+   unsigned shift_scale:4;
+   unsigned type_lo:3;
+   unsigned is_reg:1;
+};
+
+static INLINE unsigned
+sh_dstreg_type( struct sh_dstreg reg )
+{
+   return reg.type_lo | (reg.type_hi << 3);
+}
+
+struct sh_dcl
+{
+   struct sh_op op;
+   union {
+      struct {
+         struct ps_sampleinfo sampleinfo;
+      } ps;
+      struct {
+         struct vs_semantic semantic;
+      } vs;
+   } u;
+   struct sh_dstreg reg;
+};
+
+
+struct sh_srcreg
+{
+   unsigned number:11;
+   unsigned type_hi:2;
+   unsigned relative:1;
+   unsigned unused:2;
+   unsigned swizzle_x:2;
+   unsigned swizzle_y:2;
+   unsigned swizzle_z:2;
+   unsigned swizzle_w:2;
+   unsigned modifier:4;
+   unsigned type_lo:3;
+   unsigned is_reg:1;
+};
+
+static INLINE unsigned
+sh_srcreg_type( struct sh_srcreg reg )
+{
+   return reg.type_lo | (reg.type_hi << 3);
+}
+
+struct sh_dstop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+};
+
+struct sh_srcop
+{
+   struct sh_op op;
+   struct sh_srcreg src;
+};
+
+struct sh_src2op
+{
+   struct sh_op op;
+   struct sh_srcreg src0;
+   struct sh_srcreg src1;
+};
+
+struct sh_unaryop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg src;
+};
+
+struct sh_binaryop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg src0;
+   struct sh_srcreg src1;
+};
+
+struct sh_trinaryop
+{
+   struct sh_op op;
+   struct sh_dstreg dst;
+   struct sh_srcreg src0;
+   struct sh_srcreg src1;
+   struct sh_srcreg src2;
+};
+
+struct sh_comment
+{
+   unsigned opcode:16;
+   unsigned size:16;
+};
+
+#endif /* ST_SHADER_SVGA_H */
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_dump.c b/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
new file mode 100644
index 00000000000..b0e7fdf378a
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
@@ -0,0 +1,654 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Dump Facilities
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#include "svga_shader.h"
+#include "svga_shader_dump.h"
+#include "svga_shader_op.h"
+#include "util/u_debug.h"
+
+#include "../svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+struct dump_info
+{
+   SVGA3dShaderVersion version;
+   boolean is_ps;
+};
+
+static void dump_op( struct sh_op op, const char *mnemonic )
+{
+   assert( op.predicated == 0 );
+   assert( op.is_reg == 0 );
+
+   if (op.coissue)
+      debug_printf( "+" );
+   debug_printf( "%s", mnemonic );
+   switch (op.control) {
+   case 0:
+      break;
+   case SVGA3DOPCONT_PROJECT:
+      debug_printf( "p" );
+      break;
+   case SVGA3DOPCONT_BIAS:
+      debug_printf( "b" );
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+
+static void dump_comp_op( struct sh_op op, const char *mnemonic )
+{
+   assert( op.is_reg == 0 );
+
+   if (op.coissue)
+      debug_printf( "+" );
+   debug_printf( "%s", mnemonic );
+   switch (op.control) {
+   case SVGA3DOPCOMP_RESERVED0:
+      break;
+   case SVGA3DOPCOMP_GT:
+      debug_printf("_gt");
+      break;
+   case SVGA3DOPCOMP_EQ:
+      debug_printf("_eq");
+      break;
+   case SVGA3DOPCOMP_GE:
+      debug_printf("_ge");
+      break;
+   case SVGA3DOPCOMP_LT:
+      debug_printf("_lt");
+      break;
+   case SVGA3DOPCOMPC_NE:
+      debug_printf("_ne");
+      break;
+   case SVGA3DOPCOMP_LE:
+      debug_printf("_le");
+      break;
+   case SVGA3DOPCOMP_RESERVED1:
+   default:
+      assert( 0 );
+   }
+}
+
+
+static void dump_reg( struct sh_reg reg, struct sh_srcreg *indreg, const struct dump_info *di )
+{
+   assert( sh_reg_type( reg ) == SVGA3DREG_CONST || reg.relative == 0 );
+   assert( reg.is_reg == 1 );
+
+   switch (sh_reg_type( reg )) {
+   case SVGA3DREG_TEMP:
+      debug_printf( "r%u", reg.number );
+      break;
+
+   case SVGA3DREG_INPUT:
+      debug_printf( "v%u", reg.number );
+      break;
+
+   case SVGA3DREG_CONST:
+      if (reg.relative) {
+         if (sh_srcreg_type( *indreg ) == SVGA3DREG_LOOP)
+            debug_printf( "c[aL+%u]", reg.number );
+         else
+            debug_printf( "c[a%u.x+%u]", indreg->number, reg.number );
+      }
+      else
+         debug_printf( "c%u", reg.number );
+      break;
+
+   case SVGA3DREG_ADDR:    /* VS */
+   /* SVGA3DREG_TEXTURE */ /* PS */
+      if (di->is_ps)
+         debug_printf( "t%u", reg.number );
+      else
+         debug_printf( "a%u", reg.number );
+      break;
+
+   case SVGA3DREG_RASTOUT:
+      switch (reg.number) {
+      case 0 /*POSITION*/:
+         debug_printf( "oPos" );
+         break;
+      case 1 /*FOG*/:
+         debug_printf( "oFog" );
+         break;
+      case 2 /*POINT_SIZE*/:
+         debug_printf( "oPts" );
+         break;
+      default:
+         assert( 0 );
+         debug_printf( "???" );
+      }
+      break;
+
+   case SVGA3DREG_ATTROUT:
+      assert( reg.number < 2 );
+      debug_printf( "oD%u", reg.number );
+      break;
+
+   case SVGA3DREG_TEXCRDOUT:
+   /* SVGA3DREG_OUTPUT */
+      debug_printf( "oT%u", reg.number );
+      break;
+
+   case SVGA3DREG_COLOROUT:
+      debug_printf( "oC%u", reg.number );
+      break;
+
+   case SVGA3DREG_DEPTHOUT:
+      debug_printf( "oD%u", reg.number );
+      break;
+
+   case SVGA3DREG_SAMPLER:
+      debug_printf( "s%u", reg.number );
+      break;
+
+   case SVGA3DREG_CONSTBOOL:
+      assert( !reg.relative );
+      debug_printf( "b%u", reg.number );
+      break;
+
+   case SVGA3DREG_CONSTINT:
+      assert( !reg.relative );
+      debug_printf( "i%u", reg.number );
+      break;
+
+   case SVGA3DREG_LOOP:
+      assert( reg.number == 0 );
+      debug_printf( "aL" );
+      break;
+
+   case SVGA3DREG_MISCTYPE:
+      switch (reg.number) {
+      case SVGA3DMISCREG_POSITION:
+         debug_printf( "vPos" );
+         break;
+      case SVGA3DMISCREG_FACE:
+         debug_printf( "vFace" );
+         break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+
+   case SVGA3DREG_LABEL:
+      debug_printf( "l%u", reg.number );
+      break;
+
+   case SVGA3DREG_PREDICATE:
+      debug_printf( "p%u", reg.number );
+      break;
+
+
+   default:
+      assert( 0 );
+      debug_printf( "???" );
+   }
+}
+
+static void dump_cdata( struct sh_cdata cdata )
+{
+   debug_printf( "%f, %f, %f, %f", cdata.xyzw[0], cdata.xyzw[1], cdata.xyzw[2], cdata.xyzw[3] );
+}
+
+static void dump_idata( struct sh_idata idata )
+{
+   debug_printf( "%d, %d, %d, %d", idata.xyzw[0], idata.xyzw[1], idata.xyzw[2], idata.xyzw[3] );
+}
+
+static void dump_bdata( boolean bdata )
+{
+   debug_printf( bdata ? "TRUE" : "FALSE" );
+}
+
+static void dump_sampleinfo( struct ps_sampleinfo sampleinfo )
+{
+   switch (sampleinfo.texture_type) {
+   case SVGA3DSAMP_2D:
+      debug_printf( "_2d" );
+      break;
+   case SVGA3DSAMP_CUBE:
+      debug_printf( "_cube" );
+      break;
+   case SVGA3DSAMP_VOLUME:
+      debug_printf( "_volume" );
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+
+static void dump_usageinfo( struct vs_semantic semantic )
+{
+   switch (semantic.usage) {
+   case SVGA3D_DECLUSAGE_POSITION:
+      debug_printf("_position" );
+      break;
+   case SVGA3D_DECLUSAGE_BLENDWEIGHT:
+      debug_printf("_blendweight" );
+      break;
+   case SVGA3D_DECLUSAGE_BLENDINDICES:
+      debug_printf("_blendindices" );
+      break;
+   case SVGA3D_DECLUSAGE_NORMAL:
+      debug_printf("_normal" );
+      break;
+   case SVGA3D_DECLUSAGE_PSIZE:
+      debug_printf("_psize" );
+      break;
+   case SVGA3D_DECLUSAGE_TEXCOORD:
+      debug_printf("_texcoord");
+      break;
+   case SVGA3D_DECLUSAGE_TANGENT:
+      debug_printf("_tangent" );
+      break;
+   case SVGA3D_DECLUSAGE_BINORMAL:
+      debug_printf("_binormal" );
+      break;
+   case SVGA3D_DECLUSAGE_TESSFACTOR:
+      debug_printf("_tessfactor" );
+      break;
+   case SVGA3D_DECLUSAGE_POSITIONT:
+      debug_printf("_positiont" );
+      break;
+   case SVGA3D_DECLUSAGE_COLOR:
+      debug_printf("_color" );
+      break;
+   case SVGA3D_DECLUSAGE_FOG:
+      debug_printf("_fog" );
+      break;
+   case SVGA3D_DECLUSAGE_DEPTH:
+      debug_printf("_depth" );
+      break;
+   case SVGA3D_DECLUSAGE_SAMPLE:
+      debug_printf("_sample");
+      break;
+   default:
+      assert( 0 );
+      return;
+   }
+
+   if (semantic.usage_index != 0) {
+      debug_printf("%d", semantic.usage_index );
+   }
+}
+
+static void dump_dstreg( struct sh_dstreg dstreg, const struct dump_info *di )
+{
+   union {
+      struct sh_reg reg;
+      struct sh_dstreg dstreg;
+   } u;
+
+   assert( (dstreg.modifier & (SVGA3DDSTMOD_SATURATE | SVGA3DDSTMOD_PARTIALPRECISION)) == dstreg.modifier );
+
+   if (dstreg.modifier & SVGA3DDSTMOD_SATURATE)
+      debug_printf( "_sat" );
+   if (dstreg.modifier & SVGA3DDSTMOD_PARTIALPRECISION)
+      debug_printf( "_pp" );
+   switch (dstreg.shift_scale) {
+   case 0:
+      break;
+   case 1:
+      debug_printf( "_x2" );
+      break;
+   case 2:
+      debug_printf( "_x4" );
+      break;
+   case 3:
+      debug_printf( "_x8" );
+      break;
+   case 13:
+      debug_printf( "_d8" );
+      break;
+   case 14:
+      debug_printf( "_d4" );
+      break;
+   case 15:
+      debug_printf( "_d2" );
+      break;
+   default:
+      assert( 0 );
+   }
+   debug_printf( " " );
+
+   u.dstreg = dstreg;
+   dump_reg( u.reg, NULL, di );
+   if (dstreg.write_mask != SVGA3DWRITEMASK_ALL) {
+      debug_printf( "." );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_0)
+         debug_printf( "x" );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_1)
+         debug_printf( "y" );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_2)
+         debug_printf( "z" );
+      if (dstreg.write_mask & SVGA3DWRITEMASK_3)
+         debug_printf( "w" );
+   }
+}
+
+static void dump_srcreg( struct sh_srcreg srcreg, struct sh_srcreg *indreg, const struct dump_info *di )
+{
+   union {
+      struct sh_reg reg;
+      struct sh_srcreg srcreg;
+   } u;
+
+   switch (srcreg.modifier) {
+   case SVGA3DSRCMOD_NEG:
+   case SVGA3DSRCMOD_BIASNEG:
+   case SVGA3DSRCMOD_SIGNNEG:
+   case SVGA3DSRCMOD_X2NEG:
+      debug_printf( "-" );
+      break;
+   case SVGA3DSRCMOD_ABS:
+      debug_printf( "|" );
+      break;
+   case SVGA3DSRCMOD_ABSNEG:
+      debug_printf( "-|" );
+      break;
+   case SVGA3DSRCMOD_COMP:
+      debug_printf( "1-" );
+      break;
+   case SVGA3DSRCMOD_NOT:
+      debug_printf( "!" );
+   }
+
+   u.srcreg = srcreg;
+   dump_reg( u.reg, indreg, di );
+   switch (srcreg.modifier) {
+   case SVGA3DSRCMOD_NONE:
+   case SVGA3DSRCMOD_NEG:
+   case SVGA3DSRCMOD_COMP:
+   case SVGA3DSRCMOD_NOT:
+      break;
+   case SVGA3DSRCMOD_ABS:
+   case SVGA3DSRCMOD_ABSNEG:
+      debug_printf( "|" );
+      break;
+   case SVGA3DSRCMOD_BIAS:
+   case SVGA3DSRCMOD_BIASNEG:
+      debug_printf( "_bias" );
+      break;
+   case SVGA3DSRCMOD_SIGN:
+   case SVGA3DSRCMOD_SIGNNEG:
+      debug_printf( "_bx2" );
+      break;
+   case SVGA3DSRCMOD_X2:
+   case SVGA3DSRCMOD_X2NEG:
+      debug_printf( "_x2" );
+      break;
+   case SVGA3DSRCMOD_DZ:
+      debug_printf( "_dz" );
+      break;
+   case SVGA3DSRCMOD_DW:
+      debug_printf( "_dw" );
+      break;
+   default:
+      assert( 0 );
+   }
+   if (srcreg.swizzle_x != 0 || srcreg.swizzle_y != 1 || srcreg.swizzle_z != 2 || srcreg.swizzle_w != 3) {
+      debug_printf( "." );
+      if (srcreg.swizzle_x == srcreg.swizzle_y && srcreg.swizzle_y == srcreg.swizzle_z && srcreg.swizzle_z == srcreg.swizzle_w) {
+         debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
+      }
+      else {
+         debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
+         debug_printf( "%c", "xyzw"[srcreg.swizzle_y] );
+         debug_printf( "%c", "xyzw"[srcreg.swizzle_z] );
+         debug_printf( "%c", "xyzw"[srcreg.swizzle_w] );
+      }
+   }
+}
+
+void
+svga_shader_dump(
+   const unsigned *assem,
+   unsigned dwords,
+   unsigned do_binary )
+{
+   const unsigned *start = assem;
+   boolean finished = FALSE;
+   struct dump_info di;
+   unsigned i;
+
+   if (do_binary) {
+      for (i = 0; i < dwords; i++) 
+         debug_printf("  0x%08x,\n", assem[i]);
+      
+      debug_printf("\n\n");
+   }
+
+   di.version.value = *assem++;
+   di.is_ps = (di.version.type == SVGA3D_PS_TYPE);
+
+   debug_printf(
+      "%s_%u_%u\n",
+      di.is_ps ? "ps" : "vs",
+      di.version.major,
+      di.version.minor );
+
+   while (!finished) {
+      struct sh_op op = *(struct sh_op *) assem;
+
+      if (assem - start >= dwords) {
+         debug_printf("... ran off end of buffer\n");
+         assert(0);
+         return;
+      }
+
+      switch (op.opcode) {
+      case SVGA3DOP_DCL:
+         {
+            struct sh_dcl dcl = *(struct sh_dcl *) assem;
+
+            debug_printf( "dcl" );
+            if (sh_dstreg_type( dcl.reg ) == SVGA3DREG_SAMPLER)
+               dump_sampleinfo( dcl.u.ps.sampleinfo );
+            else if (di.is_ps) {
+               if (di.version.major == 3 && 
+                   sh_dstreg_type( dcl.reg ) != SVGA3DREG_MISCTYPE)
+                  dump_usageinfo( dcl.u.vs.semantic );
+            }
+            else
+               dump_usageinfo( dcl.u.vs.semantic );
+            dump_dstreg( dcl.reg, &di );
+            debug_printf( "\n" );
+            assem += sizeof( struct sh_dcl ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_DEFB:
+         {
+            struct sh_defb defb = *(struct sh_defb *) assem;
+
+            debug_printf( "defb " );
+            dump_reg( defb.reg, NULL, &di );
+            debug_printf( ", " );
+            dump_bdata( defb.data );
+            debug_printf( "\n" );
+            assem += sizeof( struct sh_defb ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_DEFI:
+         {
+            struct sh_defi defi = *(struct sh_defi *) assem;
+
+            debug_printf( "defi " );
+            dump_reg( defi.reg, NULL, &di );
+            debug_printf( ", " );
+            dump_idata( defi.idata );
+            debug_printf( "\n" );
+            assem += sizeof( struct sh_defi ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_TEXCOORD:
+         assert( di.is_ps );
+         dump_op( op, "texcoord" );
+         if (0) {
+            struct sh_dstop dstop = *(struct sh_dstop *) assem;
+            dump_dstreg( dstop.dst, &di );
+            assem += sizeof( struct sh_dstop ) / sizeof( unsigned );
+         }
+         else {
+            struct sh_unaryop unaryop = *(struct sh_unaryop *) assem;
+            dump_dstreg( unaryop.dst, &di );
+            debug_printf( ", " );
+            dump_srcreg( unaryop.src, NULL, &di );
+            assem += sizeof( struct sh_unaryop ) / sizeof( unsigned );
+         }
+         debug_printf( "\n" );
+         break;
+
+      case SVGA3DOP_TEX:
+         assert( di.is_ps );
+         if (0) {
+            dump_op( op, "tex" );
+            if (0) {
+               struct sh_dstop dstop = *(struct sh_dstop *) assem;
+
+               dump_dstreg( dstop.dst, &di );
+               assem += sizeof( struct sh_dstop ) / sizeof( unsigned );
+            }
+            else {
+               struct sh_unaryop unaryop = *(struct sh_unaryop *) assem;
+
+               dump_dstreg( unaryop.dst, &di );
+               debug_printf( ", " );
+               dump_srcreg( unaryop.src, NULL, &di );
+               assem += sizeof( struct sh_unaryop ) / sizeof( unsigned );
+            }
+         }
+         else {
+            struct sh_binaryop binaryop = *(struct sh_binaryop *) assem;
+
+            dump_op( op, "texld" );
+            dump_dstreg( binaryop.dst, &di );
+            debug_printf( ", " );
+            dump_srcreg( binaryop.src0, NULL, &di );
+            debug_printf( ", " );
+            dump_srcreg( binaryop.src1, NULL, &di );
+            assem += sizeof( struct sh_binaryop ) / sizeof( unsigned );
+         }
+         debug_printf( "\n" );
+         break;
+
+      case SVGA3DOP_DEF:
+         {
+            struct sh_def def = *(struct sh_def *) assem;
+
+            debug_printf( "def " );
+            dump_reg( def.reg, NULL, &di );
+            debug_printf( ", " );
+            dump_cdata( def.cdata );
+            debug_printf( "\n" );
+            assem += sizeof( struct sh_def ) / sizeof( unsigned );
+         }
+         break;
+
+      case SVGA3DOP_PHASE:
+         debug_printf( "phase\n" );
+         assem += sizeof( struct sh_op ) / sizeof( unsigned );
+         break;
+
+      case SVGA3DOP_COMMENT:
+         {
+            struct sh_comment comment = *(struct sh_comment *)assem;
+
+            /* Ignore comment contents. */
+            assem += sizeof(struct sh_comment) / sizeof(unsigned) + comment.size;
+         }
+         break;
+
+      case SVGA3DOP_RET:
+         debug_printf( "ret\n" );
+         assem += sizeof( struct sh_op ) / sizeof( unsigned );
+         break;
+
+      case SVGA3DOP_END:
+         debug_printf( "end\n" );
+         finished = TRUE;
+         break;
+
+      default:
+         {
+            const struct sh_opcode_info *info = svga_opcode_info( op.opcode );
+            uint i;
+            uint num_src = info->num_src + op.predicated;
+            boolean not_first_arg = FALSE;
+
+            assert( info->num_dst <= 1 );
+
+            if (op.opcode == SVGA3DOP_SINCOS && di.version.major < 3)
+               num_src += 2;
+
+            dump_comp_op( op, info->mnemonic );
+            assem += sizeof( struct sh_op ) / sizeof( unsigned );
+
+            if (info->num_dst > 0) {
+               struct sh_dstreg dstreg = *(struct sh_dstreg *) assem;
+
+               dump_dstreg( dstreg, &di );
+               assem += sizeof( struct sh_dstreg ) / sizeof( unsigned );
+               not_first_arg = TRUE;
+            }
+
+            for (i = 0; i < num_src; i++) {
+               struct sh_srcreg srcreg;
+               struct sh_srcreg indreg;
+
+               srcreg = *(struct sh_srcreg *) assem;
+               assem += sizeof( struct sh_srcreg ) / sizeof( unsigned );
+               if (srcreg.relative && !di.is_ps && di.version.major >= 2) {
+                  indreg = *(struct sh_srcreg *) assem;
+                  assem += sizeof( struct sh_srcreg ) / sizeof( unsigned );
+               }
+
+               if (not_first_arg)
+                  debug_printf( ", " );
+               else
+                  debug_printf( " " );
+               dump_srcreg( srcreg, &indreg, &di );
+               not_first_arg = TRUE;
+            }
+
+            debug_printf( "\n" );
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_dump.h b/src/gallium/drivers/svga/svgadump/svga_shader_dump.h
new file mode 100644
index 00000000000..a2657acb2f1
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_dump.h
@@ -0,0 +1,42 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Dump Facilities
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#ifndef SVGA_SHADER_DUMP_H
+#define SVGA_SHADER_DUMP_H
+
+void
+svga_shader_dump(
+   const unsigned *assem,
+   unsigned dwords,
+   unsigned do_binary );
+
+#endif /* SVGA_SHADER_DUMP_H */
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_op.c b/src/gallium/drivers/svga/svgadump/svga_shader_op.c
new file mode 100644
index 00000000000..8343bfdaab4
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_op.c
@@ -0,0 +1,168 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Token Opcode Info
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#include "util/u_debug.h"
+#include "svga_shader_op.h"
+
+#include "../svga_hw_reg.h"
+#include "svga3d_shaderdefs.h"
+
+#define SVGA3DOP_INVALID SVGA3DOP_END
+#define TGSI_OPCODE_INVALID TGSI_OPCODE_LAST
+
+static struct sh_opcode_info opcode_info[] =
+{
+   { "nop",          0, 0, SVGA3DOP_NOP          },
+   { "mov",          1, 1, SVGA3DOP_MOV,         },
+   { "add",          1, 2, SVGA3DOP_ADD,         },
+   { "sub",          1, 2, SVGA3DOP_SUB,         },
+   { "mad",          1, 3, SVGA3DOP_MAD,         },
+   { "mul",          1, 2, SVGA3DOP_MUL,         },
+   { "rcp",          1, 1, SVGA3DOP_RCP,         },
+   { "rsq",          1, 1, SVGA3DOP_RSQ,         },
+   { "dp3",          1, 2, SVGA3DOP_DP3,         },
+   { "dp4",          1, 2, SVGA3DOP_DP4,         },
+   { "min",          1, 2, SVGA3DOP_MIN,         },
+   { "max",          1, 2, SVGA3DOP_MAX,         },
+   { "slt",          1, 2, SVGA3DOP_SLT,         },
+   { "sge",          1, 2, SVGA3DOP_SGE,         },
+   { "exp",          1, 1, SVGA3DOP_EXP,         },
+   { "log",          1, 1, SVGA3DOP_LOG,         },
+   { "lit",          1, 1, SVGA3DOP_LIT,         },
+   { "dst",          1, 2, SVGA3DOP_DST,         },
+   { "lrp",          1, 3, SVGA3DOP_LRP,         },
+   { "frc",          1, 1, SVGA3DOP_FRC,         },
+   { "m4x4",         1, 2, SVGA3DOP_M4x4,        },
+   { "m4x3",         1, 2, SVGA3DOP_M4x3,        },
+   { "m3x4",         1, 2, SVGA3DOP_M3x4,        },
+   { "m3x3",         1, 2, SVGA3DOP_M3x3,        },
+   { "m3x2",         1, 2, SVGA3DOP_M3x2,        },
+   { "call",         0, 1, SVGA3DOP_CALL,        },
+   { "callnz",       0, 2, SVGA3DOP_CALLNZ,      },
+   { "loop",         0, 2, SVGA3DOP_LOOP,        },
+   { "ret",          0, 0, SVGA3DOP_RET,         },
+   { "endloop",      0, 0, SVGA3DOP_ENDLOOP,     },
+   { "label",        0, 1, SVGA3DOP_LABEL,       },
+   { "dcl",          0, 0, SVGA3DOP_DCL,         },
+   { "pow",          1, 2, SVGA3DOP_POW,         },
+   { "crs",          1, 2, SVGA3DOP_CRS,         },
+   { "sgn",          1, 3, SVGA3DOP_SGN,         },
+   { "abs",          1, 1, SVGA3DOP_ABS,         },
+   { "nrm",          1, 1, SVGA3DOP_NRM,         }, /* 3-componenet normalization */
+   { "sincos",       1, 1, SVGA3DOP_SINCOS,      },
+   { "rep",          0, 1, SVGA3DOP_REP,         },
+   { "endrep",       0, 0, SVGA3DOP_ENDREP,      },
+   { "if",           0, 1, SVGA3DOP_IF,          },
+   { "ifc",          0, 2, SVGA3DOP_IFC,         },
+   { "else",         0, 0, SVGA3DOP_ELSE,        },
+   { "endif",        0, 0, SVGA3DOP_ENDIF,       },
+   { "break",        0, 0, SVGA3DOP_BREAK,       },
+   { "breakc",       0, 0, SVGA3DOP_BREAKC,      },
+   { "mova",         1, 1, SVGA3DOP_MOVA,        },
+   { "defb",         0, 0, SVGA3DOP_DEFB,        },
+   { "defi",         0, 0, SVGA3DOP_DEFI,        },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "???",          0, 0, SVGA3DOP_INVALID,     },
+   { "texcoord",     0, 0, SVGA3DOP_TEXCOORD,    },
+   { "texkill",      1, 0, SVGA3DOP_TEXKILL,     },
+   { "tex",          0, 0, SVGA3DOP_TEX,         },
+   { "texbem",       1, 1, SVGA3DOP_TEXBEM,      },
+   { "texbeml",      1, 1, SVGA3DOP_TEXBEML,     },
+   { "texreg2ar",    1, 1, SVGA3DOP_TEXREG2AR,   },
+   { "texreg2gb",    1, 1, SVGA3DOP_TEXREG2GB,   },
+   { "texm3x2pad",   1, 1, SVGA3DOP_TEXM3x2PAD,  },
+   { "texm3x2tex",   1, 1, SVGA3DOP_TEXM3x2TEX,  },
+   { "texm3x3pad",   1, 1, SVGA3DOP_TEXM3x3PAD,  },
+   { "texm3x3tex",   1, 1, SVGA3DOP_TEXM3x3TEX,  },
+   { "reserved0",    0, 0, SVGA3DOP_RESERVED0,   },
+   { "texm3x3spec",  1, 2, SVGA3DOP_TEXM3x3SPEC, },
+   { "texm3x3vspec", 1, 1, SVGA3DOP_TEXM3x3VSPEC,},
+   { "expp",         1, 1, SVGA3DOP_EXPP,        },
+   { "logp",         1, 1, SVGA3DOP_LOGP,        },
+   { "cnd",          1, 3, SVGA3DOP_CND,         },
+   { "def",          0, 0, SVGA3DOP_DEF,         },
+   { "texreg2rgb",   1, 1, SVGA3DOP_TEXREG2RGB,  },
+   { "texdp3tex",    1, 1, SVGA3DOP_TEXDP3TEX,   },
+   { "texm3x2depth", 1, 1, SVGA3DOP_TEXM3x2DEPTH,},
+   { "texdp3",       1, 1, SVGA3DOP_TEXDP3,      },
+   { "texm3x3",      1, 1, SVGA3DOP_TEXM3x3,     },
+   { "texdepth",     1, 0, SVGA3DOP_TEXDEPTH,    },
+   { "cmp",          1, 3, SVGA3DOP_CMP,         },
+   { "bem",          1, 2, SVGA3DOP_BEM,         },
+   { "dp2add",       1, 3, SVGA3DOP_DP2ADD,      },
+   { "dsx",          1, 1, SVGA3DOP_INVALID,     },
+   { "dsy",          1, 1, SVGA3DOP_INVALID,     },
+   { "texldd",       1, 1, SVGA3DOP_INVALID,     },
+   { "setp",         1, 2, SVGA3DOP_SETP,        },
+   { "texldl",       1, 1, SVGA3DOP_INVALID,     },
+   { "breakp",       1, 1, SVGA3DOP_INVALID,     },
+};
+
+const struct sh_opcode_info *svga_opcode_info( uint op )
+{
+   struct sh_opcode_info *info;
+
+   if (op >= sizeof( opcode_info ) / sizeof( opcode_info[0] )) {
+      /* The opcode is either PHASE, COMMENT, END or out of range.
+       */
+      assert( 0 );
+      return NULL;
+   }
+
+   info = &opcode_info[op];
+
+   if (info->svga_opcode == SVGA3DOP_INVALID) {
+      /* No valid information. Please provide number of dst/src registers.
+       */
+      assert( 0 );
+      return NULL;
+   }
+
+   /* Sanity check.
+    */
+   assert( op == info->svga_opcode );
+
+   return info;
+}
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_op.h b/src/gallium/drivers/svga/svgadump/svga_shader_op.h
new file mode 100644
index 00000000000..e558de02c53
--- /dev/null
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_op.h
@@ -0,0 +1,46 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA Shader Token Opcode Info
+ * 
+ * @author Michal Krol <michal@vmware.com>
+ */
+
+#ifndef SVGA_SHADER_OP_H
+#define SVGA_SHADER_OP_H
+
+struct sh_opcode_info
+{
+   const char *mnemonic;
+   unsigned num_dst:8;
+   unsigned num_src:8;
+   unsigned svga_opcode:16;
+};
+
+const struct sh_opcode_info *svga_opcode_info( unsigned op );
+
+#endif /* SVGA_SHADER_OP_H */
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index ae0af4d0557..bf470b46ae1 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -125,11 +125,11 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    } else if ((tr_ctx->draw_rule.blocker & flag) &&
               (tr_ctx->draw_blocker & 4)) {
       boolean block = FALSE;
-      debug_printf("%s (%lu %lu) (%lu %lu) (%lu %u) (%lu %u)\n", __FUNCTION__,
-					tr_ctx->draw_rule.fs, tr_ctx->curr.fs,
-					tr_ctx->draw_rule.vs, tr_ctx->curr.vs,
-					tr_ctx->draw_rule.surf, 0,
-					tr_ctx->draw_rule.tex, 0);
+      debug_printf("%s (%p %p) (%p %p) (%p %u) (%p %u)\n", __FUNCTION__,
+                   (void *) tr_ctx->draw_rule.fs, (void *) tr_ctx->curr.fs,
+                   (void *) tr_ctx->draw_rule.vs, (void *) tr_ctx->curr.vs,
+                   (void *) tr_ctx->draw_rule.surf, 0,
+                   (void *) tr_ctx->draw_rule.tex, 0);
       if (tr_ctx->draw_rule.fs &&
           tr_ctx->draw_rule.fs == tr_ctx->curr.fs)
          block = TRUE;
diff --git a/src/gallium/drivers/trace/tr_drm.c b/src/gallium/drivers/trace/tr_drm.c
index 781ca5d3bc0..48d1c4051cc 100644
--- a/src/gallium/drivers/trace/tr_drm.c
+++ b/src/gallium/drivers/trace/tr_drm.c
@@ -150,7 +150,9 @@ trace_drm_destroy(struct drm_api *_api)
 {
    struct trace_drm_api *tr_api = trace_drm_api(_api);
    struct drm_api *api = tr_api->api;
-   api->destroy(api);
+
+   if (api->destroy)
+      api->destroy(api);
 
    free(tr_api);
 }
diff --git a/src/gallium/drivers/trace/tr_rbug.c b/src/gallium/drivers/trace/tr_rbug.c
index e85ac15edca..81e0a6f3b00 100644
--- a/src/gallium/drivers/trace/tr_rbug.c
+++ b/src/gallium/drivers/trace/tr_rbug.c
@@ -44,7 +44,7 @@
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  define sleep Sleep
-#elif defined(PIPE_OS_LINUX)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD)
 void usleep(int);
 #  define sleep usleep
 #else
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 26f1c04594f..7da9bd3866b 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -366,7 +366,8 @@ trace_screen_get_tex_transfer(struct pipe_screen *_screen,
 
    trace_dump_call_end();
 
-   result = trace_transfer_create(tr_tex, result);
+   if (result)
+      result = trace_transfer_create(tr_tex, result);
 
    return result;
 }
@@ -403,7 +404,7 @@ trace_screen_transfer_map(struct pipe_screen *_screen,
 
    map = screen->transfer_map(screen, transfer);
    if(map) {
-      if(transfer->usage != PIPE_TRANSFER_READ) {
+      if(transfer->usage & PIPE_TRANSFER_WRITE) {
          assert(!tr_trans->map);
          tr_trans->map = map;
       }
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index c13cffceb0a..c36286f9bee 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -167,11 +167,17 @@ typedef unsigned char boolean;
 #define ALIGN16_ASSIGN(NAME) NAME##___aligned
 #define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
 #define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
+#if __GNUC__ > 4 || (__GNUC__ == 4 &&__GNUC_MINOR__>1)
+#define ALIGN_STACK __attribute__((force_align_arg_pointer))
+#else
+#define ALIGN_STACK
+#endif
 #else
 #define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
 #define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
 #define ALIGN16_ATTRIB
 #define ALIGN8_ATTRIB
+#define ALIGN_STACK
 #endif
 
 
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index de99957d9d0..f6feea5f74d 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -122,18 +122,35 @@
 
 #if defined(__linux__)
 #define PIPE_OS_LINUX
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(__FreeBSD__)
+#define PIPE_OS_FREEBSD
 #define PIPE_OS_BSD
+#define PIPE_OS_UNIX
+#endif
+
+#if defined(__OpenBSD__)
+#define PIPE_OS_OPENBSD
+#define PIPE_OS_BSD
+#define PIPE_OS_UNIX
+#endif
+
+#if defined(__NetBSD__)
+#define PIPE_OS_NETBSD
+#define PIPE_OS_BSD
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(__sun)
 #define PIPE_OS_SOLARIS
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(__APPLE__)
 #define PIPE_OS_APPLE
+#define PIPE_OS_UNIX
 #endif
 
 #if defined(_WIN32) || defined(WIN32)
@@ -142,6 +159,7 @@
 
 #if defined(__HAIKU__)
 #define PIPE_OS_HAIKU
+#define PIPE_OS_UNIX
 #endif
 
 /*
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 39620a71980..5569001e601 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -189,6 +189,9 @@ struct pipe_context {
 
    /**
     * Surface functions
+    *
+    * The pipe driver is allowed to set these functions to NULL, and in that
+    * case, they will not be available.
     */
    /*@{*/
 
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b01ab6d137c..fd14dc8e92d 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -34,6 +34,23 @@
 extern "C" {
 #endif
 
+/**
+ * Gallium error codes.
+ *
+ * - A zero value always means success.
+ * - A negative value always means failure.
+ * - The meaning of a positive value is function dependent.
+ */
+enum pipe_error {
+   PIPE_OK = 0,
+   PIPE_ERROR = -1,    /**< Generic error */
+   PIPE_ERROR_BAD_INPUT = -2,
+   PIPE_ERROR_OUT_OF_MEMORY = -3,
+   PIPE_ERROR_RETRY = -4
+   /* TODO */
+};
+
+
 #define PIPE_BLENDFACTOR_ONE                 0x1
 #define PIPE_BLENDFACTOR_SRC_COLOR           0x2
 #define PIPE_BLENDFACTOR_SRC_ALPHA           0x3
@@ -193,13 +210,25 @@ enum pipe_texture_target {
 enum pipe_transfer_usage {
    PIPE_TRANSFER_READ = (1 << 0),
    PIPE_TRANSFER_WRITE = (1 << 1),
-   PIPE_TRANSFER_READ_WRITE = PIPE_TRANSFER_READ | PIPE_TRANSFER_WRITE /**< Read/modify/write */
+   /** Read/modify/write */
+   PIPE_TRANSFER_READ_WRITE = PIPE_TRANSFER_READ | PIPE_TRANSFER_WRITE,
+   /** 
+    * The transfer should map the texture storage directly. The driver may
+    * return NULL if that isn't possible, and the state tracker needs to cope
+    * with that and use an alternative path without this flag.
+    *
+    * E.g. the state tracker could have a simpler path which maps textures and
+    * does read/modify/write cycles on them directly, and a more complicated
+    * path which uses minimal read and write transfers.
+    */
+   PIPE_TRANSFER_MAP_DIRECTLY = (1 << 2)
 };
 
 
-/**
+/*
  * Buffer usage flags
  */
+
 #define PIPE_BUFFER_USAGE_CPU_READ  (1 << 0)
 #define PIPE_BUFFER_USAGE_CPU_WRITE (1 << 1)
 #define PIPE_BUFFER_USAGE_GPU_READ  (1 << 2)
@@ -208,9 +237,63 @@ enum pipe_transfer_usage {
 #define PIPE_BUFFER_USAGE_VERTEX    (1 << 5)
 #define PIPE_BUFFER_USAGE_INDEX     (1 << 6)
 #define PIPE_BUFFER_USAGE_CONSTANT  (1 << 7)
+
+/*
+ * CPU access flags.
+ *
+ * These flags should only be used for texture transfers or when mapping
+ * buffers.
+ *
+ * Note that the PIPE_BUFFER_USAGE_CPU_xxx flags above are also used for
+ * mapping. Either PIPE_BUFFER_USAGE_CPU_READ or PIPE_BUFFER_USAGE_CPU_WRITE
+ * must be set.
+ */
+
+/**
+ * Discards the memory within the mapped region.
+ *
+ * It should not be used with PIPE_BUFFER_USAGE_CPU_READ.
+ *
+ * See also:
+ * - OpenGL's ARB_map_buffer_range extension, MAP_INVALIDATE_RANGE_BIT flag.
+ * - Direct3D's D3DLOCK_DISCARD flag.
+ */
 #define PIPE_BUFFER_USAGE_DISCARD   (1 << 8)
+
+/**
+ * Fail if the resource cannot be mapped immediately.
+ *
+ * See also:
+ * - Direct3D's D3DLOCK_DONOTWAIT flag.
+ * - Mesa3D's MESA_MAP_NOWAIT_BIT flag.
+ * - WDDM's D3DDDICB_LOCKFLAGS.DonotWait flag.
+ */
 #define PIPE_BUFFER_USAGE_DONTBLOCK (1 << 9)
-#define PIPE_BUFFER_USAGE_FLUSH_EXPLICIT (1 << 10) /**< See pipe_screen::buffer_flush_mapped_range */
+
+/**
+ * Do not attempt to synchronize pending operations on the resource when mapping.
+ *
+ * It should not be used with PIPE_BUFFER_USAGE_CPU_READ.
+ *
+ * See also:
+ * - OpenGL's ARB_map_buffer_range extension, MAP_UNSYNCHRONIZED_BIT flag.
+ * - Direct3D's D3DLOCK_NOOVERWRITE flag.
+ * - WDDM's D3DDDICB_LOCKFLAGS.IgnoreSync flag.
+ */
+#define PIPE_BUFFER_USAGE_UNSYNCHRONIZED (1 << 10)
+
+/**
+ * Written ranges will be notified later with
+ * pipe_screen::buffer_flush_mapped_range.
+ *
+ * It should not be used with PIPE_BUFFER_USAGE_CPU_READ.
+ *
+ * See also:
+ * - pipe_screen::buffer_flush_mapped_range
+ * - OpenGL's ARB_map_buffer_range extension, MAP_FLUSH_EXPLICIT_BIT flag.
+ */
+#define PIPE_BUFFER_USAGE_FLUSH_EXPLICIT (1 << 11)
+
 /** Pipe driver custom usage flags should be greater or equal to this value */
 #define PIPE_BUFFER_USAGE_CUSTOM    (1 << 16)
 
@@ -281,7 +364,7 @@ enum pipe_transfer_usage {
 #define PIPE_CAP_NPOT_TEXTURES           2
 #define PIPE_CAP_TWO_SIDED_STENCIL       3
 #define PIPE_CAP_GLSL                    4  /* XXX need something better */
-#define PIPE_CAP_S3TC                    5
+#define PIPE_CAP_S3TC                    5  /* XXX: deprecated; cap determined via supported sampler formats */
 #define PIPE_CAP_ANISOTROPIC_FILTER      6
 #define PIPE_CAP_POINT_SPRITE            7
 #define PIPE_CAP_MAX_RENDER_TARGETS      8
@@ -305,6 +388,8 @@ enum pipe_transfer_usage {
 #define PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS 26
 #define PIPE_CAP_TGSI_CONT_SUPPORTED     27
 #define PIPE_CAP_BLEND_EQUATION_SEPARATE 28
+#define PIPE_CAP_SM3                     29  /*< Shader Model 3 supported */
+#define PIPE_CAP_MAX_PREDICATE_REGISTERS 30
 
 
 /**
@@ -315,6 +400,31 @@ enum pipe_transfer_usage {
 #define PIPE_REFERENCED_FOR_READ  (1 << 0)
 #define PIPE_REFERENCED_FOR_WRITE (1 << 1)
 
+
+enum pipe_video_codec
+{
+   PIPE_VIDEO_CODEC_UNKNOWN = 0,
+   PIPE_VIDEO_CODEC_MPEG12,   /**< MPEG1, MPEG2 */
+   PIPE_VIDEO_CODEC_MPEG4,    /**< DIVX, XVID */
+   PIPE_VIDEO_CODEC_VC1,      /**< WMV */
+   PIPE_VIDEO_CODEC_MPEG4_AVC /**< H.264 */
+};
+
+enum pipe_video_profile
+{
+   PIPE_VIDEO_PROFILE_MPEG1,
+   PIPE_VIDEO_PROFILE_MPEG2_SIMPLE,
+   PIPE_VIDEO_PROFILE_MPEG2_MAIN,
+   PIPE_VIDEO_PROFILE_MPEG4_SIMPLE,
+   PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE,
+   PIPE_VIDEO_PROFILE_VC1_SIMPLE,
+   PIPE_VIDEO_PROFILE_VC1_MAIN,
+   PIPE_VIDEO_PROFILE_VC1_ADVANCED,
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE,
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN,
+   PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index c4469d4a9e9..af230809201 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -613,6 +613,24 @@ pf_has_alpha( enum pipe_format format )
    }
 }
 
+enum pipe_video_chroma_format
+{
+   PIPE_VIDEO_CHROMA_FORMAT_420,
+   PIPE_VIDEO_CHROMA_FORMAT_422,
+   PIPE_VIDEO_CHROMA_FORMAT_444
+};
+
+#if 0
+enum pipe_video_surface_format
+{
+   PIPE_VIDEO_SURFACE_FORMAT_NV12,  /**< Planar; Y plane, UV plane */
+   PIPE_VIDEO_SURFACE_FORMAT_YV12,  /**< Planar; Y plane, U plane, V plane */
+   PIPE_VIDEO_SURFACE_FORMAT_YUYV,  /**< Interleaved; Y,U,Y,V,Y,U,Y,V */
+   PIPE_VIDEO_SURFACE_FORMAT_UYVY,  /**< Interleaved; U,Y,V,Y,U,Y,V,Y */
+   PIPE_VIDEO_SURFACE_FORMAT_VUYA   /**< Packed; A31-24|Y23-16|U15-8|V7-0 */
+};
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index a5c1e8270a3..5fbd62a03d2 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -118,7 +118,7 @@ pipe_buffer_write(struct pipe_screen *screen,
                   unsigned offset, unsigned size,
                   const void *data)
 {
-   uint8_t *map;
+   void *map;
    
    assert(offset < buf->size);
    assert(offset + size <= buf->size);
@@ -129,7 +129,7 @@ pipe_buffer_write(struct pipe_screen *screen,
                                PIPE_BUFFER_USAGE_FLUSH_EXPLICIT);
    assert(map);
    if(map) {
-      memcpy(map + offset, data, size);
+      memcpy((uint8_t *)map + offset, data, size);
       pipe_buffer_flush_mapped_range(screen, buf, offset, size);
       pipe_buffer_unmap(screen, buf);
    }
@@ -141,7 +141,7 @@ pipe_buffer_read(struct pipe_screen *screen,
                  unsigned offset, unsigned size,
                  void *data)
 {
-   uint8_t *map;
+   void *map;
    
    assert(offset < buf->size);
    assert(offset + size <= buf->size);
@@ -150,11 +150,47 @@ pipe_buffer_read(struct pipe_screen *screen,
    map = pipe_buffer_map_range(screen, buf, offset, size, PIPE_BUFFER_USAGE_CPU_READ);
    assert(map);
    if(map) {
-      memcpy(data, map + offset, size);
+      memcpy(data, (const uint8_t *)map + offset, size);
       pipe_buffer_unmap(screen, buf);
    }
 }
 
+static INLINE void *
+pipe_transfer_map( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   return screen->transfer_map(screen, transf);
+}
+
+static INLINE void
+pipe_transfer_unmap( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   screen->transfer_unmap(screen, transf);
+}
+
+static INLINE void
+pipe_transfer_destroy( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   screen->tex_transfer_destroy(transf);
+}
+
+static INLINE unsigned
+pipe_transfer_buffer_flags( struct pipe_transfer *transf )
+{
+   switch (transf->usage & PIPE_TRANSFER_READ_WRITE) {
+   case PIPE_TRANSFER_READ_WRITE:
+      return PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE;
+   case PIPE_TRANSFER_READ:
+      return PIPE_BUFFER_USAGE_CPU_READ;
+   case PIPE_TRANSFER_WRITE:
+      return PIPE_BUFFER_USAGE_CPU_WRITE;
+   default:
+      debug_assert(0);
+      return 0;
+   }
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 3f30c52a169..f0a4de5df33 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -53,7 +53,10 @@ extern "C" {
 struct pipe_fence_handle;
 struct pipe_winsys;
 struct pipe_buffer;
-
+struct pipe_texture;
+struct pipe_surface;
+struct pipe_video_surface;
+struct pipe_transfer;
 
 
 /**
@@ -252,6 +255,17 @@ struct pipe_screen {
 
    void (*buffer_destroy)( struct pipe_buffer *buf );
 
+   /**
+    * Create a video surface suitable for use as a decoding target by the
+    * driver's pipe_video_context.
+    */
+   struct pipe_video_surface*
+   (*video_surface_create)( struct pipe_screen *screen,
+                            enum pipe_video_chroma_format chroma_format,
+                            unsigned width, unsigned height );
+
+   void (*video_surface_destroy)( struct pipe_video_surface *vsfc );
+
 
    /**
     * Do any special operations to ensure frontbuffer contents are
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 5fa6c9af30b..d4c8aadaf92 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -1,6 +1,7 @@
 /**************************************************************************
  * 
  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,8 +26,8 @@
  * 
  **************************************************************************/
 
-#ifndef TGSI_TOKEN_H
-#define TGSI_TOKEN_H
+#ifndef P_SHADER_TOKENS_H
+#define P_SHADER_TOKENS_H
 
 #ifdef __cplusplus
 extern "C" {
@@ -79,6 +80,7 @@ enum tgsi_file_type {
    TGSI_FILE_ADDRESS     =6,
    TGSI_FILE_IMMEDIATE   =7,
    TGSI_FILE_LOOP        =8,
+   TGSI_FILE_PREDICATE   =9,
    TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
 };
 
@@ -266,10 +268,7 @@ union tgsi_immediate_data
 #define TGSI_OPCODE_BGNSUB              100
 #define TGSI_OPCODE_ENDLOOP             101
 #define TGSI_OPCODE_ENDSUB              102
-#define TGSI_OPCODE_NOISE1              103
-#define TGSI_OPCODE_NOISE2              104
-#define TGSI_OPCODE_NOISE3              105
-#define TGSI_OPCODE_NOISE4              106
+                                /* gap */
 #define TGSI_OPCODE_NOP                 107
                                 /* gap */
 #define TGSI_OPCODE_NRM4                112
@@ -278,7 +277,7 @@ union tgsi_immediate_data
 #define TGSI_OPCODE_BREAKC              115
 #define TGSI_OPCODE_KIL                 116  /* conditional kill */
 #define TGSI_OPCODE_END                 117  /* aka HALT */
-#define TGSI_OPCODE_SWZ                 118
+                                /* gap */
 #define TGSI_OPCODE_LAST                119
 
 #define TGSI_SAT_NONE            0  /* do not saturate */
@@ -322,7 +321,6 @@ struct tgsi_instruction
  * instruction, including the instruction word.
  */
 
-#define TGSI_INSTRUCTION_EXT_TYPE_NV        0
 #define TGSI_INSTRUCTION_EXT_TYPE_LABEL     1
 #define TGSI_INSTRUCTION_EXT_TYPE_TEXTURE   2
 #define TGSI_INSTRUCTION_EXT_TYPE_PREDICATE 3
@@ -335,9 +333,6 @@ struct tgsi_instruction_ext
 };
 
 /*
- * If tgsi_instruction_ext::Type is TGSI_INSTRUCTION_EXT_TYPE_NV, it should
- * be cast to tgsi_instruction_ext_nv.
- * 
  * If tgsi_instruction_ext::Type is TGSI_INSTRUCTION_EXT_TYPE_LABEL, it
  * should be cast to tgsi_instruction_ext_label.
  * 
@@ -351,56 +346,11 @@ struct tgsi_instruction_ext
  * follows.
  */
 
-#define TGSI_PRECISION_DEFAULT      0
-#define TGSI_PRECISION_FLOAT32      1
-#define TGSI_PRECISION_FLOAT16      2
-#define TGSI_PRECISION_FIXED12      3
-
-#define TGSI_CC_GT      0
-#define TGSI_CC_EQ      1
-#define TGSI_CC_LT      2
-#define TGSI_CC_GE      3
-#define TGSI_CC_LE      4
-#define TGSI_CC_NE      5
-#define TGSI_CC_TR      6
-#define TGSI_CC_FL      7
-
 #define TGSI_SWIZZLE_X      0
 #define TGSI_SWIZZLE_Y      1
 #define TGSI_SWIZZLE_Z      2
 #define TGSI_SWIZZLE_W      3
 
-/**
- * Precision controls the precision at which the operation should be executed.
- *
- * CondDstUpdate enables condition code register writes. When this field is
- * TRUE, CondDstIndex specifies the index of the condition code register to
- * update.
- *
- * CondFlowEnable enables conditional execution of the operation. When this
- * field is TRUE, CondFlowIndex specifies the index of the condition code
- * register to test against CondMask with component swizzle controled by
- * CondSwizzleX, CondSwizzleY, CondSwizzleZ and CondSwizzleW. If the test fails,
- * the operation is not executed.
- */
-
-struct tgsi_instruction_ext_nv
-{
-   unsigned Type             : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_NV */
-   unsigned Precision        : 4;    /* TGSI_PRECISION_ */
-   unsigned CondDstIndex     : 4;    /* UINT */
-   unsigned CondFlowIndex    : 4;    /* UINT */
-   unsigned CondMask         : 4;    /* TGSI_CC_ */
-   unsigned CondSwizzleX     : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSwizzleY     : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSwizzleZ     : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSwizzleW     : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondDstUpdate    : 1;    /* BOOL */
-   unsigned CondFlowEnable   : 1;    /* BOOL */
-   unsigned Padding          : 1;
-   unsigned Extended         : 1;    /* BOOL */
-};
-
 struct tgsi_instruction_ext_label
 {
    unsigned Type     : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_LABEL */
@@ -428,13 +378,21 @@ struct tgsi_instruction_ext_texture
    unsigned Extended : 1;    /* BOOL */
 };
 
+/*
+ * For SM3, the following constraint applies.
+ *   - Swizzle is either set to identity or replicate.
+ */
 struct tgsi_instruction_ext_predicate
 {
-   unsigned Type             : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_PREDICATE */
-   unsigned PredDstIndex     : 4;    /* UINT */
-   unsigned PredWriteMask    : 4;    /* TGSI_WRITEMASK_ */
-   unsigned Padding          : 19;
-   unsigned Extended         : 1;    /* BOOL */
+   unsigned Type     : 4;  /* TGSI_INSTRUCTION_EXT_TYPE_PREDICATE */
+   unsigned SwizzleX : 2;  /* TGSI_SWIZZLE_x */
+   unsigned SwizzleY : 2;  /* TGSI_SWIZZLE_x */
+   unsigned SwizzleZ : 2;  /* TGSI_SWIZZLE_x */
+   unsigned SwizzleW : 2;  /* TGSI_SWIZZLE_x */
+   unsigned Negate   : 1;  /* BOOL */
+   unsigned SrcIndex : 8;  /* UINT */
+   unsigned Padding  : 10;
+   unsigned Extended : 1;  /* BOOL */
 };
 
 /**
@@ -478,7 +436,6 @@ struct tgsi_src_register
  * Then, if tgsi_src_register::Dimension is TRUE, tgsi_dimension follows.
  */
 
-#define TGSI_SRC_REGISTER_EXT_TYPE_SWZ      0
 #define TGSI_SRC_REGISTER_EXT_TYPE_MOD      1
 
 struct tgsi_src_register_ext
@@ -489,9 +446,6 @@ struct tgsi_src_register_ext
 };
 
 /**
- * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
- * it should be cast to tgsi_src_register_ext_swz.
- * 
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_MOD,
  * it should be cast to tgsi_src_register_ext_mod.
  * 
@@ -499,39 +453,6 @@ struct tgsi_src_register_ext
  * follows.
  */
 
-#define TGSI_EXTSWIZZLE_X       TGSI_SWIZZLE_X
-#define TGSI_EXTSWIZZLE_Y       TGSI_SWIZZLE_Y
-#define TGSI_EXTSWIZZLE_Z       TGSI_SWIZZLE_Z
-#define TGSI_EXTSWIZZLE_W       TGSI_SWIZZLE_W
-#define TGSI_EXTSWIZZLE_ZERO    4
-#define TGSI_EXTSWIZZLE_ONE     5
-
-/**
- * ExtSwizzleX, ExtSwizzleY, ExtSwizzleZ and ExtSwizzleW swizzle the source
- * register in an extended manner.
- *
- * NegateX, NegateY, NegateZ and NegateW negate individual components of the
- * source register.
- *
- * NOTE: To simplify matter, if this token is present, the corresponding Swizzle
- *       and Negate fields in tgsi_src_register should be set to X,Y,Z,W
- *       and FALSE, respectively.
- */
-
-struct tgsi_src_register_ext_swz
-{
-   unsigned Type         : 4;    /* TGSI_SRC_REGISTER_EXT_TYPE_SWZ */
-   unsigned ExtSwizzleX  : 4;    /* TGSI_EXTSWIZZLE_ */
-   unsigned ExtSwizzleY  : 4;    /* TGSI_EXTSWIZZLE_ */
-   unsigned ExtSwizzleZ  : 4;    /* TGSI_EXTSWIZZLE_ */
-   unsigned ExtSwizzleW  : 4;    /* TGSI_EXTSWIZZLE_ */
-   unsigned NegateX      : 1;    /* BOOL */
-   unsigned NegateY      : 1;    /* BOOL */
-   unsigned NegateZ      : 1;    /* BOOL */
-   unsigned NegateW      : 1;    /* BOOL */
-   unsigned Padding      : 7;
-   unsigned Extended     : 1;    /* BOOL */
-};
 
 /**
  * Extra src register modifiers
@@ -586,9 +507,7 @@ struct tgsi_dst_register
  * Then, if tgsi_dst_register::Indirect is TRUE, tgsi_src_register follows.
  */
 
-#define TGSI_DST_REGISTER_EXT_TYPE_CONDCODE     0
 #define TGSI_DST_REGISTER_EXT_TYPE_MODULATE     1
-#define TGSI_DST_REGISTER_EXT_TYPE_PREDICATE    2
 
 struct tgsi_dst_register_ext
 {
@@ -600,30 +519,12 @@ struct tgsi_dst_register_ext
 /**
  * Extra destination register modifiers
  *
- * If tgsi_dst_register_ext::Type is TGSI_DST_REGISTER_EXT_TYPE_CONDCODE,
- * it should be cast to tgsi_dst_register_ext_condcode.
- * 
  * If tgsi_dst_register_ext::Type is TGSI_DST_REGISTER_EXT_TYPE_MODULATE,
  * it should be cast to tgsi_dst_register_ext_modulate.
  * 
- * If tgsi_dst_register_ext::Type is TGSI_DST_REGISTER_EXT_TYPE_PREDICATE,
- * it should be cast to tgsi_dst_register_ext_predicate.
- * 
  * If tgsi_dst_register_ext::Extended is TRUE, another tgsi_dst_register_ext
  * follows.
  */
-struct tgsi_dst_register_ext_concode
-{
-   unsigned Type         : 4;    /* TGSI_DST_REGISTER_EXT_TYPE_CONDCODE */
-   unsigned CondMask     : 4;    /* TGSI_CC_ */
-   unsigned CondSwizzleX : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSwizzleY : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSwizzleZ : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSwizzleW : 2;    /* TGSI_SWIZZLE_ */
-   unsigned CondSrcIndex : 4;    /* UINT */
-   unsigned Padding      : 11;
-   unsigned Extended     : 1;    /* BOOL */
-};
 
 #define TGSI_MODULATE_1X        0
 #define TGSI_MODULATE_2X        1
@@ -642,30 +543,8 @@ struct tgsi_dst_register_ext_modulate
    unsigned Extended : 1;    /* BOOL */
 };
 
-/*
- * Currently, the following constraints apply.
- *
- * - PredSwizzleXYZW is either set to identity or replicate.
- * - PredSrcIndex is 0.
- */
-
-struct tgsi_dst_register_ext_predicate
-{
-   unsigned Type         : 4;    /* TGSI_DST_REGISTER_EXT_TYPE_PREDICATE */
-   unsigned PredSwizzleX : 2;    /* TGSI_SWIZZLE_ */
-   unsigned PredSwizzleY : 2;    /* TGSI_SWIZZLE_ */
-   unsigned PredSwizzleZ : 2;    /* TGSI_SWIZZLE_ */
-   unsigned PredSwizzleW : 2;    /* TGSI_SWIZZLE_ */
-   unsigned PredSrcIndex : 4;    /* UINT */
-   unsigned Negate       : 1;    /* BOOL */
-   unsigned Padding      : 14;
-   unsigned Extended     : 1;    /* BOOL */
-};
-
-
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* TGSI_TOKEN_H */
-
+#endif /* P_SHADER_TOKENS_H */
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 626bedb35a8..287b424e4ac 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -114,11 +114,29 @@ struct pipe_rasterizer_state
     * the vertex shader, clipping and viewport processing.  Note that
     * a vertex shader is still needed though, to indicate the mapping
     * from vertex elements to fragment shader input semantics.
+    *
+    * XXX: considered for removal.
     */
    unsigned bypass_vs_clip_and_viewport:1;
 
-   unsigned flatshade_first:1;   /**< take color attribute from the first vertex of a primitive */
-   unsigned gl_rasterization_rules:1; /**< enable tweaks for GL rasterization?  */
+   /** 
+    * Use the first vertex of a primitive as the provoking vertex for
+    * flat shading.
+    */
+   unsigned flatshade_first:1;   
+
+   /** 
+    * When true, triangle rasterization uses (0.5, 0.5) pixel centers
+    * for determining pixel ownership.
+    *
+    * When false, triangle rasterization uses (0,0) pixel centers for
+    * determining pixel ownership.
+    *
+    * Triangle rasterization always uses a 'top,left' rule for pixel
+    * ownership, this just alters which point we consider the pixel
+    * center for that test.
+    */
+   unsigned gl_rasterization_rules:1;
 
    float line_width;
    float point_size;           /**< used when no per-vertex size */
@@ -179,7 +197,6 @@ struct pipe_depth_state
    unsigned enabled:1;         /**< depth test enabled? */
    unsigned writemask:1;       /**< allow depth buffer writes? */
    unsigned func:3;            /**< depth test func (PIPE_FUNC_x) */
-   unsigned occlusion_count:1; /**< do occlusion counting? */
 };
 
 
@@ -307,7 +324,7 @@ struct pipe_transfer
    unsigned nblocksx;            /**< allocated width in blocks */
    unsigned nblocksy;            /**< allocated height in blocks */
    unsigned stride;              /**< stride in bytes between rows of blocks */
-   unsigned usage;               /**< PIPE_TRANSFER_*  */
+   enum pipe_transfer_usage usage; /**< PIPE_TRANSFER_*  */
 
    struct pipe_texture *texture; /**< texture to transfer to/from  */
    unsigned face;
diff --git a/src/gallium/include/pipe/p_video_context.h b/src/gallium/include/pipe/p_video_context.h
new file mode 100644
index 00000000000..6ae31418fa8
--- /dev/null
+++ b/src/gallium/include/pipe/p_video_context.h
@@ -0,0 +1,121 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef PIPE_VIDEO_CONTEXT_H
+#define PIPE_VIDEO_CONTEXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <pipe/p_video_state.h>
+
+struct pipe_screen;
+struct pipe_buffer;
+struct pipe_surface;
+struct pipe_video_surface;
+struct pipe_macroblock;
+struct pipe_picture_desc;
+struct pipe_fence_handle;
+
+/**
+ * Gallium video rendering context
+ */
+struct pipe_video_context
+{
+   struct pipe_screen *screen;
+   enum pipe_video_profile profile;
+   enum pipe_video_chroma_format chroma_format;
+   unsigned width;
+   unsigned height;
+
+   void *priv; /**< context private data (for DRI for example) */
+
+   void (*destroy)(struct pipe_video_context *vpipe);
+
+   /**
+    * Picture decoding and displaying
+    */
+   /*@{*/
+   void (*decode_bitstream)(struct pipe_video_context *vpipe,
+                            unsigned num_bufs,
+                            struct pipe_buffer **bitstream_buf);
+
+   void (*decode_macroblocks)(struct pipe_video_context *vpipe,
+                              struct pipe_video_surface *past,
+                              struct pipe_video_surface *future,
+                              unsigned num_macroblocks,
+                              struct pipe_macroblock *macroblocks,
+                              struct pipe_fence_handle **fence);
+
+   void (*clear_surface)(struct pipe_video_context *vpipe,
+                         unsigned x, unsigned y,
+                         unsigned width, unsigned height,
+                         unsigned value,
+                         struct pipe_surface *surface);
+
+   void (*render_picture)(struct pipe_video_context     *vpipe,
+                          /*struct pipe_surface         *backround,
+                          struct pipe_video_rect        *backround_area,*/
+                          struct pipe_video_surface     *src_surface,
+                          enum pipe_mpeg12_picture_type picture_type,
+                          /*unsigned                    num_past_surfaces,
+                          struct pipe_video_surface     *past_surfaces,
+                          unsigned                      num_future_surfaces,
+                          struct pipe_video_surface     *future_surfaces,*/
+                          struct pipe_video_rect        *src_area,
+                          struct pipe_surface           *dst_surface,
+                          struct pipe_video_rect        *dst_area,
+                          /*unsigned                      num_layers,
+                          struct pipe_texture           *layers,
+                          struct pipe_video_rect        *layer_src_areas,
+                          struct pipe_video_rect        *layer_dst_areas,*/
+                          struct pipe_fence_handle      **fence);
+   /*@}*/
+
+   /**
+    * Parameter-like states (or properties)
+    */
+   /*@{*/
+   void (*set_picture_desc)(struct pipe_video_context *vpipe,
+                            const struct pipe_picture_desc *desc);
+
+   void (*set_decode_target)(struct pipe_video_context *vpipe,
+                             struct pipe_video_surface *dt);
+
+   void (*set_csc_matrix)(struct pipe_video_context *vpipe, const float *mat);
+
+   /* TODO: Interface for scaling modes, post-processing, etc. */
+   /*@}*/
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_VIDEO_CONTEXT_H */
diff --git a/src/gallium/include/pipe/p_video_state.h b/src/gallium/include/pipe/p_video_state.h
new file mode 100644
index 00000000000..4da26d608cf
--- /dev/null
+++ b/src/gallium/include/pipe/p_video_state.h
@@ -0,0 +1,184 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef PIPE_VIDEO_STATE_H
+#define PIPE_VIDEO_STATE_H
+
+/* u_reduce_video_profile() needs these */
+#include <pipe/p_compiler.h>
+#include <util/u_debug.h>
+
+#include <pipe/p_defines.h>
+#include <pipe/p_format.h>
+#include <pipe/p_refcnt.h>
+#include <pipe/p_screen.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_video_surface
+{
+   struct pipe_reference reference;
+   struct pipe_screen *screen;
+   enum pipe_video_chroma_format chroma_format;
+   /*enum pipe_video_surface_format surface_format;*/
+   unsigned width;
+   unsigned height;
+};
+
+static INLINE void
+pipe_video_surface_reference(struct pipe_video_surface **ptr, struct pipe_video_surface *surf)
+{
+   struct pipe_video_surface *old_surf = *ptr;
+
+   if (pipe_reference((struct pipe_reference **)ptr, &surf->reference))
+      old_surf->screen->video_surface_destroy(old_surf);
+}
+
+struct pipe_video_rect
+{
+   unsigned x, y, w, h;
+};
+
+static INLINE enum pipe_video_codec
+u_reduce_video_profile(enum pipe_video_profile profile)
+{
+   switch (profile)
+   {
+      case PIPE_VIDEO_PROFILE_MPEG1:
+      case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+         return PIPE_VIDEO_CODEC_MPEG12;
+
+      case PIPE_VIDEO_PROFILE_MPEG4_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG4_ADVANCED_SIMPLE:
+         return PIPE_VIDEO_CODEC_MPEG4;
+
+      case PIPE_VIDEO_PROFILE_VC1_SIMPLE:
+      case PIPE_VIDEO_PROFILE_VC1_MAIN:
+      case PIPE_VIDEO_PROFILE_VC1_ADVANCED:
+         return PIPE_VIDEO_CODEC_VC1;
+
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+         return PIPE_VIDEO_CODEC_MPEG4_AVC;
+
+      default:
+         assert(0);
+         return PIPE_VIDEO_CODEC_UNKNOWN;
+   }
+}
+
+enum pipe_mpeg12_picture_type
+{
+   PIPE_MPEG12_PICTURE_TYPE_FIELD_TOP,
+   PIPE_MPEG12_PICTURE_TYPE_FIELD_BOTTOM,
+   PIPE_MPEG12_PICTURE_TYPE_FRAME
+};
+
+enum pipe_mpeg12_macroblock_type
+{
+   PIPE_MPEG12_MACROBLOCK_TYPE_INTRA,
+   PIPE_MPEG12_MACROBLOCK_TYPE_FWD,
+   PIPE_MPEG12_MACROBLOCK_TYPE_BKWD,
+   PIPE_MPEG12_MACROBLOCK_TYPE_BI,
+	
+   PIPE_MPEG12_MACROBLOCK_NUM_TYPES
+};
+
+enum pipe_mpeg12_motion_type
+{
+   PIPE_MPEG12_MOTION_TYPE_FIELD,
+   PIPE_MPEG12_MOTION_TYPE_FRAME,
+   PIPE_MPEG12_MOTION_TYPE_DUALPRIME,
+   PIPE_MPEG12_MOTION_TYPE_16x8
+};
+
+enum pipe_mpeg12_dct_type
+{
+   PIPE_MPEG12_DCT_TYPE_FIELD,
+   PIPE_MPEG12_DCT_TYPE_FRAME
+};
+
+struct pipe_macroblock
+{
+   enum pipe_video_codec codec;
+};
+
+struct pipe_mpeg12_macroblock
+{
+   struct pipe_macroblock base;
+
+   unsigned mbx;
+   unsigned mby;
+   enum pipe_mpeg12_macroblock_type mb_type;
+   enum pipe_mpeg12_motion_type mo_type;
+   enum pipe_mpeg12_dct_type dct_type;
+   signed pmv[2][2][2];
+   unsigned cbp;
+   void *blocks;
+};
+
+#if 0
+struct pipe_picture_desc
+{
+   enum pipe_video_format format;
+};
+
+struct pipe_mpeg12_picture_desc
+{
+   struct pipe_picture_desc base;
+
+   /* TODO: Use bitfields where possible? */
+   struct pipe_surface *forward_reference;
+   struct pipe_surface *backward_reference;
+   unsigned picture_coding_type;
+   unsigned fcode;
+   unsigned intra_dc_precision;
+   unsigned picture_structure;
+   unsigned top_field_first;
+   unsigned frame_pred_frame_dct;
+   unsigned concealment_motion_vectors;
+   unsigned q_scale_type;
+   unsigned intra_vlc_format;
+   unsigned alternate_scan;
+   unsigned full_pel_forward_vector;
+   unsigned full_pel_backward_vector;
+   struct pipe_buffer *intra_quantizer_matrix;
+   struct pipe_buffer *non_intra_quantizer_matrix;
+   struct pipe_buffer *chroma_intra_quantizer_matrix;
+   struct pipe_buffer *chroma_non_intra_quantizer_matrix;
+};
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_VIDEO_STATE_H */
diff --git a/src/gallium/state_trackers/dri/dri_drawable.c b/src/gallium/state_trackers/dri/dri_drawable.c
index 5cec9e329d9..5625ff53cfd 100644
--- a/src/gallium/state_trackers/dri/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/dri_drawable.c
@@ -45,6 +45,7 @@
 #include "state_tracker/st_cb_fbo.h"
 
 #include "util/u_memory.h"
+#include "util/u_rect.h"
 
 static struct pipe_surface *
 dri_surface_from_handle(struct drm_api *api,
@@ -179,7 +180,6 @@ dri_get_buffers(__DRIdrawablePrivate * dPriv)
 
       switch (buffers[i].attachment) {
       case __DRI_BUFFER_FRONT_LEFT:
-	 continue;
       case __DRI_BUFFER_FAKE_FRONT_LEFT:
 	 index = ST_SURFACE_FRONT_LEFT;
 	 format = drawable->color_format;
@@ -214,6 +214,7 @@ dri_get_buffers(__DRIdrawablePrivate * dPriv)
 					dri_drawable->h, buffers[i].pitch);
 
       switch (buffers[i].attachment) {
+      case __DRI_BUFFER_FRONT_LEFT:
       case __DRI_BUFFER_FAKE_FRONT_LEFT:
       case __DRI_BUFFER_BACK_LEFT:
 	 drawable->color_format = surface->format;
@@ -223,6 +224,9 @@ dri_get_buffers(__DRIdrawablePrivate * dPriv)
       case __DRI_BUFFER_STENCIL:
 	 drawable->depth_stencil_format = surface->format;
 	 break;
+      case __DRI_BUFFER_ACCUM:
+      default:
+	 assert(0);
       }
 
       st_set_framebuffer_surface(drawable->stfb, index, surface);
@@ -250,6 +254,9 @@ void dri2_set_tex_buffer2(__DRIcontext *pDRICtx, GLint target,
    dri_get_buffers(drawable->dPriv);
    st_get_framebuffer_surface(drawable->stfb, ST_SURFACE_FRONT_LEFT, &ps);
 
+   if (!ps)
+      return;
+
    st_bind_texture_surface(ps, target == GL_TEXTURE_2D ? ST_TEXTURE_2D :
                            ST_TEXTURE_RECT, 0, drawable->color_format);
 }
@@ -360,8 +367,6 @@ dri_create_buffer(__DRIscreenPrivate * sPriv,
 
    if (visual->doubleBufferMode)
       drawable->attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-   else
-      drawable->attachments[i++] = __DRI_BUFFER_FAKE_FRONT_LEFT;
    if (visual->depthBits && visual->stencilBits)
       drawable->attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
    else if (visual->depthBits)
@@ -537,12 +542,21 @@ dri1_swap_copy(struct dri_context *ctx,
    cur = dPriv->pClipRects;
 
    for (i = 0; i < dPriv->numClipRects; ++i) {
-      if (dri1_intersect_src_bbox(&clip, dPriv->x, dPriv->y, cur++, bbox))
-	 pipe->surface_copy(pipe, dst, clip.x1, clip.y1,
-			    src,
-			    (int)clip.x1 - dPriv->x,
-			    (int)clip.y1 - dPriv->y,
-			    clip.x2 - clip.x1, clip.y2 - clip.y1);
+      if (dri1_intersect_src_bbox(&clip, dPriv->x, dPriv->y, cur++, bbox)) {
+         if (pipe->surface_copy) {
+            pipe->surface_copy(pipe, dst, clip.x1, clip.y1,
+                               src,
+                               (int)clip.x1 - dPriv->x,
+                               (int)clip.y1 - dPriv->y,
+                               clip.x2 - clip.x1, clip.y2 - clip.y1);
+         } else {
+            util_surface_copy(pipe, FALSE, dst, clip.x1, clip.y1,
+                              src,
+                              (int)clip.x1 - dPriv->x,
+                              (int)clip.y1 - dPriv->y,
+                              clip.x2 - clip.x1, clip.y2 - clip.y1);
+         }
+      }
    }
 }
 
diff --git a/src/gallium/state_trackers/dri/dri_extensions.c b/src/gallium/state_trackers/dri/dri_extensions.c
index 4349a4d1d2f..8b014a2a8b8 100644
--- a/src/gallium/state_trackers/dri/dri_extensions.c
+++ b/src/gallium/state_trackers/dri/dri_extensions.c
@@ -37,8 +37,10 @@
 #define need_GL_ARB_multisample
 #define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
+#define need_GL_ARB_provoking_vertex
 #define need_GL_ARB_shader_objects
 #define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_array_object
 #define need_GL_ARB_vertex_buffer_object
 #define need_GL_ARB_vertex_program
 #define need_GL_ARB_vertex_shader
@@ -51,22 +53,27 @@
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_multi_draw_arrays
+#define need_GL_EXT_provoking_vertex
 #define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_APPLE_vertex_array_object
 #define need_GL_NV_vertex_program
 #define need_GL_VERSION_2_0
 #define need_GL_VERSION_2_1
-#include "extension_helper.h"
+#include "main/remap_helper.h"
+#include "utils.h"
 
 /**
  * Extension strings exported by the driver.
  */
-const struct dri_extension card_extensions[] = {
+static const struct dri_extension card_extensions[] = {
    {"GL_ARB_fragment_shader", NULL},
    {"GL_ARB_map_buffer_range", GL_ARB_map_buffer_range_functions},
    {"GL_ARB_multisample", GL_ARB_multisample_functions},
    {"GL_ARB_multitexture", NULL},
    {"GL_ARB_occlusion_query", GL_ARB_occlusion_query_functions},
    {"GL_ARB_pixel_buffer_object", NULL},
+   {"GL_ARB_provoking_vertex", GL_ARB_provoking_vertex_functions},
    {"GL_ARB_point_parameters", GL_ARB_point_parameters_functions},
    {"GL_ARB_shading_language_100", GL_VERSION_2_0_functions },
    {"GL_ARB_shading_language_120", GL_VERSION_2_1_functions },
@@ -78,7 +85,9 @@ const struct dri_extension card_extensions[] = {
    {"GL_ARB_texture_env_combine", NULL},
    {"GL_ARB_texture_env_dot3", NULL},
    {"GL_ARB_texture_mirrored_repeat", NULL},
+   {"GL_ARB_texture_non_power_of_two", NULL},
    {"GL_ARB_texture_rectangle", NULL},
+   {"GL_ARB_vertex_array_object", GL_ARB_vertex_array_object_functions},
    {"GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions},
    {"GL_ARB_vertex_shader", GL_ARB_vertex_shader_functions},
    {"GL_ARB_vertex_program", GL_ARB_vertex_program_functions},
@@ -94,7 +103,9 @@ const struct dri_extension card_extensions[] = {
    {"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
    {"GL_EXT_packed_depth_stencil", NULL},
    {"GL_EXT_pixel_buffer_object", NULL},
+   {"GL_EXT_provoking_vertex", GL_EXT_provoking_vertex_functions},
    {"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
+   {"GL_EXT_stencil_two_side", GL_EXT_stencil_two_side_functions},
    {"GL_EXT_stencil_wrap", NULL},
    {"GL_EXT_texture_edge_clamp", NULL},
    {"GL_EXT_texture_env_combine", NULL},
@@ -103,6 +114,7 @@ const struct dri_extension card_extensions[] = {
    {"GL_EXT_texture_lod_bias", NULL},
    {"GL_3DFX_texture_compression_FXT1", NULL},
    {"GL_APPLE_client_storage", NULL},
+   {"GL_APPLE_vertex_array_object", GL_APPLE_vertex_array_object_functions},
    {"GL_MESA_pack_invert", NULL},
    {"GL_MESA_ycbcr_texture", NULL},
    {"GL_NV_blend_square", NULL},
@@ -119,10 +131,7 @@ dri_init_extensions(struct dri_context *ctx)
     * capabilities of the pipe_screen. This is actually something
     * that can/should be done inside st_create_context().
     */
-   if (ctx)
-      driInitExtensions(ctx->st->ctx, card_extensions, GL_TRUE);
-   else
-      driInitExtensions(NULL, card_extensions, GL_FALSE);
+   driInitExtensions(ctx->st->ctx, card_extensions, GL_TRUE);
 }
 
 /* vim: set sw=3 ts=8 sts=3 expandtab: */
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index 884b6d50111..cb864d45d51 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -226,8 +226,6 @@ dri_init_screen(__DRIscreenPrivate * sPriv)
    const __DRIconfig **configs;
    struct dri1_create_screen_arg arg;
 
-   dri_init_extensions(NULL);
-
    screen = CALLOC_STRUCT(dri_screen);
    if (!screen)
       return NULL;
@@ -292,9 +290,6 @@ dri_init_screen2(__DRIscreenPrivate * sPriv)
    struct dri_screen *screen;
    struct drm_create_screen_arg arg;
 
-   /* Set up dispatch table to cope with all known extensions */
-   dri_init_extensions(NULL);
-
    screen = CALLOC_STRUCT(dri_screen);
    if (!screen)
       goto fail;
diff --git a/src/gallium/state_trackers/egl/egl_context.c b/src/gallium/state_trackers/egl/egl_context.c
index c4f7361ca0b..fee186c6010 100644
--- a/src/gallium/state_trackers/egl/egl_context.c
+++ b/src/gallium/state_trackers/egl/egl_context.c
@@ -16,73 +16,6 @@
 
 #include "GL/internal/glcore.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_point_parameters
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
-#define need_GL_ARB_vertex_program
-#define need_GL_ARB_window_pos
-#define need_GL_EXT_blend_color
-#define need_GL_EXT_blend_equation_separate
-#define need_GL_EXT_blend_func_separate
-#define need_GL_EXT_blend_minmax
-#define need_GL_EXT_cull_vertex
-#define need_GL_EXT_fog_coord
-#define need_GL_EXT_framebuffer_object
-#define need_GL_EXT_multi_draw_arrays
-#define need_GL_EXT_secondary_color
-#define need_GL_NV_vertex_program
-#include "extension_helper.h"
-
-/**
- * TODO HACK! FUGLY!
- * Copied for intel extentions.
- */
-const struct dri_extension card_extensions[] = {
-	{"GL_ARB_multisample", GL_ARB_multisample_functions},
-	{"GL_ARB_multitexture", NULL},
-	{"GL_ARB_point_parameters", GL_ARB_point_parameters_functions},
-	{"GL_ARB_texture_border_clamp", NULL},
-	{"GL_ARB_texture_compression", GL_ARB_texture_compression_functions},
-	{"GL_ARB_texture_cube_map", NULL},
-	{"GL_ARB_texture_env_add", NULL},
-	{"GL_ARB_texture_env_combine", NULL},
-	{"GL_ARB_texture_env_dot3", NULL},
-	{"GL_ARB_texture_mirrored_repeat", NULL},
-	{"GL_ARB_texture_rectangle", NULL},
-	{"GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions},
-	{"GL_ARB_pixel_buffer_object", NULL},
-	{"GL_ARB_vertex_program", GL_ARB_vertex_program_functions},
-	{"GL_ARB_window_pos", GL_ARB_window_pos_functions},
-	{"GL_EXT_blend_color", GL_EXT_blend_color_functions},
-	{"GL_EXT_blend_equation_separate", GL_EXT_blend_equation_separate_functions},
-	{"GL_EXT_blend_func_separate", GL_EXT_blend_func_separate_functions},
-	{"GL_EXT_blend_minmax", GL_EXT_blend_minmax_functions},
-	{"GL_EXT_blend_subtract", NULL},
-	{"GL_EXT_cull_vertex", GL_EXT_cull_vertex_functions},
-	{"GL_EXT_fog_coord", GL_EXT_fog_coord_functions},
-	{"GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions},
-	{"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
-	{"GL_EXT_packed_depth_stencil", NULL},
-	{"GL_EXT_pixel_buffer_object", NULL},
-	{"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
-	{"GL_EXT_stencil_wrap", NULL},
-	{"GL_EXT_texture_edge_clamp", NULL},
-	{"GL_EXT_texture_env_combine", NULL},
-	{"GL_EXT_texture_env_dot3", NULL},
-	{"GL_EXT_texture_filter_anisotropic", NULL},
-	{"GL_EXT_texture_lod_bias", NULL},
-	{"GL_3DFX_texture_compression_FXT1", NULL},
-	{"GL_APPLE_client_storage", NULL},
-	{"GL_MESA_pack_invert", NULL},
-	{"GL_MESA_ycbcr_texture", NULL},
-	{"GL_NV_blend_square", NULL},
-	{"GL_NV_vertex_program", GL_NV_vertex_program_functions},
-	{"GL_NV_vertex_program1_1", NULL},
-	{"GL_SGIS_generate_mipmap", NULL },
-	{NULL, NULL}
-};
-
 _EGLContext *
 drm_create_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf, _EGLContext *share_list, const EGLint *attrib_list)
 {
@@ -138,7 +71,6 @@ drm_destroy_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *context)
 	struct drm_context *c = lookup_drm_context(context);
 	if (!_eglIsContextBound(&c->base)) {
 		st_destroy_context(c->st);
-		c->pipe->destroy(c->pipe);
 		free(c);
 	}
 	return EGL_TRUE;
@@ -160,18 +92,12 @@ drm_make_current(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw, _EGLSurfa
 		if (!drawSurf || !readSurf)
 			return EGL_FALSE;
 
-		drawSurf->user = ctx;
-		readSurf->user = ctx;
-
 		st_make_current(ctx->st, drawSurf->stfb, readSurf->stfb);
 
 		/* st_resize_framebuffer needs a bound context to work */
 		st_resize_framebuffer(drawSurf->stfb, drawSurf->w, drawSurf->h);
 		st_resize_framebuffer(readSurf->stfb, readSurf->w, readSurf->h);
 	} else {
-		drawSurf->user = NULL;
-		readSurf->user = NULL;
-
 		st_make_current(NULL, NULL, NULL);
 	}
 
diff --git a/src/gallium/state_trackers/egl/egl_surface.c b/src/gallium/state_trackers/egl/egl_surface.c
index 69e2d6b7081..91615abebee 100644
--- a/src/gallium/state_trackers/egl/egl_surface.c
+++ b/src/gallium/state_trackers/egl/egl_surface.c
@@ -12,6 +12,8 @@
 
 #include "state_tracker/drm_api.h"
 
+#include "util/u_rect.h"
+
 /*
  * Util functions
  */
@@ -33,34 +35,62 @@ drm_find_mode(drmModeConnectorPtr connector, _EGLMode *mode)
 }
 
 static struct st_framebuffer *
-drm_create_framebuffer(const __GLcontextModes *visual,
+drm_create_framebuffer(struct pipe_screen *screen,
+                       const __GLcontextModes *visual,
                        unsigned width,
                        unsigned height,
                        void *priv)
 {
-	enum pipe_format colorFormat, depthFormat, stencilFormat;
-
-	if (visual->redBits == 5)
-		colorFormat = PIPE_FORMAT_R5G6B5_UNORM;
-	else
-		colorFormat = PIPE_FORMAT_A8R8G8B8_UNORM;
-
-	if (visual->depthBits == 16)
-		depthFormat = PIPE_FORMAT_Z16_UNORM;
-	else if (visual->depthBits == 24)
-		depthFormat = PIPE_FORMAT_S8Z24_UNORM;
-	else
-		depthFormat = PIPE_FORMAT_NONE;
+	enum pipe_format color_format, depth_stencil_format;
+	boolean d_depth_bits_last;
+	boolean ds_depth_bits_last;
+
+	d_depth_bits_last =
+		screen->is_format_supported(screen, PIPE_FORMAT_X8Z24_UNORM,
+		                            PIPE_TEXTURE_2D,
+		                            PIPE_TEXTURE_USAGE_DEPTH_STENCIL, 0);
+	ds_depth_bits_last =
+		screen->is_format_supported(screen, PIPE_FORMAT_S8Z24_UNORM,
+		                            PIPE_TEXTURE_2D,
+		                            PIPE_TEXTURE_USAGE_DEPTH_STENCIL, 0);
+
+	if (visual->redBits == 8) {
+		if (visual->alphaBits == 8)
+			color_format = PIPE_FORMAT_A8R8G8B8_UNORM;
+		else
+			color_format = PIPE_FORMAT_X8R8G8B8_UNORM;
+	} else {
+		color_format = PIPE_FORMAT_R5G6B5_UNORM;
+	}
 
-	if (visual->stencilBits == 8)
-		stencilFormat = PIPE_FORMAT_S8Z24_UNORM;
-	else
-		stencilFormat = PIPE_FORMAT_NONE;
+	switch(visual->depthBits) {
+		default:
+		case 0:
+			depth_stencil_format = PIPE_FORMAT_NONE;
+			break;
+		case 16:
+			depth_stencil_format = PIPE_FORMAT_Z16_UNORM;
+			break;
+		case 24:
+			if (visual->stencilBits == 0) {
+				depth_stencil_format = (d_depth_bits_last) ?
+					PIPE_FORMAT_X8Z24_UNORM:
+					PIPE_FORMAT_Z24X8_UNORM;
+			} else {
+				depth_stencil_format = (ds_depth_bits_last) ?
+					PIPE_FORMAT_S8Z24_UNORM:
+					PIPE_FORMAT_Z24S8_UNORM;
+			}
+			break;
+		case 32:
+			depth_stencil_format = PIPE_FORMAT_Z32_UNORM;
+			break;
+	}
 
 	return st_create_framebuffer(visual,
-	                             colorFormat,
-	                             depthFormat,
-	                             stencilFormat,
+	                             color_format,
+	                             depth_stencil_format,
+	                             depth_stencil_format,
 	                             width,
 	                             height,
 	                             priv);
@@ -152,11 +182,13 @@ drm_takedown_shown_screen(_EGLDisplay *dpy, struct drm_screen *screen)
 
 	pipe_surface_reference(&screen->surface, NULL);
 	pipe_texture_reference(&screen->tex, NULL);
-	pipe_buffer_reference(&screen->buffer, NULL);
 
 	screen->shown = 0;
 }
 
+/**
+ * Called by libEGL's eglCreateWindowSurface().
+ */
 _EGLSurface *
 drm_create_window_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf, NativeWindowType window, const EGLint *attrib_list)
 {
@@ -164,6 +196,9 @@ drm_create_window_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf, N
 }
 
 
+/**
+ * Called by libEGL's eglCreatePixmapSurface().
+ */
 _EGLSurface *
 drm_create_pixmap_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf, NativePixmapType pixmap, const EGLint *attrib_list)
 {
@@ -171,10 +206,14 @@ drm_create_pixmap_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf, N
 }
 
 
+/**
+ * Called by libEGL's eglCreatePbufferSurface().
+ */
 _EGLSurface *
 drm_create_pbuffer_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf,
                            const EGLint *attrib_list)
 {
+	struct drm_device *dev = lookup_drm_device(dpy);
 	int i;
 	int width = -1;
 	int height = -1;
@@ -211,9 +250,8 @@ drm_create_pbuffer_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf,
 	surf->h = height;
 
 	visual = drm_visual_from_config(conf);
-	surf->stfb = drm_create_framebuffer(visual,
-	                                    width,
-	                                    height,
+	surf->stfb = drm_create_framebuffer(dev->screen, visual,
+	                                    width, height,
 	                                    (void*)surf);
 	drm_visual_modes_destroy(visual);
 
@@ -225,6 +263,9 @@ err:
 	return NULL;
 }
 
+/**
+ * Called by libEGL's eglCreateScreenSurfaceMESA().
+ */
 _EGLSurface *
 drm_create_screen_surface_mesa(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *cfg,
                                const EGLint *attrib_list)
@@ -234,6 +275,9 @@ drm_create_screen_surface_mesa(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *cf
 	return surf;
 }
 
+/**
+ * Called by libEGL's eglShowScreenSurfaceMESA().
+ */
 EGLBoolean
 drm_show_screen_surface_mesa(_EGLDriver *drv, _EGLDisplay *dpy,
                              _EGLScreen *screen,
@@ -250,8 +294,8 @@ drm_show_screen_surface_mesa(_EGLDriver *drv, _EGLDisplay *dpy,
 
 
 	drm_create_texture(dpy, scrn, mode->Width, mode->Height);
-	if (!scrn->buffer)
-		return EGL_FALSE;
+	if (!scrn->tex)
+		goto err_tex;
 
 	ret = drmModeAddFB(dev->drmFD,
 	                   scrn->front.width, scrn->front.height,
@@ -325,11 +369,14 @@ err_fb:
 err_bo:
 	pipe_surface_reference(&scrn->surface, NULL);
 	pipe_texture_reference(&scrn->tex, NULL);
-	pipe_buffer_reference(&scrn->buffer, NULL);
 
+err_tex:
 	return EGL_FALSE;
 }
 
+/**
+ * Called by libEGL's eglDestroySurface().
+ */
 EGLBoolean
 drm_destroy_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface)
 {
@@ -343,6 +390,9 @@ drm_destroy_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface)
 	return EGL_TRUE;
 }
 
+/**
+ * Called by libEGL's eglSwapBuffers().
+ */
 EGLBoolean
 drm_swap_buffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw)
 {
@@ -353,24 +403,30 @@ drm_swap_buffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw)
 	if (!surf)
 		return EGL_FALSE;
 
-	/* error checking */
-	if (!_eglSwapBuffers(drv, dpy, draw))
-		return EGL_FALSE;
-
 	st_get_framebuffer_surface(surf->stfb, ST_SURFACE_BACK_LEFT, &back_surf);
 
 	if (back_surf) {
+		struct drm_context *ctx = lookup_drm_context(draw->Binding);
 
 		st_notify_swapbuffers(surf->stfb);
 
-		if (surf->screen) {
-			surf->user->pipe->surface_copy(surf->user->pipe,
-				surf->screen->surface,
-				0, 0,
-				back_surf,
-				0, 0,
-				surf->w, surf->h);
-			surf->user->pipe->flush(surf->user->pipe, PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE, NULL);
+		if (ctx && surf->screen) {
+            if (ctx->pipe->surface_copy) {
+                ctx->pipe->surface_copy(ctx->pipe,
+                    surf->screen->surface,
+                    0, 0,
+                    back_surf,
+                    0, 0,
+                    surf->w, surf->h);
+            } else {
+                util_surface_copy(ctx->pipe, FALSE,
+                    surf->screen->surface,
+                    0, 0,
+                    back_surf,
+                    0, 0,
+                    surf->w, surf->h);
+            }
+			ctx->pipe->flush(ctx->pipe, PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE, NULL);
 
 #ifdef DRM_MODE_FEATURE_DIRTYFB
 			/* TODO query connector property to see if this is needed */
diff --git a/src/gallium/state_trackers/egl/egl_tracker.c b/src/gallium/state_trackers/egl/egl_tracker.c
index 4548b4fd27c..745803c7eb0 100644
--- a/src/gallium/state_trackers/egl/egl_tracker.c
+++ b/src/gallium/state_trackers/egl/egl_tracker.c
@@ -16,13 +16,14 @@
 
 /** HACK */
 void* driDriverAPI;
-extern const struct dri_extension card_extensions[];
 
 
 /*
  * Exported functions
  */
 
+/** Called by libEGL just prior to unloading/closing the driver.
+ */
 static void
 drm_unload(_EGLDriver *drv)
 {
@@ -32,6 +33,8 @@ drm_unload(_EGLDriver *drv)
 /**
  * The bootstrap function.  Return a new drm_driver object and
  * plug in API functions.
+ * libEGL finds this function with dlopen()/dlsym() and calls it from
+ * "load driver" function.
  */
 _EGLDriver *
 _eglMain(const char *args)
@@ -168,8 +171,7 @@ drm_initialize(_EGLDriver *drv, _EGLDisplay *disp, EGLint *major, EGLint *minor)
 		goto err_screen;
 	dev->winsys = dev->screen->winsys;
 
-	/* TODO HACK */
-	driInitExtensions(NULL, card_extensions, GL_FALSE);
+	driInitExtensions(NULL, NULL, GL_FALSE);
 
 	drm_update_res(dev);
 	res = dev->res;
diff --git a/src/gallium/state_trackers/egl/egl_tracker.h b/src/gallium/state_trackers/egl/egl_tracker.h
index dd4730f9579..73eb1a1226e 100644
--- a/src/gallium/state_trackers/egl/egl_tracker.h
+++ b/src/gallium/state_trackers/egl/egl_tracker.h
@@ -69,7 +69,6 @@ struct drm_surface
 	 * drm
 	 */
 
-	struct drm_context *user;
 	struct drm_screen *screen;
 
 	int w;
@@ -94,7 +93,6 @@ struct drm_screen
 	 * pipe
 	 */
 
-	struct pipe_buffer *buffer;
 	struct pipe_texture *tex;
 	struct pipe_surface *surface;
 
diff --git a/src/gallium/state_trackers/g3dvl/Makefile b/src/gallium/state_trackers/g3dvl/Makefile
deleted file mode 100644
index f9f4d6be3c3..00000000000
--- a/src/gallium/state_trackers/g3dvl/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-TARGET		= libg3dvl.a
-OBJECTS		= vl_display.o vl_screen.o vl_context.o vl_surface.o vl_shader_build.o vl_util.o vl_basic_csc.o	\
-		  vl_r16snorm_mc_buf.o
-GALLIUMDIR	= ../..
-
-CFLAGS		+= -g -Wall -Werror-implicit-function-declaration -fPIC	\
-		   -I${GALLIUMDIR}/include				\
-		   -I${GALLIUMDIR}/auxiliary				\
-		   -I${GALLIUMDIR}/winsys/g3dvl				\
-
-#############################################
-
-.PHONY	= all clean
-
-all: ${TARGET}
-
-${TARGET}: ${OBJECTS}
-	ar rcs $@ $^
-
-clean:
-	rm -rf ${OBJECTS} ${TARGET}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
deleted file mode 100644
index 20d682de3fb..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
+++ /dev/null
@@ -1,714 +0,0 @@
-#define VL_INTERNAL
-#include "vl_basic_csc.h"
-#include <assert.h>
-#include <pipe/p_context.h>
-#include <pipe/p_state.h>
-#include <pipe/p_inlines.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-#include <util/u_memory.h>
-#include "vl_csc.h"
-#include "vl_surface.h"
-#include "vl_shader_build.h"
-#include "vl_types.h"
-
-struct vlVertexShaderConsts
-{
-	struct vlVertex4f	dst_scale;
-	struct vlVertex4f	dst_trans;
-	struct vlVertex4f	src_scale;
-	struct vlVertex4f	src_trans;
-};
-
-struct vlFragmentShaderConsts
-{
-	struct vlVertex4f	bias;
-	float			matrix[16];
-};
-
-struct vlBasicCSC
-{
-	struct vlCSC				base;
-
-	struct pipe_context			*pipe;
-	struct pipe_viewport_state		viewport;
-	struct pipe_framebuffer_state		framebuffer;
-	struct pipe_texture			*framebuffer_tex;
-	void					*sampler;
-	void					*vertex_shader, *fragment_shader;
-	struct pipe_vertex_buffer 		vertex_bufs[2];
-	struct pipe_vertex_element		vertex_elems[2];
-	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
-};
-
-static int vlResizeFrameBuffer
-(
-	struct vlCSC *csc,
-	unsigned int width,
-	unsigned int height
-)
-{
-	struct vlBasicCSC	*basic_csc;
-	struct pipe_context	*pipe;
-	struct pipe_texture	template;
-
-	assert(csc);
-
-	basic_csc = (struct vlBasicCSC*)csc;
-	pipe = basic_csc->pipe;
-
-	if (basic_csc->framebuffer.width == width && basic_csc->framebuffer.height == height)
-		return 0;
-
-	basic_csc->viewport.scale[0] = width;
-	basic_csc->viewport.scale[1] = height;
-	basic_csc->viewport.scale[2] = 1;
-	basic_csc->viewport.scale[3] = 1;
-	basic_csc->viewport.translate[0] = 0;
-	basic_csc->viewport.translate[1] = 0;
-	basic_csc->viewport.translate[2] = 0;
-	basic_csc->viewport.translate[3] = 0;
-	
-	if (basic_csc->framebuffer_tex)
-	{
-		pipe_surface_reference(&basic_csc->framebuffer.cbufs[0], NULL);
-		pipe_texture_reference(&basic_csc->framebuffer_tex, NULL);
-	}
-	
-	memset(&template, 0, sizeof(struct pipe_texture));
-	template.target = PIPE_TEXTURE_2D;
-	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
-	template.last_level = 0;
-	template.width[0] = width;
-	template.height[0] = height;
-	template.depth[0] = 1;
-	pf_get_block(template.format, &template.block);
-	template.tex_usage = PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
-
-	basic_csc->framebuffer_tex = pipe->screen->texture_create(pipe->screen, &template);
-
-	basic_csc->framebuffer.width = width;
-	basic_csc->framebuffer.height = height;
-	basic_csc->framebuffer.cbufs[0] = pipe->screen->get_tex_surface
-	(
-		pipe->screen,
-		basic_csc->framebuffer_tex,
-		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
-	);
-
-	/* Clear to black, in case video doesn't fill the entire window */
-	pipe->set_framebuffer_state(pipe, &basic_csc->framebuffer);
-	pipe->clear(pipe, PIPE_CLEAR_COLOR, 0, 0.0f, 0);
-
-	return 0;
-}
-
-static int vlBegin
-(
-	struct vlCSC *csc
-)
-{
-	struct vlBasicCSC	*basic_csc;
-	struct pipe_context	*pipe;
-
-	assert(csc);
-
-	basic_csc = (struct vlBasicCSC*)csc;
-	pipe = basic_csc->pipe;
-
-	pipe->set_framebuffer_state(pipe, &basic_csc->framebuffer);
-	pipe->set_viewport_state(pipe, &basic_csc->viewport);
-	pipe->bind_sampler_states(pipe, 1, (void**)&basic_csc->sampler);
-	/* Source texture set in vlPutPictureCSC() */
-	pipe->bind_vs_state(pipe, basic_csc->vertex_shader);
-	pipe->bind_fs_state(pipe, basic_csc->fragment_shader);
-	pipe->set_vertex_buffers(pipe, 2, basic_csc->vertex_bufs);
-	pipe->set_vertex_elements(pipe, 2, basic_csc->vertex_elems);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &basic_csc->vs_const_buf);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &basic_csc->fs_const_buf);
-
-	return 0;
-}
-
-static int vlPutPictureCSC
-(
-	struct vlCSC *csc,
-	struct vlSurface *surface,
-	int srcx,
-	int srcy,
-	int srcw,
-	int srch,
-	int destx,
-	int desty,
-	int destw,
-	int desth,
-	enum vlPictureType picture_type
-)
-{
-	struct vlBasicCSC		*basic_csc;
-	struct pipe_context		*pipe;
-	struct vlVertexShaderConsts	*vs_consts;
-
-	assert(csc);
-	assert(surface);
-
-	basic_csc = (struct vlBasicCSC*)csc;
-	pipe = basic_csc->pipe;
-
-	vs_consts = pipe_buffer_map
-	(
-		pipe->screen,
-		basic_csc->vs_const_buf.buffer,
-		PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-	);
-
-	vs_consts->dst_scale.x = destw / (float)basic_csc->framebuffer.cbufs[0]->width;
-	vs_consts->dst_scale.y = desth / (float)basic_csc->framebuffer.cbufs[0]->height;
-	vs_consts->dst_scale.z = 1;
-	vs_consts->dst_scale.w = 1;
-	vs_consts->dst_trans.x = destx / (float)basic_csc->framebuffer.cbufs[0]->width;
-	vs_consts->dst_trans.y = desty / (float)basic_csc->framebuffer.cbufs[0]->height;
-	vs_consts->dst_trans.z = 0;
-	vs_consts->dst_trans.w = 0;
-
-	vs_consts->src_scale.x = srcw / (float)surface->texture->width[0];
-	vs_consts->src_scale.y = srch / (float)surface->texture->height[0];
-	vs_consts->src_scale.z = 1;
-	vs_consts->src_scale.w = 1;
-	vs_consts->src_trans.x = srcx / (float)surface->texture->width[0];
-	vs_consts->src_trans.y = srcy / (float)surface->texture->height[0];
-	vs_consts->src_trans.z = 0;
-	vs_consts->src_trans.w = 0;
-
-	pipe_buffer_unmap(pipe->screen, basic_csc->vs_const_buf.buffer);
-
-	pipe->set_sampler_textures(pipe, 1, &surface->texture);
-	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
-
-	return 0;
-}
-
-static int vlEnd
-(
-	struct vlCSC *csc
-)
-{
-	assert(csc);
-
-	return 0;
-}
-
-static struct pipe_surface* vlGetFrameBuffer
-(
-	struct vlCSC *csc
-)
-{
-	struct vlBasicCSC	*basic_csc;
-
-	assert(csc);
-
-	basic_csc = (struct vlBasicCSC*)csc;
-
-	return basic_csc->framebuffer.cbufs[0];
-}
-
-static int vlDestroy
-(
-	struct vlCSC *csc
-)
-{
-	struct vlBasicCSC	*basic_csc;
-	struct pipe_context	*pipe;
-	unsigned int		i;
-
-	assert(csc);
-
-	basic_csc = (struct vlBasicCSC*)csc;
-	pipe = basic_csc->pipe;
-
-	if (basic_csc->framebuffer_tex)
-	{
-		pipe_surface_reference(&basic_csc->framebuffer.cbufs[0], NULL);
-		pipe_texture_reference(&basic_csc->framebuffer_tex, NULL);
-	}
-
-	pipe->delete_sampler_state(pipe, basic_csc->sampler);
-	pipe->delete_vs_state(pipe, basic_csc->vertex_shader);
-	pipe->delete_fs_state(pipe, basic_csc->fragment_shader);
-
-	for (i = 0; i < 2; ++i)
-		pipe_buffer_reference(&basic_csc->vertex_bufs[i].buffer, NULL);
-
-	pipe_buffer_reference(&basic_csc->vs_const_buf.buffer, NULL);
-	pipe_buffer_reference(&basic_csc->fs_const_buf.buffer, NULL);
-
-	FREE(basic_csc);
-
-	return 0;
-}
-
-/*
- * Represents 2 triangles in a strip in normalized coords.
- * Used to render the surface onto the frame buffer.
- */
-static const struct vlVertex2f surface_verts[4] =
-{
-	{0.0f, 0.0f},
-	{0.0f, 1.0f},
-	{1.0f, 0.0f},
-	{1.0f, 1.0f}
-};
-
-/*
- * Represents texcoords for the above. We can use the position values directly.
- * TODO: Duplicate these in the shader, no need to create a buffer.
- */
-static const struct vlVertex2f *surface_texcoords = surface_verts;
-
-/*
- * Identity color conversion constants, for debugging
- */
-static const struct vlFragmentShaderConsts identity =
-{
-	{
-		0.0f, 0.0f, 0.0f, 0.0f
-	},
-	{
-		1.0f, 0.0f, 0.0f, 0.0f,
-		0.0f, 1.0f, 0.0f, 0.0f,
-		0.0f, 0.0f, 1.0f, 0.0f,
-		0.0f, 0.0f, 0.0f, 1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [16,235]
- */
-static const struct vlFragmentShaderConsts bt_601 =
-{
-	{
-		0.0f,		0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.0f,		0.0f,		1.371f,		0.0f,
-		1.0f,		-0.336f,	-0.698f,	0.0f,
-		1.0f,		1.732f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [0,255]
- */
-static const struct vlFragmentShaderConsts bt_601_full =
-{
-	{
-		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.164f,		0.0f,		1.596f,		0.0f,
-		1.164f,		-0.391f,	-0.813f,	0.0f,
-		1.164f,		2.018f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [16,235]
- */
-static const struct vlFragmentShaderConsts bt_709 =
-{
-	{
-		0.0f,		0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.0f,		0.0f,		1.540f,		0.0f,
-		1.0f,		-0.183f,	-0.459f,	0.0f,
-		1.0f,		1.816f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-/*
- * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
- * Y is in [16,235], Cb and Cr are in [16,240]
- * R, G, and B are in [0,255]
- */
-const struct vlFragmentShaderConsts bt_709_full =
-{
-	{
-		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
-	},
-	{
-		1.164f,		0.0f,		1.793f,		0.0f,
-		1.164f,		-0.213f,	-0.534f,	0.0f,
-		1.164f,		2.115f,		0.0f,		0.0f,
-		0.0f,		0.0f,		0.0f,		1.0f
-	}
-};
-
-static int vlCreateVertexShader
-(
-	struct vlBasicCSC *csc
-)
-{
-	const unsigned int		max_tokens = 50;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(csc);
-
-	pipe = csc->pipe;
-	tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Vertex texcoords
-	 */
-	for (i = 0; i < 2; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl c0		; Scaling vector to scale vertex pos rect to destination size
-	 * decl c1		; Translation vector to move vertex pos rect into position
-	 * decl c2		; Scaling vector to scale texcoord rect to source size
-	 * decl c3		; Translation vector to move texcoord rect into position
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Vertex texcoords
-	 */
-	for (i = 0; i < 2; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * madd o0, i0, c0, c1	; Scale and translate unit output rect to destination size and pos
-	 * madd o1, i1, c2, c3	; Scale and translate unit texcoord rect to source size and pos
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_inst4(TGSI_OPCODE_MADD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	vs.tokens = tokens;
-	csc->vertex_shader = pipe->create_vs_state(pipe, &vs);
-	FREE(tokens);
-
-	return 0;
-}
-
-static int vlCreateFragmentShader
-(
-	struct vlBasicCSC *csc
-)
-{
-	const unsigned int		max_tokens = 50;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(csc);
-
-	pipe = csc->pipe;
-	tokens = (struct tgsi_token*)MALLOC(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/* decl i0		; Texcoords for s0 */
-	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl c0		; Bias vector for CSC
-	 * decl c1-c4		; CSC matrix c1-c4
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0		; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl t0 */
-	decl = vl_decl_temps(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl s0		; Sampler for tex containing picture to display */
-	decl = vl_decl_samplers(0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* tex2d t0, i0, s0	; Read src pixel */
-	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* sub t0, t0, c0	; Subtract bias vector from pixel */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * dp4 o0.x, t0, c1	; Multiply pixel by the color conversion matrix
-	 * dp4 o0.y, t0, c2
-	 * dp4 o0.z, t0, c3
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	csc->fragment_shader = pipe->create_fs_state(pipe, &fs);
-	FREE(tokens);
-
-	return 0;
-}
-
-static int vlCreateDataBufs
-(
-	struct vlBasicCSC *csc
-)
-{
-	struct pipe_context *pipe;
-
-	assert(csc);
-
-	pipe = csc->pipe;
-
-	/*
-	 * Create our vertex buffer and vertex buffer element
-	 * VB contains 4 vertices that render a quad covering the entire window
-	 * to display a rendered surface
-	 * Quad is rendered as a tri strip
-	 */
-	csc->vertex_bufs[0].stride = sizeof(struct vlVertex2f);
-	csc->vertex_bufs[0].max_index = 3;
-	csc->vertex_bufs[0].buffer_offset = 0;
-	csc->vertex_bufs[0].buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		1,
-		PIPE_BUFFER_USAGE_VERTEX,
-		sizeof(struct vlVertex2f) * 4
-	);
-
-	memcpy
-	(
-		pipe_buffer_map(pipe->screen, csc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		surface_verts,
-		sizeof(struct vlVertex2f) * 4
-	);
-
-	pipe_buffer_unmap(pipe->screen, csc->vertex_bufs[0].buffer);
-
-	csc->vertex_elems[0].src_offset = 0;
-	csc->vertex_elems[0].vertex_buffer_index = 0;
-	csc->vertex_elems[0].nr_components = 2;
-	csc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/*
-	 * Create our texcoord buffer and texcoord buffer element
-	 * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
-	 */
-	csc->vertex_bufs[1].stride = sizeof(struct vlVertex2f);
-	csc->vertex_bufs[1].max_index = 3;
-	csc->vertex_bufs[1].buffer_offset = 0;
-	csc->vertex_bufs[1].buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		1,
-		PIPE_BUFFER_USAGE_VERTEX,
-		sizeof(struct vlVertex2f) * 4
-	);
-
-	memcpy
-	(
-		pipe_buffer_map(pipe->screen, csc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		surface_texcoords,
-		sizeof(struct vlVertex2f) * 4
-	);
-
-	pipe_buffer_unmap(pipe->screen, csc->vertex_bufs[1].buffer);
-
-	csc->vertex_elems[1].src_offset = 0;
-	csc->vertex_elems[1].vertex_buffer_index = 1;
-	csc->vertex_elems[1].nr_components = 2;
-	csc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/*
-	 * Create our vertex shader's constant buffer
-	 * Const buffer contains scaling and translation vectors
-	 */
-	csc->vs_const_buf.buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		1,
-		PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
-		sizeof(struct vlVertexShaderConsts)
-	);
-
-	/*
-	 * Create our fragment shader's constant buffer
-	 * Const buffer contains the color conversion matrix and bias vectors
-	 */
-	csc->fs_const_buf.buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		1,
-		PIPE_BUFFER_USAGE_CONSTANT,
-		sizeof(struct vlFragmentShaderConsts)
-	);
-
-	/*
-	 * TODO: Refactor this into a seperate function,
-	 * allow changing the CSC matrix at runtime to switch between regular & full versions
-	 */
-	memcpy
-	(
-		pipe_buffer_map(pipe->screen, csc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		&bt_601_full,
-		sizeof(struct vlFragmentShaderConsts)
-	);
-
-	pipe_buffer_unmap(pipe->screen, csc->fs_const_buf.buffer);
-
-	return 0;
-}
-
-static int vlInit
-(
-	struct vlBasicCSC *csc
-)
-{
-	struct pipe_context		*pipe;
-	struct pipe_sampler_state	sampler;
-
-	assert(csc);
-
-	pipe = csc->pipe;
-
-	/* Delay creating the FB until vlPutPictureCSC() so we know window size */
-	csc->framebuffer_tex = NULL;
-	csc->framebuffer.width = 0;
-	csc->framebuffer.height = 0;
-	csc->framebuffer.nr_cbufs = 1;
-	csc->framebuffer.cbufs[0] = NULL;
-	csc->framebuffer.zsbuf = NULL;
-
-	sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-	sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
-	sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-	sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
-	sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-	sampler.compare_func = PIPE_FUNC_ALWAYS;
-	sampler.normalized_coords = 1;
-	/*sampler.prefilter = ;*/
-	/*sampler.lod_bias = ;*/
-	/*sampler.min_lod = ;*/
-	/*sampler.max_lod = ;*/
-	/*sampler.border_color[i] = ;*/
-	/*sampler.max_anisotropy = ;*/
-	csc->sampler = pipe->create_sampler_state(pipe, &sampler);
-
-	vlCreateVertexShader(csc);
-	vlCreateFragmentShader(csc);
-	vlCreateDataBufs(csc);
-
-	return 0;
-}
-
-int vlCreateBasicCSC
-(
-	struct pipe_context *pipe,
-	struct vlCSC **csc
-)
-{
-	struct vlBasicCSC *basic_csc;
-
-	assert(pipe);
-	assert(csc);
-
-	basic_csc = CALLOC_STRUCT(vlBasicCSC);
-
-	if (!basic_csc)
-		return 1;
-
-	basic_csc->base.vlResizeFrameBuffer = &vlResizeFrameBuffer;
-	basic_csc->base.vlBegin = &vlBegin;
-	basic_csc->base.vlPutPicture = &vlPutPictureCSC;
-	basic_csc->base.vlEnd = &vlEnd;
-	basic_csc->base.vlGetFrameBuffer = &vlGetFrameBuffer;
-	basic_csc->base.vlDestroy = &vlDestroy;
-	basic_csc->pipe = pipe;
-
-	vlInit(basic_csc);
-
-	*csc = &basic_csc->base;
-
-	return 0;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.h b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
deleted file mode 100644
index 2e17f1d814a..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef vl_basic_csc_h
-#define vl_basic_csc_h
-
-struct pipe_context;
-struct vlCSC;
-
-int vlCreateBasicCSC
-(
-	struct pipe_context *pipe,
-	struct vlCSC **csc
-);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.c b/src/gallium/state_trackers/g3dvl/vl_context.c
deleted file mode 100644
index 5cfd233c4c1..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_context.c
+++ /dev/null
@@ -1,205 +0,0 @@
-#define VL_INTERNAL
-#include "vl_context.h"
-#include <assert.h>
-#include <pipe/p_context.h>
-#include <pipe/p_state.h>
-#include <util/u_memory.h>
-#include "vl_render.h"
-#include "vl_r16snorm_mc_buf.h"
-#include "vl_csc.h"
-#include "vl_basic_csc.h"
-
-static int vlInitCommon(struct vlContext *context)
-{
-	struct pipe_context			*pipe;
-	struct pipe_rasterizer_state		rast;
-	struct pipe_blend_state			blend;
-	struct pipe_depth_stencil_alpha_state	dsa;
-	unsigned int				i;
-
-	assert(context);
-
-	pipe = context->pipe;
-
-	rast.flatshade = 1;
-	rast.flatshade_first = 0;
-	rast.light_twoside = 0;
-	rast.front_winding = PIPE_WINDING_CCW;
-	rast.cull_mode = PIPE_WINDING_CW;
-	rast.fill_cw = PIPE_POLYGON_MODE_FILL;
-	rast.fill_ccw = PIPE_POLYGON_MODE_FILL;
-	rast.offset_cw = 0;
-	rast.offset_ccw = 0;
-	rast.scissor = 0;
-	rast.poly_smooth = 0;
-	rast.poly_stipple_enable = 0;
-	rast.point_sprite = 0;
-	rast.point_size_per_vertex = 0;
-	rast.multisample = 0;
-	rast.line_smooth = 0;
-	rast.line_stipple_enable = 0;
-	rast.line_stipple_factor = 0;
-	rast.line_stipple_pattern = 0;
-	rast.line_last_pixel = 0;
-	rast.bypass_vs_clip_and_viewport = 0;
-	rast.line_width = 1;
-	rast.point_smooth = 0;
-	rast.point_size = 1;
-	rast.offset_units = 1;
-	rast.offset_scale = 1;
-	/*rast.sprite_coord_mode[i] = ;*/
-	context->raster = pipe->create_rasterizer_state(pipe, &rast);
-	pipe->bind_rasterizer_state(pipe, context->raster);
-
-	blend.blend_enable = 0;
-	blend.rgb_func = PIPE_BLEND_ADD;
-	blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
-	blend.rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
-	blend.alpha_func = PIPE_BLEND_ADD;
-	blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
-	blend.alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
-	blend.logicop_enable = 0;
-	blend.logicop_func = PIPE_LOGICOP_CLEAR;
-	/* Needed to allow color writes to FB, even if blending disabled */
-	blend.colormask = PIPE_MASK_RGBA;
-	blend.dither = 0;
-	context->blend = pipe->create_blend_state(pipe, &blend);
-	pipe->bind_blend_state(pipe, context->blend);
-
-	dsa.depth.enabled = 0;
-	dsa.depth.writemask = 0;
-	dsa.depth.func = PIPE_FUNC_ALWAYS;
-	dsa.depth.occlusion_count = 0;
-	for (i = 0; i < 2; ++i)
-	{
-		dsa.stencil[i].enabled = 0;
-		dsa.stencil[i].func = PIPE_FUNC_ALWAYS;
-		dsa.stencil[i].fail_op = PIPE_STENCIL_OP_KEEP;
-		dsa.stencil[i].zpass_op = PIPE_STENCIL_OP_KEEP;
-		dsa.stencil[i].zfail_op = PIPE_STENCIL_OP_KEEP;
-		dsa.stencil[i].ref_value = 0;
-		dsa.stencil[i].valuemask = 0;
-		dsa.stencil[i].writemask = 0;
-	}
-	dsa.alpha.enabled = 0;
-	dsa.alpha.func = PIPE_FUNC_ALWAYS;
-	dsa.alpha.ref_value = 0;
-	context->dsa = pipe->create_depth_stencil_alpha_state(pipe, &dsa);
-	pipe->bind_depth_stencil_alpha_state(pipe, context->dsa);
-
-	return 0;
-}
-
-int vlCreateContext
-(
-	struct vlScreen *screen,
-	struct pipe_context *pipe,
-	unsigned int picture_width,
-	unsigned int picture_height,
-	enum vlFormat picture_format,
-	enum vlProfile profile,
-	enum vlEntryPoint entry_point,
-	struct vlContext **context
-)
-{
-	struct vlContext *ctx;
-
-	assert(screen);
-	assert(context);
-	assert(pipe);
-
-	ctx = CALLOC_STRUCT(vlContext);
-
-	if (!ctx)
-		return 1;
-
-	ctx->screen = screen;
-	ctx->pipe = pipe;
-	ctx->picture_width = picture_width;
-	ctx->picture_height = picture_height;
-	ctx->picture_format = picture_format;
-	ctx->profile = profile;
-	ctx->entry_point = entry_point;
-
-	vlInitCommon(ctx);
-
-	vlCreateR16SNormBufferedMC(pipe, picture_width, picture_height, picture_format, &ctx->render);
-	vlCreateBasicCSC(pipe, &ctx->csc);
-
-	*context = ctx;
-
-	return 0;
-}
-
-int vlDestroyContext
-(
-	struct vlContext *context
-)
-{
-	assert(context);
-
-	/* XXX: Must unbind shaders before we can delete them for some reason */
-	context->pipe->bind_vs_state(context->pipe, NULL);
-	context->pipe->bind_fs_state(context->pipe, NULL);
-
-	context->render->vlDestroy(context->render);
-	context->csc->vlDestroy(context->csc);
-
-	context->pipe->delete_blend_state(context->pipe, context->blend);
-	context->pipe->delete_rasterizer_state(context->pipe, context->raster);
-	context->pipe->delete_depth_stencil_alpha_state(context->pipe, context->dsa);
-
-	FREE(context);
-
-	return 0;
-}
-
-struct vlScreen* vlContextGetScreen
-(
-	struct vlContext *context
-)
-{
-	assert(context);
-
-	return context->screen;
-}
-
-struct pipe_context* vlGetPipeContext
-(
-	struct vlContext *context
-)
-{
-	assert(context);
-
-	return context->pipe;
-}
-
-unsigned int vlGetPictureWidth
-(
-	struct vlContext *context
-)
-{
-	assert(context);
-
-	return context->picture_width;
-}
-
-unsigned int vlGetPictureHeight
-(
-	struct vlContext *context
-)
-{
-	assert(context);
-
-	return context->picture_height;
-}
-
-enum vlFormat vlGetPictureFormat
-(
-	struct vlContext *context
-)
-{
-	assert(context);
-
-	return context->picture_format;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.h b/src/gallium/state_trackers/g3dvl/vl_context.h
deleted file mode 100644
index 3d14634c44e..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_context.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef vl_context_h
-#define vl_context_h
-
-#include "vl_types.h"
-
-struct pipe_context;
-
-#ifdef VL_INTERNAL
-struct vlRender;
-struct vlCSC;
-
-struct vlContext
-{
-	struct vlScreen		*screen;
-	struct pipe_context	*pipe;
-	unsigned int		picture_width;
-	unsigned int		picture_height;
-	enum vlFormat		picture_format;
-	enum vlProfile		profile;
-	enum vlEntryPoint	entry_point;
-
-	void			*raster;
-	void			*dsa;
-	void			*blend;
-
-	struct vlRender		*render;
-	struct vlCSC		*csc;
-};
-#endif
-
-int vlCreateContext
-(
-	struct vlScreen *screen,
-	struct pipe_context *pipe,
-	unsigned int picture_width,
-	unsigned int picture_height,
-	enum vlFormat picture_format,
-	enum vlProfile profile,
-	enum vlEntryPoint entry_point,
-	struct vlContext **context
-);
-
-int vlDestroyContext
-(
-	struct vlContext *context
-);
-
-struct vlScreen* vlContextGetScreen
-(
-	struct vlContext *context
-);
-
-struct pipe_context* vlGetPipeContext
-(
-	struct vlContext *context
-);
-
-unsigned int vlGetPictureWidth
-(
-	struct vlContext *context
-);
-
-unsigned int vlGetPictureHeight
-(
-	struct vlContext *context
-);
-
-enum vlFormat vlGetPictureFormat
-(
-	struct vlContext *context
-);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_csc.h b/src/gallium/state_trackers/g3dvl/vl_csc.h
deleted file mode 100644
index 36417a27929..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_csc.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef vl_csc_h
-#define vl_csc_h
-
-#include "vl_types.h"
-
-struct pipe_surface;
-
-struct vlCSC
-{
-	int (*vlResizeFrameBuffer)
-	(
-		struct vlCSC *csc,
-		unsigned int width,
-		unsigned int height
-	);
-
-	int (*vlBegin)
-	(
-		struct vlCSC *csc
-	);
-
-	int (*vlPutPicture)
-	(
-		struct vlCSC *csc,
-		struct vlSurface *surface,
-		int srcx,
-		int srcy,
-		int srcw,
-		int srch,
-		int destx,
-		int desty,
-		int destw,
-		int desth,
-		enum vlPictureType picture_type
-	);
-
-	int (*vlEnd)
-	(
-		struct vlCSC *csc
-	);
-
-	struct pipe_surface* (*vlGetFrameBuffer)
-	(
-		struct vlCSC *csc
-	);
-
-	int (*vlDestroy)
-	(
-		struct vlCSC *csc
-	);
-};
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_defs.h b/src/gallium/state_trackers/g3dvl/vl_defs.h
deleted file mode 100644
index d612d02502f..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_defs.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef vl_defs_h
-#define vl_defs_h
-
-#define VL_BLOCK_WIDTH		8
-#define VL_BLOCK_HEIGHT		8
-#define VL_BLOCK_SIZE		(VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT)
-#define VL_MACROBLOCK_WIDTH	16
-#define VL_MACROBLOCK_HEIGHT	16
-#define VL_MACROBLOCK_SIZE	(VL_MACROBLOCK_WIDTH * VL_MACROBLOCK_HEIGHT)
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.c b/src/gallium/state_trackers/g3dvl/vl_display.c
deleted file mode 100644
index dce06de7583..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_display.c
+++ /dev/null
@@ -1,48 +0,0 @@
-#define VL_INTERNAL
-#include "vl_display.h"
-#include <assert.h>
-#include <util/u_memory.h>
-
-int vlCreateDisplay
-(
-	vlNativeDisplay native_display,
-	struct vlDisplay **display
-)
-{
-	struct vlDisplay *dpy;
-
-	assert(native_display);
-	assert(display);
-
-	dpy = CALLOC_STRUCT(vlDisplay);
-
-	if (!dpy)
-		return 1;
-
-	dpy->native = native_display;
-	*display = dpy;
-
-	return 0;
-}
-
-int vlDestroyDisplay
-(
-	struct vlDisplay *display
-)
-{
-	assert(display);
-
-	FREE(display);
-
-	return 0;
-}
-
-vlNativeDisplay vlGetNativeDisplay
-(
-	struct vlDisplay *display
-)
-{
-	assert(display);
-
-	return display->native;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.h b/src/gallium/state_trackers/g3dvl/vl_display.h
deleted file mode 100644
index e11fd407993..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_display.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef vl_display_h
-#define vl_display_h
-
-#include "vl_types.h"
-
-#ifdef VL_INTERNAL
-struct vlDisplay
-{
-	vlNativeDisplay native;
-};
-#endif
-
-int vlCreateDisplay
-(
-	vlNativeDisplay native_display,
-	struct vlDisplay **display
-);
-
-int vlDestroyDisplay
-(
-	struct vlDisplay *display
-);
-
-vlNativeDisplay vlGetNativeDisplay
-(
-	struct vlDisplay *display
-);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c
deleted file mode 100644
index 23631adb693..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c
+++ /dev/null
@@ -1,1155 +0,0 @@
-#define VL_INTERNAL
-#include "vl_r16snorm_mc_buf.h"
-#include <assert.h>
-#include <pipe/p_context.h>
-#include <pipe/p_screen.h>
-#include <pipe/p_state.h>
-#include <pipe/p_inlines.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-#include <util/u_math.h>
-#include <util/u_memory.h>
-#include "vl_render.h"
-#include "vl_shader_build.h"
-#include "vl_surface.h"
-#include "vl_util.h"
-#include "vl_types.h"
-#include "vl_defs.h"
-
-const unsigned int DEFAULT_BUF_ALIGNMENT = 1;
-
-enum vlMacroBlockTypeEx
-{
-	vlMacroBlockExTypeIntra,
-	vlMacroBlockExTypeFwdPredictedFrame,
-	vlMacroBlockExTypeFwdPredictedField,
-	vlMacroBlockExTypeBkwdPredictedFrame,
-	vlMacroBlockExTypeBkwdPredictedField,
-	vlMacroBlockExTypeBiPredictedFrame,
-	vlMacroBlockExTypeBiPredictedField,
-
-	vlNumMacroBlockExTypes
-};
-
-struct vlVertexShaderConsts
-{
-	struct vlVertex4f denorm;
-};
-
-struct vlFragmentShaderConsts
-{
-	struct vlVertex4f multiplier;
-	struct vlVertex4f div;
-};
-
-struct vlMacroBlockVertexStream0
-{
-	struct vlVertex2f pos;
-	struct vlVertex2f luma_tc;
-	struct vlVertex2f cb_tc;
-	struct vlVertex2f cr_tc;
-};
-
-struct vlR16SnormBufferedMC
-{
-	struct vlRender				base;
-
-	unsigned int				picture_width;
-	unsigned int				picture_height;
-	enum vlFormat				picture_format;
-	unsigned int				macroblocks_per_picture;
-
-	struct vlSurface			*buffered_surface;
-	struct vlSurface			*past_surface;
-	struct vlSurface			*future_surface;
-	struct vlVertex2f			surface_tex_inv_size;
-	struct vlVertex2f			zero_block[3];
-	unsigned int				num_macroblocks;
-	struct vlMpeg2MacroBlock		*macroblocks;
-	struct pipe_transfer			*tex_transfer[3];
-	short					*texels[3];
-
-	struct pipe_context			*pipe;
-	struct pipe_viewport_state		viewport;
-	struct pipe_framebuffer_state		render_target;
-
-	union
-	{
-		void					*all[5];
-		struct
-		{
-			void				*y;
-			void				*cb;
-			void				*cr;
-			void				*ref[2];
-		};
-	} samplers;
-
-	union
-	{
-		struct pipe_texture			*all[5];
-		struct
-		{
-			struct pipe_texture		*y;
-			struct pipe_texture		*cb;
-			struct pipe_texture		*cr;
-			struct pipe_texture		*ref[2];
-		};
-	} textures;
-
-	union
-	{
-		struct pipe_vertex_buffer 		all[3];
-		struct
-		{
-			struct pipe_vertex_buffer	ycbcr;
-			struct pipe_vertex_buffer	ref[2];
-		};
-	} vertex_bufs;
-
-	void					*i_vs, *p_vs[2], *b_vs[2];
-	void					*i_fs, *p_fs[2], *b_fs[2];
-	struct pipe_vertex_element		vertex_elems[8];
-	struct pipe_constant_buffer		vs_const_buf;
-	struct pipe_constant_buffer		fs_const_buf;
-};
-
-static inline int vlBegin
-(
-	struct vlRender *render
-)
-{
-	assert(render);
-
-	return 0;
-}
-
-static inline int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
-{
-	unsigned int y;
-
-	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		memcpy
-		(
-			dst + y * dst_pitch,
-			src + y * VL_BLOCK_WIDTH,
-			VL_BLOCK_WIDTH * 2
-		);
-
-	return 0;
-}
-
-static inline int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
-{
-	unsigned int y;
-
-	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		memcpy
-		(
-			dst + y * dst_pitch * 2,
-			src + y * VL_BLOCK_WIDTH,
-			VL_BLOCK_WIDTH * 2
-		);
-
-	return 0;
-}
-
-static inline int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
-{
-	unsigned int y;
-
-	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
-		memset
-		(
-			dst + y * dst_pitch,
-			0,
-			VL_BLOCK_WIDTH * 2
-		);
-
-	return 0;
-}
-
-static inline int vlGrabBlocks
-(
-	struct vlR16SnormBufferedMC *mc,
-	unsigned int mbx,
-	unsigned int mby,
-	enum vlDCTType dct_type,
-	unsigned int coded_block_pattern,
-	short *blocks
-)
-{
-	short			*texels;
-	unsigned int		tex_pitch;
-	unsigned int		x, y, tb = 0, sb = 0;
-	unsigned int		mbpx = mbx * VL_MACROBLOCK_WIDTH, mbpy = mby * VL_MACROBLOCK_HEIGHT;
-
-	assert(mc);
-	assert(blocks);
-
-	tex_pitch = mc->tex_transfer[0]->stride / mc->tex_transfer[0]->block.size;
-	texels = mc->texels[0] + mbpy * tex_pitch + mbpx;
-
-	for (y = 0; y < 2; ++y)
-	{
-		for (x = 0; x < 2; ++x, ++tb)
-		{
-			if ((coded_block_pattern >> (5 - tb)) & 1)
-			{
-				short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
-
-				if (dct_type == vlDCTTypeFrameCoded)
-				{
-					vlGrabFrameCodedBlock
-					(
-						cur_block,
-						texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH,
-						tex_pitch
-					);
-				}
-				else
-				{
-					vlGrabFieldCodedBlock
-					(
-						cur_block,
-						texels + y * tex_pitch + x * VL_BLOCK_WIDTH,
-						tex_pitch
-					);
-				}
-
-				++sb;
-			}
-			else if (mc->zero_block[0].x < 0.0f)
-			{
-				vlGrabNoBlock(texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH, tex_pitch);
-
-				mc->zero_block[0].x = (mbpx + x * 8) * mc->surface_tex_inv_size.x;
-				mc->zero_block[0].y = (mbpy + y * 8) * mc->surface_tex_inv_size.y;
-			}
-		}
-	}
-
-	/* TODO: Implement 422, 444 */
-	mbpx >>= 1;
-	mbpy >>= 1;
-
-	for (tb = 0; tb < 2; ++tb)
-	{
-		tex_pitch = mc->tex_transfer[tb + 1]->stride / mc->tex_transfer[tb + 1]->block.size;
-		texels = mc->texels[tb + 1] + mbpy * tex_pitch + mbpx;
-
-		if ((coded_block_pattern >> (1 - tb)) & 1)
-		{
-			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
-
-			vlGrabFrameCodedBlock
-			(
-				cur_block,
-				texels,
-				tex_pitch
-			);
-
-			++sb;
-		}
-		else if (mc->zero_block[tb + 1].x < 0.0f)
-		{
-			vlGrabNoBlock(texels, tex_pitch);
-
-			mc->zero_block[tb + 1].x = (mbpx << 1) * mc->surface_tex_inv_size.x;
-			mc->zero_block[tb + 1].y = (mbpy << 1) * mc->surface_tex_inv_size.y;
-		}
-	}
-
-	return 0;
-}
-
-static inline enum vlMacroBlockTypeEx vlGetMacroBlockTypeEx(struct vlMpeg2MacroBlock *mb)
-{
-	assert(mb);
-
-	switch (mb->mb_type)
-	{
-		case vlMacroBlockTypeIntra:
-			return vlMacroBlockExTypeIntra;
-		case vlMacroBlockTypeFwdPredicted:
-			return mb->mo_type == vlMotionTypeFrame ?
-				vlMacroBlockExTypeFwdPredictedFrame : vlMacroBlockExTypeFwdPredictedField;
-		case vlMacroBlockTypeBkwdPredicted:
-			return mb->mo_type == vlMotionTypeFrame ?
-				vlMacroBlockExTypeBkwdPredictedFrame : vlMacroBlockExTypeBkwdPredictedField;
-		case vlMacroBlockTypeBiPredicted:
-			return mb->mo_type == vlMotionTypeFrame ?
-				vlMacroBlockExTypeBiPredictedFrame : vlMacroBlockExTypeBiPredictedField;
-		default:
-			assert(0);
-	}
-
-	/* Unreachable */
-	return -1;
-}
-
-static inline int vlGrabMacroBlock
-(
-	struct vlR16SnormBufferedMC *mc,
-	struct vlMpeg2MacroBlock *macroblock
-)
-{
-	assert(mc);
-	assert(macroblock);
-	assert(mc->num_macroblocks < mc->macroblocks_per_picture);
-
-	mc->macroblocks[mc->num_macroblocks].mbx = macroblock->mbx;
-	mc->macroblocks[mc->num_macroblocks].mby = macroblock->mby;
-	mc->macroblocks[mc->num_macroblocks].mb_type = macroblock->mb_type;
-	mc->macroblocks[mc->num_macroblocks].mo_type = macroblock->mo_type;
-	mc->macroblocks[mc->num_macroblocks].dct_type = macroblock->dct_type;
-	mc->macroblocks[mc->num_macroblocks].PMV[0][0][0] = macroblock->PMV[0][0][0];
-	mc->macroblocks[mc->num_macroblocks].PMV[0][0][1] = macroblock->PMV[0][0][1];
-	mc->macroblocks[mc->num_macroblocks].PMV[0][1][0] = macroblock->PMV[0][1][0];
-	mc->macroblocks[mc->num_macroblocks].PMV[0][1][1] = macroblock->PMV[0][1][1];
-	mc->macroblocks[mc->num_macroblocks].PMV[1][0][0] = macroblock->PMV[1][0][0];
-	mc->macroblocks[mc->num_macroblocks].PMV[1][0][1] = macroblock->PMV[1][0][1];
-	mc->macroblocks[mc->num_macroblocks].PMV[1][1][0] = macroblock->PMV[1][1][0];
-	mc->macroblocks[mc->num_macroblocks].PMV[1][1][1] = macroblock->PMV[1][1][1];
-	mc->macroblocks[mc->num_macroblocks].cbp = macroblock->cbp;
-	mc->macroblocks[mc->num_macroblocks].blocks = macroblock->blocks;
-
-	vlGrabBlocks
-	(
-		mc,
-		macroblock->mbx,
-		macroblock->mby,
-		macroblock->dct_type,
-		macroblock->cbp,
-		macroblock->blocks
-	);
-
-	mc->num_macroblocks++;
-
-	return 0;
-}
-
-#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, zb)					\
-	do {															\
-	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
-	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
-	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
-	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
-	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
-	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
-																\
-	if ((cbp) & (lm))													\
-	{															\
-		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-	}															\
-	else															\
-	{															\
-		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
-		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
-		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
-		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
-		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
-		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
-	}															\
-																\
-	if ((cbp) & (cbm))													\
-	{															\
-		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-	}															\
-	else															\
-	{															\
-		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
-		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
-		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
-		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
-		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
-		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
-	}															\
-																\
-	if ((cbp) & (crm))													\
-	{															\
-		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
-		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
-	}															\
-	else															\
-	{															\
-		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
-		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
-		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
-		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
-		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
-		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
-	}															\
-	} while (0)
-
-static inline int vlGenMacroblockVerts
-(
-	struct vlR16SnormBufferedMC *mc,
-	struct vlMpeg2MacroBlock *macroblock,
-	unsigned int pos,
-	struct vlMacroBlockVertexStream0 *ycbcr_vb,
-	struct vlVertex2f **ref_vb
-)
-{
-	struct vlVertex2f	mo_vec[2];
-	unsigned int		i;
-
-	assert(mc);
-	assert(macroblock);
-	assert(ycbcr_vb);
-	assert(pos < mc->macroblocks_per_picture);
-
-	switch (macroblock->mb_type)
-	{
-		case vlMacroBlockTypeBiPredicted:
-		{
-			struct vlVertex2f *vb;
-
-			assert(ref_vb && ref_vb[1]);
-
-			vb = ref_vb[1] + pos * 2 * 24;
-
-			mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
-			mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
-
-			if (macroblock->mo_type == vlMotionTypeFrame)
-			{
-				for (i = 0; i < 24 * 2; i += 2)
-				{
-					vb[i].x = mo_vec[0].x;
-					vb[i].y = mo_vec[0].y;
-				}
-			}
-			else
-			{
-				mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
-				mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
-
-				for (i = 0; i < 24 * 2; i += 2)
-				{
-					vb[i].x = mo_vec[0].x;
-					vb[i].y = mo_vec[0].y;
-					vb[i + 1].x = mo_vec[1].x;
-					vb[i + 1].y = mo_vec[1].y;
-				}
-			}
-
-			/* fall-through */
-		}
-		case vlMacroBlockTypeFwdPredicted:
-		case vlMacroBlockTypeBkwdPredicted:
-		{
-			struct vlVertex2f *vb;
-
-			assert(ref_vb && ref_vb[0]);
-
-			vb = ref_vb[0] + pos * 2 * 24;
-
-			if (macroblock->mb_type == vlMacroBlockTypeBkwdPredicted)
-			{
-				mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
-				mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
-
-				if (macroblock->mo_type == vlMotionTypeField)
-				{
-					mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
-					mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
-				}
-			}
-			else
-			{
-				mo_vec[0].x = macroblock->PMV[0][0][0] * 0.5f * mc->surface_tex_inv_size.x;
-				mo_vec[0].y = macroblock->PMV[0][0][1] * 0.5f * mc->surface_tex_inv_size.y;
-
-				if (macroblock->mo_type == vlMotionTypeField)
-				{
-					mo_vec[1].x = macroblock->PMV[1][0][0] * 0.5f * mc->surface_tex_inv_size.x;
-					mo_vec[1].y = macroblock->PMV[1][0][1] * 0.5f * mc->surface_tex_inv_size.y;
-				}
-			}
-
-			if (macroblock->mo_type == vlMotionTypeFrame)
-			{
-				for (i = 0; i < 24 * 2; i += 2)
-				{
-					vb[i].x = mo_vec[0].x;
-					vb[i].y = mo_vec[0].y;
-				}
-			}
-			else
-			{
-				for (i = 0; i < 24 * 2; i += 2)
-				{
-					vb[i].x = mo_vec[0].x;
-					vb[i].y = mo_vec[0].y;
-					vb[i + 1].x = mo_vec[1].x;
-					vb[i + 1].y = mo_vec[1].y;
-				}
-			}
-
-			/* fall-through */
-		}
-		case vlMacroBlockTypeIntra:
-		{
-			const struct vlVertex2f	unit =
-			{
-				mc->surface_tex_inv_size.x * VL_MACROBLOCK_WIDTH,
-				mc->surface_tex_inv_size.y * VL_MACROBLOCK_HEIGHT
-			};
-			const struct vlVertex2f half =
-			{
-				mc->surface_tex_inv_size.x * (VL_MACROBLOCK_WIDTH / 2),
-				mc->surface_tex_inv_size.y * (VL_MACROBLOCK_HEIGHT / 2)
-			};
-
-			struct vlMacroBlockVertexStream0 *vb;
-
-			vb = ycbcr_vb + pos * 24;
-
-			SET_BLOCK
-			(
-				vb,
-				macroblock->cbp, macroblock->mbx, macroblock->mby,
-				unit.x, unit.y, 0, 0, half.x, half.y,
-				32, 2, 1, mc->zero_block
-			);
-
-			SET_BLOCK
-			(
-				vb + 6,
-				macroblock->cbp, macroblock->mbx, macroblock->mby,
-				unit.x, unit.y, half.x, 0, half.x, half.y,
-				16, 2, 1, mc->zero_block
-			);
-
-			SET_BLOCK
-			(
-				vb + 12,
-				macroblock->cbp, macroblock->mbx, macroblock->mby,
-				unit.x, unit.y, 0, half.y, half.x, half.y,
-				8, 2, 1, mc->zero_block
-			);
-
-			SET_BLOCK
-			(
-				vb + 18,
-				macroblock->cbp, macroblock->mbx, macroblock->mby,
-				unit.x, unit.y, half.x, half.y, half.x, half.y,
-				4, 2, 1, mc->zero_block
-			);
-
-			break;
-		}
-		default:
-			assert(0);
-	}
-
-	return 0;
-}
-
-static int vlFlush
-(
-	struct vlRender *render
-)
-{
-	struct vlR16SnormBufferedMC	*mc;
-	struct pipe_context		*pipe;
-	struct vlVertexShaderConsts	*vs_consts;
-	unsigned int			num_macroblocks[vlNumMacroBlockExTypes] = {0};
-	unsigned int			offset[vlNumMacroBlockExTypes];
-	unsigned int			vb_start = 0;
-	unsigned int			i;
-
-	assert(render);
-
-	mc = (struct vlR16SnormBufferedMC*)render;
-
-	if (!mc->buffered_surface)
-		return 0;
-
-	if (mc->num_macroblocks < mc->macroblocks_per_picture)
-		return 0;
-
-	assert(mc->num_macroblocks <= mc->macroblocks_per_picture);
-
-	pipe = mc->pipe;
-
-	for (i = 0; i < mc->num_macroblocks; ++i)
-	{
-		enum vlMacroBlockTypeEx mb_type_ex = vlGetMacroBlockTypeEx(&mc->macroblocks[i]);
-
-		num_macroblocks[mb_type_ex]++;
-	}
-
-	offset[0] = 0;
-
-	for (i = 1; i < vlNumMacroBlockExTypes; ++i)
-		offset[i] = offset[i - 1] + num_macroblocks[i - 1];
-
-	{
-		struct vlMacroBlockVertexStream0	*ycbcr_vb;
-		struct vlVertex2f			*ref_vb[2];
-
-		ycbcr_vb = (struct vlMacroBlockVertexStream0*)pipe_buffer_map
-		(
-			pipe->screen,
-			mc->vertex_bufs.ycbcr.buffer,
-			PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-		);
-
-		for (i = 0; i < 2; ++i)
-			ref_vb[i] = (struct vlVertex2f*)pipe_buffer_map
-			(
-				pipe->screen,
-				mc->vertex_bufs.ref[i].buffer,
-				PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-			);
-
-		for (i = 0; i < mc->num_macroblocks; ++i)
-		{
-			enum vlMacroBlockTypeEx mb_type_ex = vlGetMacroBlockTypeEx(&mc->macroblocks[i]);
-
-			vlGenMacroblockVerts(mc, &mc->macroblocks[i], offset[mb_type_ex], ycbcr_vb, ref_vb);
-
-			offset[mb_type_ex]++;
-		}
-
-		pipe_buffer_unmap(pipe->screen, mc->vertex_bufs.ycbcr.buffer);
-		for (i = 0; i < 2; ++i)
-			pipe_buffer_unmap(pipe->screen, mc->vertex_bufs.ref[i].buffer);
-	}
-
-	for (i = 0; i < 3; ++i)
-	{
-		pipe->screen->transfer_unmap(pipe->screen, mc->tex_transfer[i]);
-		pipe->screen->tex_transfer_destroy(mc->tex_transfer[i]);
-	}
-
-	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
-	(
-		pipe->screen,
-		mc->buffered_surface->texture,
-		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
-	);
-
-	pipe->set_framebuffer_state(pipe, &mc->render_target);
-	pipe->set_viewport_state(pipe, &mc->viewport);
-	vs_consts = pipe_buffer_map
-	(
-		pipe->screen,
-		mc->vs_const_buf.buffer,
-		PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
-	);
-
-	vs_consts->denorm.x = mc->buffered_surface->texture->width[0];
-	vs_consts->denorm.y = mc->buffered_surface->texture->height[0];
-
-	pipe_buffer_unmap(pipe->screen, mc->vs_const_buf.buffer);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &mc->vs_const_buf);
-	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
-
-	if (num_macroblocks[vlMacroBlockExTypeIntra] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 1, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 4, mc->vertex_elems);
-		pipe->set_sampler_textures(pipe, 3, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 3, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->i_vs);
-		pipe->bind_fs_state(pipe, mc->i_fs);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeIntra] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeIntra] * 24;
-	}
-
-	if (num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
-		mc->textures.ref[0] = mc->past_surface->texture;
-		pipe->set_sampler_textures(pipe, 4, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 4, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->p_vs[0]);
-		pipe->bind_fs_state(pipe, mc->p_fs[0]);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24;
-	}
-
-	if (num_macroblocks[vlMacroBlockExTypeFwdPredictedField] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
-		mc->textures.ref[0] = mc->past_surface->texture;
-		pipe->set_sampler_textures(pipe, 4, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 4, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->p_vs[1]);
-		pipe->bind_fs_state(pipe, mc->p_fs[1]);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24;
-	}
-
-	if (num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
-		mc->textures.ref[0] = mc->future_surface->texture;
-		pipe->set_sampler_textures(pipe, 4, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 4, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->p_vs[0]);
-		pipe->bind_fs_state(pipe, mc->p_fs[0]);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24;
-	}
-
-	if (num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
-		mc->textures.ref[0] = mc->future_surface->texture;
-		pipe->set_sampler_textures(pipe, 4, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 4, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->p_vs[1]);
-		pipe->bind_fs_state(pipe, mc->p_fs[1]);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24;
-	}
-
-	if (num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 8, mc->vertex_elems);
-		mc->textures.ref[0] = mc->past_surface->texture;
-		mc->textures.ref[1] = mc->future_surface->texture;
-		pipe->set_sampler_textures(pipe, 5, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 5, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->b_vs[0]);
-		pipe->bind_fs_state(pipe, mc->b_fs[0]);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24;
-	}
-
-	if (num_macroblocks[vlMacroBlockExTypeBiPredictedField] > 0)
-	{
-		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs.all);
-		pipe->set_vertex_elements(pipe, 8, mc->vertex_elems);
-		mc->textures.ref[0] = mc->past_surface->texture;
-		mc->textures.ref[1] = mc->future_surface->texture;
-		pipe->set_sampler_textures(pipe, 5, mc->textures.all);
-		pipe->bind_sampler_states(pipe, 5, mc->samplers.all);
-		pipe->bind_vs_state(pipe, mc->b_vs[1]);
-		pipe->bind_fs_state(pipe, mc->b_fs[1]);
-
-		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24);
-		vb_start += num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24;
-	}
-
-	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, &mc->buffered_surface->render_fence);
-	pipe_surface_reference(&mc->render_target.cbufs[0], NULL);
-
-	for (i = 0; i < 3; ++i)
-		mc->zero_block[i].x = -1.0f;
-
-	mc->buffered_surface = NULL;
-	mc->num_macroblocks = 0;
-
-	return 0;
-}
-
-static int vlRenderMacroBlocksMpeg2R16SnormBuffered
-(
-	struct vlRender *render,
-	struct vlMpeg2MacroBlockBatch *batch,
-	struct vlSurface *surface
-)
-{
-	struct vlR16SnormBufferedMC	*mc;
-	bool				new_surface = false;
-	unsigned int			i;
-
-	assert(render);
-
-	mc = (struct vlR16SnormBufferedMC*)render;
-
-	if (mc->buffered_surface)
-	{
-		if (mc->buffered_surface != surface)
-		{
-			vlFlush(&mc->base);
-			new_surface = true;
-		}
-	}
-	else
-		new_surface = true;
-
-	if (new_surface)
-	{
-		mc->buffered_surface = surface;
-		mc->past_surface = batch->past_surface;
-		mc->future_surface = batch->future_surface;
-		mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
-		mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
-
-		for (i = 0; i < 3; ++i)
-		{
-			mc->tex_transfer[i] = mc->pipe->screen->get_tex_transfer
-			(
-				mc->pipe->screen,
-				mc->textures.all[i],
-				0, 0, 0, PIPE_TRANSFER_WRITE, 0, 0,
-				surface->texture->width[0],
-				surface->texture->height[0]
-			);
-
-			mc->texels[i] = mc->pipe->screen->transfer_map(mc->pipe->screen, mc->tex_transfer[i]);
-		}
-	}
-
-	for (i = 0; i < batch->num_macroblocks; ++i)
-		vlGrabMacroBlock(mc, &batch->macroblocks[i]);
-
-	return 0;
-}
-
-static inline int vlEnd
-(
-	struct vlRender *render
-)
-{
-	assert(render);
-
-	return 0;
-}
-
-static int vlDestroy
-(
-	struct vlRender *render
-)
-{
-	struct vlR16SnormBufferedMC	*mc;
-	struct pipe_context		*pipe;
-	unsigned int			i;
-
-	assert(render);
-
-	mc = (struct vlR16SnormBufferedMC*)render;
-	pipe = mc->pipe;
-
-	for (i = 0; i < 5; ++i)
-		pipe->delete_sampler_state(pipe, mc->samplers.all[i]);
-
-	for (i = 0; i < 3; ++i)
-		pipe_buffer_reference(&mc->vertex_bufs.all[i].buffer, NULL);
-
-	/* Textures 3 & 4 are not created directly, no need to release them here */
-	for (i = 0; i < 3; ++i)
-		pipe_texture_reference(&mc->textures.all[i], NULL);
-
-	pipe->delete_vs_state(pipe, mc->i_vs);
-	pipe->delete_fs_state(pipe, mc->i_fs);
-
-	for (i = 0; i < 2; ++i)
-	{
-		pipe->delete_vs_state(pipe, mc->p_vs[i]);
-		pipe->delete_fs_state(pipe, mc->p_fs[i]);
-		pipe->delete_vs_state(pipe, mc->b_vs[i]);
-		pipe->delete_fs_state(pipe, mc->b_fs[i]);
-	}
-
-	pipe_buffer_reference(&mc->vs_const_buf.buffer, NULL);
-	pipe_buffer_reference(&mc->fs_const_buf.buffer, NULL);
-
-	FREE(mc->macroblocks);
-	FREE(mc);
-
-	return 0;
-}
-
-/*
- * Muliplier renormalizes block samples from 16 bits to 12 bits.
- * Divider is used when calculating Y % 2 for choosing top or bottom
- * field for P or B macroblocks.
- * TODO: Use immediates.
- */
-static const struct vlFragmentShaderConsts fs_consts =
-{
-	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
-	{0.5f, 2.0f, 0.0f, 0.0f}
-};
-
-#include "vl_r16snorm_mc_buf_shaders.inc"
-
-static int vlCreateDataBufs
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int	mbw = align(mc->picture_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH;
-	const unsigned int	mbh = align(mc->picture_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT;
-
-	struct pipe_context	*pipe;
-	unsigned int		i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	mc->macroblocks_per_picture = mbw * mbh;
-
-	/* Create our vertex buffers */
-	mc->vertex_bufs.ycbcr.stride = sizeof(struct vlVertex2f) * 4;
-	mc->vertex_bufs.ycbcr.max_index = 24 * mc->macroblocks_per_picture - 1;
-	mc->vertex_bufs.ycbcr.buffer_offset = 0;
-	mc->vertex_bufs.ycbcr.buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		DEFAULT_BUF_ALIGNMENT,
-		PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
-		sizeof(struct vlVertex2f) * 4 * 24 * mc->macroblocks_per_picture
-	);
-
-	for (i = 1; i < 3; ++i)
-	{
-		mc->vertex_bufs.all[i].stride = sizeof(struct vlVertex2f) * 2;
-		mc->vertex_bufs.all[i].max_index = 24 * mc->macroblocks_per_picture - 1;
-		mc->vertex_bufs.all[i].buffer_offset = 0;
-		mc->vertex_bufs.all[i].buffer = pipe_buffer_create
-		(
-			pipe->screen,
-			DEFAULT_BUF_ALIGNMENT,
-			PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_DISCARD,
-			sizeof(struct vlVertex2f) * 2 * 24 * mc->macroblocks_per_picture
-		);
-	}
-
-	/* Position element */
-	mc->vertex_elems[0].src_offset = 0;
-	mc->vertex_elems[0].vertex_buffer_index = 0;
-	mc->vertex_elems[0].nr_components = 2;
-	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* Luma, texcoord element */
-	mc->vertex_elems[1].src_offset = sizeof(struct vlVertex2f);
-	mc->vertex_elems[1].vertex_buffer_index = 0;
-	mc->vertex_elems[1].nr_components = 2;
-	mc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* Chroma Cr texcoord element */
-	mc->vertex_elems[2].src_offset = sizeof(struct vlVertex2f) * 2;
-	mc->vertex_elems[2].vertex_buffer_index = 0;
-	mc->vertex_elems[2].nr_components = 2;
-	mc->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* Chroma Cb texcoord element */
-	mc->vertex_elems[3].src_offset = sizeof(struct vlVertex2f) * 3;
-	mc->vertex_elems[3].vertex_buffer_index = 0;
-	mc->vertex_elems[3].nr_components = 2;
-	mc->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* First ref surface top field texcoord element */
-	mc->vertex_elems[4].src_offset = 0;
-	mc->vertex_elems[4].vertex_buffer_index = 1;
-	mc->vertex_elems[4].nr_components = 2;
-	mc->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* First ref surface bottom field texcoord element */
-	mc->vertex_elems[5].src_offset = sizeof(struct vlVertex2f);
-	mc->vertex_elems[5].vertex_buffer_index = 1;
-	mc->vertex_elems[5].nr_components = 2;
-	mc->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* Second ref surface top field texcoord element */
-	mc->vertex_elems[6].src_offset = 0;
-	mc->vertex_elems[6].vertex_buffer_index = 2;
-	mc->vertex_elems[6].nr_components = 2;
-	mc->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* Second ref surface bottom field texcoord element */
-	mc->vertex_elems[7].src_offset = sizeof(struct vlVertex2f);
-	mc->vertex_elems[7].vertex_buffer_index = 2;
-	mc->vertex_elems[7].nr_components = 2;
-	mc->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
-
-	/* Create our constant buffer */
-	mc->vs_const_buf.buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		DEFAULT_BUF_ALIGNMENT,
-		PIPE_BUFFER_USAGE_CONSTANT | PIPE_BUFFER_USAGE_DISCARD,
-		sizeof(struct vlVertexShaderConsts)
-	);
-
-	mc->fs_const_buf.buffer = pipe_buffer_create
-	(
-		pipe->screen,
-		DEFAULT_BUF_ALIGNMENT,
-		PIPE_BUFFER_USAGE_CONSTANT,
-		sizeof(struct vlFragmentShaderConsts)
-	);
-
-	memcpy
-	(
-		pipe_buffer_map(pipe->screen, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
-		&fs_consts,
-		sizeof(struct vlFragmentShaderConsts)
-	);
-
-	pipe_buffer_unmap(pipe->screen, mc->fs_const_buf.buffer);
-
-	mc->macroblocks = MALLOC(sizeof(struct vlMpeg2MacroBlock) * mc->macroblocks_per_picture);
-
-	return 0;
-}
-
-static int vlInit
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	struct pipe_context		*pipe;
-	struct pipe_sampler_state	sampler;
-	struct pipe_texture		template;
-	unsigned int			filters[5];
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-
-	mc->buffered_surface = NULL;
-	mc->past_surface = NULL;
-	mc->future_surface = NULL;
-	for (i = 0; i < 3; ++i)
-		mc->zero_block[i].x = -1.0f;
-	mc->num_macroblocks = 0;
-
-	/* For MC we render to textures, which are rounded up to nearest POT */
-	mc->viewport.scale[0] = vlRoundUpPOT(mc->picture_width);
-	mc->viewport.scale[1] = vlRoundUpPOT(mc->picture_height);
-	mc->viewport.scale[2] = 1;
-	mc->viewport.scale[3] = 1;
-	mc->viewport.translate[0] = 0;
-	mc->viewport.translate[1] = 0;
-	mc->viewport.translate[2] = 0;
-	mc->viewport.translate[3] = 0;
-
-	mc->render_target.width = vlRoundUpPOT(mc->picture_width);
-	mc->render_target.height = vlRoundUpPOT(mc->picture_height);
-	mc->render_target.nr_cbufs = 1;
-	/* FB for MC stage is a vlSurface created by the user, set at render time */
-	mc->render_target.zsbuf = NULL;
-
-	filters[0] = PIPE_TEX_FILTER_NEAREST;
-	/* FIXME: Linear causes discoloration around block edges */
-	filters[1] = /*mc->picture_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
-	filters[2] = /*mc->picture_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
-	filters[3] = PIPE_TEX_FILTER_LINEAR;
-	filters[4] = PIPE_TEX_FILTER_LINEAR;
-
-	for (i = 0; i < 5; ++i)
-	{
-		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-		sampler.min_img_filter = filters[i];
-		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-		sampler.mag_img_filter = filters[i];
-		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
-		sampler.compare_func = PIPE_FUNC_ALWAYS;
-		sampler.normalized_coords = 1;
-		/*sampler.prefilter = ;*/
-		/*sampler.lod_bias = ;*/
-		sampler.min_lod = 0;
-		/*sampler.max_lod = ;*/
-		/*sampler.border_color[i] = ;*/
-		/*sampler.max_anisotropy = ;*/
-		mc->samplers.all[i] = pipe->create_sampler_state(pipe, &sampler);
-	}
-
-	memset(&template, 0, sizeof(struct pipe_texture));
-	template.target = PIPE_TEXTURE_2D;
-	template.format = PIPE_FORMAT_R16_SNORM;
-	template.last_level = 0;
-	template.width[0] = vlRoundUpPOT(mc->picture_width);
-	template.height[0] = vlRoundUpPOT(mc->picture_height);
-	template.depth[0] = 1;
-	pf_get_block(template.format, &template.block);
-	template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_DYNAMIC;
-
-	mc->textures.y = pipe->screen->texture_create(pipe->screen, &template);
-
-	if (mc->picture_format == vlFormatYCbCr420)
-	{
-		template.width[0] = vlRoundUpPOT(mc->picture_width / 2);
-		template.height[0] = vlRoundUpPOT(mc->picture_height / 2);
-	}
-	else if (mc->picture_format == vlFormatYCbCr422)
-		template.height[0] = vlRoundUpPOT(mc->picture_height / 2);
-
-	mc->textures.cb = pipe->screen->texture_create(pipe->screen, &template);
-	mc->textures.cr = pipe->screen->texture_create(pipe->screen, &template);
-
-	/* textures.all[3] & textures.all[4] are assigned from vlSurfaces for P and B macroblocks at render time */
-
-	vlCreateVertexShaderIMB(mc);
-	vlCreateFragmentShaderIMB(mc);
-	vlCreateVertexShaderFramePMB(mc);
-	vlCreateVertexShaderFieldPMB(mc);
-	vlCreateFragmentShaderFramePMB(mc);
-	vlCreateFragmentShaderFieldPMB(mc);
-	vlCreateVertexShaderFrameBMB(mc);
-	vlCreateVertexShaderFieldBMB(mc);
-	vlCreateFragmentShaderFrameBMB(mc);
-	vlCreateFragmentShaderFieldBMB(mc);
-	vlCreateDataBufs(mc);
-
-	return 0;
-}
-
-int vlCreateR16SNormBufferedMC
-(
-	struct pipe_context *pipe,
-	unsigned int picture_width,
-	unsigned int picture_height,
-	enum vlFormat picture_format,
-	struct vlRender **render
-)
-{
-	struct vlR16SnormBufferedMC *mc;
-
-	assert(pipe);
-	assert(render);
-
-	mc = CALLOC_STRUCT(vlR16SnormBufferedMC);
-
-	mc->base.vlBegin = &vlBegin;
-	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16SnormBuffered;
-	mc->base.vlEnd = &vlEnd;
-	mc->base.vlFlush = &vlFlush;
-	mc->base.vlDestroy = &vlDestroy;
-	mc->pipe = pipe;
-	mc->picture_width = picture_width;
-	mc->picture_height = picture_height;
-
-	vlInit(mc);
-
-	*render = &mc->base;
-
-	return 0;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h
deleted file mode 100644
index 27177d64cad..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef vl_r16snorm_mc_buf_h
-#define vl_r16snorm_mc_buf_h
-
-#include "vl_types.h"
-
-struct pipe_context;
-struct vlRender;
-
-int vlCreateR16SNormBufferedMC
-(
-	struct pipe_context *pipe,
-	unsigned int picture_width,
-	unsigned int picture_height,
-	enum vlFormat picture_format,
-	struct vlRender **render
-);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
deleted file mode 100644
index ef4a4b2add9..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
+++ /dev/null
@@ -1,1185 +0,0 @@
-static int vlCreateVertexShaderIMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 50;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma Cb texcoords
-	 * decl i3		; Chroma Cr texcoords
-	 */
-	for (i = 0; i < 4; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma Cb texcoords
-	 * decl o3		; Chroma Cr texcoords
-	 */
-	for (i = 0; i < 4; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * mov o0, i0		; Move input vertex pos to output
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma Cb texcoords to output
-	 * mov o3, i3		; Move input chroma Cr texcoords to output
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	vs.tokens = tokens;
-	mc->i_vs = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateFragmentShaderIMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Luma texcoords
-	 * decl i1			; Chroma Cb texcoords
-	 * decl i2			; Chroma Cr texcoords
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
-	}
-
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul o0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	mc->i_fs = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateVertexShaderFramePMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma Cb texcoords
-	 * decl i3		; Chroma Cr texcoords
-	 * decl i4		; Ref surface top field texcoords
-	 * decl i5		; Ref surface bottom field texcoords (unused, packed in the same stream)
-	 */
-	for (i = 0; i < 6; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma Cb texcoords
-	 * decl o3		; Chroma Cr texcoords
-	 * decl o4		; Ref macroblock texcoords
-	 */
-	for (i = 0; i < 5; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * mov o0, i0		; Move input vertex pos to output
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma Cb texcoords to output
-	 * mov o3, i3		; Move input chroma Cr texcoords to output
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* add o4, i0, i4	; Translate vertex pos by motion vec to form ref macroblock texcoords */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	vs.tokens = tokens;
-	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateVertexShaderFieldPMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma Cb texcoords
-	 * decl i3		; Chroma Cr texcoords
-	 * decl i4              ; Ref macroblock top field texcoords
-	 * decl i5              ; Ref macroblock bottom field texcoords
-	 */
-	for (i = 0; i < 6; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* decl c0		; Render target dimensions */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma Cb texcoords
-	 * decl o3		; Chroma Cr texcoords
-	 * decl o4		; Ref macroblock top field texcoords
-	 * decl o5		; Ref macroblock bottom field texcoords
-	 * decl o6		; Denormalized vertex pos
-	 */
-	for (i = 0; i < 7; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * mov o0, i0		; Move input vertex pos to output
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma Cb texcoords to output
-	 * mov o3, i3		; Move input chroma Cr texcoords to output
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * add o4, i0, i4	; Translate vertex pos by motion vec to form top field macroblock texcoords
-	 * add o5, i0, i5	; Translate vertex pos by motion vec to form bottom field macroblock texcoords
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 4);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul o6, i0, c0	; Denorm vertex pos */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 6, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	vs.tokens = tokens;
-	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateFragmentShaderFramePMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Luma texcoords
-	 * decl i1			; Chroma Cb texcoords
-	 * decl i2			; Chroma Cr texcoords
-	 * decl i3			; Ref macroblock texcoords
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for ref surface texture
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* tex2d t1, i3, s3		; Read texel from ref macroblock */
-	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, t1		; Add ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateFragmentShaderFieldPMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 200;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Luma texcoords
-	 * decl i1			; Chroma Cb texcoords
-	 * decl i2			; Chroma Cr texcoords
-	 * decl i3			; Ref macroblock top field texcoords
-	 * decl i4			; Ref macroblock bottom field texcoords
-	 * decl i5			; Denormalized vertex pos
-	 */
-	for (i = 0; i < 6; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl t0-t4 */
-	decl = vl_decl_temps(0, 4);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for ref surface texture
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * tex2d t1, i3, s3		; Read texel from ref macroblock top field
-	 * tex2d t2, i4, s3		; Read texel from ref macroblock bottom field
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* XXX: Pos values off by 0.5? */
-	/* sub t4, i5.y, c1.x		; Sub 0.5 from denormalized pos */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 5, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* floor t3, t3			; Get rid of fractional part */
-	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* mul t3, t3, c1.y		; Multiply by 2 */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
-	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, t1		; Add ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateVertexShaderFrameBMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma Cb texcoords
-	 * decl i3		; Chroma Cr texcoords
-	 * decl i4              ; First ref macroblock top field texcoords
-	 * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
-	 * decl i6		; Second ref macroblock top field texcoords
-	 * decl i7		; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
-	 */
-	for (i = 0; i < 8; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma Cb texcoords
-	 * decl o3		; Chroma Cr texcoords
-	 * decl o4		; First ref macroblock texcoords
-	 * decl o5		; Second ref macroblock texcoords
-	 */
-	for (i = 0; i < 6; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * mov o0, i0		; Move input vertex pos to output
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma Cb texcoords to output
-	 * mov o3, i3		; Move input chroma Cr texcoords to output
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * add o4, i0, i4	; Translate vertex pos by motion vec to form first ref macroblock texcoords
-	 * add o5, i0, i6	; Translate vertex pos by motion vec to form second ref macroblock texcoords
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	vs.tokens = tokens;
-	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateVertexShaderFieldBMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	vs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0		; Vertex pos
-	 * decl i1		; Luma texcoords
-	 * decl i2		; Chroma Cb texcoords
-	 * decl i3		; Chroma Cr texcoords
-	 * decl i4              ; First ref macroblock top field texcoords
-	 * decl i5              ; First ref macroblock bottom field texcoords
-	 * decl i6              ; Second ref macroblock top field texcoords
-	 * decl i7              ; Second ref macroblock bottom field texcoords
-	 */
-	for (i = 0; i < 8; i++)
-	{
-		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* decl c0		; Render target dimensions */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl o0		; Vertex pos
-	 * decl o1		; Luma texcoords
-	 * decl o2		; Chroma Cb texcoords
-	 * decl o3		; Chroma Cr texcoords
-	 * decl o4		; First ref macroblock top field texcoords
-	 * decl o5		; First ref macroblock Bottom field texcoords
-	 * decl o6		; Second ref macroblock top field texcoords
-	 * decl o7		; Second ref macroblock Bottom field texcoords
-	 * decl o8		; Denormalized vertex pos
-	 */
-	for (i = 0; i < 9; i++)
-	{
-		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* decl t0, t1 */
-	decl = vl_decl_temps(0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * mov o0, i0		; Move input vertex pos to output
-	 * mov o1, i1		; Move input luma texcoords to output
-	 * mov o2, i2		; Move input chroma Cb texcoords to output
-	 * mov o3, i3		; Move input chroma Cr texcoords to output
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * add o4, i0, i4	; Translate vertex pos by motion vec to form first top field macroblock texcoords
-	 * add o5, i0, i5	; Translate vertex pos by motion vec to form first bottom field macroblock texcoords
-	 * add o6, i0, i6	; Translate vertex pos by motion vec to form second top field macroblock texcoords
-	 * add o7, i0, i7	; Translate vertex pos by motion vec to form second bottom field macroblock texcoords
-	 */
-	for (i = 0; i < 4; ++i)
-	{
-		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 4);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul o8, i0, c0	; Denorm vertex pos */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 8, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	vs.tokens = tokens;
-	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateFragmentShaderFrameBMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 100;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Luma texcoords
-	 * decl i1			; Chroma Cb texcoords
-	 * decl i2			; Chroma Cr texcoords
-	 * decl i3			; First ref macroblock texcoords
-	 * decl i4			; Second ref macroblock texcoords
-	 */
-	for (i = 0; i < 5; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl t0-t2 */
-	decl = vl_decl_temps(0, 2);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for first ref surface texture
-	 * decl s4			; Sampler for second ref surface texture
-	 */
-	for (i = 0; i < 5; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * tex2d t1, i3, s3		; Read texel from first ref macroblock
-	 * tex2d t2, i4, s4		; Read texel from second ref macroblock
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-
-	return 0;
-}
-
-static int vlCreateFragmentShaderFieldBMB
-(
-	struct vlR16SnormBufferedMC *mc
-)
-{
-	const unsigned int		max_tokens = 200;
-
-	struct pipe_context		*pipe;
-	struct pipe_shader_state	fs;
-	struct tgsi_token		*tokens;
-	struct tgsi_header		*header;
-
-	struct tgsi_full_declaration	decl;
-	struct tgsi_full_instruction	inst;
-
-	unsigned int			ti;
-	unsigned int			i;
-
-	assert(mc);
-
-	pipe = mc->pipe;
-	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
-
-	/* Version */
-	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
-	/* Header */
-	header = (struct tgsi_header*)&tokens[1];
-	*header = tgsi_build_header();
-	/* Processor */
-	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
-
-	ti = 3;
-
-	/*
-	 * decl i0			; Luma texcoords
-	 * decl i1			; Chroma Cb texcoords
-	 * decl i2			; Chroma Cr texcoords
-	 * decl i3			; First ref macroblock top field texcoords
-	 * decl i4			; First ref macroblock bottom field texcoords
-	 * decl i5			; Second ref macroblock top field texcoords
-	 * decl i6			; Second ref macroblock bottom field texcoords
-	 * decl i7			; Denormalized vertex pos
-	 */
-	for (i = 0; i < 8; ++i)
-	{
-		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
-	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
-	 *				; and for Y-mod-2 top/bottom field selection
-	 */
-	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl o0			; Fragment color */
-	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/* decl t0-t5 */
-	decl = vl_decl_temps(0, 5);
-	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * decl s0			; Sampler for luma texture
-	 * decl s1			; Sampler for chroma Cb texture
-	 * decl s2			; Sampler for chroma Cr texture
-	 * decl s3			; Sampler for first ref surface texture
-	 * decl s4			; Sampler for second ref surface texture
-	 */
-	for (i = 0; i < 5; ++i)
-	{
-		decl = vl_decl_samplers(i, i);
-		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/*
-	 * tex2d t1, i0, s0		; Read texel from luma texture
-	 * mov t0.x, t1.x		; Move luma sample into .x component
-	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
-	 * mov t0.y, t1.x		; Move Cb sample into .y component
-	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
-	 * mov t0.z, t1.x		; Move Cr sample into .z component
-	 */
-	for (i = 0; i < 3; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* mul t0, t0, c0		; Rescale texel to correct range */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* XXX: Pos values off by 0.5? */
-	/* sub t4, i7.y, c1.x		; Sub 0.5 from denormalized pos */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 7, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
-	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* floor t3, t3			; Get rid of fractional part */
-	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* mul t3, t3, c1.y		; Multiply by 2 */
-	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
-	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
-	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * tex2d t1, i3, s3		; Read texel from past ref macroblock top field
-	 * tex2d t2, i4, s3		; Read texel from past ref macroblock bottom field
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 3);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
-	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/*
-	 * tex2d t4, i5, s4		; Read texel from future ref macroblock top field
-	 * tex2d t5, i6, s4		; Read texel from future ref macroblock bottom field
-	 */
-	for (i = 0; i < 2; ++i)
-	{
-		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 5, TGSI_FILE_SAMPLER, 4);
-		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-	}
-
-	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
-	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
-	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
-	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
-	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	/* end */
-	inst = vl_end();
-	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
-
-	fs.tokens = tokens;
-	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
-	free(tokens);
-
-	return 0;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_render.h b/src/gallium/state_trackers/g3dvl/vl_render.h
deleted file mode 100644
index 166030b4988..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_render.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef vl_render_h
-#define vl_render_h
-
-#include "vl_types.h"
-
-struct pipe_surface;
-
-struct vlRender
-{
-	int (*vlBegin)
-	(
-		struct vlRender *render
-	);
-
-	int (*vlRenderMacroBlocksMpeg2)
-	(
-		struct vlRender *render,
-		struct vlMpeg2MacroBlockBatch *batch,
-		struct vlSurface *surface
-	);
-
-	int (*vlEnd)
-	(
-		struct vlRender *render
-	);
-
-	int (*vlFlush)
-	(
-		struct vlRender *render
-	);
-
-	int (*vlDestroy)
-	(
-		struct vlRender *render
-	);
-};
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.c b/src/gallium/state_trackers/g3dvl/vl_screen.c
deleted file mode 100644
index ade8643a66a..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_screen.c
+++ /dev/null
@@ -1,115 +0,0 @@
-#define VL_INTERNAL
-#include "vl_screen.h"
-#include <assert.h>
-#include <util/u_memory.h>
-
-int vlCreateScreen
-(
-	struct vlDisplay *display,
-	int screen,
-	struct pipe_screen *pscreen,
-	struct vlScreen **vl_screen
-)
-{
-	struct vlScreen *scrn;
-
-	assert(display);
-	assert(pscreen);
-	assert(vl_screen);
-
-	scrn = CALLOC_STRUCT(vlScreen);
-
-	if (!scrn)
-		return 1;
-
-	scrn->display = display;
-	scrn->ordinal = screen;
-	scrn->pscreen = pscreen;
-	*vl_screen = scrn;
-
-	return 0;
-}
-
-int vlDestroyScreen
-(
-	struct vlScreen *screen
-)
-{
-	assert(screen);
-
-	FREE(screen);
-
-	return 0;
-}
-
-struct vlDisplay* vlGetDisplay
-(
-	struct vlScreen *screen
-)
-{
-	assert(screen);
-
-	return screen->display;
-}
-
-struct pipe_screen* vlGetPipeScreen
-(
-	struct vlScreen *screen
-)
-{
-	assert(screen);
-
-	return screen->pscreen;
-}
-
-unsigned int vlGetMaxProfiles
-(
-	struct vlScreen *screen
-)
-{
-	assert(screen);
-
-	return vlProfileCount;
-}
-
-int vlQueryProfiles
-(
-	struct vlScreen *screen,
-	enum vlProfile *profiles
-)
-{
-	assert(screen);
-	assert(profiles);
-
-	profiles[0] = vlProfileMpeg2Simple;
-	profiles[1] = vlProfileMpeg2Main;
-
-	return 0;
-}
-
-unsigned int vlGetMaxEntryPoints
-(
-	struct vlScreen *screen
-)
-{
-	assert(screen);
-
-	return vlEntryPointCount;
-}
-
-int vlQueryEntryPoints
-(
-	struct vlScreen *screen,
-	enum vlProfile profile,
-	enum vlEntryPoint *entry_points
-)
-{
-	assert(screen);
-	assert(entry_points);
-
-	entry_points[0] = vlEntryPointIDCT;
-	entry_points[1] = vlEntryPointMC;
-	entry_points[2] = vlEntryPointCSC;
-
-	return 0;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.h b/src/gallium/state_trackers/g3dvl/vl_screen.h
deleted file mode 100644
index 98f3d429b61..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_screen.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef vl_screen_h
-#define vl_screen_h
-
-#include "vl_types.h"
-
-struct pipe_screen;
-
-#ifdef VL_INTERNAL
-struct vlScreen
-{
-	struct vlDisplay	*display;
-	unsigned int		ordinal;
-	struct pipe_screen	*pscreen;
-};
-#endif
-
-int vlCreateScreen
-(
-	struct vlDisplay *display,
-	int screen,
-	struct pipe_screen *pscreen,
-	struct vlScreen **vl_screen
-);
-
-int vlDestroyScreen
-(
-	struct vlScreen *screen
-);
-
-struct vlDisplay* vlGetDisplay
-(
-	struct vlScreen *screen
-);
-
-struct pipe_screen* vlGetPipeScreen
-(
-	struct vlScreen *screen
-);
-
-unsigned int vlGetMaxProfiles
-(
-	struct vlScreen *screen
-);
-
-int vlQueryProfiles
-(
-	struct vlScreen *screen,
-	enum vlProfile *profiles
-);
-
-unsigned int vlGetMaxEntryPoints
-(
-	struct vlScreen *screen
-);
-
-int vlQueryEntryPoints
-(
-	struct vlScreen *screen,
-	enum vlProfile profile,
-	enum vlEntryPoint *entry_points
-);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.c b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
deleted file mode 100644
index 51f1721a332..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_shader_build.c
+++ /dev/null
@@ -1,215 +0,0 @@
-#include "vl_shader_build.h"
-#include <assert.h>
-#include <tgsi/tgsi_parse.h>
-#include <tgsi/tgsi_build.h>
-
-struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
-{
-	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-	decl.Declaration.File = TGSI_FILE_INPUT;
-	decl.Declaration.Semantic = 1;
-	decl.Semantic.SemanticName = name;
-	decl.Semantic.SemanticIndex = index;
-	decl.DeclarationRange.First = first;
-	decl.DeclarationRange.Last = last;
-
-	return decl;
-}
-
-struct tgsi_full_declaration vl_decl_interpolated_input
-(
-	unsigned int name,
-	unsigned int index,
-	unsigned int first,
-	unsigned int last,
-	int interpolation
-)
-{
-	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-	assert
-	(
-		interpolation == TGSI_INTERPOLATE_CONSTANT ||
-		interpolation == TGSI_INTERPOLATE_LINEAR ||
-		interpolation == TGSI_INTERPOLATE_PERSPECTIVE
-	);
-
-	decl.Declaration.File = TGSI_FILE_INPUT;
-	decl.Declaration.Semantic = 1;
-	decl.Semantic.SemanticName = name;
-	decl.Semantic.SemanticIndex = index;
-	decl.Declaration.Interpolate = interpolation;;
-	decl.DeclarationRange.First = first;
-	decl.DeclarationRange.Last = last;
-
-	return decl;
-}
-
-struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
-{
-	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-	decl.Declaration.File = TGSI_FILE_CONSTANT;
-	decl.Declaration.Semantic = 1;
-	decl.Semantic.SemanticName = name;
-	decl.Semantic.SemanticIndex = index;
-	decl.DeclarationRange.First = first;
-	decl.DeclarationRange.Last = last;
-
-	return decl;
-}
-
-struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
-{
-	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-	decl.Declaration.File = TGSI_FILE_OUTPUT;
-	decl.Declaration.Semantic = 1;
-	decl.Semantic.SemanticName = name;
-	decl.Semantic.SemanticIndex = index;
-	decl.DeclarationRange.First = first;
-	decl.DeclarationRange.Last = last;
-
-	return decl;
-}
-
-struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
-{
-	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-	decl = tgsi_default_full_declaration();
-	decl.Declaration.File = TGSI_FILE_TEMPORARY;
-	decl.DeclarationRange.First = first;
-	decl.DeclarationRange.Last = last;
-
-	return decl;
-}
-
-struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
-{
-	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
-
-	decl = tgsi_default_full_declaration();
-	decl.Declaration.File = TGSI_FILE_SAMPLER;
-	decl.DeclarationRange.First = first;
-	decl.DeclarationRange.Last = last;
-
-	return decl;
-}
-
-struct tgsi_full_instruction vl_inst2
-(
-	int opcode,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src_file,
-	unsigned int src_index
-)
-{
-	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-	inst.Instruction.Opcode = opcode;
-	inst.Instruction.NumDstRegs = 1;
-	inst.FullDstRegisters[0].DstRegister.File = dst_file;
-	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
-	inst.Instruction.NumSrcRegs = 1;
-	inst.FullSrcRegisters[0].SrcRegister.File = src_file;
-	inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
-
-	return inst;
-}
-
-struct tgsi_full_instruction vl_inst3
-(
-	int opcode,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src1_file,
-	unsigned int src1_index,
-	enum tgsi_file_type src2_file,
-	unsigned int src2_index
-)
-{
-	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-	inst.Instruction.Opcode = opcode;
-	inst.Instruction.NumDstRegs = 1;
-	inst.FullDstRegisters[0].DstRegister.File = dst_file;
-	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
-	inst.Instruction.NumSrcRegs = 2;
-	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
-	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
-	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
-	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
-
-	return inst;
-}
-
-struct tgsi_full_instruction vl_tex
-(
-	int tex,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src1_file,
-	unsigned int src1_index,
-	enum tgsi_file_type src2_file,
-	unsigned int src2_index
-)
-{
-	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-	inst.Instruction.Opcode = TGSI_OPCODE_TEX;
-	inst.Instruction.NumDstRegs = 1;
-	inst.FullDstRegisters[0].DstRegister.File = dst_file;
-	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
-	inst.Instruction.NumSrcRegs = 2;
-	inst.InstructionExtTexture.Texture = tex;
-	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
-	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
-	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
-	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
-
-	return inst;
-}
-
-struct tgsi_full_instruction vl_inst4
-(
-	int opcode,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src1_file,
-	unsigned int src1_index,
-	enum tgsi_file_type src2_file,
-	unsigned int src2_index,
-	enum tgsi_file_type src3_file,
-	unsigned int src3_index
-)
-{
-	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-	inst.Instruction.Opcode = opcode;
-	inst.Instruction.NumDstRegs = 1;
-	inst.FullDstRegisters[0].DstRegister.File = dst_file;
-	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
-	inst.Instruction.NumSrcRegs = 3;
-	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
-	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
-	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
-	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
-	inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
-	inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
-
-	return inst;
-}
-
-struct tgsi_full_instruction vl_end(void)
-{
-	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
-
-	inst.Instruction.Opcode = TGSI_OPCODE_END;
-	inst.Instruction.NumDstRegs = 0;
-	inst.Instruction.NumSrcRegs = 0;
-
-	return inst;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.h b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
deleted file mode 100644
index dc615cb1566..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_shader_build.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef vl_shader_build_h
-#define vl_shader_build_h
-
-#include <pipe/p_shader_tokens.h>
-
-struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_interpolated_input
-(
-	unsigned int name,
-	unsigned int index,
-	unsigned int first,
-	unsigned int last,
-	int interpolation
-);
-struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
-struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
-struct tgsi_full_instruction vl_inst2
-(
-	int opcode,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src_file,
-	unsigned int src_index
-);
-struct tgsi_full_instruction vl_inst3
-(
-	int opcode,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src1_file,
-	unsigned int src1_index,
-	enum tgsi_file_type src2_file,
-	unsigned int src2_index
-);
-struct tgsi_full_instruction vl_tex
-(
-	int tex,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src1_file,
-	unsigned int src1_index,
-	enum tgsi_file_type src2_file,
-	unsigned int src2_index
-);
-struct tgsi_full_instruction vl_inst4
-(
-	int opcode,
-	enum tgsi_file_type dst_file,
-	unsigned int dst_index,
-	enum tgsi_file_type src1_file,
-	unsigned int src1_index,
-	enum tgsi_file_type src2_file,
-	unsigned int src2_index,
-	enum tgsi_file_type src3_file,
-	unsigned int src3_index
-);
-struct tgsi_full_instruction vl_end(void);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.c b/src/gallium/state_trackers/g3dvl/vl_surface.c
deleted file mode 100644
index 7f60852cae8..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_surface.c
+++ /dev/null
@@ -1,242 +0,0 @@
-#define VL_INTERNAL
-#include "vl_surface.h"
-#include <assert.h>
-#include <string.h>
-#include <pipe/p_screen.h>
-#include <pipe/p_state.h>
-#include <pipe/p_inlines.h>
-#include <util/u_memory.h>
-#include <vl_winsys.h>
-#include "vl_screen.h"
-#include "vl_context.h"
-#include "vl_render.h"
-#include "vl_csc.h"
-#include "vl_util.h"
-
-int vlCreateSurface
-(
-	struct vlScreen *screen,
-	unsigned int width,
-	unsigned int height,
-	enum vlFormat format,
-	struct vlSurface **surface
-)
-{
-	struct vlSurface	*sfc;
-	struct pipe_texture	template;
-
-	assert(screen);
-	assert(surface);
-
-	sfc = CALLOC_STRUCT(vlSurface);
-
-	if (!sfc)
-		return 1;
-
-	sfc->screen = screen;
-	sfc->width = width;
-	sfc->height = height;
-	sfc->format = format;
-
-	memset(&template, 0, sizeof(struct pipe_texture));
-	template.target = PIPE_TEXTURE_2D;
-	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
-	template.last_level = 0;
-	template.width[0] = vlRoundUpPOT(sfc->width);
-	template.height[0] = vlRoundUpPOT(sfc->height);
-	template.depth[0] = 1;
-	pf_get_block(template.format, &template.block);
-	template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
-
-	sfc->texture = vlGetPipeScreen(screen)->texture_create(vlGetPipeScreen(screen), &template);
-
-	if (!sfc->texture)
-	{
-		FREE(sfc);
-		return 1;
-	}
-
-	*surface = sfc;
-
-	return 0;
-}
-
-int vlDestroySurface
-(
-	struct vlSurface *surface
-)
-{
-	assert(surface);
-
-	pipe_texture_reference(&surface->texture, NULL);
-	FREE(surface);
-
-	return 0;
-}
-
-int vlRenderMacroBlocksMpeg2
-(
-	struct vlMpeg2MacroBlockBatch *batch,
-	struct vlSurface *surface
-)
-{
-	assert(batch);
-	assert(surface);
-	assert(surface->context);
-
-	surface->context->render->vlBegin(surface->context->render);
-
-	surface->context->render->vlRenderMacroBlocksMpeg2
-	(
-		surface->context->render,
-		batch,
-		surface
-	);
-
-	surface->context->render->vlEnd(surface->context->render);
-
-	return 0;
-}
-
-int vlPutPicture
-(
-	struct vlSurface *surface,
-	vlNativeDrawable drawable,
-	int srcx,
-	int srcy,
-	int srcw,
-	int srch,
-	int destx,
-	int desty,
-	int destw,
-	int desth,
-	int drawable_w,
-	int drawable_h,
-	enum vlPictureType picture_type
-)
-{
-	struct vlCSC		*csc;
-	struct pipe_context	*pipe;
-
-	assert(surface);
-	assert(surface->context);
-
-	surface->context->render->vlFlush(surface->context->render);
-
-	csc = surface->context->csc;
-	pipe = surface->context->pipe;
-
-	csc->vlResizeFrameBuffer(csc, drawable_w, drawable_h);
-
-	csc->vlBegin(csc);
-
-	csc->vlPutPicture
-	(
-		csc,
-		surface,
-		srcx,
-		srcy,
-		srcw,
-		srch,
-		destx,
-		desty,
-		destw,
-		desth,
-		picture_type
-	);
-
-	csc->vlEnd(csc);
-
-	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, &surface->disp_fence);
-
-	bind_pipe_drawable(pipe, drawable);
-
-	pipe->screen->flush_frontbuffer
-	(
-		pipe->screen,
-		csc->vlGetFrameBuffer(csc),
-		pipe->priv
-	);
-
-	return 0;
-}
-
-int vlSurfaceGetStatus
-(
-	struct vlSurface *surface,
-	enum vlResourceStatus *status
-)
-{
-	assert(surface);
-	assert(surface->context);
-	assert(status);
-
-	if (surface->render_fence && !surface->context->pipe->screen->fence_signalled(surface->context->pipe->screen, surface->render_fence, 0))
-	{
-		*status = vlResourceStatusRendering;
-		return 0;
-	}
-
-	if (surface->disp_fence && !surface->context->pipe->screen->fence_signalled(surface->context->pipe->screen, surface->disp_fence, 0))
-	{
-		*status = vlResourceStatusDisplaying;
-		return 0;
-	}
-
-	*status = vlResourceStatusFree;
-
-	return 0;
-}
-
-int vlSurfaceFlush
-(
-	struct vlSurface *surface
-)
-{
-	assert(surface);
-	assert(surface->context);
-
-	surface->context->render->vlFlush(surface->context->render);
-
-	return 0;
-}
-
-int vlSurfaceSync
-(
-	struct vlSurface *surface
-)
-{
-	assert(surface);
-	assert(surface->context);
-	assert(surface->render_fence);
-
-	surface->context->pipe->screen->fence_finish(surface->context->pipe->screen, surface->render_fence, 0);
-
-	return 0;
-}
-
-struct vlScreen* vlSurfaceGetScreen
-(
-	struct vlSurface *surface
-)
-{
-	assert(surface);
-
-	return surface->screen;
-}
-
-struct vlContext* vlBindToContext
-(
-	struct vlSurface *surface,
-	struct vlContext *context
-)
-{
-	struct vlContext *old;
-
-	assert(surface);
-
-	old = surface->context;
-	surface->context = context;
-
-	return old;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.h b/src/gallium/state_trackers/g3dvl/vl_surface.h
deleted file mode 100644
index 133e1515ef3..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_surface.h
+++ /dev/null
@@ -1,86 +0,0 @@
-#ifndef vl_surface_h
-#define vl_surface_h
-
-#include "vl_types.h"
-
-#ifdef VL_INTERNAL
-struct pipe_texture;
-
-struct vlSurface
-{
-	struct vlScreen			*screen;
-	struct vlContext		*context;
-	unsigned int			width;
-	unsigned int			height;
-	enum vlFormat			format;
-	struct pipe_texture		*texture;
-	struct pipe_fence_handle	*render_fence;
-	struct pipe_fence_handle	*disp_fence;
-};
-#endif
-
-int vlCreateSurface
-(
-	struct vlScreen *screen,
-	unsigned int width,
-	unsigned int height,
-	enum vlFormat format,
-	struct vlSurface **surface
-);
-
-int vlDestroySurface
-(
-	struct vlSurface *surface
-);
-
-int vlRenderMacroBlocksMpeg2
-(
-	struct vlMpeg2MacroBlockBatch *batch,
-	struct vlSurface *surface
-);
-
-int vlPutPicture
-(
-	struct vlSurface *surface,
-	vlNativeDrawable drawable,
-	int srcx,
-	int srcy,
-	int srcw,
-	int srch,
-	int destx,
-	int desty,
-	int destw,
-	int desth,
-	int drawable_w,
-	int drawable_h,
-	enum vlPictureType picture_type
-);
-
-int vlSurfaceGetStatus
-(
-	struct vlSurface *surface,
-	enum vlResourceStatus *status
-);
-
-int vlSurfaceFlush
-(
-	struct vlSurface *surface
-);
-
-int vlSurfaceSync
-(
-	struct vlSurface *surface
-);
-
-struct vlScreen* vlSurfaceGetScreen
-(
-	struct vlSurface *surface
-);
-
-struct vlContext* vlBindToContext
-(
-	struct vlSurface *surface,
-	struct vlContext *context
-);
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_types.h b/src/gallium/state_trackers/g3dvl/vl_types.h
deleted file mode 100644
index 274e1f74377..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_types.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef vl_types_h
-#define vl_types_h
-
-#if 1 /*#ifdef X11*/
-#include <X11/Xlib.h>
-
-typedef Display* vlNativeDisplay;
-typedef Drawable vlNativeDrawable;
-#endif
-
-struct vlDisplay;
-struct vlScreen;
-struct vlContext;
-struct vlSurface;
-
-enum vlResourceStatus
-{
-	vlResourceStatusFree,
-	vlResourceStatusRendering,
-	vlResourceStatusDisplaying
-};
-
-enum vlProfile
-{
-	vlProfileMpeg2Simple,
-	vlProfileMpeg2Main,
-
-	vlProfileCount
-};
-
-enum vlEntryPoint
-{
-	vlEntryPointIDCT,
-	vlEntryPointMC,
-	vlEntryPointCSC,
-
-	vlEntryPointCount
-};
-
-enum vlFormat
-{
-	vlFormatYCbCr420,
-	vlFormatYCbCr422,
-	vlFormatYCbCr444
-};
-
-enum vlPictureType
-{
-	vlPictureTypeTopField,
-	vlPictureTypeBottomField,
-	vlPictureTypeFrame
-};
-
-enum vlMotionType
-{
-	vlMotionTypeField,
-	vlMotionTypeFrame,
-	vlMotionTypeDualPrime,
-	vlMotionType16x8
-};
-
-enum vlFieldOrder
-{
-	vlFieldOrderFirst,
-	vlFieldOrderSecond
-};
-
-enum vlDCTType
-{
-	vlDCTTypeFrameCoded,
-	vlDCTTypeFieldCoded
-};
-
-struct vlVertex2f
-{
-	float x, y;
-};
-
-struct vlVertex4f
-{
-	float x, y, z, w;
-};
-
-enum vlMacroBlockType
-{
-	vlMacroBlockTypeIntra,
-	vlMacroBlockTypeFwdPredicted,
-	vlMacroBlockTypeBkwdPredicted,
-	vlMacroBlockTypeBiPredicted,
-
-	vlNumMacroBlockTypes
-};
-
-struct vlMpeg2MacroBlock
-{
-	unsigned int		mbx, mby;
-	enum vlMacroBlockType	mb_type;
-	enum vlMotionType	mo_type;
-	enum vlDCTType		dct_type;
-	int			PMV[2][2][2];
-	unsigned int		cbp;
-	short			*blocks;
-};
-
-struct vlMpeg2MacroBlockBatch
-{
-	struct vlSurface		*past_surface;
-	struct vlSurface		*future_surface;
-	enum vlPictureType		picture_type;
-	enum vlFieldOrder		field_order;
-	unsigned int			num_macroblocks;
-	struct vlMpeg2MacroBlock	*macroblocks;
-};
-
-#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.c b/src/gallium/state_trackers/g3dvl/vl_util.c
deleted file mode 100644
index 50aa9af66f2..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_util.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "vl_util.h"
-#include <assert.h>
-
-unsigned int vlRoundUpPOT(unsigned int x)
-{
-	unsigned int i;
-
-	assert(x > 0);
-
-	--x;
-
-	for (i = 1; i < sizeof(unsigned int) * 8; i <<= 1)
-		x |= x >> i;
-
-	return x + 1;
-}
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.h b/src/gallium/state_trackers/g3dvl/vl_util.h
deleted file mode 100644
index bc98e79df47..00000000000
--- a/src/gallium/state_trackers/g3dvl/vl_util.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef vl_util_h
-#define vl_util_h
-
-unsigned int vlRoundUpPOT(unsigned int x);
-
-#endif
diff --git a/src/gallium/state_trackers/python/retrace/interpreter.py b/src/gallium/state_trackers/python/retrace/interpreter.py
index 6f0bd6ae52a..348f2e43683 100755
--- a/src/gallium/state_trackers/python/retrace/interpreter.py
+++ b/src/gallium/state_trackers/python/retrace/interpreter.py
@@ -314,7 +314,7 @@ class Screen(Object):
         if texture is None:
             return None
         transfer = Transfer(texture.get_surface(face, level, zslice), x, y, w, h)
-        if transfer and usage != gallium.PIPE_TRANSFER_WRITE:
+        if transfer and usage & gallium.PIPE_TRANSFER_READ:
             if self.interpreter.options.all:
                 self.interpreter.present(transfer.surface, 'transf_read', x, y, w, h)
         return transfer
@@ -459,7 +459,7 @@ class Context(Object):
         sys.stdout.flush()
 
     def set_constant_buffer(self, shader, index, buffer):
-        if buffer is not None:
+        if buffer is not None and buffer.buffer is not None:
             self.real.set_constant_buffer(shader, index, buffer.buffer)
 
             self.dump_constant_buffer(buffer.buffer)
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
index f0a4826a001..f0abd12e3dc 100644
--- a/src/gallium/state_trackers/python/st_softpipe_winsys.c
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -172,6 +172,7 @@ st_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
                                   unsigned width, unsigned height,
                                   enum pipe_format format,
                                   unsigned usage,
+                                  unsigned tex_usage,
                                   unsigned *stride)
 {
    const unsigned alignment = 64;
diff --git a/src/gallium/state_trackers/vega/arc.c b/src/gallium/state_trackers/vega/arc.c
index e74c7f03345..8b04d21ea76 100644
--- a/src/gallium/state_trackers/vega/arc.c
+++ b/src/gallium/state_trackers/vega/arc.c
@@ -528,6 +528,7 @@ static INLINE int num_beziers_needed(struct arc *arc)
    double threshold = 0.05;
    VGboolean found = VG_FALSE;
    int n = 1;
+   int i;
    double min_eta, max_eta;
 
    min_eta = MIN2(arc->eta1, arc->eta2);
@@ -538,7 +539,7 @@ static INLINE int num_beziers_needed(struct arc *arc)
       if (d_eta <= 0.5 * M_PI) {
          double eta_b = min_eta;
          found = VG_TRUE;
-         for (int i = 0; found && (i < n); ++i) {
+         for (i = 0; found && (i < n); ++i) {
             double etaA = eta_b;
             eta_b += d_eta;
             found = (estimate_error(arc, etaA, eta_b) <= threshold);
@@ -554,6 +555,7 @@ static void arc_to_beziers(struct arc *arc,
                            struct arc_cb cb,
                            struct matrix *matrix)
 {
+   int i;
    int n = 1;
    double d_eta, eta_b, cos_eta_b,
       sin_eta_b, a_cos_eta_b, b_sin_eta_b, a_sin_eta_b,
@@ -607,7 +609,7 @@ static void arc_to_beziers(struct arc *arc,
    t     = tan(0.5 * d_eta);
    alpha = sin(d_eta) * (sqrt(4 + 3 * t * t) - 1) / 3;
 
-   for (int i = 0; i < n; ++i) {
+   for (i = 0; i < n; ++i) {
       struct bezier bezier;
       double xA    = x_b;
       double yA    = y_b;
diff --git a/src/gallium/state_trackers/vega/bezier.c b/src/gallium/state_trackers/vega/bezier.c
index 39a7ade0161..0d5504004cc 100644
--- a/src/gallium/state_trackers/vega/bezier.c
+++ b/src/gallium/state_trackers/vega/bezier.c
@@ -255,7 +255,9 @@ static enum shift_result good_offset(const struct bezier *b1,
    const float max_dist_line = threshold*offset*offset;
    const float max_dist_normal = threshold*offset;
    const float spacing = 0.25;
-   for (float i = spacing; i < 0.99; i += spacing) {
+   float i;
+
+   for (i = spacing; i < 0.99; i += spacing) {
       float p1[2],p2[2], d, l;
       float normal[2];
       bezier_point_at(b1, i, p1);
@@ -330,6 +332,7 @@ static enum shift_result shift(const struct bezier *orig,
                                struct bezier *shifted,
                                float offset, float threshold)
 {
+   int i;
    int map[4];
    VGboolean p1_p2_equal = (orig->x1 == orig->x2 && orig->y1 == orig->y2);
    VGboolean p2_p3_equal = (orig->x2 == orig->x3 && orig->y2 == orig->y3);
@@ -404,7 +407,7 @@ static enum shift_result shift(const struct bezier *orig,
    points_shifted[0][0] = points[0][0] + offset * prev_normal[0];
    points_shifted[0][1] = points[0][1] + offset * prev_normal[1];
 
-   for (int i = 1; i < np - 1; ++i) {
+   for (i = 1; i < np - 1; ++i) {
       float normal_sum[2], r;
       float next_normal[2];
       compute_pt_normal(points[i], points[i + 1], next_normal);
diff --git a/src/gallium/state_trackers/vega/renderer.c b/src/gallium/state_trackers/vega/renderer.c
index f7c5f2f0cdf..396c88aa3d2 100644
--- a/src/gallium/state_trackers/vega/renderer.c
+++ b/src/gallium/state_trackers/vega/renderer.c
@@ -37,6 +37,7 @@
 #include "util/u_draw_quad.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_memory.h"
+#include "util/u_rect.h"
 
 #include "cso_cache/cso_context.h"
 
@@ -457,10 +458,17 @@ void renderer_copy_surface(struct renderer *ctx,
                                      PIPE_BUFFER_USAGE_GPU_WRITE);
 
    /* load temp texture */
-   pipe->surface_copy(pipe,
-                      texSurf, 0, 0,   /* dest */
-                      src, srcLeft, srcTop, /* src */
-                      srcW, srcH);     /* size */
+   if (pipe->surface_copy) {
+      pipe->surface_copy(pipe,
+                         texSurf, 0, 0,   /* dest */
+                         src, srcLeft, srcTop, /* src */
+                         srcW, srcH);     /* size */
+   } else {
+      util_surface_copy(pipe, FALSE,
+                        texSurf, 0, 0,   /* dest */
+                        src, srcLeft, srcTop, /* src */
+                        srcW, srcH);     /* size */
+   }
 
    /* free the surface, update the texture if necessary.*/
    screen->tex_surface_destroy(texSurf);
diff --git a/src/gallium/state_trackers/vega/st_inlines.h b/src/gallium/state_trackers/vega/st_inlines.h
index 1f331dfcdb7..610755e0636 100644
--- a/src/gallium/state_trackers/vega/st_inlines.h
+++ b/src/gallium/state_trackers/vega/st_inlines.h
@@ -57,8 +57,7 @@ st_cond_flush_get_tex_transfer(struct vg_context *st,
       pipe->is_texture_referenced(pipe, pt, face, level);
 
    if (referenced && ((referenced & PIPE_REFERENCED_FOR_WRITE) ||
-		      usage == PIPE_TRANSFER_WRITE ||
-		      usage == PIPE_TRANSFER_READ_WRITE))
+		      (usage & PIPE_TRANSFER_WRITE)))
       vgFlush();
 
    return screen->get_tex_transfer(screen, pt, face, level, zslice, usage,
diff --git a/src/gallium/state_trackers/vega/vg_context.c b/src/gallium/state_trackers/vega/vg_context.c
index e0ff02f3a99..00d23f5c227 100644
--- a/src/gallium/state_trackers/vega/vg_context.c
+++ b/src/gallium/state_trackers/vega/vg_context.c
@@ -231,6 +231,8 @@ static void update_clip_state(struct vg_context *ctx)
    if (state->scissoring) {
       struct pipe_blend_state *blend = &ctx->state.g3d.blend;
       struct pipe_framebuffer_state *fb = &ctx->state.g3d.fb;
+      int i;
+
       dsa->depth.writemask = 1;/*glDepthMask(TRUE);*/
       dsa->depth.func = PIPE_FUNC_ALWAYS;
       dsa->depth.enabled = 1;
@@ -254,7 +256,7 @@ static void update_clip_state(struct vg_context *ctx)
       cso_set_blend(ctx->cso_context, blend);
 
       /* enable scissoring */
-      for (int i = 0; i < state->scissor_rects_num; ++i) {
+      for (i = 0; i < state->scissor_rects_num; ++i) {
          const float x      = state->scissor_rects[i * 4 + 0].f;
          const float y      = state->scissor_rects[i * 4 + 1].f;
          const float width  = state->scissor_rects[i * 4 + 2].f;
diff --git a/src/gallium/state_trackers/vega/vg_tracker.c b/src/gallium/state_trackers/vega/vg_tracker.c
index 56cc60aebe1..c4da01e52cc 100644
--- a/src/gallium/state_trackers/vega/vg_tracker.c
+++ b/src/gallium/state_trackers/vega/vg_tracker.c
@@ -235,13 +235,23 @@ static void setup_new_alpha_mask(struct vg_context *ctx,
          old_texture,
          0, 0, 0,
          PIPE_BUFFER_USAGE_GPU_READ);
-      pipe->surface_copy(pipe,
-                         surface,
-                         0, 0,
-                         old_surface,
-                         0, 0,
-                         MIN2(old_surface->width, width),
-                         MIN2(old_surface->height, height));
+      if (pipe->surface_copy) {
+         pipe->surface_copy(pipe,
+                            surface,
+                            0, 0,
+                            old_surface,
+                            0, 0,
+                            MIN2(old_surface->width, width),
+                            MIN2(old_surface->height, height));
+      } else {
+         util_surface_copy(pipe, FALSE,
+                           surface,
+                           0, 0,
+                           old_surface,
+                           0, 0,
+                           MIN2(old_surface->width, width),
+                           MIN2(old_surface->height, height));
+      }
       if (surface)
          pipe_surface_reference(&surface, NULL);
       if (old_surface)
diff --git a/src/gallium/state_trackers/wgl/SConscript b/src/gallium/state_trackers/wgl/SConscript
index 69b88618ecb..b05944a33b3 100644
--- a/src/gallium/state_trackers/wgl/SConscript
+++ b/src/gallium/state_trackers/wgl/SConscript
@@ -18,20 +18,17 @@ if env['platform'] in ['windows']:
     ])
      
     sources = [
-        'icd/stw_icd.c',
-
-        'wgl/stw_wgl.c',
-
-        'shared/stw_context.c',
-        'shared/stw_device.c',
-        'shared/stw_framebuffer.c',
-        'shared/stw_pixelformat.c',
-        'shared/stw_extensionsstring.c',
-        'shared/stw_extswapinterval.c',
-        'shared/stw_getprocaddress.c',
-        'shared/stw_extgallium.c',
-        'shared/stw_arbpixelformat.c',
-        'shared/stw_tls.c',
+        'stw_context.c',
+        'stw_device.c',
+        'stw_ext_extensionsstring.c',
+        'stw_ext_gallium.c',
+        'stw_ext_pixelformat.c',
+        'stw_ext_swapinterval.c',
+        'stw_framebuffer.c',
+        'stw_getprocaddress.c',
+        'stw_pixelformat.c',
+        'stw_tls.c',
+        'stw_wgl.c',
     ]
 
     wgl = env.ConvenienceLibrary(
diff --git a/src/gallium/state_trackers/wgl/opengl32.def b/src/gallium/state_trackers/wgl/opengl32.def
index 596417ed844..5daa6ddd413 100644
--- a/src/gallium/state_trackers/wgl/opengl32.def
+++ b/src/gallium/state_trackers/wgl/opengl32.def
@@ -376,6 +376,7 @@ EXPORTS
 	DrvDescribePixelFormat
 	DrvGetLayerPaletteEntries
 	DrvGetProcAddress
+	DrvPresentBuffers
 	DrvRealizeLayerPalette
 	DrvReleaseContext
 	DrvSetCallbackProcs
diff --git a/src/gallium/state_trackers/wgl/opengl32.mingw.def b/src/gallium/state_trackers/wgl/opengl32.mingw.def
index 1f03ea3b375..6ebb31a6f1b 100644
--- a/src/gallium/state_trackers/wgl/opengl32.mingw.def
+++ b/src/gallium/state_trackers/wgl/opengl32.mingw.def
@@ -375,6 +375,7 @@ EXPORTS
 	DrvDescribePixelFormat = DrvDescribePixelFormat@16
 	DrvGetLayerPaletteEntries = DrvGetLayerPaletteEntries@20
 	DrvGetProcAddress = DrvGetProcAddress@4
+	DrvPresentBuffers = DrvPresentBuffers@8
 	DrvRealizeLayerPalette = DrvRealizeLayerPalette@12
 	DrvReleaseContext = DrvReleaseContext@4
 	DrvSetCallbackProcs = DrvSetCallbackProcs@8
diff --git a/src/gallium/state_trackers/wgl/shared/stw_context.c b/src/gallium/state_trackers/wgl/shared/stw_context.c
deleted file mode 100644
index 4968ecc692d..00000000000
--- a/src/gallium/state_trackers/wgl/shared/stw_context.c
+++ /dev/null
@@ -1,382 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <windows.h>
-
-#include "main/mtypes.h"
-#include "main/context.h"
-#include "pipe/p_compiler.h"
-#include "pipe/p_context.h"
-#include "state_tracker/st_context.h"
-#include "state_tracker/st_public.h"
-
-#ifdef DEBUG
-#include "trace/tr_screen.h"
-#include "trace/tr_context.h"
-#endif
-
-#include "shared/stw_device.h"
-#include "shared/stw_winsys.h"
-#include "shared/stw_framebuffer.h"
-#include "shared/stw_pixelformat.h"
-#include "stw_public.h"
-#include "stw_context.h"
-#include "stw_tls.h"
-
-
-static INLINE struct stw_context *
-stw_context(GLcontext *glctx)
-{
-   if(!glctx)
-      return NULL;
-   assert(glctx->DriverCtx);
-   return (struct stw_context *)glctx->DriverCtx;
-}
-
-static INLINE struct stw_context *
-stw_current_context(void)
-{
-   /* We must check if multiple threads are being used or GET_CURRENT_CONTEXT 
-    * might return the current context of the thread first seen. */
-   _glapi_check_multithread();
-
-   {
-      GET_CURRENT_CONTEXT( glctx );
-      return stw_context(glctx);
-   }
-}
-
-BOOL
-stw_copy_context(
-   UINT_PTR hglrcSrc,
-   UINT_PTR hglrcDst,
-   UINT mask )
-{
-   struct stw_context *src;
-   struct stw_context *dst;
-   BOOL ret = FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
-   src = stw_lookup_context_locked( hglrcSrc );
-   dst = stw_lookup_context_locked( hglrcDst );
-
-   if (src && dst) { 
-      /* FIXME */
-      assert(0);
-      (void) src;
-      (void) dst;
-      (void) mask;
-   }
-
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
-   return ret;
-}
-
-BOOL
-stw_share_lists(
-   UINT_PTR hglrc1, 
-   UINT_PTR hglrc2 )
-{
-   struct stw_context *ctx1;
-   struct stw_context *ctx2;
-   BOOL ret = FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
-   ctx1 = stw_lookup_context_locked( hglrc1 );
-   ctx2 = stw_lookup_context_locked( hglrc2 );
-
-   if (ctx1 && ctx2 &&
-       ctx1->iPixelFormat == ctx2->iPixelFormat) { 
-      ret = _mesa_share_state(ctx2->st->ctx, ctx1->st->ctx);
-   }
-
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
-   return ret;
-}
-
-static void
-stw_viewport(GLcontext * glctx, GLint x, GLint y,
-             GLsizei width, GLsizei height)
-{
-   struct stw_context *ctx = (struct stw_context *)glctx->DriverCtx;
-   struct stw_framebuffer *fb;
-   
-   fb = stw_framebuffer_from_hdc( ctx->hdc );
-   if(fb) {
-      stw_framebuffer_update(fb);
-      stw_framebuffer_release(fb);
-   }
-}
-
-UINT_PTR
-stw_create_layer_context(
-   HDC hdc,
-   int iLayerPlane )
-{
-   int iPixelFormat;
-   const struct stw_pixelformat_info *pfi;
-   GLvisual visual;
-   struct stw_context *ctx = NULL;
-   struct pipe_screen *screen = NULL;
-   struct pipe_context *pipe = NULL;
-   
-   if(!stw_dev)
-      return 0;
-   
-   if (iLayerPlane != 0)
-      return 0;
-
-   iPixelFormat = GetPixelFormat(hdc);
-   if(!iPixelFormat)
-      return 0;
-   
-   pfi = stw_pixelformat_get_info( iPixelFormat - 1 );
-   stw_pixelformat_visual(&visual, pfi);
-   
-   ctx = CALLOC_STRUCT( stw_context );
-   if (ctx == NULL)
-      goto no_ctx;
-
-   ctx->hdc = hdc;
-   ctx->iPixelFormat = iPixelFormat;
-
-   screen = stw_dev->screen;
-
-#ifdef DEBUG
-   /* Unwrap screen */
-   if(stw_dev->trace_running)
-      screen = trace_screen(screen)->screen;
-#endif
-
-   pipe = stw_dev->stw_winsys->create_context( screen );
-   if (pipe == NULL) 
-      goto no_pipe;
-
-#ifdef DEBUG
-   /* Wrap context */
-   if(stw_dev->trace_running)
-      pipe = trace_context_create(stw_dev->screen, pipe);
-#endif
-
-   /* pass to stw_flush_frontbuffer as context_private */
-   assert(!pipe->priv);
-   pipe->priv = hdc;
-
-   ctx->st = st_create_context( pipe, &visual, NULL );
-   if (ctx->st == NULL) 
-      goto no_st_ctx;
-
-   ctx->st->ctx->DriverCtx = ctx;
-   ctx->st->ctx->Driver.Viewport = stw_viewport;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   ctx->hglrc = handle_table_add(stw_dev->ctx_table, ctx);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   if (!ctx->hglrc)
-      goto no_hglrc;
-
-   return ctx->hglrc;
-
-no_hglrc:
-   st_destroy_context(ctx->st);
-   goto no_pipe; /* st_context_destroy already destroys pipe */
-no_st_ctx:
-   pipe->destroy( pipe );
-no_pipe:
-   FREE(ctx);
-no_ctx:
-   return 0;
-}
-
-BOOL
-stw_delete_context(
-   UINT_PTR hglrc )
-{
-   struct stw_context *ctx ;
-   BOOL ret = FALSE;
-   
-   if (!stw_dev)
-      return FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   ctx = stw_lookup_context_locked(hglrc);
-   handle_table_remove(stw_dev->ctx_table, hglrc);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-
-   if (ctx) {
-      struct stw_context *curctx = stw_current_context();
-      
-      /* Unbind current if deleting current context. */
-      if (curctx == ctx)
-         st_make_current( NULL, NULL, NULL );
-
-      st_destroy_context(ctx->st);
-      FREE(ctx);
-
-      ret = TRUE;
-   }
-
-   return ret;
-}
-
-BOOL
-stw_release_context(
-   UINT_PTR hglrc )
-{
-   struct stw_context *ctx;
-
-   if (!stw_dev)
-      return FALSE;
-
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   ctx = stw_lookup_context_locked( hglrc );
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-
-   if (!ctx)
-      return FALSE;
-   
-   /* The expectation is that ctx is the same context which is
-    * current for this thread.  We should check that and return False
-    * if not the case.
-    */
-   if (ctx != stw_current_context())
-      return FALSE;
-
-   if (stw_make_current( NULL, 0 ) == FALSE)
-      return FALSE;
-
-   return TRUE;
-}
-
-
-UINT_PTR
-stw_get_current_context( void )
-{
-   struct stw_context *ctx;
-
-   ctx = stw_current_context();
-   if(!ctx)
-      return 0;
-   
-   return ctx->hglrc;
-}
-
-HDC
-stw_get_current_dc( void )
-{
-   struct stw_context *ctx;
-
-   ctx = stw_current_context();
-   if(!ctx)
-      return NULL;
-   
-   return ctx->hdc;
-}
-
-BOOL
-stw_make_current(
-   HDC hdc,
-   UINT_PTR hglrc )
-{
-   struct stw_context *curctx = NULL;
-   struct stw_context *ctx = NULL;
-   struct stw_framebuffer *fb = NULL;
-
-   if (!stw_dev)
-      goto fail;
-
-   curctx = stw_current_context();
-   if (curctx != NULL) {
-      if (curctx->hglrc != hglrc)
-	 st_flush(curctx->st, PIPE_FLUSH_RENDER_CACHE, NULL);
-      
-      /* Return if already current. */
-      if (curctx->hglrc == hglrc && curctx->hdc == hdc) {
-         ctx = curctx;
-         fb = stw_framebuffer_from_hdc( hdc );
-         goto success;
-      }
-   }
-
-   if (hdc == NULL || hglrc == 0) {
-      return st_make_current( NULL, NULL, NULL );
-   }
-
-   pipe_mutex_lock( stw_dev->ctx_mutex ); 
-   ctx = stw_lookup_context_locked( hglrc );
-   pipe_mutex_unlock( stw_dev->ctx_mutex ); 
-   if(!ctx)
-      goto fail;
-
-   fb = stw_framebuffer_from_hdc( hdc );
-   if(!fb) { 
-      /* Applications should call SetPixelFormat before creating a context,
-       * but not all do, and the opengl32 runtime seems to use a default pixel
-       * format in some cases, so we must create a framebuffer for those here
-       */
-      int iPixelFormat = GetPixelFormat(hdc);
-      if(iPixelFormat)
-         fb = stw_framebuffer_create( hdc, iPixelFormat );
-      if(!fb) 
-         goto fail;
-   }
-   
-   if(fb->iPixelFormat != ctx->iPixelFormat)
-      goto fail;
-
-   /* Lazy allocation of the frame buffer */
-   if(!stw_framebuffer_allocate(fb))
-      goto fail;
-
-   /* Bind the new framebuffer */
-   ctx->hdc = hdc;
-   
-   /* pass to stw_flush_frontbuffer as context_private */
-   ctx->st->pipe->priv = hdc;
-   
-   if(!st_make_current( ctx->st, fb->stfb, fb->stfb ))
-      goto fail;
-
-success:
-   assert(fb);
-   if(fb) {
-      stw_framebuffer_update(fb);
-      stw_framebuffer_release(fb);
-   }
-   
-   return TRUE;
-
-fail:
-   if(fb)
-      stw_framebuffer_release(fb);
-   st_make_current( NULL, NULL, NULL );
-   return FALSE;
-}
diff --git a/src/gallium/state_trackers/wgl/icd/stw_icd.c b/src/gallium/state_trackers/wgl/stw_context.c
index 347f40aa06b..f2f0264844a 100644
--- a/src/gallium/state_trackers/wgl/icd/stw_icd.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -26,18 +26,49 @@
  **************************************************************************/
 
 #include <windows.h>
-#include <stdio.h>
 
-#include "GL/gl.h"
-
-#include "util/u_debug.h"
-#include "pipe/p_thread.h"
-
-#include "shared/stw_public.h"
-#include "icd/stw_icd.h"
+#include "main/mtypes.h"
+#include "main/context.h"
+#include "pipe/p_compiler.h"
+#include "pipe/p_context.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_public.h"
+
+#ifdef DEBUG
+#include "trace/tr_screen.h"
+#include "trace/tr_context.h"
+#endif
+
+#include "stw_icd.h"
+#include "stw_device.h"
+#include "stw_winsys.h"
+#include "stw_framebuffer.h"
+#include "stw_pixelformat.h"
+#include "stw_context.h"
+#include "stw_tls.h"
+
+
+static INLINE struct stw_context *
+stw_context(GLcontext *glctx)
+{
+   if(!glctx)
+      return NULL;
+   assert(glctx->DriverCtx);
+   return (struct stw_context *)glctx->DriverCtx;
+}
 
-#define DBG 0
+static INLINE struct stw_context *
+stw_current_context(void)
+{
+   /* We must check if multiple threads are being used or GET_CURRENT_CONTEXT 
+    * might return the current context of the thread first seen. */
+   _glapi_check_multithread();
 
+   {
+      GET_CURRENT_CONTEXT( glctx );
+      return stw_context(glctx);
+   }
+}
 
 BOOL APIENTRY
 DrvCopyContext(
@@ -45,24 +76,64 @@ DrvCopyContext(
    DHGLRC dhrcDest,
    UINT fuMask )
 {
-   return stw_copy_context(dhrcSource, dhrcDest, fuMask);
-}
+   struct stw_context *src;
+   struct stw_context *dst;
+   BOOL ret = FALSE;
 
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   
+   src = stw_lookup_context_locked( dhrcSource );
+   dst = stw_lookup_context_locked( dhrcDest );
+
+   if (src && dst) { 
+      /* FIXME */
+      assert(0);
+      (void) src;
+      (void) dst;
+      (void) fuMask;
+   }
 
-DHGLRC APIENTRY
-DrvCreateLayerContext(
-   HDC hdc,
-   INT iLayerPlane )
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   
+   return ret;
+}
+
+BOOL APIENTRY
+DrvShareLists(
+   DHGLRC dhglrc1,
+   DHGLRC dhglrc2 )
 {
-   DHGLRC r;
+   struct stw_context *ctx1;
+   struct stw_context *ctx2;
+   BOOL ret = FALSE;
+
+   pipe_mutex_lock( stw_dev->ctx_mutex );
    
-   r = stw_create_layer_context( hdc, iLayerPlane );
+   ctx1 = stw_lookup_context_locked( dhglrc1 );
+   ctx2 = stw_lookup_context_locked( dhglrc2 );
+
+   if (ctx1 && ctx2 &&
+       ctx1->iPixelFormat == ctx2->iPixelFormat) { 
+      ret = _mesa_share_state(ctx2->st->ctx, ctx1->st->ctx);
+   }
+
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
    
-   if (DBG)
-      debug_printf( "%s( %p, %i ) = %u\n",
-                    __FUNCTION__, hdc, iLayerPlane, r );
+   return ret;
+}
+
+static void
+stw_viewport(GLcontext * glctx, GLint x, GLint y,
+             GLsizei width, GLsizei height)
+{
+   struct stw_context *ctx = (struct stw_context *)glctx->DriverCtx;
+   struct stw_framebuffer *fb;
    
-   return r;
+   fb = stw_framebuffer_from_hdc( ctx->hdc );
+   if(fb) {
+      stw_framebuffer_update(fb);
+      stw_framebuffer_release(fb);
+   }
 }
 
 DHGLRC APIENTRY
@@ -72,114 +143,253 @@ DrvCreateContext(
    return DrvCreateLayerContext( hdc, 0 );
 }
 
-BOOL APIENTRY
-DrvDeleteContext(
-   DHGLRC dhglrc )
+DHGLRC APIENTRY
+DrvCreateLayerContext(
+   HDC hdc,
+   INT iLayerPlane )
 {
-   BOOL r;
+   int iPixelFormat;
+   const struct stw_pixelformat_info *pfi;
+   GLvisual visual;
+   struct stw_context *ctx = NULL;
+   struct pipe_screen *screen = NULL;
+   struct pipe_context *pipe = NULL;
    
-   r = stw_delete_context( dhglrc );
+   if(!stw_dev)
+      return 0;
    
-   if (DBG)
-      debug_printf( "%s( %u ) = %u\n",
-                    __FUNCTION__, dhglrc, r );
+   if (iLayerPlane != 0)
+      return 0;
+
+   iPixelFormat = GetPixelFormat(hdc);
+   if(!iPixelFormat)
+      return 0;
    
-   return r;
+   pfi = stw_pixelformat_get_info( iPixelFormat - 1 );
+   stw_pixelformat_visual(&visual, pfi);
+   
+   ctx = CALLOC_STRUCT( stw_context );
+   if (ctx == NULL)
+      goto no_ctx;
+
+   ctx->hdc = hdc;
+   ctx->iPixelFormat = iPixelFormat;
+
+   screen = stw_dev->screen;
+
+#ifdef DEBUG
+   /* Unwrap screen */
+   if(stw_dev->trace_running)
+      screen = trace_screen(screen)->screen;
+#endif
+
+   pipe = stw_dev->stw_winsys->create_context( screen );
+   if (pipe == NULL) 
+      goto no_pipe;
+
+#ifdef DEBUG
+   /* Wrap context */
+   if(stw_dev->trace_running)
+      pipe = trace_context_create(stw_dev->screen, pipe);
+#endif
+
+   /* pass to stw_flush_frontbuffer as context_private */
+   assert(!pipe->priv);
+   pipe->priv = hdc;
+
+   ctx->st = st_create_context( pipe, &visual, NULL );
+   if (ctx->st == NULL) 
+      goto no_st_ctx;
+
+   ctx->st->ctx->DriverCtx = ctx;
+   ctx->st->ctx->Driver.Viewport = stw_viewport;
+
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   ctx->dhglrc = handle_table_add(stw_dev->ctx_table, ctx);
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   if (!ctx->dhglrc)
+      goto no_hglrc;
+
+   return ctx->dhglrc;
+
+no_hglrc:
+   st_destroy_context(ctx->st);
+   goto no_pipe; /* st_context_destroy already destroys pipe */
+no_st_ctx:
+   pipe->destroy( pipe );
+no_pipe:
+   FREE(ctx);
+no_ctx:
+   return 0;
 }
 
 BOOL APIENTRY
-DrvDescribeLayerPlane(
-   HDC hdc,
-   INT iPixelFormat,
-   INT iLayerPlane,
-   UINT nBytes,
-   LPLAYERPLANEDESCRIPTOR plpd )
+DrvDeleteContext(
+   DHGLRC dhglrc )
 {
-   if (DBG) 
-      debug_printf( "%s\n", __FUNCTION__ );
+   struct stw_context *ctx ;
+   BOOL ret = FALSE;
+   
+   if (!stw_dev)
+      return FALSE;
 
-   return FALSE;
-}
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   ctx = stw_lookup_context_locked(dhglrc);
+   handle_table_remove(stw_dev->ctx_table, dhglrc);
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
 
-LONG APIENTRY
-DrvDescribePixelFormat(
-   HDC hdc,
-   INT iPixelFormat,
-   ULONG cjpfd,
-   PIXELFORMATDESCRIPTOR *ppfd )
-{
-   LONG r;
+   if (ctx) {
+      struct stw_context *curctx = stw_current_context();
+      
+      /* Unbind current if deleting current context. */
+      if (curctx == ctx)
+         st_make_current( NULL, NULL, NULL );
 
-   r = stw_pixelformat_describe( hdc, iPixelFormat, cjpfd, ppfd );
+      st_destroy_context(ctx->st);
+      FREE(ctx);
 
-   if (DBG)
-      debug_printf( "%s( %p, %d, %u, %p ) = %d\n",
-                    __FUNCTION__, hdc, iPixelFormat, cjpfd, ppfd, r );
+      ret = TRUE;
+   }
 
-   return r;
+   return ret;
 }
 
-int APIENTRY
-DrvGetLayerPaletteEntries(
-   HDC hdc,
-   INT iLayerPlane,
-   INT iStart,
-   INT cEntries,
-   COLORREF *pcr )
+BOOL APIENTRY
+DrvReleaseContext(
+   DHGLRC dhglrc )
 {
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
+   struct stw_context *ctx;
 
-   return 0;
-}
+   if (!stw_dev)
+      return FALSE;
 
-PROC APIENTRY
-DrvGetProcAddress(
-   LPCSTR lpszProc )
-{
-   PROC r;
+   pipe_mutex_lock( stw_dev->ctx_mutex );
+   ctx = stw_lookup_context_locked( dhglrc );
+   pipe_mutex_unlock( stw_dev->ctx_mutex );
 
-   r = stw_get_proc_address( lpszProc );
+   if (!ctx)
+      return FALSE;
+   
+   /* The expectation is that ctx is the same context which is
+    * current for this thread.  We should check that and return False
+    * if not the case.
+    */
+   if (ctx != stw_current_context())
+      return FALSE;
 
-   if (DBG)
-      debug_printf( "%s( \"%s\" ) = %p\n", __FUNCTION__, lpszProc, r );
+   if (stw_make_current( NULL, 0 ) == FALSE)
+      return FALSE;
 
-   return r;
+   return TRUE;
 }
 
-BOOL APIENTRY
-DrvRealizeLayerPalette(
-   HDC hdc,
-   INT iLayerPlane,
-   BOOL bRealize )
+
+DHGLRC
+stw_get_current_context( void )
 {
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
+   struct stw_context *ctx;
 
-   return FALSE;
+   ctx = stw_current_context();
+   if(!ctx)
+      return 0;
+   
+   return ctx->dhglrc;
 }
 
-BOOL APIENTRY
-DrvReleaseContext(
-   DHGLRC dhglrc )
+HDC
+stw_get_current_dc( void )
 {
-   return stw_release_context(dhglrc);
+   struct stw_context *ctx;
+
+   ctx = stw_current_context();
+   if(!ctx)
+      return NULL;
+   
+   return ctx->hdc;
 }
 
-void APIENTRY
-DrvSetCallbackProcs(
-   INT nProcs,
-   PROC *pProcs )
+BOOL
+stw_make_current(
+   HDC hdc,
+   DHGLRC dhglrc )
 {
-   if (DBG)
-      debug_printf( "%s( %d, %p )\n", __FUNCTION__, nProcs, pProcs );
+   struct stw_context *curctx = NULL;
+   struct stw_context *ctx = NULL;
+   struct stw_framebuffer *fb = NULL;
 
-   return;
-}
+   if (!stw_dev)
+      goto fail;
+
+   curctx = stw_current_context();
+   if (curctx != NULL) {
+      if (curctx->dhglrc != dhglrc)
+	 st_flush(curctx->st, PIPE_FLUSH_RENDER_CACHE, NULL);
+      
+      /* Return if already current. */
+      if (curctx->dhglrc == dhglrc && curctx->hdc == hdc) {
+         ctx = curctx;
+         fb = stw_framebuffer_from_hdc( hdc );
+         goto success;
+      }
+   }
+
+   if (hdc == NULL || dhglrc == 0) {
+      return st_make_current( NULL, NULL, NULL );
+   }
+
+   pipe_mutex_lock( stw_dev->ctx_mutex ); 
+   ctx = stw_lookup_context_locked( dhglrc );
+   pipe_mutex_unlock( stw_dev->ctx_mutex ); 
+   if(!ctx)
+      goto fail;
+
+   fb = stw_framebuffer_from_hdc( hdc );
+   if(!fb) { 
+      /* Applications should call SetPixelFormat before creating a context,
+       * but not all do, and the opengl32 runtime seems to use a default pixel
+       * format in some cases, so we must create a framebuffer for those here
+       */
+      int iPixelFormat = GetPixelFormat(hdc);
+      if(iPixelFormat)
+         fb = stw_framebuffer_create( hdc, iPixelFormat );
+      if(!fb) 
+         goto fail;
+   }
+   
+   if(fb->iPixelFormat != ctx->iPixelFormat)
+      goto fail;
+
+   /* Lazy allocation of the frame buffer */
+   if(!stw_framebuffer_allocate(fb))
+      goto fail;
+
+   /* Bind the new framebuffer */
+   ctx->hdc = hdc;
+   
+   /* pass to stw_flush_frontbuffer as context_private */
+   ctx->st->pipe->priv = hdc;
+   
+   if(!st_make_current( ctx->st, fb->stfb, fb->stfb ))
+      goto fail;
+
+success:
+   assert(fb);
+   if(fb) {
+      stw_framebuffer_update(fb);
+      stw_framebuffer_release(fb);
+   }
+   
+   return TRUE;
 
+fail:
+   if(fb)
+      stw_framebuffer_release(fb);
+   st_make_current( NULL, NULL, NULL );
+   return FALSE;
+}
 
 /**
- * Although WGL allows different dispatch entrypoints per context 
+ * Although WGL allows different dispatch entrypoints per context
  */
 static const GLCLTPROCTABLE cpt =
 {
@@ -524,7 +734,6 @@ static const GLCLTPROCTABLE cpt =
    }
 };
 
-
 PGLCLTPROCTABLE APIENTRY
 DrvSetContext(
    HDC hdc,
@@ -532,86 +741,9 @@ DrvSetContext(
    PFN_SETPROCTABLE pfnSetProcTable )
 {
    PGLCLTPROCTABLE r = (PGLCLTPROCTABLE)&cpt;
-   
+
    if (!stw_make_current( hdc, dhglrc ))
       r = NULL;
-      
-   if (DBG)
-      debug_printf( "%s( 0x%p, %u, 0x%p ) = %p\n", 
-                    __FUNCTION__, hdc, dhglrc, pfnSetProcTable, r );
-
-   return r;
-}
-
-int APIENTRY
-DrvSetLayerPaletteEntries(
-   HDC hdc,
-   INT iLayerPlane,
-   INT iStart,
-   INT cEntries,
-   CONST COLORREF *pcr )
-{
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
-
-   return 0;
-}
-
-BOOL APIENTRY
-DrvSetPixelFormat(
-   HDC hdc,
-   LONG iPixelFormat )
-{
-   BOOL r;
-
-   r = stw_pixelformat_set( hdc, iPixelFormat );
-
-   if (DBG)
-      debug_printf( "%s( %p, %d ) = %s\n", __FUNCTION__, hdc, iPixelFormat, r ? "TRUE" : "FALSE" );
 
    return r;
 }
-
-BOOL APIENTRY
-DrvShareLists(
-   DHGLRC dhglrc1,
-   DHGLRC dhglrc2 )
-{
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
-
-   return stw_share_lists(dhglrc1, dhglrc2);
-}
-
-BOOL APIENTRY
-DrvSwapBuffers(
-   HDC hdc )
-{
-   if (DBG)
-      debug_printf( "%s( %p )\n", __FUNCTION__, hdc );
-
-   return stw_swap_buffers( hdc );
-}
-
-BOOL APIENTRY
-DrvSwapLayerBuffers(
-   HDC hdc,
-   UINT fuPlanes )
-{
-   if (DBG)
-      debug_printf( "%s\n", __FUNCTION__ );
-
-   return stw_swap_layer_buffers( hdc, fuPlanes );
-}
-
-BOOL APIENTRY
-DrvValidateVersion(
-   ULONG ulVersion )
-{
-   if (DBG)
-      debug_printf( "%s( %u )\n", __FUNCTION__, ulVersion );
-
-   /* TODO: get the expected version from the winsys */
-   
-   return ulVersion == 1;
-}
diff --git a/src/gallium/state_trackers/wgl/shared/stw_context.h b/src/gallium/state_trackers/wgl/stw_context.h
index 166471de5eb..256c27e21ef 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_context.h
+++ b/src/gallium/state_trackers/wgl/stw_context.h
@@ -35,9 +35,15 @@ struct st_context;
 struct stw_context
 {
    struct st_context *st;
-   UINT_PTR hglrc;
+   DHGLRC dhglrc;
    int iPixelFormat;
    HDC hdc;
 };
 
+DHGLRC stw_get_current_context( void );
+
+HDC stw_get_current_dc( void );
+
+BOOL stw_make_current( HDC hdc, DHGLRC dhglrc );
+
 #endif /* STW_CONTEXT_H */
diff --git a/src/gallium/state_trackers/wgl/shared/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c
index 0b6954915a6..7785aba4677 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_device.c
+++ b/src/gallium/state_trackers/wgl/stw_device.c
@@ -29,6 +29,7 @@
 
 #include "glapi/glthread.h"
 #include "util/u_debug.h"
+#include "util/u_math.h"
 #include "pipe/p_screen.h"
 #include "state_tracker/st_public.h"
 
@@ -37,12 +38,12 @@
 #include "trace/tr_texture.h"
 #endif
 
-#include "shared/stw_device.h"
-#include "shared/stw_winsys.h"
-#include "shared/stw_pixelformat.h"
-#include "shared/stw_public.h"
-#include "shared/stw_tls.h"
-#include "shared/stw_framebuffer.h"
+#include "stw_device.h"
+#include "stw_winsys.h"
+#include "stw_pixelformat.h"
+#include "stw_icd.h"
+#include "stw_tls.h"
+#include "stw_framebuffer.h"
 
 #ifdef WIN32_THREADS
 extern _glthread_Mutex OneTimeLock;
@@ -62,38 +63,16 @@ stw_flush_frontbuffer(struct pipe_screen *screen,
                      struct pipe_surface *surface,
                      void *context_private )
 {
-   const struct stw_winsys *stw_winsys = stw_dev->stw_winsys;
    HDC hdc = (HDC)context_private;
    struct stw_framebuffer *fb;
    
    fb = stw_framebuffer_from_hdc( hdc );
-   /* fb can be NULL if window was destroyed already */
-   if (fb) {
-#if DEBUG
-      {
-         struct pipe_surface *surface2;
-   
-         if(!st_get_framebuffer_surface( fb->stfb, ST_SURFACE_FRONT_LEFT, &surface2 ))
-            assert(0);
-         else
-            assert(surface2 == surface);
-      }
-#endif
-
-#ifdef DEBUG
-      if(stw_dev->trace_running) {
-         screen = trace_screen(screen)->screen;
-         surface = trace_surface(surface)->surface;
-      }
-#endif
-   }
-   
-   stw_winsys->flush_frontbuffer(screen, surface, hdc);
-   
-   if(fb) {
-      stw_framebuffer_update(fb);
-      stw_framebuffer_release(fb);
+   if (!fb) {
+      /* fb can be NULL if window was destroyed already */
+      return;
    }
+
+   stw_framebuffer_present_locked(hdc, fb, surface);
 }
 
 
@@ -126,6 +105,9 @@ stw_init(const struct stw_winsys *stw_winsys)
    if(!screen)
       goto error1;
 
+   if(stw_winsys->get_adapter_luid)
+      stw_winsys->get_adapter_luid(screen, &stw_dev->AdapterLuid);
+
 #ifdef DEBUG
    stw_dev->screen = trace_screen_create(screen);
    stw_dev->trace_running = stw_dev->screen != screen ? TRUE : FALSE;
@@ -182,7 +164,7 @@ stw_cleanup(void)
       /* Ensure all contexts are destroyed */
       i = handle_table_get_first_handle(stw_dev->ctx_table);
       while (i) {
-         stw_delete_context(i);
+         DrvDeleteContext(i);
          i = handle_table_get_next_handle(stw_dev->ctx_table, i);
       }
       handle_table_destroy(stw_dev->ctx_table);
@@ -212,7 +194,7 @@ stw_cleanup(void)
 
 
 struct stw_context *
-stw_lookup_context_locked( UINT_PTR dhglrc )
+stw_lookup_context_locked( DHGLRC dhglrc )
 {
    if (dhglrc == 0)
       return NULL;
@@ -223,3 +205,28 @@ stw_lookup_context_locked( UINT_PTR dhglrc )
    return (struct stw_context *) handle_table_get(stw_dev->ctx_table, dhglrc);
 }
 
+
+void APIENTRY
+DrvSetCallbackProcs(
+   INT nProcs,
+   PROC *pProcs )
+{
+   size_t size;
+
+   if (stw_dev == NULL)
+      return;
+
+   size = MIN2(nProcs * sizeof *pProcs, sizeof stw_dev->callbacks);
+   memcpy(&stw_dev->callbacks, pProcs, size);
+
+   return;
+}
+
+
+BOOL APIENTRY
+DrvValidateVersion(
+   ULONG ulVersion )
+{
+   /* TODO: get the expected version from the winsys */
+   return ulVersion == 1;
+}
diff --git a/src/gallium/state_trackers/wgl/shared/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h
index e1bb9518dd1..0bf3b0da825 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_device.h
+++ b/src/gallium/state_trackers/wgl/stw_device.h
@@ -29,11 +29,10 @@
 #define STW_DEVICE_H_
 
 
-#include <windows.h>
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_thread.h"
 #include "util/u_handle_table.h"
+#include "stw_icd.h"
 #include "stw_pixelformat.h"
 
 
@@ -53,10 +52,14 @@ struct stw_device
    boolean trace_running;
 #endif
 
+   LUID AdapterLuid;
+
    struct stw_pixelformat_info pixelformats[STW_MAX_PIXELFORMATS];
    unsigned pixelformat_count;
    unsigned pixelformat_extended_count;
 
+   GLCALLBACKTABLE callbacks;
+
    pipe_mutex ctx_mutex;
    struct handle_table *ctx_table;
    
@@ -69,7 +72,7 @@ struct stw_device
 };
 
 struct stw_context *
-stw_lookup_context_locked( UINT_PTR hglrc );
+stw_lookup_context_locked( DHGLRC hglrc );
 
 extern struct stw_device *stw_dev;
 
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extensionsstring.c b/src/gallium/state_trackers/wgl/stw_ext_extensionsstring.c
index 62c859e1f92..62c859e1f92 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extensionsstring.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_extensionsstring.c
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extgallium.c b/src/gallium/state_trackers/wgl/stw_ext_gallium.c
index fc22737d7e3..fb30ec5dba9 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extgallium.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_gallium.c
@@ -27,9 +27,9 @@
 
 
 #include "pipe/p_screen.h"
-#include "stw_public.h"
 #include "stw_device.h"
 #include "stw_winsys.h"
+#include "stw_ext_gallium.h"
 
 #ifdef DEBUG
 #include "trace/tr_screen.h"
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extgallium.h b/src/gallium/state_trackers/wgl/stw_ext_gallium.h
index cc35f2bb7fe..cc35f2bb7fe 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extgallium.h
+++ b/src/gallium/state_trackers/wgl/stw_ext_gallium.h
diff --git a/src/gallium/state_trackers/wgl/shared/stw_arbpixelformat.c b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
index 0e2d4076993..8a9995aba8e 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_arbpixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
@@ -43,7 +43,6 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
-#include "stw_public.h"
 #include "stw_pixelformat.h"
 
 
diff --git a/src/gallium/state_trackers/wgl/shared/stw_extswapinterval.c b/src/gallium/state_trackers/wgl/stw_ext_swapinterval.c
index 9eac6a1d09d..9eac6a1d09d 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_extswapinterval.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_swapinterval.c
diff --git a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index b8956bb5509..6d095019815 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ *
+ * Copyright 2008-2009 Vmware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,19 +10,19 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #include <windows.h>
@@ -38,9 +38,9 @@
 #include "trace/tr_texture.h"
 #endif
 
+#include "stw_icd.h"
 #include "stw_framebuffer.h"
 #include "stw_device.h"
-#include "stw_public.h"
 #include "stw_winsys.h"
 #include "stw_tls.h"
 
@@ -83,6 +83,9 @@ stw_framebuffer_destroy_locked(
    *link = fb->next;
    fb->next = NULL;
 
+   if(fb->shared_surface)
+      stw_dev->stw_winsys->shared_surface_close(stw_dev->screen, fb->shared_surface);
+
    st_unreference_framebuffer(fb->stfb);
    
    pipe_mutex_unlock( fb->mutex );
@@ -106,13 +109,18 @@ static INLINE void
 stw_framebuffer_get_size( struct stw_framebuffer *fb )
 {
    unsigned width, height;
-   RECT rect;
+   RECT client_rect;
+   RECT window_rect;
+   POINT client_pos;
 
    assert(fb->hWnd);
    
-   GetClientRect( fb->hWnd, &rect );
-   width = rect.right - rect.left;
-   height = rect.bottom - rect.top;
+   /* Get the client area size. */
+   GetClientRect( fb->hWnd, &client_rect );
+   assert(client_rect.left == 0);
+   assert(client_rect.top == 0);
+   width = client_rect.right - client_rect.left;
+   height = client_rect.bottom - client_rect.top;
 
    if(width < 1)
       width = 1;
@@ -124,6 +132,31 @@ stw_framebuffer_get_size( struct stw_framebuffer *fb )
       fb->width = width; 
       fb->height = height; 
    }
+
+   client_pos.x = 0;
+   client_pos.y = 0;
+   ClientToScreen(fb->hWnd, &client_pos);
+
+   GetWindowRect(fb->hWnd, &window_rect);
+
+   fb->client_rect.left = client_pos.x - window_rect.left;
+   fb->client_rect.top =  client_pos.y - window_rect.top;
+   fb->client_rect.right = fb->client_rect.left + fb->width;
+   fb->client_rect.bottom = fb->client_rect.top + fb->height;
+
+#if 0
+   debug_printf("\n");
+   debug_printf("%s: client_position = (%i, %i)\n",
+                __FUNCTION__, client_pos.x, client_pos.y);
+   debug_printf("%s: window_rect = (%i, %i) - (%i, %i)\n",
+                __FUNCTION__,
+                window_rect.left, window_rect.top,
+                window_rect.right, window_rect.bottom);
+   debug_printf("%s: client_rect = (%i, %i) - (%i, %i)\n",
+                __FUNCTION__,
+                fb->client_rect.left, fb->client_rect.top,
+                fb->client_rect.right, fb->client_rect.bottom);
+#endif
 }
 
 
@@ -155,6 +188,7 @@ stw_call_window_proc(
        * can be masked out by the application. */
       LPWINDOWPOS lpWindowPos = (LPWINDOWPOS)pParams->lParam;
       if((lpWindowPos->flags & SWP_SHOWWINDOW) || 
+         !(lpWindowPos->flags & SWP_NOMOVE) ||
          !(lpWindowPos->flags & SWP_NOSIZE)) {
          fb = stw_framebuffer_from_hwnd( pParams->hwnd );
          if(fb) {
@@ -379,10 +413,10 @@ stw_framebuffer_from_hwnd(
 }
 
 
-BOOL
-stw_pixelformat_set(
+BOOL APIENTRY
+DrvSetPixelFormat(
    HDC hdc,
-   int iPixelFormat )
+   LONG iPixelFormat )
 {
    uint count;
    uint index;
@@ -435,9 +469,8 @@ stw_pixelformat_get(
 }
 
 
-BOOL
-stw_swap_buffers(
-   HDC hdc )
+BOOL APIENTRY
+DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data)
 {
    struct stw_framebuffer *fb;
    struct pipe_screen *screen;
@@ -447,23 +480,9 @@ stw_swap_buffers(
    if (fb == NULL)
       return FALSE;
 
-   if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) {
-      stw_framebuffer_release(fb);
-      return TRUE;
-   }
-
-   /* If we're swapping the buffer associated with the current context
-    * we have to flush any pending rendering commands first.
-    */
-   st_notify_swapbuffers( fb->stfb );
-
    screen = stw_dev->screen;
-   
-   if(!st_get_framebuffer_surface( fb->stfb, ST_SURFACE_BACK_LEFT, &surface )) {
-      /* FIXME: this shouldn't happen, but does on glean */
-      stw_framebuffer_release(fb);
-      return FALSE;
-   }
+
+   surface = (struct pipe_surface *)data->pPrivateData;
 
 #ifdef DEBUG
    if(stw_dev->trace_running) {
@@ -472,22 +491,114 @@ stw_swap_buffers(
    }
 #endif
 
-   stw_dev->stw_winsys->flush_frontbuffer( screen, surface, hdc );
-   
+   if(data->hSharedSurface != fb->hSharedSurface) {
+      if(fb->shared_surface) {
+         stw_dev->stw_winsys->shared_surface_close(screen, fb->shared_surface);
+         fb->shared_surface = NULL;
+      }
+
+      fb->hSharedSurface = data->hSharedSurface;
+
+      if(data->hSharedSurface &&
+         stw_dev->stw_winsys->shared_surface_open) {
+         fb->shared_surface = stw_dev->stw_winsys->shared_surface_open(screen, fb->hSharedSurface);
+      }
+   }
+
+   if(fb->shared_surface) {
+      stw_dev->stw_winsys->compose(screen,
+                                   surface,
+                                   fb->shared_surface,
+                                   &fb->client_rect,
+                                   data->PresentHistoryToken);
+   }
+   else {
+      stw_dev->stw_winsys->present( screen, surface, hdc );
+   }
+
    stw_framebuffer_update(fb);
+
    stw_framebuffer_release(fb);
-   
+
    return TRUE;
 }
 
 
+/**
+ * Queue a composition.
+ *
+ * It will drop the lock on success.
+ */
 BOOL
-stw_swap_layer_buffers(
+stw_framebuffer_present_locked(HDC hdc,
+                               struct stw_framebuffer *fb,
+                               struct pipe_surface *surface)
+{
+   if(stw_dev->callbacks.wglCbPresentBuffers &&
+      stw_dev->stw_winsys->compose) {
+      GLCBPRESENTBUFFERSDATA data;
+
+      memset(&data, 0, sizeof data);
+      data.magic1 = 2;
+      data.magic2 = 0;
+      data.AdapterLuid = stw_dev->AdapterLuid;
+      data.rect = fb->client_rect;
+      data.pPrivateData = (void *)surface;
+
+      stw_framebuffer_release(fb);
+
+      return stw_dev->callbacks.wglCbPresentBuffers(hdc, &data);
+   }
+   else {
+      struct pipe_screen *screen = stw_dev->screen;
+
+#ifdef DEBUG
+      if(stw_dev->trace_running) {
+         screen = trace_screen(screen)->screen;
+         surface = trace_surface(surface)->surface;
+      }
+#endif
+
+      stw_dev->stw_winsys->present( screen, surface, hdc );
+
+      stw_framebuffer_update(fb);
+
+      stw_framebuffer_release(fb);
+
+      return TRUE;
+   }
+}
+
+
+BOOL APIENTRY
+DrvSwapBuffers(
+   HDC hdc )
+{
+   struct stw_framebuffer *fb;
+   struct pipe_surface *surface = NULL;
+
+   fb = stw_framebuffer_from_hdc( hdc );
+   if (fb == NULL)
+      return FALSE;
+
+   if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) {
+      stw_framebuffer_release(fb);
+      return TRUE;
+   }
+
+   st_swapbuffers(fb->stfb, &surface, NULL);
+
+   return stw_framebuffer_present_locked(hdc, fb, surface);
+}
+
+
+BOOL APIENTRY
+DrvSwapLayerBuffers(
    HDC hdc,
    UINT fuPlanes )
 {
    if(fuPlanes & WGL_SWAP_MAIN_PLANE)
-      return stw_swap_buffers(hdc);
+      return DrvSwapBuffers(hdc);
 
    return FALSE;
 }
diff --git a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.h b/src/gallium/state_trackers/wgl/stw_framebuffer.h
index 13d29f37e48..b80d168a7ce 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_framebuffer.h
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.h
@@ -34,6 +34,7 @@
 
 #include "pipe/p_thread.h"
 
+struct pipe_surface;
 struct stw_pixelformat_info;
 
 /**
@@ -73,9 +74,20 @@ struct stw_framebuffer
    
    /* FIXME: Make this work for multiple contexts bound to the same framebuffer */
    boolean must_resize;
+
    unsigned width;
    unsigned height;
    
+   /**
+    * Client area rectangle, relative to the window upper-left corner.
+    *
+    * @sa GLCBPRESENTBUFFERSDATA::rect.
+    */
+   RECT client_rect;
+
+   HANDLE hSharedSurface;
+   struct stw_shared_surface *shared_surface;
+
    /** 
     * This is protected by stw_device::fb_mutex, not the mutex above.
     * 
@@ -126,6 +138,11 @@ BOOL
 stw_framebuffer_allocate(
    struct stw_framebuffer *fb );
 
+BOOL
+stw_framebuffer_present_locked(HDC hdc,
+                               struct stw_framebuffer *fb,
+                               struct pipe_surface *surface);
+
 void
 stw_framebuffer_update(
    struct stw_framebuffer *fb);
diff --git a/src/gallium/state_trackers/wgl/shared/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
index 879ced925a5..8875dc22f3d 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_getprocaddress.c
+++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
@@ -33,8 +33,7 @@
 #include <GL/wglext.h>
 
 #include "glapi/glapi.h"
-#include "stw_public.h"
-#include "stw_extgallium.h"
+#include "stw_ext_gallium.h"
 
 struct stw_extension_entry
 {
@@ -68,8 +67,8 @@ static const struct stw_extension_entry stw_extension_entries[] = {
    { NULL, NULL }
 };
 
-PROC
-stw_get_proc_address(
+PROC APIENTRY
+DrvGetProcAddress(
    LPCSTR lpszProc )
 {
    const struct stw_extension_entry *entry;
diff --git a/src/gallium/state_trackers/wgl/icd/stw_icd.h b/src/gallium/state_trackers/wgl/stw_icd.h
index cbc1a665481..02eb543fef0 100644
--- a/src/gallium/state_trackers/wgl/icd/stw_icd.h
+++ b/src/gallium/state_trackers/wgl/stw_icd.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008-2009 Vmware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -388,6 +388,113 @@ typedef struct _GLCLTPROCTABLE
 
 typedef VOID (APIENTRY * PFN_SETPROCTABLE)(PGLCLTPROCTABLE);
 
+/**
+ * Presentation data passed to opengl32!wglCbPresentBuffers.
+ *
+ * Pure software drivers don't need to worry about this -- if they stick to the
+ * GDI API then will integrate with the Desktop Window Manager (DWM) without
+ * problems. Hardware drivers, however, cannot present directly to the primary
+ * surface while the DWM is active, as DWM gets exclusive access to the primary
+ * surface.
+ *
+ * Proper DWM integration requires:
+ * - advertise the PFD_SUPPORT_COMPOSITION flag
+ * - redirect glFlush/glfinish/wglSwapBuffers into a surface shared with the
+ * DWM process.
+ *
+ * @sa http://www.opengl.org/pipeline/article/vol003_7/
+ * @sa http://blogs.msdn.com/greg_schechter/archive/2006/05/02/588934.aspx
+ */
+typedef struct _GLCBPRESENTBUFFERSDATA
+{
+   /**
+    * wglCbPresentBuffers enforces this to be 2.
+    */
+   DWORD magic1;
+
+   /**
+    * wglCbPresentBuffers enforces to be 0 or 1, but it is most commonly
+    * set to 0.
+    */
+   DWORD magic2;
+
+   /**
+    * Locally unique identifier (LUID) of the graphics adapter.
+    *
+    * This should contain the value returned by D3DKMTOpenAdapterFromHdc. It
+    * is passed to dwmapi!DwmpDxGetWindowSharedSurface in order to obtain
+    * the shared surface handle for the bound drawable (window).
+    *
+    * @sa http://msdn.microsoft.com/en-us/library/ms799177.aspx
+    */
+   LUID AdapterLuid;
+
+   /**
+    * This is passed unmodified to DrvPresentBuffers
+    */
+   LPVOID pPrivateData;
+
+   /**
+    * Client area rectangle to update, relative to the window upper-left corner.
+    */
+   RECT rect;
+} GLCBPRESENTBUFFERSDATA, *PGLCBPRESENTBUFFERSDATA;
+
+/**
+ * Callbacks supplied to DrvSetCallbackProcs by the OpenGL runtime.
+ *
+ * Pointers to several callback functions in opengl32.dll.
+ */
+typedef struct _GLCALLBACKTABLE
+{
+   /** Unused */
+   PROC wglCbSetCurrentValue;
+
+   /** Unused */
+   PROC wglCbGetCurrentValue;
+
+   /** Unused */
+   PROC wglCbGetDhglrc;
+
+   /** Unused */
+   PROC wglCbGetDdHandle;
+
+   /**
+    * Queue a present composition.
+    *
+    * Makes the runtime call DrvPresentBuffers with the composition information.
+    */
+   BOOL (APIENTRY *wglCbPresentBuffers)(HDC hdc, PGLCBPRESENTBUFFERSDATA data);
+
+} GLCALLBACKTABLE;
+
+typedef struct _GLPRESENTBUFFERSDATA
+{
+   /**
+    * The shared surface handle.
+    *
+    * Return by dwmapi!DwmpDxGetWindowSharedSurface.
+    *
+    * @sa http://channel9.msdn.com/forums/TechOff/251261-Help-Getting-the-shared-window-texture-out-of-DWM-/
+    */
+   HANDLE hSharedSurface;
+
+   LUID AdapterLuid;
+
+   /**
+    * Present history token.
+    *
+    * This is returned by dwmapi!DwmpDxGetWindowSharedSurface and
+    * should be passed to D3DKMTRender in D3DKMT_RENDER::PresentHistoryToken.
+    *
+    * @sa http://msdn.microsoft.com/en-us/library/ms799176.aspx
+    */
+   ULONGLONG PresentHistoryToken;
+
+   /** Same as GLCBPRESENTBUFFERSDATA::pPrivateData */
+   LPVOID pPrivateData;
+} GLPRESENTBUFFERSDATA, *PGLPRESENTBUFFERSDATA;
+
 BOOL APIENTRY
 DrvCopyContext(
    DHGLRC dhrcSource,
@@ -435,6 +542,9 @@ DrvGetProcAddress(
    LPCSTR lpszProc );
 
 BOOL APIENTRY
+DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data);
+
+BOOL APIENTRY
 DrvRealizeLayerPalette(
    HDC hdc,
    INT iLayerPlane,
diff --git a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index c296744838b..7abe5d9f7fa 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -34,9 +34,9 @@
 
 #include "util/u_debug.h"
 
+#include "stw_icd.h"
 #include "stw_device.h"
 #include "stw_pixelformat.h"
-#include "stw_public.h"
 #include "stw_tls.h"
 
 
@@ -154,8 +154,11 @@ stw_pixelformat_add(
    pfi->pfd.dwFlags = PFD_SUPPORT_OPENGL;
    
    /* TODO: also support non-native pixel formats */
-   pfi->pfd.dwFlags |= PFD_DRAW_TO_WINDOW ;
-   
+   pfi->pfd.dwFlags |= PFD_DRAW_TO_WINDOW;
+
+   /* See http://www.opengl.org/pipeline/article/vol003_7/ */
+   pfi->pfd.dwFlags |= PFD_SUPPORT_COMPOSITION;
+
    if (doublebuffer)
       pfi->pfd.dwFlags |= PFD_DOUBLEBUFFER | PFD_SWAP_COPY;
    
@@ -288,12 +291,12 @@ stw_pixelformat_visual(GLvisual *visual,
 }
 
 
-int
-stw_pixelformat_describe(
+LONG APIENTRY
+DrvDescribePixelFormat(
    HDC hdc,
-   int iPixelFormat,
-   UINT nBytes,
-   LPPIXELFORMATDESCRIPTOR ppfd )
+   INT iPixelFormat,
+   ULONG cjpfd,
+   PIXELFORMATDESCRIPTOR *ppfd )
 {
    uint count;
    uint index;
@@ -306,7 +309,7 @@ stw_pixelformat_describe(
 
    if (ppfd == NULL)
       return count;
-   if (index >= count || nBytes != sizeof( PIXELFORMATDESCRIPTOR ))
+   if (index >= count || cjpfd != sizeof( PIXELFORMATDESCRIPTOR ))
       return 0;
 
    pfi = stw_pixelformat_get_info( index );
@@ -316,6 +319,52 @@ stw_pixelformat_describe(
    return count;
 }
 
+BOOL APIENTRY
+DrvDescribeLayerPlane(
+   HDC hdc,
+   INT iPixelFormat,
+   INT iLayerPlane,
+   UINT nBytes,
+   LPLAYERPLANEDESCRIPTOR plpd )
+{
+   assert(0);
+   return FALSE;
+}
+
+int APIENTRY
+DrvGetLayerPaletteEntries(
+   HDC hdc,
+   INT iLayerPlane,
+   INT iStart,
+   INT cEntries,
+   COLORREF *pcr )
+{
+   assert(0);
+   return 0;
+}
+
+int APIENTRY
+DrvSetLayerPaletteEntries(
+   HDC hdc,
+   INT iLayerPlane,
+   INT iStart,
+   INT cEntries,
+   CONST COLORREF *pcr )
+{
+   assert(0);
+   return 0;
+}
+
+BOOL APIENTRY
+DrvRealizeLayerPalette(
+   HDC hdc,
+   INT iLayerPlane,
+   BOOL bRealize )
+{
+   assert(0);
+   return FALSE;
+}
+
 /* Only used by the wgl code, but have it here to avoid exporting the
  * pixelformat.h functionality.
  */
diff --git a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.h b/src/gallium/state_trackers/wgl/stw_pixelformat.h
index bec429231b2..3a690b35bad 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_pixelformat.h
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.h
@@ -30,6 +30,10 @@
 
 #include <windows.h>
 
+#ifndef PFD_SUPPORT_COMPOSITION
+#define PFD_SUPPORT_COMPOSITION 0x00008000
+#endif
+
 #include "main/mtypes.h"
 
 #include "pipe/p_compiler.h"
@@ -62,4 +66,11 @@ void
 stw_pixelformat_visual(GLvisual *visual, 
                        const struct stw_pixelformat_info *pfi );
 
+int
+stw_pixelformat_choose( HDC hdc,
+                        CONST PIXELFORMATDESCRIPTOR *ppfd );
+
+int
+stw_pixelformat_get(HDC hdc);
+
 #endif /* STW_PIXELFORMAT_H */
diff --git a/src/gallium/state_trackers/wgl/shared/stw_tls.c b/src/gallium/state_trackers/wgl/stw_tls.c
index 4bd6a9289c9..4bd6a9289c9 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_tls.c
+++ b/src/gallium/state_trackers/wgl/stw_tls.c
diff --git a/src/gallium/state_trackers/wgl/shared/stw_tls.h b/src/gallium/state_trackers/wgl/stw_tls.h
index fbf8b1cbee4..fbf8b1cbee4 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_tls.h
+++ b/src/gallium/state_trackers/wgl/stw_tls.h
diff --git a/src/gallium/state_trackers/wgl/wgl/stw_wgl.c b/src/gallium/state_trackers/wgl/stw_wgl.c
index a131292f7ae..bb199fdd252 100644
--- a/src/gallium/state_trackers/wgl/wgl/stw_wgl.c
+++ b/src/gallium/state_trackers/wgl/stw_wgl.c
@@ -28,7 +28,9 @@
 #include <windows.h>
 
 #include "util/u_debug.h"
-#include "shared/stw_public.h"
+#include "stw_icd.h"
+#include "stw_context.h"
+#include "stw_pixelformat.h"
 #include "stw_wgl.h"
 
 
@@ -38,16 +40,16 @@ wglCopyContext(
    HGLRC hglrcDst,
    UINT mask )
 {
-   return stw_copy_context( (UINT_PTR)hglrcSrc, 
-                            (UINT_PTR)hglrcDst, 
-                            mask );
+   return DrvCopyContext( (DHGLRC)(UINT_PTR)hglrcSrc,
+                          (DHGLRC)(UINT_PTR)hglrcDst,
+                          mask );
 }
 
 WINGDIAPI HGLRC APIENTRY
 wglCreateContext(
    HDC hdc )
 {
-   return wglCreateLayerContext(hdc, 0);
+   return (HGLRC) DrvCreateContext(hdc);
 }
 
 WINGDIAPI HGLRC APIENTRY
@@ -55,21 +57,21 @@ wglCreateLayerContext(
    HDC hdc,
    int iLayerPlane )
 {
-   return (HGLRC) stw_create_layer_context( hdc, iLayerPlane );
+   return (HGLRC) DrvCreateLayerContext( hdc, iLayerPlane );
 }
 
 WINGDIAPI BOOL APIENTRY
 wglDeleteContext(
    HGLRC hglrc )
 {
-   return stw_delete_context( (UINT_PTR)hglrc );
+   return DrvDeleteContext((DHGLRC)(UINT_PTR)hglrc );
 }
 
 
 WINGDIAPI HGLRC APIENTRY
 wglGetCurrentContext( VOID )
 {
-   return (HGLRC)stw_get_current_context();
+   return (HGLRC)(UINT_PTR)stw_get_current_context();
 }
 
 WINGDIAPI HDC APIENTRY
@@ -83,7 +85,7 @@ wglMakeCurrent(
    HDC hdc,
    HGLRC hglrc )
 {
-   return stw_make_current( hdc, (UINT_PTR)hglrc );
+   return DrvSetContext( hdc, (DHGLRC)(UINT_PTR)hglrc, NULL ) ? TRUE : FALSE;
 }
 
 
@@ -91,7 +93,7 @@ WINGDIAPI BOOL APIENTRY
 wglSwapBuffers(
    HDC hdc )
 {
-   return stw_swap_buffers( hdc );
+   return DrvSwapBuffers( hdc );
 }
 
 
@@ -100,14 +102,14 @@ wglSwapLayerBuffers(
    HDC hdc,
    UINT fuPlanes )
 {
-   return stw_swap_layer_buffers( hdc, fuPlanes );
+   return DrvSwapLayerBuffers( hdc, fuPlanes );
 }
 
 WINGDIAPI PROC APIENTRY
 wglGetProcAddress(
     LPCSTR lpszProc )
 {
-   return stw_get_proc_address( lpszProc );
+   return DrvGetProcAddress( lpszProc );
 }
 
 
@@ -141,7 +143,7 @@ wglDescribePixelFormat(
    UINT nBytes,
    LPPIXELFORMATDESCRIPTOR ppfd )
 {
-   return stw_pixelformat_describe( hdc, iPixelFormat, nBytes, ppfd );
+   return DrvDescribePixelFormat( hdc, iPixelFormat, nBytes, ppfd );
 }
 
 WINGDIAPI int APIENTRY
@@ -160,7 +162,7 @@ wglSetPixelFormat(
    if (ppfd->nSize != sizeof( PIXELFORMATDESCRIPTOR ))
       return FALSE;
 
-   return stw_pixelformat_set( hdc, iPixelFormat );
+   return DrvSetPixelFormat( hdc, iPixelFormat );
 }
 
 
@@ -186,7 +188,8 @@ wglShareLists(
    HGLRC hglrc1,
    HGLRC hglrc2 )
 {
-   return stw_share_lists( (UINT_PTR)hglrc1, (UINT_PTR)hglrc2);;
+   return DrvShareLists((DHGLRC)(UINT_PTR)hglrc1,
+                        (DHGLRC)(UINT_PTR)hglrc2);
 }
 
 WINGDIAPI BOOL APIENTRY
@@ -264,15 +267,7 @@ wglDescribeLayerPlane(
    UINT nBytes,
    LPLAYERPLANEDESCRIPTOR plpd )
 {
-   (void) hdc;
-   (void) iPixelFormat;
-   (void) iLayerPlane;
-   (void) nBytes;
-   (void) plpd;
-
-   assert( 0 );
-
-   return FALSE;
+   return DrvDescribeLayerPlane(hdc, iPixelFormat, iLayerPlane, nBytes, plpd);
 }
 
 WINGDIAPI int APIENTRY
@@ -283,15 +278,7 @@ wglSetLayerPaletteEntries(
    int cEntries,
    CONST COLORREF *pcr )
 {
-   (void) hdc;
-   (void) iLayerPlane;
-   (void) iStart;
-   (void) cEntries;
-   (void) pcr;
-
-   assert( 0 );
-
-   return 0;
+   return DrvSetLayerPaletteEntries(hdc, iLayerPlane, iStart, cEntries, pcr);
 }
 
 WINGDIAPI int APIENTRY
@@ -302,15 +289,7 @@ wglGetLayerPaletteEntries(
    int cEntries,
    COLORREF *pcr )
 {
-   (void) hdc;
-   (void) iLayerPlane;
-   (void) iStart;
-   (void) cEntries;
-   (void) pcr;
-
-   assert( 0 );
-
-   return 0;
+   return DrvGetLayerPaletteEntries(hdc, iLayerPlane, iStart, cEntries, pcr);
 }
 
 WINGDIAPI BOOL APIENTRY
diff --git a/src/gallium/state_trackers/wgl/wgl/stw_wgl.h b/src/gallium/state_trackers/wgl/stw_wgl.h
index a98179944aa..a98179944aa 100644
--- a/src/gallium/state_trackers/wgl/wgl/stw_wgl.h
+++ b/src/gallium/state_trackers/wgl/stw_wgl.h
diff --git a/src/gallium/state_trackers/wgl/stw_winsys.h b/src/gallium/state_trackers/wgl/stw_winsys.h
new file mode 100644
index 00000000000..1de6e906d0d
--- /dev/null
+++ b/src/gallium/state_trackers/wgl/stw_winsys.h
@@ -0,0 +1,109 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2009 Vmware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef STW_WINSYS_H
+#define STW_WINSYS_H
+
+#include <windows.h> /* for HDC */
+
+#include "pipe/p_compiler.h"
+
+struct pipe_screen;
+struct pipe_context;
+struct pipe_surface;
+
+struct stw_shared_surface;
+
+struct stw_winsys
+{
+   struct pipe_screen *
+   (*create_screen)( void );
+
+   struct pipe_context *
+   (*create_context)( struct pipe_screen *screen );
+
+   /**
+    * Present the color buffer to the window associated with the device context.
+    */
+   void
+   (*present)( struct pipe_screen *screen,
+               struct pipe_surface *surf,
+               HDC hDC );
+
+   /**
+    * Locally unique identifier (LUID) of the graphics adapter.
+    *
+    * @sa GLCBPRESENTBUFFERSDATA::AdapterLuid;
+    */
+   boolean
+   (*get_adapter_luid)( struct pipe_screen *screen,
+                        LUID *pAdapterLuid );
+
+   /**
+    * Open a shared surface (optional).
+    *
+    * @sa GLCBPRESENTBUFFERSDATA::hSharedSurface;
+    */
+   struct stw_shared_surface *
+   (*shared_surface_open)(struct pipe_screen *screen,
+                          HANDLE hSharedSurface);
+
+   /**
+    * Close a shared surface (optional).
+    */
+   void
+   (*shared_surface_close)(struct pipe_screen *screen,
+                           struct stw_shared_surface *surface);
+
+   /**
+    * Compose into a shared (optional).
+    *
+    * Blit the color buffer into a shared surface.
+    *
+    * @sa GLPRESENTBUFFERSDATA::PresentHistoryToken.
+    */
+   void
+   (*compose)( struct pipe_screen *screen,
+               struct pipe_surface *src,
+               struct stw_shared_surface *dest,
+               LPCRECT pRect,
+               ULONGLONG PresentHistoryToken );
+};
+
+boolean
+stw_init(const struct stw_winsys *stw_winsys);
+
+boolean
+stw_init_thread(void);
+
+void
+stw_cleanup_thread(void);
+
+void
+stw_cleanup(void);
+
+#endif /* STW_WINSYS_H */
diff --git a/src/gallium/state_trackers/xorg/Makefile b/src/gallium/state_trackers/xorg/Makefile
index 27a1990724d..cb2c3aea410 100644
--- a/src/gallium/state_trackers/xorg/Makefile
+++ b/src/gallium/state_trackers/xorg/Makefile
@@ -5,6 +5,11 @@ LIBNAME = xorgtracker
 
 LIBRARY_INCLUDES = \
 	-DHAVE_CONFIG_H \
+	$(shell pkg-config xextproto --atleast-version=7.0.99.1 \
+				&& echo "-DHAVE_XEXTPROTO_71") \
+	$(shell pkg-config libkms --atleast-version=1.0 \
+				&& echo "-DHAVE_LIBKMS") \
+	$(shell pkg-config libkms --silence-errors --cflags-only-I) \
 	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto) \
 	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/gallium/auxiliary \
diff --git a/src/gallium/state_trackers/xorg/SConscript b/src/gallium/state_trackers/xorg/SConscript
index 65f55ea378a..5d0b6613ac7 100644
--- a/src/gallium/state_trackers/xorg/SConscript
+++ b/src/gallium/state_trackers/xorg/SConscript
@@ -13,6 +13,11 @@ if 'xorg' in env['statetrackers']:
 
     env.ParseConfig('pkg-config --cflags --libs xorg-server')
 
+    conf = env.Configure()
+
+    if conf.CheckHeader('X11/extensions/dpmsconst.h'):
+        env.Append(CPPDEFINES = [('HAVE_XEXTPROTO_71', '1')])
+
     st_xorg = env.ConvenienceLibrary(
 	target = 'st_xorg',
 	source = [ 'xorg_composite.c',
@@ -22,6 +27,8 @@ if 'xorg' in env['statetrackers']:
 		'xorg_exa.c',
 		'xorg_exa_tgsi.c',
 		'xorg_output.c',
+		'xorg_renderer.c',
+		'xorg_xv.c',
 		]
     )
     Export('st_xorg')
diff --git a/src/gallium/state_trackers/xorg/xorg_composite.c b/src/gallium/state_trackers/xorg/xorg_composite.c
index c708ac31702..a5975aad515 100644
--- a/src/gallium/state_trackers/xorg/xorg_composite.c
+++ b/src/gallium/state_trackers/xorg/xorg_composite.c
@@ -1,97 +1,116 @@
 #include "xorg_composite.h"
 
+#include "xorg_renderer.h"
 #include "xorg_exa_tgsi.h"
 
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h"
+#include "util/u_math.h"
 
 #include "pipe/p_inlines.h"
 
+/*XXX also in Xrender.h but the including it here breaks compilition */
+#define XFixedToDouble(f)    (((double) (f)) / 65536.)
+
 struct xorg_composite_blend {
-   int op:8;
+   int op : 8;
 
-   unsigned rgb_src_factor:5;    /**< PIPE_BLENDFACTOR_x */
-   unsigned rgb_dst_factor:5;    /**< PIPE_BLENDFACTOR_x */
+   unsigned alpha_dst : 4;
+   unsigned alpha_src : 4;
 
-   unsigned alpha_src_factor:5;  /**< PIPE_BLENDFACTOR_x */
-   unsigned alpha_dst_factor:5;  /**< PIPE_BLENDFACTOR_x */
+   unsigned rgb_src : 8;    /**< PIPE_BLENDFACTOR_x */
+   unsigned rgb_dst : 8;    /**< PIPE_BLENDFACTOR_x */
 };
 
 #define BLEND_OP_OVER 3
 static const struct xorg_composite_blend xorg_blends[] = {
    { PictOpClear,
-     PIPE_BLENDFACTOR_CONST_COLOR, PIPE_BLENDFACTOR_CONST_ALPHA,
-     PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_ZERO },
-
+     0, 0, PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_ZERO},
    { PictOpSrc,
-     PIPE_BLENDFACTOR_ONE, PIPE_BLENDFACTOR_ONE,
-     PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_ZERO },
-
+     0, 0, PIPE_BLENDFACTOR_ONE, PIPE_BLENDFACTOR_ZERO},
    { PictOpDst,
-     PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_ZERO,
-     PIPE_BLENDFACTOR_ONE, PIPE_BLENDFACTOR_ONE },
-
+     0, 0, PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_ONE},
    { PictOpOver,
-     PIPE_BLENDFACTOR_SRC_ALPHA, PIPE_BLENDFACTOR_ONE,
-     PIPE_BLENDFACTOR_INV_SRC_ALPHA, PIPE_BLENDFACTOR_INV_SRC_ALPHA },
-
+     0, 1, PIPE_BLENDFACTOR_ONE, PIPE_BLENDFACTOR_INV_SRC_ALPHA},
    { PictOpOverReverse,
-     PIPE_BLENDFACTOR_SRC_ALPHA, PIPE_BLENDFACTOR_ONE,
-     PIPE_BLENDFACTOR_INV_SRC_ALPHA, PIPE_BLENDFACTOR_INV_SRC_ALPHA },
+     1, 0, PIPE_BLENDFACTOR_INV_DST_ALPHA, PIPE_BLENDFACTOR_ONE},
+   { PictOpIn,
+     1, 0, PIPE_BLENDFACTOR_DST_ALPHA, PIPE_BLENDFACTOR_ZERO},
+   { PictOpInReverse,
+     0, 1, PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_SRC_ALPHA},
+   { PictOpOut,
+     1, 0, PIPE_BLENDFACTOR_INV_DST_ALPHA, PIPE_BLENDFACTOR_ZERO},
+   { PictOpOutReverse,
+     0, 1, PIPE_BLENDFACTOR_ZERO, PIPE_BLENDFACTOR_INV_SRC_ALPHA},
+   { PictOpAtop,
+     1, 1, PIPE_BLENDFACTOR_DST_ALPHA, PIPE_BLENDFACTOR_INV_SRC_ALPHA},
+   { PictOpAtopReverse,
+     1, 1, PIPE_BLENDFACTOR_INV_DST_ALPHA, PIPE_BLENDFACTOR_SRC_ALPHA},
+   { PictOpXor,
+     1, 1, PIPE_BLENDFACTOR_INV_DST_ALPHA, PIPE_BLENDFACTOR_INV_SRC_ALPHA},
+   { PictOpAdd,
+     0, 0, PIPE_BLENDFACTOR_ONE, PIPE_BLENDFACTOR_ONE},
 };
 
+
 static INLINE void
-pixel_to_float4(PictFormatPtr format,
-                CARD32 pixel, float *color)
+pixel_to_float4(Pixel pixel, float *color)
 {
    CARD32	    r, g, b, a;
 
-   debug_assert(format->type == PictTypeDirect);
-
-   r = (pixel >> format->direct.red) & format->direct.redMask;
-   g = (pixel >> format->direct.green) & format->direct.greenMask;
-   b = (pixel >> format->direct.blue) & format->direct.blueMask;
-   a = (pixel >> format->direct.alpha) & format->direct.alphaMask;
-   color[0] = ((float)r) / ((float)format->direct.redMask);
-   color[1] = ((float)g) / ((float)format->direct.greenMask);
-   color[2] = ((float)b) / ((float)format->direct.blueMask);
-   color[3] = ((float)a) / ((float)format->direct.alphaMask);
+   a = (pixel >> 24) & 0xff;
+   r = (pixel >> 16) & 0xff;
+   g = (pixel >>  8) & 0xff;
+   b = (pixel >>  0) & 0xff;
+   color[0] = ((float)r) / 255.;
+   color[1] = ((float)g) / 255.;
+   color[2] = ((float)b) / 255.;
+   color[3] = ((float)a) / 255.;
 }
 
-struct acceleration_info {
-   int op : 16;
-   int with_mask : 1;
-   int component_alpha : 1;
-};
-static const struct acceleration_info accelerated_ops[] = {
-   {PictOpClear,       1, 0},
-   {PictOpSrc,         1, 0},
-   {PictOpDst,         1, 0},
-   {PictOpOver,        1, 0},
-   {PictOpOverReverse, 1, 0},
-   {PictOpIn,          1, 0},
-   {PictOpInReverse,   1, 0},
-   {PictOpOut,         1, 0},
-   {PictOpOutReverse,  1, 0},
-   {PictOpAtop,        1, 0},
-   {PictOpAtopReverse, 1, 0},
-   {PictOpXor,         1, 0},
-   {PictOpAdd,         1, 0},
-   {PictOpSaturate,    1, 0},
-};
-
-static struct xorg_composite_blend
-blend_for_op(int op)
+static boolean
+blend_for_op(struct xorg_composite_blend *blend,
+             int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
+             PicturePtr pDstPicture)
 {
    const int num_blends =
       sizeof(xorg_blends)/sizeof(struct xorg_composite_blend);
    int i;
+   boolean supported = FALSE;
+
+   /* our default in case something goes wrong */
+   *blend = xorg_blends[BLEND_OP_OVER];
 
    for (i = 0; i < num_blends; ++i) {
-      if (xorg_blends[i].op == op)
-         return xorg_blends[i];
+      if (xorg_blends[i].op == op) {
+         *blend = xorg_blends[i];
+         supported = TRUE;
+      }
+   }
+
+   /* If there's no dst alpha channel, adjust the blend op so that we'll treat
+    * it as always 1. */
+   if (pDstPicture &&
+       PICT_FORMAT_A(pDstPicture->format) == 0 && blend->alpha_dst) {
+      if (blend->rgb_src == PIPE_BLENDFACTOR_DST_ALPHA)
+         blend->rgb_src = PIPE_BLENDFACTOR_ONE;
+      else if (blend->rgb_src == PIPE_BLENDFACTOR_INV_DST_ALPHA)
+         blend->rgb_src = PIPE_BLENDFACTOR_ZERO;
+   }
+
+   /* If the source alpha is being used, then we should only be in a case where
+    * the source blend factor is 0, and the source blend value is the mask
+    * channels multiplied by the source picture's alpha. */
+   if (pMaskPicture && pMaskPicture->componentAlpha &&
+       PICT_FORMAT_RGB(pMaskPicture->format) && blend->alpha_src) {
+      if (blend->rgb_dst == PIPE_BLENDFACTOR_SRC_ALPHA) {
+         blend->rgb_dst = PIPE_BLENDFACTOR_SRC_COLOR;
+      } else if (blend->rgb_dst == PIPE_BLENDFACTOR_INV_SRC_ALPHA) {
+         blend->rgb_dst = PIPE_BLENDFACTOR_INV_SRC_COLOR;
+      }
    }
-   return xorg_blends[BLEND_OP_OVER];
+
+   return supported;
 }
 
 static INLINE int
@@ -99,7 +118,7 @@ render_repeat_to_gallium(int mode)
 {
    switch(mode) {
    case RepeatNone:
-      return PIPE_TEX_WRAP_CLAMP;
+      return PIPE_TEX_WRAP_CLAMP_TO_BORDER;
    case RepeatNormal:
       return PIPE_TEX_WRAP_REPEAT;
    case RepeatReflect:
@@ -112,150 +131,44 @@ render_repeat_to_gallium(int mode)
    return PIPE_TEX_WRAP_REPEAT;
 }
 
-
-static INLINE void
-setup_vertex0(float vertex[2][4], float x, float y,
-              float color[4])
-{
-   vertex[0][0] = x;
-   vertex[0][1] = y;
-   vertex[0][2] = 0.f; /*z*/
-   vertex[0][3] = 1.f; /*w*/
-
-   vertex[1][0] = color[0]; /*r*/
-   vertex[1][1] = color[1]; /*g*/
-   vertex[1][2] = color[2]; /*b*/
-   vertex[1][3] = color[3]; /*a*/
-}
-
-static struct pipe_buffer *
-setup_vertex_data0(struct exa_context *ctx,
-                   int srcX, int srcY, int maskX, int maskY,
-                   int dstX, int dstY, int width, int height)
-{
-   float vertices[4][2][4];
-
-   /* 1st vertex */
-   setup_vertex0(vertices[0], dstX, dstY,
-                 ctx->solid_color);
-   /* 2nd vertex */
-   setup_vertex0(vertices[1], dstX + width, dstY,
-                 ctx->solid_color);
-   /* 3rd vertex */
-   setup_vertex0(vertices[2], dstX + width, dstY + height,
-                 ctx->solid_color);
-   /* 4th vertex */
-   setup_vertex0(vertices[3], dstX, dstY + height,
-                 ctx->solid_color);
-
-   return pipe_user_buffer_create(ctx->ctx->screen,
-                                  vertices,
-                                  sizeof(vertices));
-}
-
-static INLINE void
-setup_vertex1(float vertex[2][4], float x, float y, float s, float t)
-{
-   vertex[0][0] = x;
-   vertex[0][1] = y;
-   vertex[0][2] = 0.f; /*z*/
-   vertex[0][3] = 1.f; /*w*/
-
-   vertex[1][0] = s;   /*s*/
-   vertex[1][1] = t;   /*t*/
-   vertex[1][2] = 0.f; /*r*/
-   vertex[1][3] = 1.f; /*q*/
-}
-
-static struct pipe_buffer *
-setup_vertex_data1(struct exa_context *ctx,
-                   int srcX, int srcY, int maskX, int maskY,
-                   int dstX, int dstY, int width, int height)
+static INLINE boolean
+render_filter_to_gallium(int xrender_filter, int *out_filter)
 {
-   float vertices[4][2][4];
-   float s0, t0, s1, t1;
-   struct pipe_texture *src = ctx->bound_textures[0];
-
-   s0 = srcX / src->width[0];
-   s1 = srcX + width / src->width[0];
-   t0 = srcY / src->height[0];
-   t1 = srcY + height / src->height[0];
-
-   /* 1st vertex */
-   setup_vertex1(vertices[0], dstX, dstY,
-                 s0, t0);
-   /* 2nd vertex */
-   setup_vertex1(vertices[1], dstX + width, dstY,
-                 s1, t0);
-   /* 3rd vertex */
-   setup_vertex1(vertices[2], dstX + width, dstY + height,
-                 s1, t1);
-   /* 4th vertex */
-   setup_vertex1(vertices[3], dstX, dstY + height,
-                 s0, t1);
-
-   return pipe_user_buffer_create(ctx->ctx->screen,
-                                  vertices,
-                                  sizeof(vertices));
-}
 
+   switch (xrender_filter) {
+   case PictFilterNearest:
+      *out_filter = PIPE_TEX_FILTER_NEAREST;
+      break;
+   case PictFilterBilinear:
+      *out_filter = PIPE_TEX_FILTER_LINEAR;
+      break;
+   case PictFilterFast:
+      *out_filter = PIPE_TEX_FILTER_NEAREST;
+      break;
+   case PictFilterGood:
+      *out_filter = PIPE_TEX_FILTER_LINEAR;
+      break;
+   case PictFilterBest:
+      *out_filter = PIPE_TEX_FILTER_LINEAR;
+      break;
+   case PictFilterConvolution:
+      *out_filter = PIPE_TEX_FILTER_NEAREST;
+      return FALSE;
+   default:
+      debug_printf("Unknown xrender filter\n");
+      *out_filter = PIPE_TEX_FILTER_NEAREST;
+      return FALSE;
+   }
 
-static INLINE void
-setup_vertex2(float vertex[3][4], float x, float y,
-              float s0, float t0, float s1, float t1)
-{
-   vertex[0][0] = x;
-   vertex[0][1] = y;
-   vertex[0][2] = 0.f; /*z*/
-   vertex[0][3] = 1.f; /*w*/
-
-   vertex[1][0] = s0;  /*s*/
-   vertex[1][1] = t0;  /*t*/
-   vertex[1][2] = 0.f; /*r*/
-   vertex[1][3] = 1.f; /*q*/
-
-   vertex[2][0] = s1;  /*s*/
-   vertex[2][1] = t1;  /*t*/
-   vertex[2][2] = 0.f; /*r*/
-   vertex[2][3] = 1.f; /*q*/
+   return TRUE;
 }
 
-static struct pipe_buffer *
-setup_vertex_data2(struct exa_context *ctx,
-                   int srcX, int srcY, int maskX, int maskY,
-                   int dstX, int dstY, int width, int height)
+static boolean is_filter_accelerated(PicturePtr pic)
 {
-   float vertices[4][3][4];
-   float st0[4], st1[4];
-   struct pipe_texture *src = ctx->bound_textures[0];
-   struct pipe_texture *mask = ctx->bound_textures[0];
-
-   st0[0] = srcX / src->width[0];
-   st0[1] = srcY / src->height[0];
-   st0[2] = srcX + width / src->width[0];
-   st0[3] = srcY + height / src->height[0];
-
-   st1[0] = maskX / mask->width[0];
-   st1[1] = maskY / mask->height[0];
-   st1[2] = maskX + width / mask->width[0];
-   st1[3] = maskY + height / mask->height[0];
-
-   /* 1st vertex */
-   setup_vertex2(vertices[0], dstX, dstY,
-                 st0[0], st0[1], st1[0], st1[1]);
-   /* 2nd vertex */
-   setup_vertex2(vertices[1], dstX + width, dstY,
-                 st0[2], st0[1], st1[2], st1[1]);
-   /* 3rd vertex */
-   setup_vertex2(vertices[2], dstX + width, dstY + height,
-                 st0[2], st0[3], st1[2], st1[3]);
-   /* 4th vertex */
-   setup_vertex2(vertices[3], dstX, dstY + height,
-                 st0[0], st0[3], st1[0], st1[3]);
-
-   return pipe_user_buffer_create(ctx->ctx->screen,
-                                  vertices,
-                                  sizeof(vertices));
+   int filter;
+   if (pic && !render_filter_to_gallium(pic->filter, &filter))
+       return FALSE;
+   return TRUE;
 }
 
 boolean xorg_composite_accelerated(int op,
@@ -263,141 +176,145 @@ boolean xorg_composite_accelerated(int op,
                                    PicturePtr pMaskPicture,
                                    PicturePtr pDstPicture)
 {
-   unsigned i;
-   unsigned accel_ops_count =
-      sizeof(accelerated_ops)/sizeof(struct acceleration_info);
-
-
-   /*FIXME: currently accel is disabled */
-   return FALSE;
-
-   if (pSrcPicture) {
-      /* component alpha not supported */
-      if (pSrcPicture->componentAlpha)
-         return FALSE;
-      /* fills not supported */
-      if (pSrcPicture->pSourcePict)
-         return FALSE;
+   ScreenPtr pScreen = pDstPicture->pDrawable->pScreen;
+   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   struct xorg_composite_blend blend;
+
+   if (!is_filter_accelerated(pSrcPicture) ||
+       !is_filter_accelerated(pMaskPicture)) {
+      XORG_FALLBACK("Unsupported Xrender filter");
    }
 
-   for (i = 0; i < accel_ops_count; ++i) {
-      if (op == accelerated_ops[i].op) {
-         if (pMaskPicture && !accelerated_ops[i].with_mask)
-            return FALSE;
-         return TRUE;
-      }
+   if (pSrcPicture->pSourcePict) {
+      if (pSrcPicture->pSourcePict->type != SourcePictTypeSolidFill)
+         XORG_FALLBACK("Gradients not enabled (haven't been well tested)");
    }
-   return FALSE;
-}
-
-static void
-bind_framebuffer_state(struct exa_context *exa, PicturePtr pDstPicture,
-                       struct exa_pixmap_priv *pDst)
-{
-   unsigned i;
-   struct pipe_framebuffer_state state;
-   struct pipe_surface *surface = exa_gpu_surface(exa, pDst);
-   memset(&state, 0, sizeof(struct pipe_framebuffer_state));
-
-   state.width  = pDstPicture->pDrawable->width;
-   state.height = pDstPicture->pDrawable->height;
-
-   state.nr_cbufs = 1;
-   state.cbufs[0] = surface;
-   for (i = 1; i < PIPE_MAX_COLOR_BUFS; ++i)
-      state.cbufs[i] = 0;
-
-   /* currently we don't use depth/stencil */
-   state.zsbuf = 0;
-
-   cso_set_framebuffer(exa->cso, &state);
-}
-
-enum AxisOrientation {
-   Y0_BOTTOM,
-   Y0_TOP
-};
-
-static void
-set_viewport(struct exa_context *exa, int width, int height,
-             enum AxisOrientation orientation)
-{
-   struct pipe_viewport_state viewport;
-   float y_scale = (orientation == Y0_BOTTOM) ? -2.f : 2.f;
-
-   viewport.scale[0] =  width / 2.f;
-   viewport.scale[1] =  height / y_scale;
-   viewport.scale[2] =  1.0;
-   viewport.scale[3] =  1.0;
-   viewport.translate[0] = width / 2.f;
-   viewport.translate[1] = height / 2.f;
-   viewport.translate[2] = 0.0;
-   viewport.translate[3] = 0.0;
-
-   cso_set_viewport(exa->cso, &viewport);
-}
 
-static void
-bind_viewport_state(struct exa_context *exa, PicturePtr pDstPicture)
-{
-   int width = pDstPicture->pDrawable->width;
-   int height = pDstPicture->pDrawable->height;
+   if (blend_for_op(&blend, op,
+                    pSrcPicture, pMaskPicture, pDstPicture)) {
+      /* Check for component alpha */
+      if (pMaskPicture && pMaskPicture->componentAlpha &&
+          PICT_FORMAT_RGB(pMaskPicture->format)) {
+         if (blend.alpha_src && blend.rgb_src != PIPE_BLENDFACTOR_ZERO) {
+            XORG_FALLBACK("Component alpha not supported with source "
+                          "alpha and source value blending. (op=%d)",
+                          op);
+         }
+      }
 
-   set_viewport(exa, width, height, Y0_TOP);
+      return TRUE;
+   }
+   XORG_FALLBACK("Unsupported composition operation = %d", op);
 }
 
 static void
 bind_blend_state(struct exa_context *exa, int op,
-                 PicturePtr pSrcPicture, PicturePtr pMaskPicture)
+                 PicturePtr pSrcPicture,
+                 PicturePtr pMaskPicture,
+                 PicturePtr pDstPicture)
 {
-   boolean component_alpha = pSrcPicture->componentAlpha;
    struct xorg_composite_blend blend_opt;
    struct pipe_blend_state blend;
 
-   if (component_alpha) {
-      op = PictOpOver;
-   }
-   blend_opt = blend_for_op(op);
+   blend_for_op(&blend_opt, op, pSrcPicture, pMaskPicture, pDstPicture);
 
    memset(&blend, 0, sizeof(struct pipe_blend_state));
    blend.blend_enable = 1;
-   blend.colormask |= PIPE_MASK_R;
-   blend.colormask |= PIPE_MASK_G;
-   blend.colormask |= PIPE_MASK_B;
-   blend.colormask |= PIPE_MASK_A;
+   blend.colormask |= PIPE_MASK_RGBA;
 
-   blend.rgb_src_factor   = blend_opt.rgb_src_factor;
-   blend.alpha_src_factor = blend_opt.alpha_src_factor;
-   blend.rgb_dst_factor   = blend_opt.rgb_dst_factor;
-   blend.alpha_dst_factor = blend_opt.alpha_dst_factor;
+   blend.rgb_src_factor   = blend_opt.rgb_src;
+   blend.alpha_src_factor = blend_opt.rgb_src;
+   blend.rgb_dst_factor   = blend_opt.rgb_dst;
+   blend.alpha_dst_factor = blend_opt.rgb_dst;
 
-   cso_set_blend(exa->cso, &blend);
+   cso_set_blend(exa->renderer->cso, &blend);
 }
 
-static void
-bind_rasterizer_state(struct exa_context *exa)
+static unsigned
+picture_format_fixups(struct exa_pixmap_priv *pSrc, PicturePtr pSrcPicture, boolean mask,
+                      PicturePtr pDstPicture)
 {
-   struct pipe_rasterizer_state raster;
-   memset(&raster, 0, sizeof(struct pipe_rasterizer_state));
-   raster.gl_rasterization_rules = 1;
-   cso_set_rasterizer(exa->cso, &raster);
+   boolean set_alpha = FALSE;
+   boolean swizzle = FALSE;
+   unsigned ret = 0;
+
+   if (pSrc->picture_format == pSrcPicture->format) {
+      if (pSrc->picture_format == PICT_a8) {
+         if (mask)
+            return FS_MASK_LUMINANCE;
+         else if (pDstPicture->format != PICT_a8) {
+            /* if both dst and src are luminance then
+             * we don't want to swizzle the alpha (X) of the
+             * source into W component of the dst because
+             * it will break our destination */
+            return FS_SRC_LUMINANCE;
+         }
+      }
+      return 0;
+   }
+
+   if (pSrc->picture_format != PICT_a8r8g8b8) {
+      assert(!"can not handle formats");
+      return 0;
+   }
+
+   /* pSrc->picture_format == PICT_a8r8g8b8 */
+   switch (pSrcPicture->format) {
+   case PICT_x8b8g8r8:
+   case PICT_b8g8r8:
+      set_alpha = TRUE; /* fall trough */
+   case PICT_a8b8g8r8:
+      swizzle = TRUE;
+      break;
+   case PICT_x8r8g8b8:
+   case PICT_r8g8b8:
+      set_alpha = TRUE; /* fall through */
+   case PICT_a8r8g8b8:
+      break;
+#ifdef PICT_TYPE_BGRA
+   case PICT_b8g8r8a8:
+   case PICT_b8g8r8x8:
+   case PICT_a2r10g10b10:
+   case PICT_x2r10g10b10:
+   case PICT_a2b10g10r10:
+   case PICT_x2b10g10r10:
+#endif
+   default:
+      assert(!"can not handle formats");
+      return 0;
+   }
+
+   if (set_alpha)
+      ret |= mask ? FS_MASK_SET_ALPHA : FS_SRC_SET_ALPHA;
+   if (swizzle)
+      ret |= mask ? FS_MASK_SWIZZLE_RGB : FS_SRC_SWIZZLE_RGB;
+
+   return ret;
 }
 
 static void
 bind_shaders(struct exa_context *exa, int op,
-             PicturePtr pSrcPicture, PicturePtr pMaskPicture)
+             PicturePtr pSrcPicture, PicturePtr pMaskPicture, PicturePtr pDstPicture,
+             struct exa_pixmap_priv *pSrc, struct exa_pixmap_priv *pMask)
 {
    unsigned vs_traits = 0, fs_traits = 0;
    struct xorg_shader shader;
 
+   exa->has_solid_color = FALSE;
+
    if (pSrcPicture) {
+      if (pSrcPicture->repeatType == RepeatNone && pSrcPicture->transform)
+         fs_traits |= FS_SRC_REPEAT_NONE;
+
       if (pSrcPicture->pSourcePict) {
          if (pSrcPicture->pSourcePict->type == SourcePictTypeSolidFill) {
             fs_traits |= FS_SOLID_FILL;
             vs_traits |= VS_SOLID_FILL;
-            pixel_to_float4(pSrcPicture->pFormat,
-                            pSrcPicture->pSourcePict->solidFill.color,
+            debug_assert(pSrcPicture->format == PICT_a8r8g8b8);
+            pixel_to_float4(pSrcPicture->pSourcePict->solidFill.color,
                             exa->solid_color);
+            exa->has_solid_color = TRUE;
          } else {
             debug_assert("!gradients not supported");
          }
@@ -405,19 +322,33 @@ bind_shaders(struct exa_context *exa, int op,
          fs_traits |= FS_COMPOSITE;
          vs_traits |= VS_COMPOSITE;
       }
+
+      fs_traits |= picture_format_fixups(pSrc, pSrcPicture, FALSE, pDstPicture);
    }
 
    if (pMaskPicture) {
       vs_traits |= VS_MASK;
       fs_traits |= FS_MASK;
+      if (pMaskPicture->repeatType == RepeatNone && pMaskPicture->transform)
+         fs_traits |= FS_MASK_REPEAT_NONE;
+      if (pMaskPicture->componentAlpha) {
+         struct xorg_composite_blend blend;
+         blend_for_op(&blend, op,
+                      pSrcPicture, pMaskPicture, NULL);
+         if (blend.alpha_src) {
+            fs_traits |= FS_CA_SRCALPHA;
+         } else
+            fs_traits |= FS_CA_FULL;
+      }
+
+      fs_traits |= picture_format_fixups(pMask, pMaskPicture, TRUE, pDstPicture);
    }
 
-   shader = xorg_shaders_get(exa->shaders, vs_traits, fs_traits);
-   cso_set_vertex_shader_handle(exa->cso, shader.vs);
-   cso_set_fragment_shader_handle(exa->cso, shader.fs);
+   shader = xorg_shaders_get(exa->renderer->shaders, vs_traits, fs_traits);
+   cso_set_vertex_shader_handle(exa->renderer->cso, shader.vs);
+   cso_set_fragment_shader_handle(exa->renderer->cso, shader.fs);
 }
 
-
 static void
 bind_samplers(struct exa_context *exa, int op,
               PicturePtr pSrcPicture, PicturePtr pMaskPicture,
@@ -431,94 +362,105 @@ bind_samplers(struct exa_context *exa, int op,
 
    exa->num_bound_samplers = 0;
 
+#if 0
+   if ((pSrc && (exa->pipe->is_texture_referenced(exa->pipe, pSrc->tex, 0, 0) &
+                 PIPE_REFERENCED_FOR_WRITE)) ||
+       (pMask && (exa->pipe->is_texture_referenced(exa->pipe, pMask->tex, 0, 0) &
+        PIPE_REFERENCED_FOR_WRITE)))
+      xorg_exa_flush(exa, PIPE_FLUSH_RENDER_CACHE, NULL);
+#endif
+
    memset(&src_sampler, 0, sizeof(struct pipe_sampler_state));
    memset(&mask_sampler, 0, sizeof(struct pipe_sampler_state));
 
    if (pSrcPicture && pSrc) {
-      unsigned src_wrap = render_repeat_to_gallium(
-         pSrcPicture->repeatType);
-      src_sampler.wrap_s = src_wrap;
-      src_sampler.wrap_t = src_wrap;
-      src_sampler.min_img_filter = PIPE_TEX_MIPFILTER_NEAREST;
-      src_sampler.mag_img_filter = PIPE_TEX_MIPFILTER_NEAREST;
-      src_sampler.normalized_coords = 1;
-      samplers[0] = &src_sampler;
-      exa->bound_textures[0] = pSrc->tex;
-      ++exa->num_bound_samplers;
+      if (exa->has_solid_color) {
+         debug_assert(!"solid color with textures");
+         samplers[0] = NULL;
+         exa->bound_textures[0] = NULL;
+      } else {
+         unsigned src_wrap = render_repeat_to_gallium(
+            pSrcPicture->repeatType);
+         int filter;
+
+         render_filter_to_gallium(pSrcPicture->filter, &filter);
+
+         src_sampler.wrap_s = src_wrap;
+         src_sampler.wrap_t = src_wrap;
+         src_sampler.min_img_filter = filter;
+         src_sampler.mag_img_filter = filter;
+         src_sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+         src_sampler.normalized_coords = 1;
+         samplers[0] = &src_sampler;
+         exa->bound_textures[0] = pSrc->tex;
+         exa->num_bound_samplers = 1;
+      }
    }
 
    if (pMaskPicture && pMask) {
       unsigned mask_wrap = render_repeat_to_gallium(
          pMaskPicture->repeatType);
+      int filter;
+
+      render_filter_to_gallium(pMaskPicture->filter, &filter);
+
       mask_sampler.wrap_s = mask_wrap;
       mask_sampler.wrap_t = mask_wrap;
-      mask_sampler.min_img_filter = PIPE_TEX_MIPFILTER_NEAREST;
-      mask_sampler.mag_img_filter = PIPE_TEX_MIPFILTER_NEAREST;
+      mask_sampler.min_img_filter = filter;
+      mask_sampler.mag_img_filter = filter;
+      src_sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
       mask_sampler.normalized_coords = 1;
       samplers[1] = &mask_sampler;
       exa->bound_textures[1] = pMask->tex;
-      ++exa->num_bound_samplers;
+      exa->num_bound_samplers = 2;
    }
 
-   cso_set_samplers(exa->cso, exa->num_bound_samplers,
+   cso_set_samplers(exa->renderer->cso, exa->num_bound_samplers,
                     (const struct pipe_sampler_state **)samplers);
-   cso_set_sampler_textures(exa->cso, exa->num_bound_samplers,
+   cso_set_sampler_textures(exa->renderer->cso, exa->num_bound_samplers,
                             exa->bound_textures);
 }
 
-static void
-setup_vs_constant_buffer(struct exa_context *exa,
-                         int width, int height)
-{
-   const int param_bytes = 8 * sizeof(float);
-   float vs_consts[8] = {
-      2.f/width, 2.f/height, 1, 1,
-      -1, -1, 0, 0
-   };
-   struct pipe_constant_buffer *cbuf = &exa->vs_const_buffer;
-
-   pipe_buffer_reference(&cbuf->buffer, NULL);
-   cbuf->buffer = pipe_buffer_create(exa->ctx->screen, 16,
-                                     PIPE_BUFFER_USAGE_CONSTANT,
-                                     param_bytes);
-
-   if (cbuf->buffer) {
-      pipe_buffer_write(exa->ctx->screen, cbuf->buffer,
-                        0, param_bytes, vs_consts);
-   }
-   exa->ctx->set_constant_buffer(exa->ctx, PIPE_SHADER_VERTEX, 0, cbuf);
-}
 
 
-static void
-setup_fs_constant_buffer(struct exa_context *exa)
+
+
+static INLINE boolean matrix_from_pict_transform(PictTransform *trans, float *matrix)
 {
-   const int param_bytes = 4 * sizeof(float);
-   float fs_consts[8] = {
-      0, 0, 0, 1,
-   };
-   struct pipe_constant_buffer *cbuf = &exa->fs_const_buffer;
-
-   pipe_buffer_reference(&cbuf->buffer, NULL);
-   cbuf->buffer = pipe_buffer_create(exa->ctx->screen, 16,
-                                     PIPE_BUFFER_USAGE_CONSTANT,
-                                     param_bytes);
-
-   if (cbuf->buffer) {
-      pipe_buffer_write(exa->ctx->screen, cbuf->buffer,
-                        0, param_bytes, fs_consts);
-   }
-   exa->ctx->set_constant_buffer(exa->ctx, PIPE_SHADER_FRAGMENT, 0, cbuf);
+   if (!trans)
+      return FALSE;
+
+   matrix[0] = XFixedToDouble(trans->matrix[0][0]);
+   matrix[3] = XFixedToDouble(trans->matrix[0][1]);
+   matrix[6] = XFixedToDouble(trans->matrix[0][2]);
+
+   matrix[1] = XFixedToDouble(trans->matrix[1][0]);
+   matrix[4] = XFixedToDouble(trans->matrix[1][1]);
+   matrix[7] = XFixedToDouble(trans->matrix[1][2]);
+
+   matrix[2] = XFixedToDouble(trans->matrix[2][0]);
+   matrix[5] = XFixedToDouble(trans->matrix[2][1]);
+   matrix[8] = XFixedToDouble(trans->matrix[2][2]);
+
+   return TRUE;
 }
 
 static void
-setup_constant_buffers(struct exa_context *exa, PicturePtr pDstPicture)
+setup_transforms(struct  exa_context *exa,
+                 PicturePtr pSrcPicture, PicturePtr pMaskPicture)
 {
-   int width = pDstPicture->pDrawable->width;
-   int height = pDstPicture->pDrawable->height;
-
-   setup_vs_constant_buffer(exa, width, height);
-   setup_fs_constant_buffer(exa);
+   PictTransform *src_t = NULL;
+   PictTransform *mask_t = NULL;
+
+   if (pSrcPicture)
+      src_t = pSrcPicture->transform;
+   if (pMaskPicture)
+      mask_t = pMaskPicture->transform;
+
+   exa->transform.has_src  =
+      matrix_from_pict_transform(src_t, exa->transform.src);
+   exa->transform.has_mask =
+      matrix_from_pict_transform(mask_t, exa->transform.mask);
 }
 
 boolean xorg_composite_bind_state(struct exa_context *exa,
@@ -530,17 +472,30 @@ boolean xorg_composite_bind_state(struct exa_context *exa,
                                   struct exa_pixmap_priv *pMask,
                                   struct exa_pixmap_priv *pDst)
 {
-   bind_framebuffer_state(exa, pDstPicture, pDst);
-   bind_viewport_state(exa, pDstPicture);
-   bind_blend_state(exa, op, pSrcPicture, pMaskPicture);
-   bind_rasterizer_state(exa);
-   bind_shaders(exa, op, pSrcPicture, pMaskPicture);
+   struct pipe_surface *dst_surf = xorg_gpu_surface(exa->scrn, pDst);
+
+   renderer_bind_destination(exa->renderer, dst_surf,
+                             pDst->width,
+                             pDst->height);
+
+   bind_blend_state(exa, op, pSrcPicture, pMaskPicture, pDstPicture);
+   bind_shaders(exa, op, pSrcPicture, pMaskPicture, pDstPicture, pSrc, pMask);
    bind_samplers(exa, op, pSrcPicture, pMaskPicture,
                  pDstPicture, pSrc, pMask, pDst);
 
-   setup_constant_buffers(exa, pDstPicture);
+   setup_transforms(exa, pSrcPicture, pMaskPicture);
+
+   if (exa->num_bound_samplers == 0 ) { /* solid fill */
+      renderer_begin_solid(exa->renderer);
+   } else {
+      renderer_begin_textures(exa->renderer,
+                              exa->bound_textures,
+                              exa->num_bound_samplers);
+   }
 
-   return FALSE;
+
+   pipe_surface_reference(&dst_surf, NULL);
+   return TRUE;
 }
 
 void xorg_composite(struct exa_context *exa,
@@ -548,37 +503,81 @@ void xorg_composite(struct exa_context *exa,
                     int srcX, int srcY, int maskX, int maskY,
                     int dstX, int dstY, int width, int height)
 {
-   struct pipe_context *pipe = exa->ctx;
-   struct pipe_buffer *buf = 0;
-
    if (exa->num_bound_samplers == 0 ) { /* solid fill */
-      buf = setup_vertex_data0(exa,
-                               srcX, srcY, maskX, maskY,
-                               dstX, dstY, width, height);
-   } else if (exa->num_bound_samplers == 1 ) /* src */
-      buf = setup_vertex_data1(exa,
-                               srcX, srcY, maskX, maskY,
-                               dstX, dstY, width, height);
-   else if (exa->num_bound_samplers == 2) /* src + mask */
-      buf = setup_vertex_data2(exa,
-                               srcX, srcY, maskX, maskY,
-                               dstX, dstY, width, height);
-   else if (exa->num_bound_samplers == 3) { /* src + mask + dst */
-      debug_assert(!"src/mask/dst not handled right now");
+      renderer_solid(exa->renderer,
+                     dstX, dstY, dstX + width, dstY + height,
+                     exa->solid_color);
+   } else {
+      int pos[6] = {srcX, srcY, maskX, maskY, dstX, dstY};
+      float *src_matrix = NULL;
+      float *mask_matrix = NULL;
+
+      if (exa->transform.has_src)
+         src_matrix = exa->transform.src;
+      if (exa->transform.has_mask)
+         mask_matrix = exa->transform.mask;
+
+      renderer_texture(exa->renderer,
+                       pos, width, height,
+                       exa->bound_textures,
+                       exa->num_bound_samplers,
+                       src_matrix, mask_matrix);
+   }
+}
+
+boolean xorg_solid_bind_state(struct exa_context *exa,
+                              struct exa_pixmap_priv *pixmap,
+                              Pixel fg)
+{
+   struct pipe_surface *dst_surf = xorg_gpu_surface(exa->scrn, pixmap);
+   unsigned vs_traits, fs_traits;
+   struct xorg_shader shader;
+
+   pixel_to_float4(fg, exa->solid_color);
+   exa->has_solid_color = TRUE;
+
 #if 0
-      buf = setup_vertex_data2(exa,
-                               srcX, srcY, maskX, maskY,
-                               dstX, dstY, width, height);
+   debug_printf("Color Pixel=(%d, %d, %d, %d), RGBA=(%f, %f, %f, %f)\n",
+                (fg >> 24) & 0xff, (fg >> 16) & 0xff,
+                (fg >> 8) & 0xff,  (fg >> 0) & 0xff,
+                exa->solid_color[0], exa->solid_color[1],
+                exa->solid_color[2], exa->solid_color[3]);
 #endif
-   }
 
-   if (buf) {
-      util_draw_vertex_buffer(pipe, buf, 0,
-                              PIPE_PRIM_TRIANGLE_FAN,
-                              4,  /* verts */
-                              1 + exa->num_bound_samplers); /* attribs/vert */
+   vs_traits = VS_SOLID_FILL;
+   fs_traits = FS_SOLID_FILL;
 
-      pipe_buffer_reference(&buf, NULL);
-   }
+   renderer_bind_destination(exa->renderer, dst_surf, 
+                             pixmap->width, pixmap->height);
+   bind_blend_state(exa, PictOpSrc, NULL, NULL, NULL);
+   cso_set_samplers(exa->renderer->cso, 0, NULL);
+   cso_set_sampler_textures(exa->renderer->cso, 0, NULL);
+
+   shader = xorg_shaders_get(exa->renderer->shaders, vs_traits, fs_traits);
+   cso_set_vertex_shader_handle(exa->renderer->cso, shader.vs);
+   cso_set_fragment_shader_handle(exa->renderer->cso, shader.fs);
+
+   renderer_begin_solid(exa->renderer);
+
+   pipe_surface_reference(&dst_surf, NULL);
+   return TRUE;
 }
 
+void xorg_solid(struct exa_context *exa,
+                struct exa_pixmap_priv *pixmap,
+                int x0, int y0, int x1, int y1)
+{
+   renderer_solid(exa->renderer,
+                  x0, y0, x1, y1, exa->solid_color);
+}
+
+void
+xorg_composite_done(struct exa_context *exa)
+{
+   renderer_draw_flush(exa->renderer);
+
+   exa->transform.has_src = FALSE;
+   exa->transform.has_mask = FALSE;
+   exa->has_solid_color = FALSE;
+   exa->num_bound_samplers = 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xorg_composite.h b/src/gallium/state_trackers/xorg/xorg_composite.h
index 17dfcb199ea..ec71ebfe0dc 100644
--- a/src/gallium/state_trackers/xorg/xorg_composite.h
+++ b/src/gallium/state_trackers/xorg/xorg_composite.h
@@ -22,4 +22,15 @@ void xorg_composite(struct exa_context *exa,
                     int srcX, int srcY, int maskX, int maskY,
                     int dstX, int dstY, int width, int height);
 
+boolean xorg_solid_bind_state(struct exa_context *exa,
+                              struct exa_pixmap_priv *pixmap,
+                              Pixel fg);
+void xorg_solid(struct exa_context *exa,
+                struct exa_pixmap_priv *pixmap,
+                int x0, int y0, int x1, int y1);
+
+
+void
+xorg_composite_done(struct exa_context *exa);
+
 #endif
diff --git a/src/gallium/state_trackers/xorg/xorg_crtc.c b/src/gallium/state_trackers/xorg/xorg_crtc.c
index 67fe29a69da..ddcaedde37e 100644
--- a/src/gallium/state_trackers/xorg/xorg_crtc.c
+++ b/src/gallium/state_trackers/xorg/xorg_crtc.c
@@ -52,20 +52,24 @@
 #include "pipe/p_inlines.h"
 #include "util/u_rect.h"
 
+#ifdef HAVE_LIBKMS
+#include "libkms.h"
+#endif
+
 struct crtc_private
 {
     drmModeCrtcPtr drm_crtc;
 
     /* hwcursor */
     struct pipe_texture *cursor_tex;
+    struct kms_bo *cursor_bo;
+
     unsigned cursor_handle;
 };
 
 static void
 crtc_dpms(xf86CrtcPtr crtc, int mode)
 {
-    //ScrnInfoPtr pScrn = crtc->scrn;
-
     switch (mode) {
     case DPMSModeOn:
     case DPMSModeStandby:
@@ -77,44 +81,29 @@ crtc_dpms(xf86CrtcPtr crtc, int mode)
 }
 
 static Bool
-crtc_lock(xf86CrtcPtr crtc)
-{
-    return FALSE;
-}
-
-static void
-crtc_unlock(xf86CrtcPtr crtc)
-{
-}
-
-static void
-crtc_prepare(xf86CrtcPtr crtc)
-{
-}
-
-static void
-crtc_commit(xf86CrtcPtr crtc)
-{
-}
-
-static Bool
-crtc_mode_fixup(xf86CrtcPtr crtc, DisplayModePtr mode,
-		DisplayModePtr adjusted_mode)
-{
-    return TRUE;
-}
-
-static void
-crtc_mode_set(xf86CrtcPtr crtc, DisplayModePtr mode,
-	      DisplayModePtr adjusted_mode, int x, int y)
+crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
+		    Rotation rotation, int x, int y)
 {
     xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(crtc->scrn);
     modesettingPtr ms = modesettingPTR(crtc->scrn);
-    xf86OutputPtr output = config->output[config->compat_output];
-    drmModeConnectorPtr drm_connector = output->driver_private;
+    xf86OutputPtr output = NULL;
+    drmModeConnectorPtr drm_connector;
     struct crtc_private *crtcp = crtc->driver_private;
     drmModeCrtcPtr drm_crtc = crtcp->drm_crtc;
     drmModeModeInfo drm_mode;
+    int i, ret;
+
+    for (i = 0; i < config->num_output; output = NULL, i++) {
+	output = config->output[i];
+
+	if (output->crtc == crtc)
+	    break;
+    }
+
+    if (!output)
+	return FALSE;
+
+    drm_connector = output->driver_private;
 
     drm_mode.clock = mode->Clock;
     drm_mode.hdisplay = mode->HDisplay;
@@ -133,60 +122,65 @@ crtc_mode_set(xf86CrtcPtr crtc, DisplayModePtr mode,
 	xf86SetModeDefaultName(mode);
     strncpy(drm_mode.name, mode->name, DRM_DISPLAY_MODE_LEN);
 
-    drmModeSetCrtc(ms->fd, drm_crtc->crtc_id, ms->fb_id, x, y,
-		   &drm_connector->connector_id, 1, &drm_mode);
-}
+    ret = drmModeSetCrtc(ms->fd, drm_crtc->crtc_id, ms->fb_id, x, y,
+			 &drm_connector->connector_id, 1, &drm_mode);
 
-#if 0
-static void
-crtc_load_lut(xf86CrtcPtr crtc)
-{
-    //ScrnInfoPtr pScrn = crtc->scrn;
+    if (ret)
+	return FALSE;
+
+    crtc->x = x;
+    crtc->y = y;
+    crtc->mode = *mode;
+    crtc->rotation = rotation;
+
+    return TRUE;
 }
-#endif
 
 static void
 crtc_gamma_set(xf86CrtcPtr crtc, CARD16 * red, CARD16 * green, CARD16 * blue,
 	       int size)
 {
+    /* XXX: hockup */
 }
 
 static void *
 crtc_shadow_allocate(xf86CrtcPtr crtc, int width, int height)
 {
-    //ScrnInfoPtr pScrn = crtc->scrn;
-
     return NULL;
 }
 
 static PixmapPtr
 crtc_shadow_create(xf86CrtcPtr crtc, void *data, int width, int height)
 {
-    //ScrnInfoPtr pScrn = crtc->scrn;
-
     return NULL;
 }
 
 static void
 crtc_shadow_destroy(xf86CrtcPtr crtc, PixmapPtr rotate_pixmap, void *data)
 {
-    //ScrnInfoPtr pScrn = crtc->scrn;
 }
 
+/*
+ * Cursor functions
+ */
+
 static void
-crtc_destroy(xf86CrtcPtr crtc)
+crtc_set_cursor_colors(xf86CrtcPtr crtc, int bg, int fg)
 {
-    struct crtc_private *crtcp = crtc->driver_private;
+    /* XXX: See if this one is needed, as we only support ARGB cursors */
+}
 
-    if (crtcp->cursor_tex)
-	pipe_texture_reference(&crtcp->cursor_tex, NULL);
+static void
+crtc_set_cursor_position(xf86CrtcPtr crtc, int x, int y)
+{
+    modesettingPtr ms = modesettingPTR(crtc->scrn);
+    struct crtc_private *crtcp = crtc->driver_private;
 
-    drmModeFreeCrtc(crtcp->drm_crtc);
-    xfree(crtcp);
+    drmModeMoveCursor(ms->fd, crtcp->drm_crtc->crtc_id, x, y);
 }
 
 static void
-crtc_load_cursor_argb(xf86CrtcPtr crtc, CARD32 * image)
+crtc_load_cursor_argb_ga3d(xf86CrtcPtr crtc, CARD32 * image)
 {
     unsigned char *ptr;
     modesettingPtr ms = modesettingPTR(crtc->scrn);
@@ -229,13 +223,54 @@ crtc_load_cursor_argb(xf86CrtcPtr crtc, CARD32 * image)
     ms->screen->tex_transfer_destroy(transfer);
 }
 
+#if HAVE_LIBKMS
 static void
-crtc_set_cursor_position(xf86CrtcPtr crtc, int x, int y)
+crtc_load_cursor_argb_kms(xf86CrtcPtr crtc, CARD32 * image)
 {
     modesettingPtr ms = modesettingPTR(crtc->scrn);
     struct crtc_private *crtcp = crtc->driver_private;
+    unsigned char *ptr;
 
-    drmModeMoveCursor(ms->fd, crtcp->drm_crtc->crtc_id, x, y);
+    if (!crtcp->cursor_bo) {
+	unsigned attr[8];
+
+	attr[0] = KMS_BO_TYPE;
+	attr[1] = KMS_BO_TYPE_CURSOR;
+	attr[2] = KMS_WIDTH;
+	attr[3] = 64;
+	attr[4] = KMS_HEIGHT;
+	attr[5] = 64;
+	attr[6] = 0;
+
+        if (kms_bo_create(ms->kms, attr, &crtcp->cursor_bo))
+	   return;
+
+	if (kms_bo_get_prop(crtcp->cursor_bo, KMS_HANDLE,
+			    &crtcp->cursor_handle))
+	    goto err_bo_destroy;
+    }
+
+    kms_bo_map(crtcp->cursor_bo, (void**)&ptr);
+    memcpy(ptr, image, 64*64*4);
+    kms_bo_unmap(crtcp->cursor_bo);
+
+    return;
+
+err_bo_destroy:
+    kms_bo_destroy(crtcp->cursor_bo);
+}
+#endif
+
+static void
+crtc_load_cursor_argb(xf86CrtcPtr crtc, CARD32 * image)
+{
+    modesettingPtr ms = modesettingPTR(crtc->scrn);
+    if (ms->screen)
+	crtc_load_cursor_argb_ga3d(crtc, image);
+#ifdef HAVE_LIBKMS
+    else if (ms->kms)
+	crtc_load_cursor_argb_kms(crtc, image);
+#endif
 }
 
 static void
@@ -244,7 +279,7 @@ crtc_show_cursor(xf86CrtcPtr crtc)
     modesettingPtr ms = modesettingPTR(crtc->scrn);
     struct crtc_private *crtcp = crtc->driver_private;
 
-    if (crtcp->cursor_tex)
+    if (crtcp->cursor_tex || crtcp->cursor_bo)
 	drmModeSetCursor(ms->fd, crtcp->drm_crtc->crtc_id,
 			 crtcp->cursor_handle, 64, 64);
 }
@@ -258,41 +293,60 @@ crtc_hide_cursor(xf86CrtcPtr crtc)
     drmModeSetCursor(ms->fd, crtcp->drm_crtc->crtc_id, 0, 0, 0);
 }
 
+/**
+ * Called at vt leave
+ */
+void
+xorg_crtc_cursor_destroy(xf86CrtcPtr crtc)
+{
+    struct crtc_private *crtcp = crtc->driver_private;
+
+    if (crtcp->cursor_tex)
+	pipe_texture_reference(&crtcp->cursor_tex, NULL);
+#ifdef HAVE_LIBKMS
+    if (crtcp->cursor_bo)
+	kms_bo_destroy(crtcp->cursor_bo);
+#endif
+
+    xfree(crtcp);
+}
+
+/*
+ * Misc functions
+ */
+
+static void
+crtc_destroy(xf86CrtcPtr crtc)
+{
+    struct crtc_private *crtcp = crtc->driver_private;
+
+    if (crtcp->cursor_tex)
+	pipe_texture_reference(&crtcp->cursor_tex, NULL);
+
+    drmModeFreeCrtc(crtcp->drm_crtc);
+    xfree(crtcp);
+}
+
 static const xf86CrtcFuncsRec crtc_funcs = {
     .dpms = crtc_dpms,
-    .save = NULL,
-    .restore = NULL,
-    .lock = crtc_lock,
-    .unlock = crtc_unlock,
-    .mode_fixup = crtc_mode_fixup,
-    .prepare = crtc_prepare,
-    .mode_set = crtc_mode_set,
-    .commit = crtc_commit,
-    .gamma_set = crtc_gamma_set,
-    .shadow_create = crtc_shadow_create,
-    .shadow_allocate = crtc_shadow_allocate,
-    .shadow_destroy = crtc_shadow_destroy,
+    .set_mode_major = crtc_set_mode_major,
+
+    .set_cursor_colors = crtc_set_cursor_colors,
     .set_cursor_position = crtc_set_cursor_position,
     .show_cursor = crtc_show_cursor,
     .hide_cursor = crtc_hide_cursor,
-    .load_cursor_image = NULL,	       /* lets convert to argb only */
-    .set_cursor_colors = NULL,	       /* using argb only */
     .load_cursor_argb = crtc_load_cursor_argb,
-    .destroy = crtc_destroy,
-};
 
-void
-crtc_cursor_destroy(xf86CrtcPtr crtc)
-{
-    struct crtc_private *crtcp = crtc->driver_private;
+    .shadow_create = crtc_shadow_create,
+    .shadow_allocate = crtc_shadow_allocate,
+    .shadow_destroy = crtc_shadow_destroy,
 
-    if (crtcp->cursor_tex) {
-	pipe_texture_reference(&crtcp->cursor_tex, NULL);
-    }
-}
+    .gamma_set = crtc_gamma_set,
+    .destroy = crtc_destroy,
+};
 
 void
-crtc_init(ScrnInfoPtr pScrn)
+xorg_crtc_init(ScrnInfoPtr pScrn)
 {
     modesettingPtr ms = modesettingPTR(pScrn);
     xf86CrtcPtr crtc;
@@ -309,6 +363,7 @@ crtc_init(ScrnInfoPtr pScrn)
 
     for (c = 0; c < res->count_crtcs; c++) {
 	drm_crtc = drmModeGetCrtc(ms->fd, res->crtcs[c]);
+
 	if (!drm_crtc)
 	    continue;
 
@@ -325,7 +380,6 @@ crtc_init(ScrnInfoPtr pScrn)
 	crtcp->drm_crtc = drm_crtc;
 
 	crtc->driver_private = crtcp;
-
     }
 
   out:
diff --git a/src/gallium/state_trackers/xorg/xorg_dri2.c b/src/gallium/state_trackers/xorg/xorg_dri2.c
index 6431a0fe254..4fa47548a43 100644
--- a/src/gallium/state_trackers/xorg/xorg_dri2.c
+++ b/src/gallium/state_trackers/xorg/xorg_dri2.c
@@ -42,6 +42,12 @@
 
 #include "util/u_rect.h"
 
+/* Make all the #if cases in the code esier to read */
+/* XXX can it be set to 1? */
+#ifndef DRI2INFOREC_VERSION
+#define DRI2INFOREC_VERSION 0
+#endif
+
 typedef struct {
     PixmapPtr pPixmap;
     struct pipe_texture *tex;
@@ -49,7 +55,7 @@ typedef struct {
 } *BufferPrivatePtr;
 
 static Bool
-driDoCreateBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer, unsigned int format)
+dri2_do_create_buffer(DrawablePtr pDraw, DRI2BufferPtr buffer, unsigned int format)
 {
     struct pipe_texture *tex = NULL;
     ScreenPtr pScreen = pDraw->pScreen;
@@ -79,13 +85,16 @@ driDoCreateBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer, unsigned int format)
     case DRI2BufferFrontLeft:
 	break;
     case DRI2BufferStencil:
-#if defined(DRI2INFOREC_VERSION) && DRI2INFOREC_VERSION > 2
+#if DRI2INFOREC_VERSION >= 3
     case DRI2BufferDepthStencil:
+#else
+    /* Works on old X servers because sanity checking is for the weak */
+    case 9:
+#endif
 	if (exa_priv->depth_stencil_tex &&
 	    !pf_is_depth_stencil(exa_priv->depth_stencil_tex->format))
 	    exa_priv->depth_stencil_tex = NULL;
         /* Fall through */
-#endif
     case DRI2BufferDepth:
 	if (exa_priv->depth_stencil_tex)
 	    pipe_texture_reference(&tex, exa_priv->depth_stencil_tex);
@@ -118,8 +127,12 @@ driDoCreateBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer, unsigned int format)
     }
 
     if (!tex) {
+	/* First call to make sure we have a pixmap private */
+	exaMoveInPixmap(private->pPixmap);
 	xorg_exa_set_shared_usage(private->pPixmap);
 	pScreen->ModifyPixmapHeader(private->pPixmap, 0, 0, 0, 0, 0, NULL);
+	/* Second call to make sure texture has valid contents */
+	exaMoveInPixmap(private->pPixmap);
 	tex = xorg_exa_get_texture(private->pPixmap);
     }
 
@@ -133,13 +146,18 @@ driDoCreateBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer, unsigned int format)
     buffer->cpp = 4;
     buffer->driverPrivate = private;
     buffer->flags = 0; /* not tiled */
+#if DRI2INFOREC_VERSION == 2
+    ((DRI2Buffer2Ptr)buffer)->format = 0;
+#elif DRI2INFOREC_VERSION >= 3
+    buffer->format = 0;
+#endif
     private->tex = tex;
 
     return TRUE;
 }
 
 static void
-driDoDestroyBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer)
+dri2_do_destroy_buffer(DrawablePtr pDraw, DRI2BufferPtr buffer)
 {
     ScreenPtr pScreen = pDraw->pScreen;
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
@@ -153,12 +171,12 @@ driDoDestroyBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer)
     (*pScreen->DestroyPixmap)(private->pPixmap);
 }
 
-#if defined(DRI2INFOREC_VERSION) && DRI2INFOREC_VERSION > 2
+#if DRI2INFOREC_VERSION >= 2
 
-static DRI2BufferPtr
-driCreateBuffer(DrawablePtr pDraw, unsigned int attachment, unsigned int format)
+static DRI2Buffer2Ptr
+dri2_create_buffer(DrawablePtr pDraw, unsigned int attachment, unsigned int format)
 {
-    DRI2BufferPtr buffer;
+    DRI2Buffer2Ptr buffer;
     BufferPrivatePtr private;
 
     buffer = xcalloc(1, sizeof *buffer);
@@ -173,7 +191,8 @@ driCreateBuffer(DrawablePtr pDraw, unsigned int attachment, unsigned int format)
     buffer->attachment = attachment;
     buffer->driverPrivate = private;
 
-    if (driDoCreateBuffer(pDraw, buffer, format))
+    /* So far it is safe to downcast a DRI2Buffer2Ptr to DRI2BufferPtr */
+    if (dri2_do_create_buffer(pDraw, (DRI2BufferPtr)buffer, format))
 	return buffer;
 
     xfree(private);
@@ -183,18 +202,19 @@ fail:
 }
 
 static void
-driDestroyBuffer(DrawablePtr pDraw, DRI2BufferPtr buffer)
+dri2_destroy_buffer(DrawablePtr pDraw, DRI2Buffer2Ptr buffer)
 {
-    driDoDestroyBuffer(pDraw, buffer);
+    /* So far it is safe to downcast a DRI2Buffer2Ptr to DRI2BufferPtr */
+    dri2_do_destroy_buffer(pDraw, (DRI2BufferPtr)buffer);
 
     xfree(buffer->driverPrivate);
     xfree(buffer);
 }
 
-#else /* DRI2INFOREC_VERSION <= 2 */
+#else /* DRI2INFOREC_VERSION < 2 */
 
 static DRI2BufferPtr
-driCreateBuffers(DrawablePtr pDraw, unsigned int *attachments, int count)
+dri2_create_buffers(DrawablePtr pDraw, unsigned int *attachments, int count)
 {
     BufferPrivatePtr privates;
     DRI2BufferPtr buffers;
@@ -212,7 +232,7 @@ driCreateBuffers(DrawablePtr pDraw, unsigned int *attachments, int count)
 	buffers[i].attachment = attachments[i];
 	buffers[i].driverPrivate = &privates[i];
 
-	if (!driDoCreateBuffer(pDraw, &buffers[i], 0))
+	if (!dri2_do_create_buffer(pDraw, &buffers[i], 0))
 	    goto fail;
     }
 
@@ -227,12 +247,12 @@ fail_buffers:
 }
 
 static void
-driDestroyBuffers(DrawablePtr pDraw, DRI2BufferPtr buffers, int count)
+dri2_destroy_buffers(DrawablePtr pDraw, DRI2BufferPtr buffers, int count)
 {
     int i;
 
     for (i = 0; i < count; i++) {
-	driDoDestroyBuffer(pDraw, &buffers[i]);
+	dri2_do_destroy_buffer(pDraw, &buffers[i]);
     }
 
     if (buffers) {
@@ -241,21 +261,22 @@ driDestroyBuffers(DrawablePtr pDraw, DRI2BufferPtr buffers, int count)
     }
 }
 
-#endif /* DRI2INFOREC_VERSION */
+#endif /* DRI2INFOREC_VERSION >= 2 */
 
 static void
-driCopyRegion(DrawablePtr pDraw, RegionPtr pRegion,
-              DRI2BufferPtr pDestBuffer, DRI2BufferPtr pSrcBuffer)
+dri2_copy_region(DrawablePtr pDraw, RegionPtr pRegion,
+                 DRI2BufferPtr pDestBuffer, DRI2BufferPtr pSrcBuffer)
 {
     ScreenPtr pScreen = pDraw->pScreen;
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
     modesettingPtr ms = modesettingPTR(pScrn);
     BufferPrivatePtr dst_priv = pDestBuffer->driverPrivate;
     BufferPrivatePtr src_priv = pSrcBuffer->driverPrivate;
-    PixmapPtr src_pixmap;
-    PixmapPtr dst_pixmap;
+    DrawablePtr src_draw;
+    DrawablePtr dst_draw;
     GCPtr gc;
     RegionPtr copy_clip;
+    Bool save_accel;
 
     /*
      * In driCreateBuffers we dewrap windows into the
@@ -263,12 +284,10 @@ driCopyRegion(DrawablePtr pDraw, RegionPtr pRegion,
      * We need to use the real drawable in CopyArea
      * so that cliprects and offsets are correct.
      */
-    src_pixmap = src_priv->pPixmap;
-    dst_pixmap = dst_priv->pPixmap;
-    if (pSrcBuffer->attachment == DRI2BufferFrontLeft)
-	src_pixmap = (PixmapPtr)pDraw;
-    if (pDestBuffer->attachment == DRI2BufferFrontLeft)
-	dst_pixmap = (PixmapPtr)pDraw;
+    src_draw = (pSrcBuffer->attachment == DRI2BufferFrontLeft) ? pDraw :
+       &src_priv->pPixmap->drawable;
+    dst_draw = (pDestBuffer->attachment == DRI2BufferFrontLeft) ? pDraw :
+       &dst_priv->pPixmap->drawable;
 
     /*
      * The clients implements glXWaitX with a copy front to fake and then
@@ -287,7 +306,7 @@ driCopyRegion(DrawablePtr pDraw, RegionPtr pRegion,
      * must in the glXWaitGL case but we don't know if this is a glXWaitGL
      * or a glFlush/glFinish call.
      */
-    if (dst_pixmap == src_pixmap) {
+    if (dst_priv->pPixmap == src_priv->pPixmap) {
 	/* pixmap glXWaitX */
 	if (pSrcBuffer->attachment == DRI2BufferFrontLeft &&
 	    pDestBuffer->attachment == DRI2BufferFakeFrontLeft) {
@@ -308,7 +327,7 @@ driCopyRegion(DrawablePtr pDraw, RegionPtr pRegion,
     copy_clip = REGION_CREATE(pScreen, NULL, 0);
     REGION_COPY(pScreen, copy_clip, pRegion);
     (*gc->funcs->ChangeClip) (gc, CT_REGION, copy_clip, 0);
-    ValidateGC(&dst_pixmap->drawable, gc);
+    ValidateGC(dst_draw, gc);
 
     /* If this is a full buffer swap, throttle on the previous one */
     if (dst_priv->fence && REGION_NUM_RECTS(pRegion) == 1) {
@@ -321,8 +340,21 @@ driCopyRegion(DrawablePtr pDraw, RegionPtr pRegion,
 	}
     }
 
-    (*gc->ops->CopyArea)(&src_pixmap->drawable, &dst_pixmap->drawable, gc,
+    /* Try to make sure the blit will be accelerated */
+    save_accel = ms->exa->accel;
+    ms->exa->accel = TRUE;
+
+    /* In case it won't be though, make sure the GPU copy contents of the
+     * source pixmap will be used for the software fallback - presumably the
+     * client modified them before calling in here.
+     */
+    exaMoveInPixmap(src_priv->pPixmap);
+    DamageRegionAppend(src_draw, pRegion);
+    DamageRegionProcessPending(src_draw);
+
+    (*gc->ops->CopyArea)(src_draw, dst_draw, gc,
 			 0, 0, pDraw->width, pDraw->height, 0, 0);
+    ms->exa->accel = save_accel;
 
     FreeScratchGC(gc);
 
@@ -332,13 +364,13 @@ driCopyRegion(DrawablePtr pDraw, RegionPtr pRegion,
 }
 
 Bool
-driScreenInit(ScreenPtr pScreen)
+xorg_dri2_init(ScreenPtr pScreen)
 {
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
     modesettingPtr ms = modesettingPTR(pScrn);
     DRI2InfoRec dri2info;
 
-#if defined(DRI2INFOREC_VERSION)
+#if DRI2INFOREC_VERSION >= 2
     dri2info.version = DRI2INFOREC_VERSION;
 #else
     dri2info.version = 1;
@@ -348,14 +380,14 @@ driScreenInit(ScreenPtr pScreen)
     dri2info.driverName = pScrn->driverName;
     dri2info.deviceName = "/dev/dri/card0"; /* FIXME */
 
-#if defined(DRI2INFOREC_VERSION) && DRI2INFOREC_VERSION > 2
-    dri2info.CreateBuffer = driCreateBuffer;
-    dri2info.DestroyBuffer = driDestroyBuffer;
+#if DRI2INFOREC_VERSION >= 2
+    dri2info.CreateBuffer = dri2_create_buffer;
+    dri2info.DestroyBuffer = dri2_destroy_buffer;
 #else
-    dri2info.CreateBuffers = driCreateBuffers;
-    dri2info.DestroyBuffers = driDestroyBuffers;
+    dri2info.CreateBuffers = dri2_create_buffers;
+    dri2info.DestroyBuffers = dri2_destroy_buffers;
 #endif
-    dri2info.CopyRegion = driCopyRegion;
+    dri2info.CopyRegion = dri2_copy_region;
     dri2info.Wait = NULL;
 
     ms->d_depth_bits_last =
@@ -371,7 +403,7 @@ driScreenInit(ScreenPtr pScreen)
 }
 
 void
-driCloseScreen(ScreenPtr pScreen)
+xorg_dri2_close(ScreenPtr pScreen)
 {
     DRI2CloseScreen(pScreen);
 }
diff --git a/src/gallium/state_trackers/xorg/xorg_driver.c b/src/gallium/state_trackers/xorg/xorg_driver.c
index 643b6b3b9e4..da86295c316 100644
--- a/src/gallium/state_trackers/xorg/xorg_driver.c
+++ b/src/gallium/state_trackers/xorg/xorg_driver.c
@@ -56,32 +56,38 @@
 #include "xorg_tracker.h"
 #include "xorg_winsys.h"
 
-static void AdjustFrame(int scrnIndex, int x, int y, int flags);
-static Bool CloseScreen(int scrnIndex, ScreenPtr pScreen);
-static Bool EnterVT(int scrnIndex, int flags);
-static Bool SaveHWState(ScrnInfoPtr pScrn);
-static Bool RestoreHWState(ScrnInfoPtr pScrn);
-
-
-static ModeStatus ValidMode(int scrnIndex, DisplayModePtr mode, Bool verbose,
-			    int flags);
-static void FreeScreen(int scrnIndex, int flags);
-static void LeaveVT(int scrnIndex, int flags);
-static Bool SwitchMode(int scrnIndex, DisplayModePtr mode, int flags);
-static Bool ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc,
-		       char **argv);
-static Bool PreInit(ScrnInfoPtr pScrn, int flags);
+#ifdef HAVE_LIBKMS
+#include "libkms.h"
+#endif
+
+/*
+ * Functions and symbols exported to Xorg via pointers.
+ */
+
+static Bool drv_pre_init(ScrnInfoPtr pScrn, int flags);
+static Bool drv_screen_init(int scrnIndex, ScreenPtr pScreen, int argc,
+			    char **argv);
+static Bool drv_switch_mode(int scrnIndex, DisplayModePtr mode, int flags);
+static void drv_adjust_frame(int scrnIndex, int x, int y, int flags);
+static Bool drv_enter_vt(int scrnIndex, int flags);
+static void drv_leave_vt(int scrnIndex, int flags);
+static void drv_free_screen(int scrnIndex, int flags);
+static ModeStatus drv_valid_mode(int scrnIndex, DisplayModePtr mode, Bool verbose,
+			         int flags);
 
 typedef enum
 {
     OPTION_SW_CURSOR,
-} modesettingOpts;
+    OPTION_2D_ACCEL,
+} drv_option_enums;
 
-static const OptionInfoRec Options[] = {
+static const OptionInfoRec drv_options[] = {
     {OPTION_SW_CURSOR, "SWcursor", OPTV_BOOLEAN, {0}, FALSE},
+    {OPTION_2D_ACCEL, "2DAccel", OPTV_BOOLEAN, {0}, FALSE},
     {-1, NULL, OPTV_NONE, {0}, FALSE}
 };
 
+
 /*
  * Exported Xorg driver functions to winsys
  */
@@ -89,28 +95,39 @@ static const OptionInfoRec Options[] = {
 const OptionInfoRec *
 xorg_tracker_available_options(int chipid, int busid)
 {
-    return Options;
+    return drv_options;
 }
 
 void
 xorg_tracker_set_functions(ScrnInfoPtr scrn)
 {
-    scrn->PreInit = PreInit;
-    scrn->ScreenInit = ScreenInit;
-    scrn->SwitchMode = SwitchMode;
-    scrn->AdjustFrame = AdjustFrame;
-    scrn->EnterVT = EnterVT;
-    scrn->LeaveVT = LeaveVT;
-    scrn->FreeScreen = FreeScreen;
-    scrn->ValidMode = ValidMode;
+    scrn->PreInit = drv_pre_init;
+    scrn->ScreenInit = drv_screen_init;
+    scrn->SwitchMode = drv_switch_mode;
+    scrn->AdjustFrame = drv_adjust_frame;
+    scrn->EnterVT = drv_enter_vt;
+    scrn->LeaveVT = drv_leave_vt;
+    scrn->FreeScreen = drv_free_screen;
+    scrn->ValidMode = drv_valid_mode;
 }
 
+
 /*
- * Static Xorg funtctions
+ * Internal function definitions
+ */
+
+static Bool drv_init_front_buffer_functions(ScrnInfoPtr pScrn);
+static Bool drv_close_screen(int scrnIndex, ScreenPtr pScreen);
+static Bool drv_save_hw_state(ScrnInfoPtr pScrn);
+static Bool drv_restore_hw_state(ScrnInfoPtr pScrn);
+
+
+/*
+ * Internal functions
  */
 
 static Bool
-GetRec(ScrnInfoPtr pScrn)
+drv_get_rec(ScrnInfoPtr pScrn)
 {
     if (pScrn->driverPrivate)
 	return TRUE;
@@ -121,7 +138,7 @@ GetRec(ScrnInfoPtr pScrn)
 }
 
 static void
-FreeRec(ScrnInfoPtr pScrn)
+drv_free_rec(ScrnInfoPtr pScrn)
 {
     if (!pScrn)
 	return;
@@ -135,85 +152,129 @@ FreeRec(ScrnInfoPtr pScrn)
 }
 
 static void
-ProbeDDC(ScrnInfoPtr pScrn, int index)
+drv_probe_ddc(ScrnInfoPtr pScrn, int index)
 {
     ConfiguredMonitor = NULL;
 }
 
 static Bool
-CreateFrontBuffer(ScrnInfoPtr pScrn)
+drv_crtc_resize(ScrnInfoPtr pScrn, int width, int height)
 {
     modesettingPtr ms = modesettingPTR(pScrn);
+    PixmapPtr rootPixmap;
     ScreenPtr pScreen = pScrn->pScreen;
-    PixmapPtr rootPixmap = pScreen->GetScreenPixmap(pScreen);
-    unsigned handle, stride;
 
-    ms->noEvict = TRUE;
-    xorg_exa_set_displayed_usage(rootPixmap);
-    pScreen->ModifyPixmapHeader(rootPixmap,
-				pScrn->virtualX, pScrn->virtualY,
-				pScrn->depth, pScrn->bitsPerPixel,
-				pScrn->displayWidth * pScrn->bitsPerPixel / 8,
-				NULL);
-    ms->noEvict = FALSE;
+    if (width == pScrn->virtualX && height == pScrn->virtualY)
+	return TRUE;
 
-    handle = xorg_exa_get_pixmap_handle(rootPixmap, &stride);
+    pScrn->virtualX = width;
+    pScrn->virtualY = height;
 
-    drmModeAddFB(ms->fd,
-		 pScrn->virtualX,
-		 pScrn->virtualY,
-		 pScrn->depth,
-		 pScrn->bitsPerPixel,
-		 stride,
-		 handle,
-		 &ms->fb_id);
+    /*
+     * Remove the old framebuffer & texture.
+     */
+    drmModeRmFB(ms->fd, ms->fb_id);
+    if (!ms->destroy_front_buffer(pScrn))
+	FatalError("failed to destroy front buffer\n");
 
-    pScrn->frameX0 = 0;
-    pScrn->frameY0 = 0;
-    AdjustFrame(pScrn->scrnIndex, pScrn->frameX0, pScrn->frameY0, 0);
+    rootPixmap = pScreen->GetScreenPixmap(pScreen);
+    if (!pScreen->ModifyPixmapHeader(rootPixmap, width, height, -1, -1, -1, NULL))
+	return FALSE;
+
+    /* HW dependent - FIXME */
+    pScrn->displayWidth = pScrn->virtualX;
+
+    /* now create new frontbuffer */
+    return ms->create_front_buffer(pScrn) && ms->bind_front_buffer(pScrn);
+}
+
+static const xf86CrtcConfigFuncsRec crtc_config_funcs = {
+    .resize = drv_crtc_resize
+};
+
+static Bool
+drv_init_drm(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+
+    /* deal with server regeneration */
+    if (ms->fd < 0) {
+	char *BusID;
+
+	BusID = xalloc(64);
+	sprintf(BusID, "PCI:%d:%d:%d",
+		((ms->PciInfo->domain << 8) | ms->PciInfo->bus),
+		ms->PciInfo->dev, ms->PciInfo->func
+	    );
+
+	ms->fd = drmOpen(NULL, BusID);
+
+	if (ms->fd < 0)
+	    return FALSE;
+    }
 
     return TRUE;
 }
 
 static Bool
-crtc_resize(ScrnInfoPtr pScrn, int width, int height)
+drv_init_resource_management(ScrnInfoPtr pScrn)
 {
     modesettingPtr ms = modesettingPTR(pScrn);
-    //ScreenPtr pScreen = pScrn->pScreen;
-    //PixmapPtr rootPixmap = pScreen->GetScreenPixmap(pScreen);
-    //Bool fbAccessDisabled;
-    //CARD8 *fbstart;
 
-    if (width == pScrn->virtualX && height == pScrn->virtualY)
+    if (ms->screen || ms->kms)
 	return TRUE;
 
-    ErrorF("RESIZING TO %dx%d\n", width, height);
+    ms->api = drm_api_create();
+    if (ms->api) {
+	ms->screen = ms->api->create_screen(ms->api, ms->fd, NULL);
 
-    pScrn->virtualX = width;
-    pScrn->virtualY = height;
+	if (ms->screen)
+	    return TRUE;
 
-    /* HW dependent - FIXME */
-    pScrn->displayWidth = pScrn->virtualX;
+	if (ms->api->destroy)
+	    ms->api->destroy(ms->api);
 
-    drmModeRmFB(ms->fd, ms->fb_id);
+	ms->api = NULL;
+    }
 
-    /* now create new frontbuffer */
-    return CreateFrontBuffer(pScrn);
+#ifdef HAVE_LIBKMS
+    if (!kms_create(ms->fd, &ms->kms))
+	return TRUE;
+#endif
+
+    return FALSE;
 }
 
-static const xf86CrtcConfigFuncsRec crtc_config_funcs = {
-    crtc_resize
-};
+static Bool
+drv_close_resource_management(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+
+    if (ms->screen)
+	ms->screen->destroy(ms->screen);
+    ms->screen = NULL;
+
+    if (ms->api && ms->api->destroy)
+	ms->api->destroy(ms->api);
+    ms->api = NULL;
+
+#ifdef HAVE_LIBKMS
+    if (ms->kms)
+	kms_destroy(ms->kms);
+    ms->kms = NULL;
+#endif
+
+    return TRUE;
+}
 
 static Bool
-PreInit(ScrnInfoPtr pScrn, int flags)
+drv_pre_init(ScrnInfoPtr pScrn, int flags)
 {
     xf86CrtcConfigPtr xf86_config;
     modesettingPtr ms;
     rgb defaultWeight = { 0, 0, 0 };
     EntityInfoPtr pEnt;
     EntPtr msEnt = NULL;
-    char *BusID;
     int max_width, max_height;
 
     if (pScrn->numEntities != 1)
@@ -222,12 +283,12 @@ PreInit(ScrnInfoPtr pScrn, int flags)
     pEnt = xf86GetEntityInfo(pScrn->entityList[0]);
 
     if (flags & PROBE_DETECT) {
-	ProbeDDC(pScrn, pEnt->index);
+	drv_probe_ddc(pScrn, pEnt->index);
 	return TRUE;
     }
 
     /* Allocate driverPrivate */
-    if (!GetRec(pScrn))
+    if (!drv_get_rec(pScrn))
 	return FALSE;
 
     ms = modesettingPTR(pScrn);
@@ -262,16 +323,9 @@ PreInit(ScrnInfoPtr pScrn, int flags)
 	}
     }
 
-    BusID = xalloc(64);
-    sprintf(BusID, "PCI:%d:%d:%d",
-	    ((ms->PciInfo->domain << 8) | ms->PciInfo->bus),
-	    ms->PciInfo->dev, ms->PciInfo->func
-	);
-
-    ms->api = drm_api_create();
-    ms->fd = drmOpen(NULL, BusID);
-
-    if (ms->fd < 0)
+    ms->fd = -1;
+    ms->api = NULL;
+    if (!drv_init_drm(pScrn))
 	return FALSE;
 
     pScrn->monitor = pScrn->confScreen->monitor;
@@ -303,9 +357,9 @@ PreInit(ScrnInfoPtr pScrn, int flags)
 
     /* Process the options */
     xf86CollectOptions(pScrn, NULL);
-    if (!(ms->Options = xalloc(sizeof(Options))))
+    if (!(ms->Options = xalloc(sizeof(drv_options))))
 	return FALSE;
-    memcpy(ms->Options, Options, sizeof(Options));
+    memcpy(ms->Options, drv_options, sizeof(drv_options));
     xf86ProcessOptions(pScrn->scrnIndex, pScrn->options, ms->Options);
 
     /* Allocate an xf86CrtcConfig */
@@ -320,18 +374,18 @@ PreInit(ScrnInfoPtr pScrn, int flags)
 	ms->SWCursor = TRUE;
     }
 
-    SaveHWState(pScrn);
+    drv_save_hw_state(pScrn);
 
-    crtc_init(pScrn);
-    output_init(pScrn);
+    xorg_crtc_init(pScrn);
+    xorg_output_init(pScrn);
 
     if (!xf86InitialConfiguration(pScrn, TRUE)) {
 	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "No valid modes.\n");
-	RestoreHWState(pScrn);
+	drv_restore_hw_state(pScrn);
 	return FALSE;
     }
 
-    RestoreHWState(pScrn);
+    drv_restore_hw_state(pScrn);
 
     /*
      * If the driver can do gamma correction, it should call xf86SetGamma() here.
@@ -355,21 +409,23 @@ PreInit(ScrnInfoPtr pScrn, int flags)
     xf86SetDpi(pScrn, 0, 0);
 
     /* Load the required sub modules */
-    if (!xf86LoadSubModule(pScrn, "fb")) {
+    if (!xf86LoadSubModule(pScrn, "fb"))
 	return FALSE;
-    }
 
-    xf86LoadSubModule(pScrn, "exa");
+    /* XXX: these aren't needed when we are using libkms */
+    if (!xf86LoadSubModule(pScrn, "exa"))
+	return FALSE;
 
 #ifdef DRI2
-    xf86LoadSubModule(pScrn, "dri2");
+    if (!xf86LoadSubModule(pScrn, "dri2"))
+	return FALSE;
 #endif
 
     return TRUE;
 }
 
 static Bool
-SaveHWState(ScrnInfoPtr pScrn)
+drv_save_hw_state(ScrnInfoPtr pScrn)
 {
     /*xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(pScrn);*/
 
@@ -377,24 +433,45 @@ SaveHWState(ScrnInfoPtr pScrn)
 }
 
 static Bool
-RestoreHWState(ScrnInfoPtr pScrn)
+drv_restore_hw_state(ScrnInfoPtr pScrn)
 {
     /*xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(pScrn);*/
 
     return TRUE;
 }
 
-static void xorgBlockHandler(int i, pointer blockData, pointer pTimeout,
-                             pointer pReadmask)
+static void drv_block_handler(int i, pointer blockData, pointer pTimeout,
+                              pointer pReadmask)
 {
     ScreenPtr pScreen = screenInfo.screens[i];
     modesettingPtr ms = modesettingPTR(xf86Screens[pScreen->myNum]);
 
     pScreen->BlockHandler = ms->blockHandler;
     pScreen->BlockHandler(i, blockData, pTimeout, pReadmask);
-    pScreen->BlockHandler = xorgBlockHandler;
-
-    ms->ctx->flush(ms->ctx, PIPE_FLUSH_RENDER_CACHE, NULL);
+    pScreen->BlockHandler = drv_block_handler;
+
+    if (ms->ctx) {
+       int j;
+
+       ms->ctx->flush(ms->ctx, PIPE_FLUSH_RENDER_CACHE, &ms->fence[XORG_NR_FENCES-1]);
+       
+       if (ms->fence[0])
+          ms->ctx->screen->fence_finish(ms->ctx->screen, ms->fence[0], 0);
+  
+       /* The amount of rendering generated by a block handler can be
+        * quite small.  Let us get a fair way ahead of hardware before
+        * throttling.
+        */
+       for (j = 0; j < XORG_NR_FENCES; j++)
+          ms->screen->fence_reference(ms->screen,
+                                      &ms->fence[j],
+                                      ms->fence[j+1]);
+
+       ms->screen->fence_reference(ms->screen,
+                                   &ms->fence[XORG_NR_FENCES-1],
+                                   NULL);
+    }
+        
 
 #ifdef DRM_MODE_FEATURE_DIRTYFB
     {
@@ -406,11 +483,12 @@ static void xorgBlockHandler(int i, pointer blockData, pointer pTimeout,
 	    BoxPtr rect = REGION_RECTS(dirty);
 	    int i;
 
+	    /* XXX no need for copy? */
 	    for (i = 0; i < num_cliprects; i++, rect++) {
-		clip[i].x = rect->x1;
-		clip[i].y = rect->y1;
-		clip[i].width = rect->x2 - rect->x1;
-		clip[i].height = rect->y2 - rect->y1;
+		clip[i].x1 = rect->x1;
+		clip[i].y1 = rect->y1;
+		clip[i].x2 = rect->x2;
+		clip[i].y2 = rect->y2;
 	    }
 
 	    /* TODO query connector property to see if this is needed */
@@ -423,43 +501,27 @@ static void xorgBlockHandler(int i, pointer blockData, pointer pTimeout,
 }
 
 static Bool
-CreateScreenResources(ScreenPtr pScreen)
+drv_create_screen_resources(ScreenPtr pScreen)
 {
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
     modesettingPtr ms = modesettingPTR(pScrn);
     PixmapPtr rootPixmap;
     Bool ret;
-    unsigned handle, stride;
 
     ms->noEvict = TRUE;
 
     pScreen->CreateScreenResources = ms->createScreenResources;
     ret = pScreen->CreateScreenResources(pScreen);
-    pScreen->CreateScreenResources = CreateScreenResources;
+    pScreen->CreateScreenResources = drv_create_screen_resources;
 
-    rootPixmap = pScreen->GetScreenPixmap(pScreen);
-
-    xorg_exa_set_displayed_usage(rootPixmap);
-    xorg_exa_set_shared_usage(rootPixmap);
-    if (!pScreen->ModifyPixmapHeader(rootPixmap, -1, -1, -1, -1, -1, NULL))
-	FatalError("Couldn't adjust screen pixmap\n");
+    ms->bind_front_buffer(pScrn);
 
     ms->noEvict = FALSE;
 
-    handle = xorg_exa_get_pixmap_handle(rootPixmap, &stride);
-
-    drmModeAddFB(ms->fd,
-		 pScrn->virtualX,
-		 pScrn->virtualY,
-		 pScrn->depth,
-		 pScrn->bitsPerPixel,
-		 stride,
-		 handle,
-                 &ms->fb_id);
-
-    AdjustFrame(pScrn->scrnIndex, pScrn->frameX0, pScrn->frameY0, 0);
+    drv_adjust_frame(pScrn->scrnIndex, pScrn->frameX0, pScrn->frameY0, 0);
 
 #ifdef DRM_MODE_FEATURE_DIRTYFB
+    rootPixmap = pScreen->GetScreenPixmap(pScreen);
     ms->damage = DamageCreate(NULL, NULL, DamageReportNone, TRUE,
                               pScreen, rootPixmap);
 
@@ -472,41 +534,33 @@ CreateScreenResources(ScreenPtr pScreen)
                   "Failed to create screen damage record\n");
        return FALSE;
     }
+#else
+    (void)rootPixmap;
 #endif
 
     return ret;
 }
 
 static Bool
-ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
+drv_screen_init(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
 {
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
     modesettingPtr ms = modesettingPTR(pScrn);
     VisualPtr visual;
 
-    /* deal with server regeneration */
-    if (ms->fd < 0) {
-	char *BusID;
-
-	BusID = xalloc(64);
-	sprintf(BusID, "PCI:%d:%d:%d",
-		((ms->PciInfo->domain << 8) | ms->PciInfo->bus),
-		ms->PciInfo->dev, ms->PciInfo->func
-	    );
-
-	ms->fd = drmOpen(NULL, BusID);
-
-	if (ms->fd < 0)
-	    return FALSE;
+    if (!drv_init_drm(pScrn)) {
+	FatalError("Could not init DRM");
+	return FALSE;
     }
 
-    if (!ms->screen) {
-	ms->screen = ms->api->create_screen(ms->api, ms->fd, NULL);
+    if (!drv_init_resource_management(pScrn)) {
+	FatalError("Could not init resource management (!pipe_screen && !libkms)");
+	return FALSE;
+    }
 
-	if (!ms->screen) {
-	    FatalError("Could not init pipe_screen\n");
-	    return FALSE;
-	}
+    if (!drv_init_front_buffer_functions(pScrn)) {
+	FatalError("Could not init front buffer manager");
+	return FALSE;
     }
 
     pScrn->pScreen = pScreen;
@@ -551,13 +605,22 @@ ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
     fbPictureInit(pScreen, NULL, 0);
 
     ms->blockHandler = pScreen->BlockHandler;
-    pScreen->BlockHandler = xorgBlockHandler;
+    pScreen->BlockHandler = drv_block_handler;
     ms->createScreenResources = pScreen->CreateScreenResources;
-    pScreen->CreateScreenResources = CreateScreenResources;
+    pScreen->CreateScreenResources = drv_create_screen_resources;
 
     xf86SetBlackWhitePixels(pScreen);
 
-    ms->exa = xorg_exa_init(pScrn);
+    if (ms->screen) {
+	ms->exa = xorg_exa_init(pScrn, xf86ReturnOptValBool(ms->Options,
+							    OPTION_2D_ACCEL, TRUE));
+	ms->debug_fallback = debug_get_bool_option("XORG_DEBUG_FALLBACK", TRUE);
+
+	xorg_xv_init(pScreen);
+#ifdef DRI2
+	xorg_dri2_init(pScreen);
+#endif
+    }
 
     miInitializeBackingStore(pScreen);
     xf86SetBackingStore(pScreen);
@@ -576,7 +639,7 @@ ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
 
     pScreen->SaveScreen = xf86SaveScreen;
     ms->CloseScreen = pScreen->CloseScreen;
-    pScreen->CloseScreen = CloseScreen;
+    pScreen->CloseScreen = drv_close_screen;
 
     if (!xf86CrtcScreenInit(pScreen))
 	return FALSE;
@@ -589,17 +652,14 @@ ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
     if (serverGeneration == 1)
 	xf86ShowUnusedOptions(pScrn->scrnIndex, pScrn->options);
 
-#if 1
-#ifdef DRI2
-    driScreenInit(pScreen);
-#endif
-#endif
+    if (ms->winsys_screen_init)
+	ms->winsys_screen_init(pScrn);
 
-    return EnterVT(scrnIndex, 1);
+    return drv_enter_vt(scrnIndex, 1);
 }
 
 static void
-AdjustFrame(int scrnIndex, int x, int y, int flags)
+drv_adjust_frame(int scrnIndex, int x, int y, int flags)
 {
     ScrnInfoPtr pScrn = xf86Screens[scrnIndex];
     xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(pScrn);
@@ -607,21 +667,21 @@ AdjustFrame(int scrnIndex, int x, int y, int flags)
     xf86CrtcPtr crtc = output->crtc;
 
     if (crtc && crtc->enabled) {
-	crtc->funcs->mode_set(crtc, pScrn->currentMode, pScrn->currentMode, x,
-			      y);
+	crtc->funcs->set_mode_major(crtc, pScrn->currentMode,
+				    RR_Rotate_0, x, y);
 	crtc->x = output->initial_x + x;
 	crtc->y = output->initial_y + y;
     }
 }
 
 static void
-FreeScreen(int scrnIndex, int flags)
+drv_free_screen(int scrnIndex, int flags)
 {
-    FreeRec(xf86Screens[scrnIndex]);
+    drv_free_rec(xf86Screens[scrnIndex]);
 }
 
 static void
-LeaveVT(int scrnIndex, int flags)
+drv_leave_vt(int scrnIndex, int flags)
 {
     ScrnInfoPtr pScrn = xf86Screens[scrnIndex];
     modesettingPtr ms = modesettingPTR(pScrn);
@@ -631,7 +691,7 @@ LeaveVT(int scrnIndex, int flags)
     for (o = 0; o < config->num_crtc; o++) {
 	xf86CrtcPtr crtc = config->crtc[o];
 
-	crtc_cursor_destroy(crtc);
+	xorg_crtc_cursor_destroy(crtc);
 
 	if (crtc->rotatedPixmap || crtc->rotatedData) {
 	    crtc->funcs->shadow_destroy(crtc, crtc->rotatedPixmap,
@@ -643,7 +703,7 @@ LeaveVT(int scrnIndex, int flags)
 
     drmModeRmFB(ms->fd, ms->fb_id);
 
-    RestoreHWState(pScrn);
+    drv_restore_hw_state(pScrn);
 
     if (drmDropMaster(ms->fd))
 	xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
@@ -656,7 +716,7 @@ LeaveVT(int scrnIndex, int flags)
  * This gets called when gaining control of the VT, and from ScreenInit().
  */
 static Bool
-EnterVT(int scrnIndex, int flags)
+drv_enter_vt(int scrnIndex, int flags)
 {
     ScrnInfoPtr pScrn = xf86Screens[scrnIndex];
     modesettingPtr ms = modesettingPTR(pScrn);
@@ -678,11 +738,14 @@ EnterVT(int scrnIndex, int flags)
      */
     if (ms->SaveGeneration != serverGeneration) {
 	ms->SaveGeneration = serverGeneration;
-	SaveHWState(pScrn);
+	drv_save_hw_state(pScrn);
     }
 
-    if (!flags)			       /* signals startup as we'll do this in CreateScreenResources */
-	CreateFrontBuffer(pScrn);
+    if (!ms->create_front_buffer(pScrn))
+	return FALSE;
+
+    if (!flags && !ms->bind_front_buffer(pScrn))
+	return FALSE;
 
     if (!xf86SetDesiredModes(pScrn))
 	return FALSE;
@@ -691,7 +754,7 @@ EnterVT(int scrnIndex, int flags)
 }
 
 static Bool
-SwitchMode(int scrnIndex, DisplayModePtr mode, int flags)
+drv_switch_mode(int scrnIndex, DisplayModePtr mode, int flags)
 {
     ScrnInfoPtr pScrn = xf86Screens[scrnIndex];
 
@@ -699,16 +762,21 @@ SwitchMode(int scrnIndex, DisplayModePtr mode, int flags)
 }
 
 static Bool
-CloseScreen(int scrnIndex, ScreenPtr pScreen)
+drv_close_screen(int scrnIndex, ScreenPtr pScreen)
 {
     ScrnInfoPtr pScrn = xf86Screens[scrnIndex];
     modesettingPtr ms = modesettingPTR(pScrn);
 
     if (pScrn->vtSema) {
-	LeaveVT(scrnIndex, 0);
+	drv_leave_vt(scrnIndex, 0);
     }
+
+    if (ms->winsys_screen_close)
+	ms->winsys_screen_close(pScrn);
+
 #ifdef DRI2
-    driCloseScreen(pScreen);
+    if (ms->screen)
+	xorg_dri2_close(pScreen);
 #endif
 
     pScreen->BlockHandler = ms->blockHandler;
@@ -722,11 +790,15 @@ CloseScreen(int scrnIndex, ScreenPtr pScreen)
     }
 #endif
 
+    drmModeRmFB(ms->fd, ms->fb_id);
+    ms->destroy_front_buffer(pScrn);
+
     if (ms->exa)
 	xorg_exa_close(pScrn);
+    ms->exa = NULL;
+
+    drv_close_resource_management(pScrn);
 
-	ms->api->destroy(ms->api);
-	ms->api = NULL;
     drmClose(ms->fd);
     ms->fd = -1;
 
@@ -736,9 +808,190 @@ CloseScreen(int scrnIndex, ScreenPtr pScreen)
 }
 
 static ModeStatus
-ValidMode(int scrnIndex, DisplayModePtr mode, Bool verbose, int flags)
+drv_valid_mode(int scrnIndex, DisplayModePtr mode, Bool verbose, int flags)
 {
     return MODE_OK;
 }
 
+
+/*
+ * Front buffer backing store functions.
+ */
+
+static Bool
+drv_destroy_front_buffer_ga3d(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    pipe_texture_reference(&ms->root_texture, NULL);
+    return TRUE;
+}
+
+static Bool
+drv_create_front_buffer_ga3d(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    unsigned handle, stride;
+    struct pipe_texture *tex;
+
+    ms->noEvict = TRUE;
+
+    tex = xorg_exa_create_root_texture(pScrn, pScrn->virtualX, pScrn->virtualY,
+				       pScrn->depth, pScrn->bitsPerPixel);
+
+    if (!tex)
+	return FALSE;
+
+    if (!ms->api->local_handle_from_texture(ms->api, ms->screen,
+					    tex,
+					    &stride,
+					    &handle))
+	return FALSE;
+
+    drmModeAddFB(ms->fd,
+		 pScrn->virtualX,
+		 pScrn->virtualY,
+		 pScrn->depth,
+		 pScrn->bitsPerPixel,
+		 stride,
+		 handle,
+                 &ms->fb_id);
+
+    pScrn->frameX0 = 0;
+    pScrn->frameY0 = 0;
+    drv_adjust_frame(pScrn->scrnIndex, pScrn->frameX0, pScrn->frameY0, 0);
+
+    pipe_texture_reference(&ms->root_texture, tex);
+    pipe_texture_reference(&tex, NULL);
+
+    return TRUE;
+}
+
+static Bool
+drv_bind_front_buffer_ga3d(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    ScreenPtr pScreen = pScrn->pScreen;
+    PixmapPtr rootPixmap = pScreen->GetScreenPixmap(pScreen);
+    struct pipe_texture *check;
+
+    xorg_exa_set_displayed_usage(rootPixmap);
+    xorg_exa_set_shared_usage(rootPixmap);
+    xorg_exa_set_texture(rootPixmap, ms->root_texture);
+    if (!pScreen->ModifyPixmapHeader(rootPixmap, -1, -1, -1, -1, -1, NULL))
+	FatalError("Couldn't adjust screen pixmap\n");
+
+    check = xorg_exa_get_texture(rootPixmap);
+    if (ms->root_texture != check)
+	FatalError("Created new root texture\n");
+
+    pipe_texture_reference(&check, NULL);
+    return TRUE;
+}
+
+#ifdef HAVE_LIBKMS
+static Bool
+drv_destroy_front_buffer_kms(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+
+    if (!ms->root_bo)
+	return TRUE;
+
+    kms_bo_unmap(ms->root_bo);
+    kms_bo_destroy(ms->root_bo);
+    ms->root_bo = NULL;
+    return TRUE;
+}
+
+static Bool
+drv_create_front_buffer_kms(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    unsigned handle, stride;
+    struct kms_bo *bo;
+    unsigned attr[8];
+
+    attr[0] = KMS_BO_TYPE;
+    attr[1] = KMS_BO_TYPE_SCANOUT;
+    attr[2] = KMS_WIDTH;
+    attr[3] = pScrn->virtualX;
+    attr[4] = KMS_HEIGHT;
+    attr[5] = pScrn->virtualY;
+    attr[6] = 0;
+
+    if (kms_bo_create(ms->kms, attr, &bo))
+	return FALSE;
+
+    if (kms_bo_get_prop(bo, KMS_PITCH, &stride))
+	goto err_destroy;
+
+    if (kms_bo_get_prop(bo, KMS_HANDLE, &handle))
+	goto err_destroy;
+
+    drmModeAddFB(ms->fd,
+		 pScrn->virtualX,
+		 pScrn->virtualY,
+		 pScrn->depth,
+		 pScrn->bitsPerPixel,
+		 stride,
+		 handle,
+                 &ms->fb_id);
+
+    pScrn->frameX0 = 0;
+    pScrn->frameY0 = 0;
+    drv_adjust_frame(pScrn->scrnIndex, pScrn->frameX0, pScrn->frameY0, 0);
+    ms->root_bo = bo;
+
+    return TRUE;
+
+err_destroy:
+    kms_bo_destroy(bo);
+    return FALSE;
+}
+
+static Bool
+drv_bind_front_buffer_kms(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    ScreenPtr pScreen = pScrn->pScreen;
+    PixmapPtr rootPixmap = pScreen->GetScreenPixmap(pScreen);
+    unsigned stride;
+    void *ptr;
+
+    if (kms_bo_get_prop(ms->root_bo, KMS_PITCH, &stride))
+	return FALSE;
+
+    if (kms_bo_map(ms->root_bo, &ptr))
+	return FALSE;
+
+    pScreen->ModifyPixmapHeader(rootPixmap,
+				pScreen->width,
+				pScreen->height,
+				pScreen->rootDepth,
+				pScrn->bitsPerPixel,
+				stride,
+				ptr);
+    return TRUE;
+}
+#endif /* HAVE_LIBKMS */
+
+static Bool drv_init_front_buffer_functions(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    if (ms->screen) {
+	ms->destroy_front_buffer = drv_destroy_front_buffer_ga3d;
+	ms->create_front_buffer = drv_create_front_buffer_ga3d;
+	ms->bind_front_buffer = drv_bind_front_buffer_ga3d;
+#ifdef HAVE_LIBKMS
+    } else if (ms->kms) {
+	ms->destroy_front_buffer = drv_destroy_front_buffer_kms;
+	ms->create_front_buffer = drv_create_front_buffer_kms;
+	ms->bind_front_buffer = drv_bind_front_buffer_kms;
+#endif
+    } else
+	return FALSE;
+
+    return TRUE;
+}
+
 /* vim: set sw=4 ts=8 sts=4: */
diff --git a/src/gallium/state_trackers/xorg/xorg_exa.c b/src/gallium/state_trackers/xorg/xorg_exa.c
index a17a71f23a8..32485add94e 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa.c
@@ -43,35 +43,105 @@
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 
-#include "cso_cache/cso_context.h"
-
 #include "util/u_rect.h"
+#include "util/u_math.h"
+#include "util/u_debug.h"
+
+#define DEBUG_PRINT 0
+#define ROUND_UP_TEXTURES 1
 
 /*
  * Helper functions
  */
+struct render_format_str {
+   int format;
+   const char *name;
+};
+static const struct render_format_str formats_info[] =
+{
+   {PICT_a8r8g8b8, "PICT_a8r8g8b8"},
+   {PICT_x8r8g8b8, "PICT_x8r8g8b8"},
+   {PICT_a8b8g8r8, "PICT_a8b8g8r8"},
+   {PICT_x8b8g8r8, "PICT_x8b8g8r8"},
+#ifdef PICT_TYPE_BGRA
+   {PICT_b8g8r8a8, "PICT_b8g8r8a8"},
+   {PICT_b8g8r8x8, "PICT_b8g8r8x8"},
+   {PICT_a2r10g10b10, "PICT_a2r10g10b10"},
+   {PICT_x2r10g10b10, "PICT_x2r10g10b10"},
+   {PICT_a2b10g10r10, "PICT_a2b10g10r10"},
+   {PICT_x2b10g10r10, "PICT_x2b10g10r10"},
+#endif
+   {PICT_r8g8b8, "PICT_r8g8b8"},
+   {PICT_b8g8r8, "PICT_b8g8r8"},
+   {PICT_r5g6b5, "PICT_r5g6b5"},
+   {PICT_b5g6r5, "PICT_b5g6r5"},
+   {PICT_a1r5g5b5, "PICT_a1r5g5b5"},
+   {PICT_x1r5g5b5, "PICT_x1r5g5b5"},
+   {PICT_a1b5g5r5, "PICT_a1b5g5r5"},
+   {PICT_x1b5g5r5, "PICT_x1b5g5r5"},
+   {PICT_a4r4g4b4, "PICT_a4r4g4b4"},
+   {PICT_x4r4g4b4, "PICT_x4r4g4b4"},
+   {PICT_a4b4g4r4, "PICT_a4b4g4r4"},
+   {PICT_x4b4g4r4, "PICT_x4b4g4r4"},
+   {PICT_a8, "PICT_a8"},
+   {PICT_r3g3b2, "PICT_r3g3b2"},
+   {PICT_b2g3r3, "PICT_b2g3r3"},
+   {PICT_a2r2g2b2, "PICT_a2r2g2b2"},
+   {PICT_a2b2g2r2, "PICT_a2b2g2r2"},
+   {PICT_c8, "PICT_c8"},
+   {PICT_g8, "PICT_g8"},
+   {PICT_x4a4, "PICT_x4a4"},
+   {PICT_x4c4, "PICT_x4c4"},
+   {PICT_x4g4, "PICT_x4g4"},
+   {PICT_a4, "PICT_a4"},
+   {PICT_r1g2b1, "PICT_r1g2b1"},
+   {PICT_b1g2r1, "PICT_b1g2r1"},
+   {PICT_a1r1g1b1, "PICT_a1r1g1b1"},
+   {PICT_a1b1g1r1, "PICT_a1b1g1r1"},
+   {PICT_c4, "PICT_c4"},
+   {PICT_g4, "PICT_g4"},
+   {PICT_a1, "PICT_a1"},
+   {PICT_g1, "PICT_g1"}
+};
+static const char *render_format_name(int format)
+{
+   int i = 0;
+   for (i = 0; i < sizeof(formats_info)/sizeof(formats_info[0]); ++i) {
+      if (formats_info[i].format == format)
+         return formats_info[i].name;
+   }
+   return NULL;
+}
 
 static void
-exa_get_pipe_format(int depth, enum pipe_format *format, int *bbp)
+exa_get_pipe_format(int depth, enum pipe_format *format, int *bbp, int *picture_format)
 {
     switch (depth) {
     case 32:
 	*format = PIPE_FORMAT_A8R8G8B8_UNORM;
+	*picture_format = PICT_a8r8g8b8;
 	assert(*bbp == 32);
 	break;
     case 24:
 	*format = PIPE_FORMAT_X8R8G8B8_UNORM;
+	*picture_format = PICT_x8r8g8b8;
 	assert(*bbp == 32);
 	break;
     case 16:
 	*format = PIPE_FORMAT_R5G6B5_UNORM;
+	*picture_format = PICT_r5g6b5;
 	assert(*bbp == 16);
 	break;
     case 15:
 	*format = PIPE_FORMAT_A1R5G5B5_UNORM;
+	*picture_format = PICT_x1r5g5b5;
 	assert(*bbp == 16);
 	break;
     case 8:
+	*format = PIPE_FORMAT_L8_UNORM;
+	*picture_format = PICT_a8;
+	assert(*bbp == 8);
+	break;
     case 4:
     case 1:
 	*format = PIPE_FORMAT_A8R8G8B8_UNORM; /* bad bad bad */
@@ -82,6 +152,7 @@ exa_get_pipe_format(int depth, enum pipe_format *format, int *bbp)
     }
 }
 
+
 /*
  * Static exported EXA functions
  */
@@ -89,14 +160,20 @@ exa_get_pipe_format(int depth, enum pipe_format *format, int *bbp)
 static void
 ExaWaitMarker(ScreenPtr pScreen, int marker)
 {
+   /* Nothing to do, handled in the PrepareAccess hook */
 }
 
 static int
 ExaMarkSync(ScreenPtr pScreen)
 {
-    return 1;
+   return 1;
 }
 
+
+/***********************************************************************
+ * Screen upload/download
+ */
+
 static Bool
 ExaDownloadFromScreen(PixmapPtr pPix, int x,  int y, int w,  int h, char *dst,
 		      int dst_pitch)
@@ -111,15 +188,20 @@ ExaDownloadFromScreen(PixmapPtr pPix, int x,  int y, int w,  int h, char *dst,
     if (!priv || !priv->tex)
 	return FALSE;
 
-    if (exa->ctx->is_texture_referenced(exa->ctx, priv->tex, 0, 0) &
+    if (exa->pipe->is_texture_referenced(exa->pipe, priv->tex, 0, 0) &
 	PIPE_REFERENCED_FOR_WRITE)
-	exa->ctx->flush(exa->ctx, 0, NULL);
+	exa->pipe->flush(exa->pipe, 0, NULL);
 
     transfer = exa->scrn->get_tex_transfer(exa->scrn, priv->tex, 0, 0, 0,
 					   PIPE_TRANSFER_READ, x, y, w, h);
     if (!transfer)
 	return FALSE;
 
+#if DEBUG_PRINT
+    debug_printf("------ ExaDownloadFromScreen(%d, %d, %d, %d, %d)\n",
+                 x, y, w, h, dst_pitch);
+#endif
+
     util_copy_rect((unsigned char*)dst, &priv->tex->block, dst_pitch, 0, 0,
 		   w, h, exa->scrn->transfer_map(exa->scrn, transfer),
 		   transfer->stride, 0, 0);
@@ -144,11 +226,21 @@ ExaUploadToScreen(PixmapPtr pPix, int x, int y, int w, int h, char *src,
     if (!priv || !priv->tex)
 	return FALSE;
 
+    /* make sure that any pending operations are flushed to hardware */
+    if (exa->pipe->is_texture_referenced(exa->pipe, priv->tex, 0, 0) &
+	(PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE))
+	xorg_exa_flush(exa, 0, NULL);
+
     transfer = exa->scrn->get_tex_transfer(exa->scrn, priv->tex, 0, 0, 0,
 					   PIPE_TRANSFER_WRITE, x, y, w, h);
     if (!transfer)
 	return FALSE;
 
+#if DEBUG_PRINT
+    debug_printf("++++++ ExaUploadToScreen(%d, %d, %d, %d, %d)\n",
+                 x, y, w, h, src_pitch);
+#endif
+
     util_copy_rect(exa->scrn->transfer_map(exa->scrn, transfer),
 		   &priv->tex->block, transfer->stride, 0, 0, w, h,
 		   (unsigned char*)src, src_pitch, 0, 0);
@@ -176,22 +268,38 @@ ExaPrepareAccess(PixmapPtr pPix, int index)
     if (!priv->tex)
 	return FALSE;
 
-    if (priv->map_count++ == 0)
+    if (priv->map_count == 0)
     {
-	if (exa->ctx->is_texture_referenced(exa->ctx, priv->tex, 0, 0) &
+	if (exa->pipe->is_texture_referenced(exa->pipe, priv->tex, 0, 0) &
 	    PIPE_REFERENCED_FOR_WRITE)
-	    exa->ctx->flush(exa->ctx, 0, NULL);
+	    exa->pipe->flush(exa->pipe, 0, NULL);
+
+        assert(pPix->drawable.width <= priv->tex->width[0]);
+        assert(pPix->drawable.height <= priv->tex->height[0]);
 
 	priv->map_transfer =
 	    exa->scrn->get_tex_transfer(exa->scrn, priv->tex, 0, 0, 0,
+#ifdef EXA_MIXED_PIXMAPS
+					PIPE_TRANSFER_MAP_DIRECTLY |
+#endif
 					PIPE_TRANSFER_READ_WRITE,
-					0, 0, priv->tex->width[0], priv->tex->height[0]);
+					0, 0, 
+                                        pPix->drawable.width,
+                                        pPix->drawable.height );
+	if (!priv->map_transfer)
+#ifdef EXA_MIXED_PIXMAPS
+	    return FALSE;
+#else
+	    FatalError("failed to create transfer\n");
+#endif
 
 	pPix->devPrivate.ptr =
 	    exa->scrn->transfer_map(exa->scrn, priv->map_transfer);
 	pPix->devKind = priv->map_transfer->stride;
     }
 
+    priv->map_count++;
+
     return TRUE;
 }
 
@@ -220,27 +328,9 @@ ExaFinishAccess(PixmapPtr pPix, int index)
     }
 }
 
-static void
-ExaDone(PixmapPtr pPixmap)
-{
-    ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
-    modesettingPtr ms = modesettingPTR(pScrn);
-    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
-    struct exa_context *exa = ms->exa;
-
-    if (!priv)
-	return;
-
-    if (priv->src_surf)
-	exa->scrn->tex_surface_destroy(priv->src_surf);
-    priv->src_surf = NULL;
-}
-
-static void
-ExaDoneComposite(PixmapPtr pPixmap)
-{
-
-}
+/***********************************************************************
+ * Solid Fills
+ */
 
 static Bool
 ExaPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planeMask, Pixel fg)
@@ -250,24 +340,28 @@ ExaPrepareSolid(PixmapPtr pPixmap, int alu, Pixel planeMask, Pixel fg)
     struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
     struct exa_context *exa = ms->exa;
 
-    if (pPixmap->drawable.depth < 15)
-	return FALSE;
-
-    if (!EXA_PM_IS_SOLID(&pPixmap->drawable, planeMask))
-	return FALSE;
+#if DEBUG_PRINT
+    debug_printf("ExaPrepareSolid(0x%x)\n", fg);
+#endif
+    if (!exa->pipe)
+	XORG_FALLBACK("accle not enabled");
 
     if (!priv || !priv->tex)
-	return FALSE;
+	XORG_FALLBACK("%s", !priv ? "!priv" : "!priv->tex");
 
-    if (alu != GXcopy)
-	return FALSE;
+    if (!EXA_PM_IS_SOLID(&pPixmap->drawable, planeMask))
+	XORG_FALLBACK("planeMask is not solid");
 
-    if (!exa->ctx || !exa->ctx->surface_fill)
-	return FALSE;
+    if (alu != GXcopy)
+	XORG_FALLBACK("not GXcopy");
 
-    priv->color = fg;
+    if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                        priv->tex->target,
+                                        PIPE_TEXTURE_USAGE_RENDER_TARGET, 0)) {
+	XORG_FALLBACK("format %s", pf_name(priv->tex->format));
+    }
 
-    return TRUE;
+    return exa->accel && xorg_solid_bind_state(exa, priv, fg);
 }
 
 static void
@@ -277,14 +371,39 @@ ExaSolid(PixmapPtr pPixmap, int x0, int y0, int x1, int y1)
     modesettingPtr ms = modesettingPTR(pScrn);
     struct exa_context *exa = ms->exa;
     struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
-    struct pipe_surface *surf = exa_gpu_surface(exa, priv);
 
-    exa->ctx->surface_fill(exa->ctx, surf, x0, y0, x1 - x0, y1 - y0,
-			   priv->color);
+#if DEBUG_PRINT
+    debug_printf("\tExaSolid(%d, %d, %d, %d)\n", x0, y0, x1, y1);
+#endif
 
-    exa->scrn->tex_surface_destroy(surf);
+    if (x0 == 0 && y0 == 0 &&
+        x1 == pPixmap->drawable.width && y1 == pPixmap->drawable.height) {
+       exa->pipe->clear(exa->pipe, PIPE_CLEAR_COLOR, exa->solid_color, 0.0, 0);
+       return;
+    }
+
+    xorg_solid(exa, priv, x0, y0, x1, y1) ;
 }
 
+
+static void
+ExaDoneSolid(PixmapPtr pPixmap)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
+    modesettingPtr ms = modesettingPTR(pScrn);
+    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
+    struct exa_context *exa = ms->exa;
+
+    if (!priv)
+	return;
+   
+    xorg_composite_done(exa);
+}
+
+/***********************************************************************
+ * Copy Blits
+ */
+
 static Bool
 ExaPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap, int xdir,
 	       int ydir, int alu, Pixel planeMask)
@@ -295,44 +414,202 @@ ExaPrepareCopy(PixmapPtr pSrcPixmap, PixmapPtr pDstPixmap, int xdir,
     struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDstPixmap);
     struct exa_pixmap_priv *src_priv = exaGetPixmapDriverPrivate(pSrcPixmap);
 
-    if (alu != GXcopy)
-	return FALSE;
+#if DEBUG_PRINT
+    debug_printf("ExaPrepareCopy\n");
+#endif
+    if (!exa->pipe)
+	XORG_FALLBACK("accle not enabled");
 
-    if (pSrcPixmap->drawable.depth < 15 || pDstPixmap->drawable.depth < 15)
-	return FALSE;
+    if (!priv || !priv->tex)
+	XORG_FALLBACK("pDst %s", !priv ? "!priv" : "!priv->tex");
+
+    if (!src_priv || !src_priv->tex)
+	XORG_FALLBACK("pSrc %s", !src_priv ? "!priv" : "!priv->tex");
 
     if (!EXA_PM_IS_SOLID(&pSrcPixmap->drawable, planeMask))
-	return FALSE;
+	XORG_FALLBACK("planeMask is not solid");
 
-    if (!priv || !src_priv)
-	return FALSE;
+    if (alu != GXcopy)
+	XORG_FALLBACK("alu not GXcopy");
+
+    if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                        priv->tex->target,
+                                        PIPE_TEXTURE_USAGE_RENDER_TARGET, 0))
+	XORG_FALLBACK("pDst format %s", pf_name(priv->tex->format));
+
+    if (!exa->scrn->is_format_supported(exa->scrn, src_priv->tex->format,
+                                        src_priv->tex->target,
+                                        PIPE_TEXTURE_USAGE_SAMPLER, 0))
+	XORG_FALLBACK("pSrc format %s", pf_name(src_priv->tex->format));
+
+    exa->copy.src = src_priv;
+    exa->copy.dst = priv;
+
+    /* For same-surface copies, the pipe->surface_copy path is clearly
+     * superior, providing it is implemented.  In other cases it's not
+     * clear what the better path would be, and eventually we'd
+     * probably want to gather timings and choose dynamically.
+     */
+    if (exa->pipe->surface_copy &&
+        exa->copy.src == exa->copy.dst) {
+
+       exa->copy.use_surface_copy = TRUE;
+       
+       exa->copy.src_surface =
+          exa->scrn->get_tex_surface( exa->scrn,
+                                      exa->copy.src->tex,
+                                      0, 0, 0,
+                                      PIPE_BUFFER_USAGE_GPU_READ);
+
+       exa->copy.dst_surface =
+          exa->scrn->get_tex_surface( exa->scrn, 
+                                      exa->copy.dst->tex,
+                                      0, 0, 0, 
+                                      PIPE_BUFFER_USAGE_GPU_WRITE );
+    }
+    else {
+       exa->copy.use_surface_copy = FALSE;
+
+       if (exa->copy.dst == exa->copy.src)
+          exa->copy.src_texture = renderer_clone_texture( exa->renderer,
+                                                          exa->copy.src->tex );
+       else
+          pipe_texture_reference(&exa->copy.src_texture,
+                                 exa->copy.src->tex);
+
+       exa->copy.dst_surface =
+          exa->scrn->get_tex_surface(exa->scrn,
+                                     exa->copy.dst->tex,
+                                     0, 0, 0,
+                                     PIPE_BUFFER_USAGE_GPU_WRITE);
 
-    if (!priv->tex || !src_priv->tex)
-	return FALSE;
 
-    if (!exa->ctx || !exa->ctx->surface_copy)
-	return FALSE;
+       renderer_copy_prepare(exa->renderer, 
+                             exa->copy.dst_surface,
+                             exa->copy.src_texture );
+    }
 
-    priv->src_surf = exa_gpu_surface(exa, src_priv);
 
-    return TRUE;
+    return exa->accel;
 }
 
 static void
 ExaCopy(PixmapPtr pDstPixmap, int srcX, int srcY, int dstX, int dstY,
 	int width, int height)
 {
-    ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
+   ScrnInfoPtr pScrn = xf86Screens[pDstPixmap->drawable.pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   struct exa_context *exa = ms->exa;
+   struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDstPixmap);
+
+#if DEBUG_PRINT
+   debug_printf("\tExaCopy(srcx=%d, srcy=%d, dstX=%d, dstY=%d, w=%d, h=%d)\n",
+                srcX, srcY, dstX, dstY, width, height);
+#endif
+
+   debug_assert(priv == exa->copy.dst);
+
+   if (exa->copy.use_surface_copy) {
+      /* XXX: consider exposing >1 box in surface_copy interface.
+       */
+      exa->pipe->surface_copy( exa->pipe,
+                             exa->copy.dst_surface,
+                             dstX, dstY,
+                             exa->copy.src_surface,
+                             srcX, srcY,
+                             width, height );
+   }
+   else {
+      renderer_copy_pixmap(exa->renderer, 
+                           dstX, dstY,
+                           srcX, srcY,
+                           width, height,
+                           exa->copy.src_texture->width[0],
+                           exa->copy.src_texture->height[0]);
+   }
+}
+
+static void
+ExaDoneCopy(PixmapPtr pPixmap)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
     modesettingPtr ms = modesettingPTR(pScrn);
+    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
     struct exa_context *exa = ms->exa;
-    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDstPixmap);
-    struct pipe_surface *surf = exa_gpu_surface(exa, priv);
 
-    exa->ctx->surface_copy(exa->ctx, surf, dstX, dstY, priv->src_surf,
-			   srcX, srcY, width, height);
-    exa->scrn->tex_surface_destroy(surf);
+    if (!priv)
+	return;
+
+   renderer_draw_flush(exa->renderer);
+
+   exa->copy.src = NULL;
+   exa->copy.dst = NULL;
+   pipe_surface_reference(&exa->copy.src_surface, NULL);
+   pipe_surface_reference(&exa->copy.dst_surface, NULL);
+   pipe_texture_reference(&exa->copy.src_texture, NULL);
 }
 
+
+
+static Bool
+picture_check_formats(struct exa_pixmap_priv *pSrc, PicturePtr pSrcPicture)
+{
+   if (pSrc->picture_format == pSrcPicture->format)
+      return TRUE;
+
+   if (pSrc->picture_format != PICT_a8r8g8b8)
+      return FALSE;
+
+   /* pSrc->picture_format == PICT_a8r8g8b8 */
+   switch (pSrcPicture->format) {
+   case PICT_a8r8g8b8:
+   case PICT_x8r8g8b8:
+   case PICT_a8b8g8r8:
+   case PICT_x8b8g8r8:
+   /* just treat these two as x8... */
+   case PICT_r8g8b8:
+   case PICT_b8g8r8:
+      return TRUE;
+#ifdef PICT_TYPE_BGRA
+   case PICT_b8g8r8a8:
+   case PICT_b8g8r8x8:
+      return FALSE; /* does not support swizzleing the alpha channel yet */
+   case PICT_a2r10g10b10:
+   case PICT_x2r10g10b10:
+   case PICT_a2b10g10r10:
+   case PICT_x2b10g10r10:
+      return FALSE;
+#endif
+   default:
+      return FALSE;
+   }
+   return FALSE;
+}
+
+/***********************************************************************
+ * Composite entrypoints
+ */
+
+static Bool
+ExaCheckComposite(int op,
+		  PicturePtr pSrcPicture, PicturePtr pMaskPicture,
+		  PicturePtr pDstPicture)
+{
+   ScrnInfoPtr pScrn = xf86Screens[pDstPicture->pDrawable->pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   struct exa_context *exa = ms->exa;
+   boolean accelerated = xorg_composite_accelerated(op,
+                                                    pSrcPicture,
+                                                    pMaskPicture,
+                                                    pDstPicture);
+#if DEBUG_PRINT
+   debug_printf("ExaCheckComposite(%d, %p, %p, %p) = %d\n",
+                op, pSrcPicture, pMaskPicture, pDstPicture, accelerated);
+#endif
+   return exa->accel && accelerated;
+}
+
+
 static Bool
 ExaPrepareComposite(int op, PicturePtr pSrcPicture,
 		    PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -341,11 +618,71 @@ ExaPrepareComposite(int op, PicturePtr pSrcPicture,
    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
    modesettingPtr ms = modesettingPTR(pScrn);
    struct exa_context *exa = ms->exa;
+   struct exa_pixmap_priv *priv;
+
+#if DEBUG_PRINT
+   debug_printf("ExaPrepareComposite(%d, src=0x%p, mask=0x%p, dst=0x%p)\n",
+                op, pSrcPicture, pMaskPicture, pDstPicture);
+   debug_printf("\tFormats: src(%s), mask(%s), dst(%s)\n",
+                pSrcPicture ? render_format_name(pSrcPicture->format) : "none",
+                pMaskPicture ? render_format_name(pMaskPicture->format) : "none",
+                pDstPicture ? render_format_name(pDstPicture->format) : "none");
+#endif
+   if (!exa->pipe)
+      XORG_FALLBACK("accle not enabled");
+
+   priv = exaGetPixmapDriverPrivate(pDst);
+   if (!priv || !priv->tex)
+      XORG_FALLBACK("pDst %s", !priv ? "!priv" : "!priv->tex");
+
+   if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                       priv->tex->target,
+                                       PIPE_TEXTURE_USAGE_RENDER_TARGET, 0))
+      XORG_FALLBACK("pDst format: %s", pf_name(priv->tex->format));
+
+   if (priv->picture_format != pDstPicture->format)
+      XORG_FALLBACK("pDst pic_format: %s != %s",
+                    render_format_name(priv->picture_format),
+                    render_format_name(pDstPicture->format));
+
+   if (pSrc) {
+      priv = exaGetPixmapDriverPrivate(pSrc);
+      if (!priv || !priv->tex)
+         XORG_FALLBACK("pSrc %s", !priv ? "!priv" : "!priv->tex");
+
+      if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                          priv->tex->target,
+                                          PIPE_TEXTURE_USAGE_SAMPLER, 0))
+         XORG_FALLBACK("pSrc format: %s", pf_name(priv->tex->format));
+
+      if (!picture_check_formats(priv, pSrcPicture))
+         XORG_FALLBACK("pSrc pic_format: %s != %s",
+                       render_format_name(priv->picture_format),
+                       render_format_name(pSrcPicture->format));
 
-   return xorg_composite_bind_state(exa, op, pSrcPicture, pMaskPicture,
+   }
+
+   if (pMask) {
+      priv = exaGetPixmapDriverPrivate(pMask);
+      if (!priv || !priv->tex)
+         XORG_FALLBACK("pMask %s", !priv ? "!priv" : "!priv->tex");
+
+      if (!exa->scrn->is_format_supported(exa->scrn, priv->tex->format,
+                                          priv->tex->target,
+                                          PIPE_TEXTURE_USAGE_SAMPLER, 0))
+         XORG_FALLBACK("pMask format: %s", pf_name(priv->tex->format));
+
+      if (!picture_check_formats(priv, pMaskPicture))
+         XORG_FALLBACK("pMask pic_format: %s != %s",
+                       render_format_name(priv->picture_format),
+                       render_format_name(pMaskPicture->format));
+   }
+
+   return exa->accel &&
+          xorg_composite_bind_state(exa, op, pSrcPicture, pMaskPicture,
                                     pDstPicture,
-                                    exaGetPixmapDriverPrivate(pSrc),
-                                    exaGetPixmapDriverPrivate(pMask),
+                                    pSrc ? exaGetPixmapDriverPrivate(pSrc) : NULL,
+                                    pMask ? exaGetPixmapDriverPrivate(pMask) : NULL,
                                     exaGetPixmapDriverPrivate(pDst));
 }
 
@@ -358,21 +695,34 @@ ExaComposite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
    struct exa_context *exa = ms->exa;
    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pDst);
 
+#if DEBUG_PRINT
+   debug_printf("\tExaComposite(src[%d,%d], mask=[%d, %d], dst=[%d, %d], dim=[%d, %d])\n",
+                srcX, srcY, maskX, maskY, dstX, dstY, width, height);
+   debug_printf("\t   Num bound samplers = %d\n",
+                exa->num_bound_samplers);
+#endif
+
    xorg_composite(exa, priv, srcX, srcY, maskX, maskY,
                   dstX, dstY, width, height);
 }
 
-static Bool
-ExaCheckComposite(int op,
-		  PicturePtr pSrcPicture, PicturePtr pMaskPicture,
-		  PicturePtr pDstPicture)
+
+
+static void
+ExaDoneComposite(PixmapPtr pPixmap)
 {
-   return xorg_composite_accelerated(op,
-                                     pSrcPicture,
-                                     pMaskPicture,
-                                     pDstPicture);
+   ScrnInfoPtr pScrn = xf86Screens[pPixmap->drawable.pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   struct exa_context *exa = ms->exa;
+
+   xorg_composite_done(exa);
 }
 
+
+/***********************************************************************
+ * Pixmaps
+ */
+
 static void *
 ExaCreatePixmap(ScreenPtr pScreen, int size, int align)
 {
@@ -389,14 +739,11 @@ static void
 ExaDestroyPixmap(ScreenPtr pScreen, void *dPriv)
 {
     struct exa_pixmap_priv *priv = (struct exa_pixmap_priv *)dPriv;
-    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
-    modesettingPtr ms = modesettingPTR(pScrn);
 
     if (!priv)
 	return;
 
-    if (priv->tex)
-	ms->screen->texture_destroy(priv->tex);
+    pipe_texture_reference(&priv->tex, NULL);
 
     xfree(priv);
 }
@@ -479,6 +826,22 @@ xorg_exa_get_pixmap_handle(PixmapPtr pPixmap, unsigned *stride_out)
 }
 
 static Bool
+size_match( int width, int tex_width )
+{
+#if ROUND_UP_TEXTURES
+   if (width > tex_width)
+      return FALSE;
+
+   if (width * 2 < tex_width)
+      return FALSE;
+
+   return TRUE;
+#else
+   return width == tex_width;
+#endif
+}
+
+static Bool
 ExaModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
 		      int depth, int bitsPerPixel, int devKind,
 		      pointer pPixData)
@@ -492,6 +855,17 @@ ExaModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
     if (!priv || pPixData)
 	return FALSE;
 
+    if (0) {
+       debug_printf("%s pixmap %p sz %dx%dx%d devKind %d\n",
+                    __FUNCTION__, pPixmap, width, height, bitsPerPixel, devKind);
+       
+       if (priv->tex)
+          debug_printf("  ==> old texture %dx%d\n",
+                       priv->tex->width[0], 
+                       priv->tex->height[0]);
+    }
+
+
     if (depth <= 0)
 	depth = pPixmap->drawable.depth;
 
@@ -510,70 +884,60 @@ ExaModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
     miModifyPixmapHeader(pPixmap, width, height, depth,
 			     bitsPerPixel, devKind, NULL);
 
+    priv->width = width;
+    priv->height = height;
+
     /* Deal with screen resize */
-    if (!priv->tex ||
-        (priv->tex->width[0] != width ||
-         priv->tex->height[0] != height ||
+    if ((exa->accel || priv->flags) &&
+        (!priv->tex ||
+         !size_match(width, priv->tex->width[0]) ||
+         !size_match(height, priv->tex->height[0]) ||
          priv->tex_flags != priv->flags)) {
 	struct pipe_texture *texture = NULL;
-
-#ifdef DRM_MODE_FEATURE_DIRTYFB
-	if (priv->flags)
-#endif
-	{
-	    struct pipe_texture template;
-
-	    memset(&template, 0, sizeof(template));
-	    template.target = PIPE_TEXTURE_2D;
-	    exa_get_pipe_format(depth, &template.format, &bitsPerPixel);
-	    pf_get_block(template.format, &template.block);
-	    template.width[0] = width;
-	    template.height[0] = height;
-	    template.depth[0] = 1;
-	    template.last_level = 0;
-	    template.tex_usage = PIPE_TEXTURE_USAGE_RENDER_TARGET | priv->flags;
-	    priv->tex_flags = priv->flags;
-	    texture = exa->scrn->texture_create(exa->scrn, &template);
-
-	    if (priv->tex) {
-		struct pipe_surface *dst_surf;
-
-		dst_surf = exa->scrn->get_tex_surface(exa->scrn, texture, 0, 0, 0,
-						      PIPE_BUFFER_USAGE_GPU_WRITE);
-		priv->src_surf = exa_gpu_surface(exa, priv);
-		exa->ctx->surface_copy(exa->ctx, dst_surf, 0, 0, priv->src_surf,
-				       0, 0, min(width, texture->width[0]),
-				       min(height, texture->height[0]));
-		exa->scrn->tex_surface_destroy(dst_surf);
-		exa->scrn->tex_surface_destroy(priv->src_surf);
-		priv->src_surf = NULL;
-	    } else if (pPixmap->devPrivate.ptr) {
-		struct pipe_transfer *transfer;
-
-		if (priv->map_count != 0)
-		     FatalError("doing ExaModifyPixmapHeader on mapped buffer\n");
-
-		transfer =
-		    exa->scrn->get_tex_transfer(exa->scrn, texture, 0, 0, 0,
-						PIPE_TRANSFER_WRITE,
-						0, 0, width, height);
-		util_copy_rect(exa->scrn->transfer_map(exa->scrn, transfer),
-			       &texture->block, transfer->stride, 0, 0,
-			       width, height, pPixmap->devPrivate.ptr,
-			       pPixmap->devKind, 0, 0);
-		exa->scrn->transfer_unmap(exa->scrn, transfer);
-		exa->scrn->tex_transfer_destroy(transfer);
-	    }
+	struct pipe_texture template;
+
+	memset(&template, 0, sizeof(template));
+	template.target = PIPE_TEXTURE_2D;
+	exa_get_pipe_format(depth, &template.format, &bitsPerPixel, &priv->picture_format);
+	pf_get_block(template.format, &template.block);
+        if (ROUND_UP_TEXTURES && priv->flags == 0) {
+           template.width[0] = util_next_power_of_two(width);
+           template.height[0] = util_next_power_of_two(height);
+        }
+        else {
+           template.width[0] = width;
+           template.height[0] = height;
+        }
+
+	template.depth[0] = 1;
+	template.last_level = 0;
+	template.tex_usage = PIPE_TEXTURE_USAGE_RENDER_TARGET | priv->flags;
+	priv->tex_flags = priv->flags;
+	texture = exa->scrn->texture_create(exa->scrn, &template);
+
+	if (priv->tex) {
+	    struct pipe_surface *dst_surf;
+	    struct pipe_surface *src_surf;
+
+	    dst_surf = exa->scrn->get_tex_surface(
+		exa->scrn, texture, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE);
+	    src_surf = xorg_gpu_surface(exa->pipe->screen, priv);
+            if (exa->pipe->surface_copy) {
+               exa->pipe->surface_copy(exa->pipe, dst_surf, 0, 0, src_surf,
+                                       0, 0, min(width, texture->width[0]),
+                                       min(height, texture->height[0]));
+            } else {
+               util_surface_copy(exa->pipe, FALSE, dst_surf, 0, 0, src_surf,
+                                 0, 0, min(width, texture->width[0]),
+                                 min(height, texture->height[0]));
+            }
+	    exa->scrn->tex_surface_destroy(dst_surf);
+	    exa->scrn->tex_surface_destroy(src_surf);
 	}
-#ifdef DRM_MODE_FEATURE_DIRTYFB
-	else {
-	    xfree(pPixmap->devPrivate.ptr);
-	    pPixmap->devPrivate.ptr = xalloc(pPixmap->drawable.height *
-					     pPixmap->devKind);
-	}
-#endif
 
 	pipe_texture_reference(&priv->tex, texture);
+	/* the texture we create has one reference */
+	pipe_texture_reference(&texture, NULL);
     }
 
     return TRUE;
@@ -588,31 +952,61 @@ xorg_exa_get_texture(PixmapPtr pPixmap)
    return tex;
 }
 
+Bool
+xorg_exa_set_texture(PixmapPtr pPixmap, struct  pipe_texture *tex)
+{
+    struct exa_pixmap_priv *priv = exaGetPixmapDriverPrivate(pPixmap);
+
+    int mask = PIPE_TEXTURE_USAGE_PRIMARY | PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
+    if (!priv)
+	return FALSE;
+
+    if (pPixmap->drawable.width != tex->width[0] ||
+	pPixmap->drawable.height != tex->height[0])
+	return FALSE;
+
+    pipe_texture_reference(&priv->tex, tex);
+    priv->tex_flags = tex->tex_usage & mask;
+
+    return TRUE;
+}
+
+struct pipe_texture *
+xorg_exa_create_root_texture(ScrnInfoPtr pScrn,
+			     int width, int height,
+			     int depth, int bitsPerPixel)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    struct exa_context *exa = ms->exa;
+    struct pipe_texture template;
+    int dummy;
+
+    memset(&template, 0, sizeof(template));
+    template.target = PIPE_TEXTURE_2D;
+    exa_get_pipe_format(depth, &template.format, &bitsPerPixel, &dummy);
+    pf_get_block(template.format, &template.block);
+    template.width[0] = width;
+    template.height[0] = height;
+    template.depth[0] = 1;
+    template.last_level = 0;
+    template.tex_usage |= PIPE_TEXTURE_USAGE_RENDER_TARGET;
+    template.tex_usage |= PIPE_TEXTURE_USAGE_PRIMARY;
+    template.tex_usage |= PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
+    return exa->scrn->texture_create(exa->scrn, &template);
+}
+
 void
 xorg_exa_close(ScrnInfoPtr pScrn)
 {
    modesettingPtr ms = modesettingPTR(pScrn);
    struct exa_context *exa = ms->exa;
-   struct pipe_constant_buffer *vsbuf = &exa->vs_const_buffer;
-   struct pipe_constant_buffer *fsbuf = &exa->fs_const_buffer;
-
-   if (exa->shaders) {
-      xorg_shaders_destroy(exa->shaders);
-   }
-
-   if (vsbuf && vsbuf->buffer)
-      pipe_buffer_reference(&vsbuf->buffer, NULL);
 
-   if (fsbuf && fsbuf->buffer)
-      pipe_buffer_reference(&fsbuf->buffer, NULL);
+   renderer_destroy(exa->renderer);
 
-   if (exa->cso) {
-      cso_release_all(exa->cso);
-      cso_destroy_context(exa->cso);
-   }
-
-   if (exa->ctx)
-      exa->ctx->destroy(exa->ctx);
+   if (exa->pipe)
+      exa->pipe->destroy(exa->pipe);
 
    exaDriverFini(pScrn->pScreen);
    xfree(exa);
@@ -620,7 +1014,7 @@ xorg_exa_close(ScrnInfoPtr pScrn)
 }
 
 void *
-xorg_exa_init(ScrnInfoPtr pScrn)
+xorg_exa_init(ScrnInfoPtr pScrn, Bool accel)
 {
    modesettingPtr ms = modesettingPTR(pScrn);
    struct exa_context *exa;
@@ -645,6 +1039,12 @@ xorg_exa_init(ScrnInfoPtr pScrn)
    pExa->pixmapOffsetAlign = 0;
    pExa->pixmapPitchAlign  = 1;
    pExa->flags             = EXA_OFFSCREEN_PIXMAPS | EXA_HANDLES_PIXMAPS;
+#ifdef EXA_SUPPORTS_PREPARE_AUX
+   pExa->flags            |= EXA_SUPPORTS_PREPARE_AUX;
+#endif
+#ifdef EXA_MIXED_PIXMAPS
+   pExa->flags            |= EXA_MIXED_PIXMAPS;
+#endif
    pExa->maxX              = 8191; /* FIXME */
    pExa->maxY              = 8191; /* FIXME */
 
@@ -652,10 +1052,10 @@ xorg_exa_init(ScrnInfoPtr pScrn)
    pExa->MarkSync           = ExaMarkSync;
    pExa->PrepareSolid       = ExaPrepareSolid;
    pExa->Solid              = ExaSolid;
-   pExa->DoneSolid          = ExaDone;
+   pExa->DoneSolid          = ExaDoneSolid;
    pExa->PrepareCopy        = ExaPrepareCopy;
    pExa->Copy               = ExaCopy;
-   pExa->DoneCopy           = ExaDone;
+   pExa->DoneCopy           = ExaDoneCopy;
    pExa->CheckComposite     = ExaCheckComposite;
    pExa->PrepareComposite   = ExaPrepareComposite;
    pExa->Composite          = ExaComposite;
@@ -674,12 +1074,12 @@ xorg_exa_init(ScrnInfoPtr pScrn)
    }
 
    exa->scrn = ms->screen;
-   exa->ctx = ms->api->create_context(ms->api, exa->scrn);
+   exa->pipe = ms->api->create_context(ms->api, exa->scrn);
    /* Share context with DRI */
-   ms->ctx = exa->ctx;
+   ms->ctx = exa->pipe;
 
-   exa->cso = cso_create_context(exa->ctx);
-   exa->shaders = xorg_shaders_create(exa);
+   exa->renderer = renderer_create(exa->pipe);
+   exa->accel = accel;
 
    return (void *)exa;
 
@@ -690,11 +1090,27 @@ out_err:
 }
 
 struct pipe_surface *
-exa_gpu_surface(struct exa_context *exa, struct exa_pixmap_priv *priv)
+xorg_gpu_surface(struct pipe_screen *scrn, struct exa_pixmap_priv *priv)
 {
-   return exa->scrn->get_tex_surface(exa->scrn, priv->tex, 0, 0, 0,
-                                     PIPE_BUFFER_USAGE_GPU_READ |
-                                     PIPE_BUFFER_USAGE_GPU_WRITE);
+   return scrn->get_tex_surface(scrn, priv->tex, 0, 0, 0,
+                                PIPE_BUFFER_USAGE_GPU_READ |
+                                PIPE_BUFFER_USAGE_GPU_WRITE);
+
+}
+
+void xorg_exa_flush(struct exa_context *exa, uint pipeFlushFlags,
+                    struct pipe_fence_handle **fence)
+{
+   exa->pipe->flush(exa->pipe, pipeFlushFlags, fence);
+}
+
+void xorg_exa_finish(struct exa_context *exa)
+{
+   struct pipe_fence_handle *fence = NULL;
+
+   xorg_exa_flush(exa, PIPE_FLUSH_RENDER_CACHE, &fence);
 
+   exa->pipe->screen->fence_finish(exa->pipe->screen, fence, 0);
+   exa->pipe->screen->fence_reference(exa->pipe->screen, &fence, NULL);
 }
 
diff --git a/src/gallium/state_trackers/xorg/xorg_exa.h b/src/gallium/state_trackers/xorg/xorg_exa.h
index 5b515be1397..f2cefe23b99 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa.h
+++ b/src/gallium/state_trackers/xorg/xorg_exa.h
@@ -14,37 +14,69 @@ struct xorg_shaders;
 struct exa_context
 {
    ExaDriverPtr pExa;
-   struct pipe_context *ctx;
+   struct pipe_context *pipe;
    struct pipe_screen *scrn;
-   struct cso_context *cso;
-   struct xorg_shaders *shaders;
-
-   struct pipe_constant_buffer vs_const_buffer;
-   struct pipe_constant_buffer fs_const_buffer;
+   struct xorg_renderer *renderer;
 
    struct pipe_texture *bound_textures[MAX_EXA_SAMPLERS];
    int num_bound_samplers;
 
    float solid_color[4];
-};
+   boolean has_solid_color;
+
+   boolean accel;
+
+   /* float[9] projective matrix bound to pictures */
+   struct {
+      float    src[9];
+      float   mask[9];
+      boolean has_src;
+      boolean has_mask;
+   } transform;
+
+   struct {
+      boolean use_surface_copy;
+
+      struct exa_pixmap_priv *src;
+      struct exa_pixmap_priv *dst;
 
+      struct pipe_surface *src_surface;
+      struct pipe_surface *dst_surface;
+
+      struct pipe_texture *src_texture;
+   } copy;
+};
 
 struct exa_pixmap_priv
 {
+   int width, height;
+
    int flags;
    int tex_flags;
 
+   int picture_format;
+
    struct pipe_texture *tex;
    struct pipe_texture *depth_stencil_tex;
-   unsigned int color;
-   struct pipe_surface *src_surf; /* for copies */
 
    struct pipe_transfer *map_transfer;
    unsigned map_count;
 };
 
+#define XORG_FALLBACK(s, arg...)                              \
+do {                                                          \
+   if (ms->debug_fallback) {                                  \
+      xf86DrvMsg(pScrn->scrnIndex, X_INFO,                    \
+                 "%s fallback " s "\n", __FUNCTION__, ##arg); \
+   }                                                          \
+   return FALSE;                                              \
+} while(0)
+
 struct pipe_surface *
-exa_gpu_surface(struct exa_context *exa, struct exa_pixmap_priv *priv);
+xorg_gpu_surface(struct pipe_screen *scrn, struct exa_pixmap_priv *priv);
 
+void xorg_exa_flush(struct exa_context *exa, uint pipeFlushFlags,
+                    struct pipe_fence_handle **fence);
+void xorg_exa_finish(struct exa_context *exa);
 
 #endif
diff --git a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
index cfee10c3b30..13a9840bddd 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.c
@@ -43,34 +43,67 @@
  * OUT[0] = color
  */
 
+static void
+print_fs_traits(int fs_traits)
+{
+   const char *strings[] = {
+      "FS_COMPOSITE",       // = 1 << 0,
+      "FS_MASK",            // = 1 << 1,
+      "FS_SOLID_FILL",      // = 1 << 2,
+      "FS_LINGRAD_FILL",    // = 1 << 3,
+      "FS_RADGRAD_FILL",    // = 1 << 4,
+      "FS_CA_FULL",         // = 1 << 5, /* src.rgba * mask.rgba */
+      "FS_CA_SRCALPHA",     // = 1 << 6, /* src.aaaa * mask.rgba */
+      "FS_YUV",             // = 1 << 7,
+      "FS_SRC_REPEAT_NONE", // = 1 << 8,
+      "FS_MASK_REPEAT_NONE",// = 1 << 9,
+      "FS_SRC_SWIZZLE_RGB", // = 1 << 10,
+      "FS_MASK_SWIZZLE_RGB",// = 1 << 11,
+      "FS_SRC_SET_ALPHA",   // = 1 << 12,
+      "FS_MASK_SET_ALPHA",  // = 1 << 13,
+      "FS_SRC_LUMINANCE",   // = 1 << 14,
+      "FS_MASK_LUMINANCE",  // = 1 << 15,
+   };
+   int i, k;
+   debug_printf("%s: ", __func__);
+
+   for (i = 0, k = 1; k < (1 << 16); i++, k <<= 1) {
+      if (fs_traits & k)
+         debug_printf("%s, ", strings[i]);
+   }
+
+   debug_printf("\n");
+}
+
 struct xorg_shaders {
-   struct exa_context *exa;
+   struct xorg_renderer *r;
 
    struct cso_hash *vs_hash;
    struct cso_hash *fs_hash;
 };
 
-static const char over_op[] =
-   "SUB TEMP[3], CONST[0].wwww, TEMP[1].wwww\n"
-   "MUL TEMP[3], TEMP[0], TEMP[3]\n"
-   "ADD TEMP[0], TEMP[3], TEMP[0]\n";
-
-
-static INLINE void
-create_preamble(struct ureg_program *ureg)
-{
-}
-
-
 static INLINE void
 src_in_mask(struct ureg_program *ureg,
             struct ureg_dst dst,
             struct ureg_src src,
-            struct ureg_src mask)
+            struct ureg_src mask,
+            unsigned component_alpha,
+            unsigned mask_luminance)
 {
-   /* MUL dst, src, mask.wwww */
-   ureg_MUL(ureg, dst, src,
-            ureg_scalar(mask, TGSI_SWIZZLE_W));
+   if (component_alpha == FS_CA_FULL) {
+      ureg_MUL(ureg, dst, src, mask);
+   } else if (component_alpha == FS_CA_SRCALPHA) {
+      ureg_MUL(ureg, dst,
+               ureg_scalar(src, TGSI_SWIZZLE_W), mask);
+   }
+   else {
+      if (mask_luminance)
+         ureg_MUL(ureg, dst, src,
+                  ureg_scalar(mask, TGSI_SWIZZLE_X));
+      else
+         ureg_MUL(ureg, dst, src,
+                  ureg_scalar(mask, TGSI_SWIZZLE_W));
+   }
 }
 
 static struct ureg_src
@@ -79,8 +112,7 @@ vs_normalize_coords(struct ureg_program *ureg, struct ureg_src coords,
 {
    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
    struct ureg_src ret;
-   ureg_MUL(ureg, tmp, coords, const0);
-   ureg_ADD(ureg, tmp, ureg_src(tmp), const1);
+   ureg_MAD(ureg, tmp, coords, const0, const1);
    ret = ureg_src(tmp);
    ureg_release_temporary(ureg, tmp);
    return ret;
@@ -238,45 +270,49 @@ create_vs(struct pipe_context *pipe,
    struct ureg_src src;
    struct ureg_dst dst;
    struct ureg_src const0, const1;
-   boolean is_fill = vs_traits & VS_FILL;
-   boolean is_composite = vs_traits & VS_COMPOSITE;
-   boolean has_mask = vs_traits & VS_MASK;
+   boolean is_fill = (vs_traits & VS_FILL) != 0;
+   boolean is_composite = (vs_traits & VS_COMPOSITE) != 0;
+   boolean has_mask = (vs_traits & VS_MASK) != 0;
+   boolean is_yuv = (vs_traits & VS_YUV) != 0;
+   unsigned input_slot = 0;
 
    ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
    if (ureg == NULL)
       return 0;
 
-   const0 = ureg_DECL_constant(ureg);
-   const1 = ureg_DECL_constant(ureg);
+   const0 = ureg_DECL_constant(ureg, 0);
+   const1 = ureg_DECL_constant(ureg, 1);
 
    /* it has to be either a fill or a composite op */
-   debug_assert(is_fill ^ is_composite);
+   debug_assert((is_fill ^ is_composite) ^ is_yuv);
 
-   src = ureg_DECL_vs_input(ureg,
-                            TGSI_SEMANTIC_POSITION, 0);
+   src = ureg_DECL_vs_input(ureg, input_slot++);
    dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
    src = vs_normalize_coords(ureg, src,
                              const0, const1);
    ureg_MOV(ureg, dst, src);
 
+   if (is_yuv) {
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0);
+      ureg_MOV(ureg, dst, src);
+   }
 
    if (is_composite) {
-      src = ureg_DECL_vs_input(ureg,
-                               TGSI_SEMANTIC_GENERIC, 1);
-      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 1);
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0);
       ureg_MOV(ureg, dst, src);
    }
+
    if (is_fill) {
-      src = ureg_DECL_vs_input(ureg,
-                               TGSI_SEMANTIC_COLOR, 1);
-      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1);
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
       ureg_MOV(ureg, dst, src);
    }
 
    if (has_mask) {
-      src = ureg_DECL_vs_input(ureg,
-                               TGSI_SEMANTIC_GENERIC, 2);
-      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 2);
+      src = ureg_DECL_vs_input(ureg, input_slot++);
+      dst = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 1);
       ureg_MOV(ureg, dst, src);
    }
 
@@ -286,6 +322,138 @@ create_vs(struct pipe_context *pipe,
 }
 
 static void *
+create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg)
+{
+   struct ureg_src y_sampler, u_sampler, v_sampler;
+   struct ureg_src pos;
+   struct ureg_src matrow0, matrow1, matrow2;
+   struct ureg_dst y, u, v, rgb;
+   struct ureg_dst out = ureg_DECL_output(ureg,
+                                          TGSI_SEMANTIC_COLOR,
+                                          0);
+
+   pos = ureg_DECL_fs_input(ureg,
+                            TGSI_SEMANTIC_GENERIC,
+                            0,
+                            TGSI_INTERPOLATE_PERSPECTIVE);
+
+   rgb = ureg_DECL_temporary(ureg);
+   y = ureg_DECL_temporary(ureg);
+   u = ureg_DECL_temporary(ureg);
+   v = ureg_DECL_temporary(ureg);
+
+   y_sampler = ureg_DECL_sampler(ureg, 0);
+   u_sampler = ureg_DECL_sampler(ureg, 1);
+   v_sampler = ureg_DECL_sampler(ureg, 2);
+
+   matrow0 = ureg_DECL_constant(ureg, 0);
+   matrow1 = ureg_DECL_constant(ureg, 1);
+   matrow2 = ureg_DECL_constant(ureg, 2);
+
+   ureg_TEX(ureg, y,
+            TGSI_TEXTURE_2D, pos, y_sampler);
+   ureg_TEX(ureg, u,
+            TGSI_TEXTURE_2D, pos, u_sampler);
+   ureg_TEX(ureg, v,
+            TGSI_TEXTURE_2D, pos, v_sampler);
+
+   ureg_SUB(ureg, u, ureg_src(u),
+            ureg_scalar(matrow0, TGSI_SWIZZLE_W));
+   ureg_SUB(ureg, v, ureg_src(v),
+            ureg_scalar(matrow0, TGSI_SWIZZLE_W));
+
+   ureg_MUL(ureg, rgb,
+            ureg_scalar(ureg_src(y), TGSI_SWIZZLE_X),
+            matrow0);
+   ureg_MAD(ureg, rgb,
+            ureg_scalar(ureg_src(u), TGSI_SWIZZLE_X),
+            matrow1,
+            ureg_src(rgb));
+   ureg_MAD(ureg, rgb,
+            ureg_scalar(ureg_src(v), TGSI_SWIZZLE_X),
+            matrow2,
+            ureg_src(rgb));
+
+   /* rgb.a = 1; */
+   ureg_MOV(ureg, ureg_writemask(rgb, TGSI_WRITEMASK_W),
+            ureg_scalar(matrow0, TGSI_SWIZZLE_X));
+
+   ureg_MOV(ureg, out, ureg_src(rgb));
+
+   ureg_release_temporary(ureg, rgb);
+   ureg_release_temporary(ureg, y);
+   ureg_release_temporary(ureg, u);
+   ureg_release_temporary(ureg, v);
+
+   ureg_END(ureg);
+
+   return ureg_create_shader_and_destroy(ureg, pipe);
+}
+
+
+static INLINE void
+xrender_tex(struct ureg_program *ureg,
+            struct ureg_dst dst,
+            struct ureg_src coords,
+            struct ureg_src sampler,
+            struct ureg_src imm0,
+            boolean repeat_none,
+            boolean swizzle,
+            boolean set_alpha)
+{
+   if (repeat_none) {
+      struct ureg_dst tmp0 = ureg_DECL_temporary(ureg);
+      struct ureg_dst tmp1 = ureg_DECL_temporary(ureg);
+      ureg_SGT(ureg, tmp1, ureg_swizzle(coords,
+                                        TGSI_SWIZZLE_X,
+                                        TGSI_SWIZZLE_Y,
+                                        TGSI_SWIZZLE_X,
+                                        TGSI_SWIZZLE_Y),
+               ureg_scalar(imm0, TGSI_SWIZZLE_X));
+      ureg_SLT(ureg, tmp0, ureg_swizzle(coords,
+                                        TGSI_SWIZZLE_X,
+                                        TGSI_SWIZZLE_Y,
+                                        TGSI_SWIZZLE_X,
+                                        TGSI_SWIZZLE_Y),
+               ureg_scalar(imm0, TGSI_SWIZZLE_W));
+      ureg_MIN(ureg, tmp0, ureg_src(tmp0), ureg_src(tmp1));
+      ureg_MIN(ureg, tmp0, ureg_scalar(ureg_src(tmp0), TGSI_SWIZZLE_X),
+               ureg_scalar(ureg_src(tmp0), TGSI_SWIZZLE_Y));
+      ureg_TEX(ureg, tmp1, TGSI_TEXTURE_2D, coords, sampler);
+      if (swizzle)
+         ureg_MOV(ureg, tmp1, ureg_swizzle(ureg_src(tmp1),
+                                           TGSI_SWIZZLE_Z,
+                                           TGSI_SWIZZLE_Y,
+                                           TGSI_SWIZZLE_X,
+                                           TGSI_SWIZZLE_W));
+      if (set_alpha)
+         ureg_MOV(ureg,
+                  ureg_writemask(tmp1, TGSI_WRITEMASK_W),
+                  ureg_scalar(imm0, TGSI_SWIZZLE_W));
+      ureg_MUL(ureg, dst, ureg_src(tmp1), ureg_src(tmp0));
+      ureg_release_temporary(ureg, tmp0);
+      ureg_release_temporary(ureg, tmp1);
+   } else {
+      if (swizzle) {
+         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
+         ureg_TEX(ureg, tmp, TGSI_TEXTURE_2D, coords, sampler);
+         ureg_MOV(ureg, dst, ureg_swizzle(ureg_src(tmp),
+                                          TGSI_SWIZZLE_Z,
+                                          TGSI_SWIZZLE_Y,
+                                          TGSI_SWIZZLE_X,
+                                          TGSI_SWIZZLE_W));
+         ureg_release_temporary(ureg, tmp);
+      } else {
+         ureg_TEX(ureg, dst, TGSI_TEXTURE_2D, coords, sampler);
+      }
+      if (set_alpha)
+         ureg_MOV(ureg,
+                  ureg_writemask(dst, TGSI_WRITEMASK_W),
+                  ureg_scalar(imm0, TGSI_SWIZZLE_W));
+   }
+}
+
+static void *
 create_fs(struct pipe_context *pipe,
           unsigned fs_traits)
 {
@@ -294,32 +462,53 @@ create_fs(struct pipe_context *pipe,
    struct ureg_src /*dst_pos,*/ src_input, mask_pos;
    struct ureg_dst src, mask;
    struct ureg_dst out;
-   boolean has_mask = fs_traits & FS_MASK;
-   boolean is_fill = fs_traits & FS_FILL;
-   boolean is_composite = fs_traits & FS_COMPOSITE;
-   boolean is_solid   = fs_traits & FS_SOLID_FILL;
-   boolean is_lingrad = fs_traits & FS_LINGRAD_FILL;
-   boolean is_radgrad = fs_traits & FS_RADGRAD_FILL;
+   struct ureg_src imm0 = { 0 };
+   unsigned has_mask = (fs_traits & FS_MASK) != 0;
+   unsigned is_fill = (fs_traits & FS_FILL) != 0;
+   unsigned is_composite = (fs_traits & FS_COMPOSITE) != 0;
+   unsigned is_solid   = (fs_traits & FS_SOLID_FILL) != 0;
+   unsigned is_lingrad = (fs_traits & FS_LINGRAD_FILL) != 0;
+   unsigned is_radgrad = (fs_traits & FS_RADGRAD_FILL) != 0;
+   unsigned comp_alpha_mask = fs_traits & FS_COMPONENT_ALPHA;
+   unsigned is_yuv = (fs_traits & FS_YUV) != 0;
+   unsigned src_repeat_none = (fs_traits & FS_SRC_REPEAT_NONE) != 0;
+   unsigned mask_repeat_none = (fs_traits & FS_MASK_REPEAT_NONE) != 0;
+   unsigned src_swizzle = (fs_traits & FS_SRC_SWIZZLE_RGB) != 0;
+   unsigned mask_swizzle = (fs_traits & FS_MASK_SWIZZLE_RGB) != 0;
+   unsigned src_set_alpha = (fs_traits & FS_SRC_SET_ALPHA) != 0;
+   unsigned mask_set_alpha = (fs_traits & FS_MASK_SET_ALPHA) != 0;
+   unsigned src_luminance = (fs_traits & FS_SRC_LUMINANCE) != 0;
+   unsigned mask_luminance = (fs_traits & FS_MASK_LUMINANCE) != 0;
+
+#if 0
+   print_fs_traits(fs_traits);
+#else
+   (void)print_fs_traits;
+#endif
 
    ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
    if (ureg == NULL)
       return 0;
 
-   /* it has to be either a fill or a composite op */
-   debug_assert(is_fill ^ is_composite);
+   /* it has to be either a fill, a composite op or a yuv conversion */
+   debug_assert((is_fill ^ is_composite) ^ is_yuv);
 
    out = ureg_DECL_output(ureg,
                           TGSI_SEMANTIC_COLOR,
                           0);
 
+   if (src_repeat_none || mask_repeat_none ||
+       src_set_alpha || mask_set_alpha ||
+       src_luminance) {
+      imm0 = ureg_imm4f(ureg, 0, 0, 0, 1);
+   }
    if (is_composite) {
       src_sampler = ureg_DECL_sampler(ureg, 0);
       src_input = ureg_DECL_fs_input(ureg,
-                                     TGSI_SEMANTIC_POSITION,
+                                     TGSI_SEMANTIC_GENERIC,
                                      0,
                                      TGSI_INTERPOLATE_PERSPECTIVE);
-   }
-   if (is_fill) {
+   } else if (is_fill) {
       if (is_solid)
          src_input = ureg_DECL_fs_input(ureg,
                                         TGSI_SEMANTIC_COLOR,
@@ -330,12 +519,15 @@ create_fs(struct pipe_context *pipe,
                                         TGSI_SEMANTIC_POSITION,
                                         0,
                                         TGSI_INTERPOLATE_PERSPECTIVE);
+   } else {
+      debug_assert(is_yuv);
+      return create_yuv_shader(pipe, ureg);
    }
 
    if (has_mask) {
       mask_sampler = ureg_DECL_sampler(ureg, 1);
       mask_pos = ureg_DECL_fs_input(ureg,
-                                    TGSI_SEMANTIC_POSITION,
+                                    TGSI_SEMANTIC_GENERIC,
                                     1,
                                     TGSI_INTERPOLATE_PERSPECTIVE);
    }
@@ -348,16 +540,17 @@ create_fs(struct pipe_context *pipe,
                                 TGSI_INTERPOLATE_PERSPECTIVE);
 #endif
 
+
    if (is_composite) {
-      if (has_mask)
+      if (has_mask || src_luminance)
          src = ureg_DECL_temporary(ureg);
       else
          src = out;
-      ureg_TEX(ureg, src,
-               TGSI_TEXTURE_2D, src_input, src_sampler);
+      xrender_tex(ureg, src, src_input, src_sampler, imm0,
+                  src_repeat_none, src_swizzle, src_set_alpha);
    } else if (is_fill) {
       if (is_solid) {
-         if (has_mask)
+         if (has_mask || src_luminance)
             src = ureg_dst(src_input);
          else
             ureg_MOV(ureg, out, src_input);
@@ -365,16 +558,16 @@ create_fs(struct pipe_context *pipe,
          struct ureg_src coords, const0124,
             matrow0, matrow1, matrow2;
 
-         if (has_mask)
+         if (has_mask || src_luminance)
             src = ureg_DECL_temporary(ureg);
          else
             src = out;
 
-         coords = ureg_DECL_constant(ureg);
-         const0124 = ureg_DECL_constant(ureg);
-         matrow0 = ureg_DECL_constant(ureg);
-         matrow1 = ureg_DECL_constant(ureg);
-         matrow2 = ureg_DECL_constant(ureg);
+         coords = ureg_DECL_constant(ureg, 0);
+         const0124 = ureg_DECL_constant(ureg, 1);
+         matrow0 = ureg_DECL_constant(ureg, 2);
+         matrow1 = ureg_DECL_constant(ureg, 3);
+         matrow2 = ureg_DECL_constant(ureg, 4);
 
          if (is_lingrad) {
             linear_gradient(ureg, src,
@@ -390,13 +583,22 @@ create_fs(struct pipe_context *pipe,
       } else
          debug_assert(!"Unknown fill type!");
    }
+   if (src_luminance) {
+      ureg_MOV(ureg, src,
+               ureg_scalar(ureg_src(src), TGSI_SWIZZLE_X));
+      ureg_MOV(ureg, ureg_writemask(src, TGSI_WRITEMASK_XYZ),
+               ureg_scalar(imm0, TGSI_SWIZZLE_X));
+      if (!has_mask)
+         ureg_MOV(ureg, out, ureg_src(src));
+   }
 
    if (has_mask) {
       mask = ureg_DECL_temporary(ureg);
-      ureg_TEX(ureg, mask,
-               TGSI_TEXTURE_2D, mask_pos, mask_sampler);
+      xrender_tex(ureg, mask, mask_pos, mask_sampler, imm0,
+                  mask_repeat_none, mask_swizzle, mask_set_alpha);
       /* src IN mask */
-      src_in_mask(ureg, out, ureg_src(src), ureg_src(mask));
+      src_in_mask(ureg, out, ureg_src(src), ureg_src(mask),
+                  comp_alpha_mask, mask_luminance);
       ureg_release_temporary(ureg, mask);
    }
 
@@ -405,11 +607,11 @@ create_fs(struct pipe_context *pipe,
    return ureg_create_shader_and_destroy(ureg, pipe);
 }
 
-struct xorg_shaders * xorg_shaders_create(struct exa_context *exa)
+struct xorg_shaders * xorg_shaders_create(struct xorg_renderer *r)
 {
    struct xorg_shaders *sc = CALLOC_STRUCT(xorg_shaders);
 
-   sc->exa = exa;
+   sc->r = r;
    sc->vs_hash = cso_hash_create();
    sc->fs_hash = cso_hash_create();
 
@@ -436,9 +638,9 @@ cache_destroy(struct cso_context *cso,
 
 void xorg_shaders_destroy(struct xorg_shaders *sc)
 {
-   cache_destroy(sc->exa->cso, sc->vs_hash,
+   cache_destroy(sc->r->cso, sc->vs_hash,
                  PIPE_SHADER_VERTEX);
-   cache_destroy(sc->exa->cso, sc->fs_hash,
+   cache_destroy(sc->r->cso, sc->fs_hash,
                  PIPE_SHADER_FRAGMENT);
 
    free(sc);
@@ -470,12 +672,12 @@ struct xorg_shader xorg_shaders_get(struct xorg_shaders *sc,
                                     unsigned vs_traits,
                                     unsigned fs_traits)
 {
-   struct xorg_shader shader = {0};
+   struct xorg_shader shader = { NULL, NULL };
    void *vs, *fs;
 
-   vs = shader_from_cache(sc->exa->ctx, PIPE_SHADER_VERTEX,
+   vs = shader_from_cache(sc->r->pipe, PIPE_SHADER_VERTEX,
                           sc->vs_hash, vs_traits);
-   fs = shader_from_cache(sc->exa->ctx, PIPE_SHADER_FRAGMENT,
+   fs = shader_from_cache(sc->r->pipe, PIPE_SHADER_FRAGMENT,
                           sc->fs_hash, fs_traits);
 
    debug_assert(vs && fs);
diff --git a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.h b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.h
index 1535a0c8c30..6f2a361d030 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa_tgsi.h
+++ b/src/gallium/state_trackers/xorg/xorg_exa_tgsi.h
@@ -1,7 +1,7 @@
 #ifndef XORG_EXA_TGSI_H
 #define XORG_EXA_TGSI_H
 
-#include "xorg_exa.h"
+#include "xorg_renderer.h"
 
 enum xorg_vs_traits {
    VS_COMPOSITE        = 1 << 0,
@@ -9,10 +9,12 @@ enum xorg_vs_traits {
    VS_SOLID_FILL       = 1 << 2,
    VS_LINGRAD_FILL     = 1 << 3,
    VS_RADGRAD_FILL     = 1 << 4,
+   VS_YUV              = 1 << 5,
+
+
    VS_FILL             = (VS_SOLID_FILL |
                           VS_LINGRAD_FILL |
                           VS_RADGRAD_FILL)
-   /*VS_TRANSFORM      = 1 << 5*/
 };
 
 enum xorg_fs_traits {
@@ -21,9 +23,23 @@ enum xorg_fs_traits {
    FS_SOLID_FILL       = 1 << 2,
    FS_LINGRAD_FILL     = 1 << 3,
    FS_RADGRAD_FILL     = 1 << 4,
+   FS_CA_FULL          = 1 << 5, /* src.rgba * mask.rgba */
+   FS_CA_SRCALPHA      = 1 << 6, /* src.aaaa * mask.rgba */
+   FS_YUV              = 1 << 7,
+   FS_SRC_REPEAT_NONE  = 1 << 8,
+   FS_MASK_REPEAT_NONE = 1 << 9,
+   FS_SRC_SWIZZLE_RGB  = 1 << 10,
+   FS_MASK_SWIZZLE_RGB = 1 << 11,
+   FS_SRC_SET_ALPHA    = 1 << 12,
+   FS_MASK_SET_ALPHA   = 1 << 13,
+   FS_SRC_LUMINANCE    = 1 << 14,
+   FS_MASK_LUMINANCE   = 1 << 15,
+
    FS_FILL             = (FS_SOLID_FILL |
                           FS_LINGRAD_FILL |
-                          FS_RADGRAD_FILL)
+                          FS_RADGRAD_FILL),
+   FS_COMPONENT_ALPHA  = (FS_CA_FULL |
+                          FS_CA_SRCALPHA)
 };
 
 struct xorg_shader {
@@ -33,7 +49,7 @@ struct xorg_shader {
 
 struct xorg_shaders;
 
-struct xorg_shaders *xorg_shaders_create(struct exa_context *exa);
+struct xorg_shaders *xorg_shaders_create(struct xorg_renderer *renderer);
 void xorg_shaders_destroy(struct xorg_shaders *shaders);
 
 struct xorg_shader xorg_shaders_get(struct xorg_shaders *shaders,
diff --git a/src/gallium/state_trackers/xorg/xorg_output.c b/src/gallium/state_trackers/xorg/xorg_output.c
index 26f45f8d645..251f331ea7a 100644
--- a/src/gallium/state_trackers/xorg/xorg_output.c
+++ b/src/gallium/state_trackers/xorg/xorg_output.c
@@ -53,73 +53,36 @@
 
 #include "xorg_tracker.h"
 
-static char *connector_enum_list[] = {
+static char *output_enum_list[] = {
     "Unknown",
     "VGA",
-    "DVI-I",
-    "DVI-D",
-    "DVI-A",
+    "DVI",
+    "DVI",
+    "DVI",
     "Composite",
     "SVIDEO",
     "LVDS",
-    "Component",
-    "9-pin DIN",
-    "DisplayPort",
-    "HDMI Type A",
-    "HDMI Type B",
+    "CTV",
+    "DIN",
+    "DP",
+    "HDMI",
+    "HDMI",
 };
 
 static void
-dpms(xf86OutputPtr output, int mode)
-{
-}
-
-static void
-save(xf86OutputPtr output)
-{
-}
-
-static void
-restore(xf86OutputPtr output)
-{
-}
-
-static int
-mode_valid(xf86OutputPtr output, DisplayModePtr pMode)
-{
-    return MODE_OK;
-}
-
-static Bool
-mode_fixup(xf86OutputPtr output, DisplayModePtr mode,
-	   DisplayModePtr adjusted_mode)
-{
-    return TRUE;
-}
-
-static void
-prepare(xf86OutputPtr output)
-{
-    dpms(output, DPMSModeOff);
-}
-
-static void
-mode_set(xf86OutputPtr output, DisplayModePtr mode,
-	 DisplayModePtr adjusted_mode)
+output_create_resources(xf86OutputPtr output)
 {
+#ifdef RANDR_12_INTERFACE
+#endif /* RANDR_12_INTERFACE */
 }
 
 static void
-commit(xf86OutputPtr output)
+output_dpms(xf86OutputPtr output, int mode)
 {
-    dpms(output, DPMSModeOn);
-
-    if (output->scrn->pScreen != NULL)
-	xf86_reload_cursors(output->scrn->pScreen);
 }
 
 static xf86OutputStatus
-detect(xf86OutputPtr output)
+output_detect(xf86OutputPtr output)
 {
     drmModeConnectorPtr drm_connector = output->driver_private;
 
@@ -134,7 +97,7 @@ detect(xf86OutputPtr output)
 }
 
 static DisplayModePtr
-get_modes(xf86OutputPtr output)
+output_get_modes(xf86OutputPtr output)
 {
     drmModeConnectorPtr drm_connector = output->driver_private;
     drmModeModeInfoPtr drm_mode = NULL;
@@ -147,7 +110,6 @@ get_modes(xf86OutputPtr output)
 	    mode = xcalloc(1, sizeof(DisplayModeRec));
 	    if (!mode)
 		continue;
-	    mode->type = 0;
 	    mode->Clock = drm_mode->clock;
 	    mode->HDisplay = drm_mode->hdisplay;
 	    mode->HSyncStart = drm_mode->hsync_start;
@@ -162,6 +124,11 @@ get_modes(xf86OutputPtr output)
 	    mode->VScan = drm_mode->vscan;
 	    mode->VRefresh = xf86ModeVRefresh(mode);
 	    mode->Private = (void *)drm_mode;
+	    mode->type = 0;
+	    if (drm_mode->type & DRM_MODE_TYPE_PREFERRED)
+		mode->type |= M_T_PREFERRED;
+	    if (drm_mode->type & DRM_MODE_TYPE_DRIVER)
+		mode->type |= M_T_DRIVER;
 	    xf86SetModeDefaultName(mode);
 	    modes = xf86ModesAdd(modes, mode);
 	    xf86PrintModeline(0, mode);
@@ -171,22 +138,15 @@ get_modes(xf86OutputPtr output)
     return modes;
 }
 
-static void
-destroy(xf86OutputPtr output)
-{
-    drmModeFreeConnector(output->driver_private);
-}
-
-static void
-create_resources(xf86OutputPtr output)
+static int
+output_mode_valid(xf86OutputPtr output, DisplayModePtr pMode)
 {
-#ifdef RANDR_12_INTERFACE
-#endif /* RANDR_12_INTERFACE */
+    return MODE_OK;
 }
 
 #ifdef RANDR_12_INTERFACE
 static Bool
-set_property(xf86OutputPtr output, Atom property, RRPropertyValuePtr value)
+output_set_property(xf86OutputPtr output, Atom property, RRPropertyValuePtr value)
 {
     return TRUE;
 }
@@ -194,53 +154,43 @@ set_property(xf86OutputPtr output, Atom property, RRPropertyValuePtr value)
 
 #ifdef RANDR_13_INTERFACE
 static Bool
-get_property(xf86OutputPtr output, Atom property)
+output_get_property(xf86OutputPtr output, Atom property)
 {
     return TRUE;
 }
 #endif /* RANDR_13_INTERFACE */
 
-#ifdef RANDR_GET_CRTC_INTERFACE
-static xf86CrtcPtr
-get_crtc(xf86OutputPtr output)
+static void
+output_destroy(xf86OutputPtr output)
 {
-    return NULL;
+    drmModeFreeConnector(output->driver_private);
 }
-#endif
 
 static const xf86OutputFuncsRec output_funcs = {
-    .create_resources = create_resources,
-    .dpms = dpms,
-    .save = save,
-    .restore = restore,
-    .mode_valid = mode_valid,
-    .mode_fixup = mode_fixup,
-    .prepare = prepare,
-    .mode_set = mode_set,
-    .commit = commit,
-    .detect = detect,
-    .get_modes = get_modes,
+    .create_resources = output_create_resources,
 #ifdef RANDR_12_INTERFACE
-    .set_property = set_property,
+    .set_property = output_set_property,
 #endif
 #ifdef RANDR_13_INTERFACE
-    .get_property = get_property,
-#endif
-    .destroy = destroy,
-#ifdef RANDR_GET_CRTC_INTERFACE
-    .get_crtc = get_crtc,
+    .get_property = output_get_property,
 #endif
+    .dpms = output_dpms,
+    .detect = output_detect,
+
+    .get_modes = output_get_modes,
+    .mode_valid = output_mode_valid,
+    .destroy = output_destroy,
 };
 
 void
-output_init(ScrnInfoPtr pScrn)
+xorg_output_init(ScrnInfoPtr pScrn)
 {
     modesettingPtr ms = modesettingPTR(pScrn);
     xf86OutputPtr output;
     drmModeResPtr res;
     drmModeConnectorPtr drm_connector = NULL;
     drmModeEncoderPtr drm_encoder = NULL;
-    char *name;
+    char name[32];
     int c, v, p;
 
     res = drmModeGetResources(ms->fd);
@@ -273,7 +223,10 @@ output_init(ScrnInfoPtr pScrn)
 	(void)v;
 #endif
 
-	name = connector_enum_list[drm_connector->connector_type];
+	snprintf(name, 32, "%s%d",
+		 output_enum_list[drm_connector->connector_type],
+		 drm_connector->connector_type_id);
+
 
 	output = xf86OutputCreate(pScrn, &output_funcs, name);
 	if (!output)
diff --git a/src/gallium/state_trackers/xorg/xorg_renderer.c b/src/gallium/state_trackers/xorg/xorg_renderer.c
new file mode 100644
index 00000000000..cbb84a8c0da
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xorg_renderer.c
@@ -0,0 +1,699 @@
+#include "xorg_exa.h"
+#include "xorg_renderer.h"
+
+#include "xorg_exa_tgsi.h"
+
+#include "cso_cache/cso_context.h"
+#include "util/u_draw_quad.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_rect.h"
+
+#include "pipe/p_inlines.h"
+
+#include <math.h>
+
+#define floatsEqual(x, y) (fabs(x - y) <= 0.00001f * MIN2(fabs(x), fabs(y)))
+#define floatIsZero(x) (floatsEqual((x) + 1, 1))
+
+#define NUM_COMPONENTS 4
+
+static INLINE boolean is_affine(float *matrix)
+{
+   return floatIsZero(matrix[2]) && floatIsZero(matrix[5])
+      && floatsEqual(matrix[8], 1);
+}
+static INLINE void map_point(float *mat, float x, float y,
+                             float *out_x, float *out_y)
+{
+   if (!mat) {
+      *out_x = x;
+      *out_y = y;
+      return;
+   }
+
+   *out_x = mat[0]*x + mat[3]*y + mat[6];
+   *out_y = mat[1]*x + mat[4]*y + mat[7];
+   if (!is_affine(mat)) {
+      float w = 1/(mat[2]*x + mat[5]*y + mat[8]);
+      *out_x *= w;
+      *out_y *= w;
+   }
+}
+
+static INLINE struct pipe_buffer *
+renderer_buffer_create(struct xorg_renderer *r)
+{
+   struct pipe_buffer *buf =
+      pipe_user_buffer_create(r->pipe->screen,
+                              r->buffer,
+                              sizeof(float)*
+                              r->buffer_size);
+   r->buffer_size = 0;
+
+   return buf;
+}
+
+static INLINE void
+renderer_draw(struct xorg_renderer *r)
+{
+   struct pipe_context *pipe = r->pipe;
+   struct pipe_buffer *buf = 0;
+   int num_verts = r->buffer_size/(r->attrs_per_vertex * NUM_COMPONENTS);
+
+   if (!r->buffer_size)
+      return;
+
+   buf = renderer_buffer_create(r);
+
+
+   if (buf) {
+      util_draw_vertex_buffer(pipe, buf, 0,
+                              PIPE_PRIM_QUADS,
+                              num_verts,  /* verts */
+                              r->attrs_per_vertex); /* attribs/vert */
+
+      pipe_buffer_reference(&buf, NULL);
+   }
+}
+
+static INLINE void
+renderer_draw_conditional(struct xorg_renderer *r,
+                          int next_batch)
+{
+   if (r->buffer_size + next_batch >= BUF_SIZE ||
+       (next_batch == 0 && r->buffer_size)) {
+      renderer_draw(r);
+   }
+}
+
+static void
+renderer_init_state(struct xorg_renderer *r)
+{
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_rasterizer_state raster;
+
+   /* set common initial clip state */
+   memset(&dsa, 0, sizeof(struct pipe_depth_stencil_alpha_state));
+   cso_set_depth_stencil_alpha(r->cso, &dsa);
+
+
+   /* XXX: move to renderer_init_state? */
+   memset(&raster, 0, sizeof(struct pipe_rasterizer_state));
+   raster.gl_rasterization_rules = 1;
+   cso_set_rasterizer(r->cso, &raster);
+
+}
+
+
+static INLINE void
+add_vertex_color(struct xorg_renderer *r,
+                 float x, float y,
+                 float color[4])
+{
+   float *vertex = r->buffer + r->buffer_size;
+
+   vertex[0] = x;
+   vertex[1] = y;
+   vertex[2] = 0.f; /*z*/
+   vertex[3] = 1.f; /*w*/
+
+   vertex[4] = color[0]; /*r*/
+   vertex[5] = color[1]; /*g*/
+   vertex[6] = color[2]; /*b*/
+   vertex[7] = color[3]; /*a*/
+
+   r->buffer_size += 8;
+}
+
+static INLINE void
+add_vertex_1tex(struct xorg_renderer *r,
+                float x, float y, float s, float t)
+{
+   float *vertex = r->buffer + r->buffer_size;
+
+   vertex[0] = x;
+   vertex[1] = y;
+   vertex[2] = 0.f; /*z*/
+   vertex[3] = 1.f; /*w*/
+
+   vertex[4] = s;   /*s*/
+   vertex[5] = t;   /*t*/
+   vertex[6] = 0.f; /*r*/
+   vertex[7] = 1.f; /*q*/
+
+   r->buffer_size += 8;
+}
+
+static void
+add_vertex_data1(struct xorg_renderer *r,
+                 float srcX, float srcY,  float dstX, float dstY,
+                 float width, float height,
+                 struct pipe_texture *src, float *src_matrix)
+{
+   float s0, t0, s1, t1, s2, t2, s3, t3;
+   float pt0[2], pt1[2], pt2[2], pt3[2];
+
+   pt0[0] = srcX;
+   pt0[1] = srcY;
+   pt1[0] = (srcX + width);
+   pt1[1] = srcY;
+   pt2[0] = (srcX + width);
+   pt2[1] = (srcY + height);
+   pt3[0] = srcX;
+   pt3[1] = (srcY + height);
+
+   if (src_matrix) {
+      map_point(src_matrix, pt0[0], pt0[1], &pt0[0], &pt0[1]);
+      map_point(src_matrix, pt1[0], pt1[1], &pt1[0], &pt1[1]);
+      map_point(src_matrix, pt2[0], pt2[1], &pt2[0], &pt2[1]);
+      map_point(src_matrix, pt3[0], pt3[1], &pt3[0], &pt3[1]);
+   }
+
+   s0 =  pt0[0] / src->width[0];
+   s1 =  pt1[0] / src->width[0];
+   s2 =  pt2[0] / src->width[0];
+   s3 =  pt3[0] / src->width[0];
+   t0 =  pt0[1] / src->height[0];
+   t1 =  pt1[1] / src->height[0];
+   t2 =  pt2[1] / src->height[0];
+   t3 =  pt3[1] / src->height[0];
+
+   /* 1st vertex */
+   add_vertex_1tex(r, dstX, dstY, s0, t0);
+   /* 2nd vertex */
+   add_vertex_1tex(r, dstX + width, dstY, s1, t1);
+   /* 3rd vertex */
+   add_vertex_1tex(r, dstX + width, dstY + height, s2, t2);
+   /* 4th vertex */
+   add_vertex_1tex(r, dstX, dstY + height, s3, t3);
+}
+
+
+static INLINE void
+add_vertex_2tex(struct xorg_renderer *r,
+                float x, float y,
+                float s0, float t0, float s1, float t1)
+{
+   float *vertex = r->buffer + r->buffer_size;
+
+   vertex[0] = x;
+   vertex[1] = y;
+   vertex[2] = 0.f; /*z*/
+   vertex[3] = 1.f; /*w*/
+
+   vertex[4] = s0;  /*s*/
+   vertex[5] = t0;  /*t*/
+   vertex[6] = 0.f; /*r*/
+   vertex[7] = 1.f; /*q*/
+
+   vertex[8] = s1;  /*s*/
+   vertex[9] = t1;  /*t*/
+   vertex[10] = 0.f; /*r*/
+   vertex[11] = 1.f; /*q*/
+
+   r->buffer_size += 12;
+}
+
+static void
+add_vertex_data2(struct xorg_renderer *r,
+                 float srcX, float srcY, float maskX, float maskY,
+                 float dstX, float dstY, float width, float height,
+                 struct pipe_texture *src,
+                 struct pipe_texture *mask,
+                 float *src_matrix, float *mask_matrix)
+{
+   float src_s0, src_t0, src_s1, src_t1;
+   float mask_s0, mask_t0, mask_s1, mask_t1;
+   float spt0[2], spt1[2];
+   float mpt0[2], mpt1[2];
+
+   spt0[0] = srcX;
+   spt0[1] = srcY;
+   spt1[0] = srcX + width;
+   spt1[1] = srcY + height;
+
+   mpt0[0] = maskX;
+   mpt0[1] = maskY;
+   mpt1[0] = maskX + width;
+   mpt1[1] = maskY + height;
+
+   if (src_matrix) {
+      map_point(src_matrix, spt0[0], spt0[1], &spt0[0], &spt0[1]);
+      map_point(src_matrix, spt1[0], spt1[1], &spt1[0], &spt1[1]);
+   }
+
+   if (mask_matrix) {
+      map_point(mask_matrix, mpt0[0], mpt0[1], &mpt0[0], &mpt0[1]);
+      map_point(mask_matrix, mpt1[0], mpt1[1], &mpt1[0], &mpt1[1]);
+   }
+
+   src_s0 = spt0[0] / src->width[0];
+   src_t0 = spt0[1] / src->height[0];
+   src_s1 = spt1[0] / src->width[0];
+   src_t1 = spt1[1] / src->height[0];
+
+   mask_s0 = mpt0[0] / mask->width[0];
+   mask_t0 = mpt0[1] / mask->height[0];
+   mask_s1 = mpt1[0] / mask->width[0];
+   mask_t1 = mpt1[1] / mask->height[0];
+
+   /* 1st vertex */
+   add_vertex_2tex(r, dstX, dstY,
+                   src_s0, src_t0, mask_s0, mask_t0);
+   /* 2nd vertex */
+   add_vertex_2tex(r, dstX + width, dstY,
+                   src_s1, src_t0, mask_s1, mask_t0);
+   /* 3rd vertex */
+   add_vertex_2tex(r, dstX + width, dstY + height,
+                   src_s1, src_t1, mask_s1, mask_t1);
+   /* 4th vertex */
+   add_vertex_2tex(r, dstX, dstY + height,
+                   src_s0, src_t1, mask_s0, mask_t1);
+}
+
+static struct pipe_buffer *
+setup_vertex_data_yuv(struct xorg_renderer *r,
+                      float srcX, float srcY, float srcW, float srcH,
+                      float dstX, float dstY, float dstW, float dstH,
+                      struct pipe_texture **tex)
+{
+   float s0, t0, s1, t1;
+   float spt0[2], spt1[2];
+
+   spt0[0] = srcX;
+   spt0[1] = srcY;
+   spt1[0] = srcX + srcW;
+   spt1[1] = srcY + srcH;
+
+   s0 = spt0[0] / tex[0]->width[0];
+   t0 = spt0[1] / tex[0]->height[0];
+   s1 = spt1[0] / tex[0]->width[0];
+   t1 = spt1[1] / tex[0]->height[0];
+
+   /* 1st vertex */
+   add_vertex_1tex(r, dstX, dstY, s0, t0);
+   /* 2nd vertex */
+   add_vertex_1tex(r, dstX + dstW, dstY,
+                   s1, t0);
+   /* 3rd vertex */
+   add_vertex_1tex(r, dstX + dstW, dstY + dstH,
+                   s1, t1);
+   /* 4th vertex */
+   add_vertex_1tex(r, dstX, dstY + dstH,
+                   s0, t1);
+
+   return renderer_buffer_create(r);
+}
+
+
+
+/* Set up framebuffer, viewport and vertex shader constant buffer
+ * state for a particular destinaton surface.  In all our rendering,
+ * these concepts are linked.
+ */
+void renderer_bind_destination(struct xorg_renderer *r,
+                               struct pipe_surface *surface,
+                               int width,
+                               int height )
+{
+
+   struct pipe_framebuffer_state fb;
+   struct pipe_viewport_state viewport;
+
+   /* Framebuffer uses actual surface width/height
+    */
+   memset(&fb, 0, sizeof fb);
+   fb.width  = surface->width;
+   fb.height = surface->height;
+   fb.nr_cbufs = 1;
+   fb.cbufs[0] = surface;
+   fb.zsbuf = 0;
+
+   /* Viewport just touches the bit we're interested in:
+    */
+   viewport.scale[0] =  width / 2.f;
+   viewport.scale[1] =  height / 2.f;
+   viewport.scale[2] =  1.0;
+   viewport.scale[3] =  1.0;
+   viewport.translate[0] = width / 2.f;
+   viewport.translate[1] = height / 2.f;
+   viewport.translate[2] = 0.0;
+   viewport.translate[3] = 0.0;
+
+   /* Constant buffer set up to match viewport dimensions:
+    */
+   if (r->fb_width != width ||
+       r->fb_height != height) 
+   {
+      float vs_consts[8] = {
+         2.f/width, 2.f/height, 1, 1,
+         -1, -1, 0, 0
+      };
+
+      r->fb_width = width;
+      r->fb_height = height;
+
+      renderer_set_constants(r, PIPE_SHADER_VERTEX,
+                             vs_consts, sizeof vs_consts);
+   }
+
+   cso_set_framebuffer(r->cso, &fb);
+   cso_set_viewport(r->cso, &viewport);
+}
+
+
+struct xorg_renderer * renderer_create(struct pipe_context *pipe)
+{
+   struct xorg_renderer *renderer = CALLOC_STRUCT(xorg_renderer);
+
+   renderer->pipe = pipe;
+   renderer->cso = cso_create_context(pipe);
+   renderer->shaders = xorg_shaders_create(renderer);
+
+   renderer_init_state(renderer);
+
+   return renderer;
+}
+
+void renderer_destroy(struct xorg_renderer *r)
+{
+   struct pipe_constant_buffer *vsbuf = &r->vs_const_buffer;
+   struct pipe_constant_buffer *fsbuf = &r->fs_const_buffer;
+
+   if (vsbuf && vsbuf->buffer)
+      pipe_buffer_reference(&vsbuf->buffer, NULL);
+
+   if (fsbuf && fsbuf->buffer)
+      pipe_buffer_reference(&fsbuf->buffer, NULL);
+
+   if (r->shaders) {
+      xorg_shaders_destroy(r->shaders);
+      r->shaders = NULL;
+   }
+
+   if (r->cso) {
+      cso_release_all(r->cso);
+      cso_destroy_context(r->cso);
+      r->cso = NULL;
+   }
+}
+
+
+
+
+
+void renderer_set_constants(struct xorg_renderer *r,
+                            int shader_type,
+                            const float *params,
+                            int param_bytes)
+{
+   struct pipe_constant_buffer *cbuf =
+      (shader_type == PIPE_SHADER_VERTEX) ? &r->vs_const_buffer :
+      &r->fs_const_buffer;
+
+   pipe_buffer_reference(&cbuf->buffer, NULL);
+   cbuf->buffer = pipe_buffer_create(r->pipe->screen, 16,
+                                     PIPE_BUFFER_USAGE_CONSTANT,
+                                     param_bytes);
+
+   if (cbuf->buffer) {
+      pipe_buffer_write(r->pipe->screen, cbuf->buffer,
+                        0, param_bytes, params);
+   }
+   r->pipe->set_constant_buffer(r->pipe, shader_type, 0, cbuf);
+}
+
+
+void renderer_copy_prepare(struct xorg_renderer *r,
+                           struct pipe_surface *dst_surface,
+                           struct pipe_texture *src_texture)
+{
+   struct pipe_context *pipe = r->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct xorg_shader shader;
+
+   assert(screen->is_format_supported(screen, dst_surface->format,
+                                      PIPE_TEXTURE_2D,
+                                      PIPE_TEXTURE_USAGE_RENDER_TARGET,
+                                      0));
+
+
+   /* set misc state we care about */
+   {
+      struct pipe_blend_state blend;
+      memset(&blend, 0, sizeof(blend));
+      blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.colormask = PIPE_MASK_RGBA;
+      cso_set_blend(r->cso, &blend);
+   }
+
+   /* sampler */
+   {
+      struct pipe_sampler_state sampler;
+      memset(&sampler, 0, sizeof(sampler));
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+      sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+      sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      sampler.normalized_coords = 1;
+      cso_single_sampler(r->cso, 0, &sampler);
+      cso_single_sampler_done(r->cso);
+   }
+
+   renderer_bind_destination(r, dst_surface, 
+                             dst_surface->width,
+                             dst_surface->height);
+
+   /* texture */
+   cso_set_sampler_textures(r->cso, 1, &src_texture);
+
+   /* shaders */
+   shader = xorg_shaders_get(r->shaders,
+                             VS_COMPOSITE,
+                             FS_COMPOSITE);
+   cso_set_vertex_shader_handle(r->cso, shader.vs);
+   cso_set_fragment_shader_handle(r->cso, shader.fs);
+
+   r->buffer_size = 0;
+   r->attrs_per_vertex = 2;
+}
+
+struct pipe_texture *
+renderer_clone_texture(struct xorg_renderer *r,
+                       struct pipe_texture *src)
+{
+   enum pipe_format format;
+   struct pipe_context *pipe = r->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_texture *pt;
+   struct pipe_texture templ;
+
+   if (pipe->is_texture_referenced(pipe, src, 0, 0) &
+       PIPE_REFERENCED_FOR_WRITE)
+      pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
+   /* the coming in texture should already have that invariance */
+   debug_assert(screen->is_format_supported(screen, src->format,
+                                            PIPE_TEXTURE_2D,
+                                            PIPE_TEXTURE_USAGE_SAMPLER, 0));
+
+   format = src->format;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = format;
+   templ.last_level = 0;
+   templ.width[0] = src->width[0];
+   templ.height[0] = src->height[0];
+   templ.depth[0] = 1;
+   pf_get_block(format, &templ.block);
+   templ.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER;
+
+   pt = screen->texture_create(screen, &templ);
+
+   debug_assert(!pt || pipe_is_referenced(&pt->reference));
+
+   if (!pt)
+      return NULL;
+
+   {
+      /* copy source framebuffer surface into texture */
+      struct pipe_surface *ps_read = screen->get_tex_surface(
+         screen, src, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ);
+      struct pipe_surface *ps_tex = screen->get_tex_surface(
+         screen, pt, 0, 0, 0, PIPE_BUFFER_USAGE_GPU_WRITE );
+      if (pipe->surface_copy) {
+         pipe->surface_copy(pipe,
+                ps_tex, /* dest */
+                0, 0, /* destx/y */
+                ps_read,
+                0, 0, src->width[0], src->height[0]);
+      } else {
+          util_surface_copy(pipe, FALSE,
+                ps_tex, /* dest */
+                0, 0, /* destx/y */
+                ps_read,
+                0, 0, src->width[0], src->height[0]);
+      }
+      pipe_surface_reference(&ps_read, NULL);
+      pipe_surface_reference(&ps_tex, NULL);
+   }
+
+   return pt;
+}
+
+
+void renderer_copy_pixmap(struct xorg_renderer *r,
+                          int dx, int dy,
+                          int sx, int sy,
+                          int width, int height,
+                          float src_width,
+                          float src_height)
+{
+   float s0, t0, s1, t1;
+   float x0, y0, x1, y1;
+
+
+   /* XXX: could put the texcoord scaling calculation into the vertex
+    * shader.
+    */
+   s0 = sx            / src_width;
+   s1 = (sx + width)  / src_width;
+   t0 = sy            / src_height;
+   t1 = (sy + height) / src_height;
+
+   x0 = dx;
+   x1 = dx + width;
+   y0 = dy;
+   y1 = dy + height;
+
+   /* draw quad */
+   renderer_draw_conditional(r, 4*8);
+   add_vertex_1tex(r, x0, y0, s0, t0);
+   add_vertex_1tex(r, x1, y0, s1, t0);
+   add_vertex_1tex(r, x1, y1, s1, t1);
+   add_vertex_1tex(r, x0, y1, s0, t1);
+}
+
+
+
+
+void renderer_draw_yuv(struct xorg_renderer *r,
+                       int src_x, int src_y, int src_w, int src_h,
+                       int dst_x, int dst_y, int dst_w, int dst_h,
+                       struct pipe_texture **textures)
+{
+   struct pipe_context *pipe = r->pipe;
+   struct pipe_buffer *buf = 0;
+
+   buf = setup_vertex_data_yuv(r,
+                               src_x, src_y, src_w, src_h,
+                               dst_x, dst_y, dst_w, dst_h,
+                               textures);
+
+   if (buf) {
+      const int num_attribs = 2; /*pos + tex coord*/
+
+      util_draw_vertex_buffer(pipe, buf, 0,
+                              PIPE_PRIM_QUADS,
+                              4,  /* verts */
+                              num_attribs); /* attribs/vert */
+
+      pipe_buffer_reference(&buf, NULL);
+   }
+}
+
+void renderer_begin_solid(struct xorg_renderer *r)
+{
+   r->buffer_size = 0;
+   r->attrs_per_vertex = 2;
+}
+
+void renderer_solid(struct xorg_renderer *r,
+                    int x0, int y0,
+                    int x1, int y1,
+                    float *color)
+{
+   /*
+   debug_printf("solid rect[(%d, %d), (%d, %d)], rgba[%f, %f, %f, %f]\n",
+   x0, y0, x1, y1, color[0], color[1], color[2], color[3]);*/
+
+   renderer_draw_conditional(r, 4 * 8);
+
+   /* 1st vertex */
+   add_vertex_color(r, x0, y0, color);
+   /* 2nd vertex */
+   add_vertex_color(r, x1, y0, color);
+   /* 3rd vertex */
+   add_vertex_color(r, x1, y1, color);
+   /* 4th vertex */
+   add_vertex_color(r, x0, y1, color);
+}
+
+void renderer_draw_flush(struct xorg_renderer *r)
+{
+   renderer_draw_conditional(r, 0);
+}
+
+void renderer_begin_textures(struct xorg_renderer *r,
+                             struct pipe_texture **textures,
+                             int num_textures)
+{
+   r->attrs_per_vertex = 1 + num_textures;
+   r->buffer_size = 0;
+}
+
+void renderer_texture(struct xorg_renderer *r,
+                      int *pos,
+                      int width, int height,
+                      struct pipe_texture **textures,
+                      int num_textures,
+                      float *src_matrix,
+                      float *mask_matrix)
+{
+
+#if 0
+   if (src_matrix) {
+      debug_printf("src_matrix = \n");
+      debug_printf("%f, %f, %f\n", src_matrix[0], src_matrix[1], src_matrix[2]);
+      debug_printf("%f, %f, %f\n", src_matrix[3], src_matrix[4], src_matrix[5]);
+      debug_printf("%f, %f, %f\n", src_matrix[6], src_matrix[7], src_matrix[8]);
+   }
+   if (mask_matrix) {
+      debug_printf("mask_matrix = \n");
+      debug_printf("%f, %f, %f\n", mask_matrix[0], mask_matrix[1], mask_matrix[2]);
+      debug_printf("%f, %f, %f\n", mask_matrix[3], mask_matrix[4], mask_matrix[5]);
+      debug_printf("%f, %f, %f\n", mask_matrix[6], mask_matrix[7], mask_matrix[8]);
+   }
+#endif
+
+   switch(r->attrs_per_vertex) {
+   case 2:
+      renderer_draw_conditional(r, 4 * 8);
+      add_vertex_data1(r,
+                       pos[0], pos[1], /* src */
+                       pos[4], pos[5], /* dst */
+                       width, height,
+                       textures[0], src_matrix);
+      break;
+   case 3:
+      renderer_draw_conditional(r, 4 * 12);
+      add_vertex_data2(r,
+                       pos[0], pos[1], /* src */
+                       pos[2], pos[3], /* mask */
+                       pos[4], pos[5], /* dst */
+                       width, height,
+                       textures[0], textures[1],
+                       src_matrix, mask_matrix);
+      break;
+   default:
+      debug_assert(!"Unsupported number of textures");
+      break;
+   }
+}
diff --git a/src/gallium/state_trackers/xorg/xorg_renderer.h b/src/gallium/state_trackers/xorg/xorg_renderer.h
new file mode 100644
index 00000000000..5272cde2b3f
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xorg_renderer.h
@@ -0,0 +1,95 @@
+#ifndef XORG_RENDERER_H
+#define XORG_RENDERER_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+struct xorg_shaders;
+struct exa_pixmap_priv;
+
+/* max number of vertices *
+ * max number of attributes per vertex *
+ * max number of components per attribute
+ *
+ * currently the max is 100 quads
+ */
+#define BUF_SIZE (100 * 4 * 3 * 4)
+
+struct xorg_renderer {
+   struct pipe_context *pipe;
+
+   struct cso_context *cso;
+   struct xorg_shaders *shaders;
+
+   int fb_width;
+   int fb_height;
+   struct pipe_constant_buffer vs_const_buffer;
+   struct pipe_constant_buffer fs_const_buffer;
+
+   float buffer[BUF_SIZE];
+   int buffer_size;
+
+   /* number of attributes per vertex for the current
+    * draw operation */
+   int attrs_per_vertex;
+};
+
+struct xorg_renderer *renderer_create(struct pipe_context *pipe);
+void renderer_destroy(struct xorg_renderer *renderer);
+
+void renderer_bind_destination(struct xorg_renderer *r,
+                               struct pipe_surface *surface,
+                               int width,
+                               int height );
+
+void renderer_bind_framebuffer(struct xorg_renderer *r,
+                               struct exa_pixmap_priv *priv);
+void renderer_bind_viewport(struct xorg_renderer *r,
+                            struct exa_pixmap_priv *dst);
+void renderer_set_constants(struct xorg_renderer *r,
+                            int shader_type,
+                            const float *buffer,
+                            int size);
+
+
+void renderer_draw_yuv(struct xorg_renderer *r,
+                       int src_x, int src_y, int src_w, int src_h,
+                       int dst_x, int dst_y, int dst_w, int dst_h,
+                       struct pipe_texture **textures);
+
+void renderer_begin_solid(struct xorg_renderer *r);
+void renderer_solid(struct xorg_renderer *r,
+                    int x0, int y0,
+                    int x1, int y1,
+                    float *color);
+
+void renderer_begin_textures(struct xorg_renderer *r,
+                             struct pipe_texture **textures,
+                             int num_textures);
+void renderer_texture(struct xorg_renderer *r,
+                      int *pos,
+                      int width, int height,
+                      struct pipe_texture **textures,
+                      int num_textures,
+                      float *src_matrix,
+                      float *mask_matrix);
+
+void renderer_draw_flush(struct xorg_renderer *r);
+
+struct pipe_texture *
+renderer_clone_texture(struct xorg_renderer *r,
+                       struct pipe_texture *src);
+
+void renderer_copy_prepare(struct xorg_renderer *r,
+                           struct pipe_surface *dst_surface,
+                           struct pipe_texture *src_texture);
+
+void renderer_copy_pixmap(struct xorg_renderer *r,
+                          int dx, int dy,
+                          int sx, int sy,
+                          int width, int height,
+                          float src_width,
+                          float src_height);
+
+
+#endif
diff --git a/src/gallium/state_trackers/xorg/xorg_tracker.h b/src/gallium/state_trackers/xorg/xorg_tracker.h
index b1ab783a15a..d5fc18448ef 100644
--- a/src/gallium/state_trackers/xorg/xorg_tracker.h
+++ b/src/gallium/state_trackers/xorg/xorg_tracker.h
@@ -51,6 +51,10 @@
 
 #define DRV_ERROR(msg)	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, msg);
 
+struct kms_bo;
+struct kms_driver;
+struct exa_context;
+
 typedef struct
 {
     int lastInstance;
@@ -59,6 +63,8 @@ typedef struct
     ScrnInfoPtr pScrn_2;
 } EntRec, *EntPtr;
 
+#define XORG_NR_FENCES 3
+
 typedef struct _modesettingRec
 {
     /* drm */
@@ -82,18 +88,36 @@ typedef struct _modesettingRec
     unsigned int SaveGeneration;
 
     void (*blockHandler)(int, pointer, pointer, pointer);
+    struct pipe_fence_handle *fence[XORG_NR_FENCES];
+
     CreateScreenResourcesProcPtr createScreenResources;
 
+    /* for frontbuffer backing store */
+    Bool (*destroy_front_buffer)(ScrnInfoPtr pScrn);
+    Bool (*create_front_buffer)(ScrnInfoPtr pScrn);
+    Bool (*bind_front_buffer)(ScrnInfoPtr pScrn);
+
+    /* kms */
+    struct kms_driver *kms;
+    struct kms_bo *root_bo;
+
     /* gallium */
     struct drm_api *api;
     struct pipe_screen *screen;
     struct pipe_context *ctx;
     boolean d_depth_bits_last;
     boolean ds_depth_bits_last;
+    struct pipe_texture *root_texture;
 
     /* exa */
-    void *exa;
+    struct exa_context *exa;
     Bool noEvict;
+    Bool debug_fallback;
+
+    /* winsys hocks */
+    Bool (*winsys_screen_init)(ScrnInfoPtr pScr);
+    Bool (*winsys_screen_close)(ScrnInfoPtr pScr);
+    void *winsys_priv;
 
 #ifdef DRM_MODE_FEATURE_DIRTYFB
     DamagePtr damage;
@@ -118,8 +142,16 @@ xorg_exa_set_displayed_usage(PixmapPtr pPixmap);
 int
 xorg_exa_set_shared_usage(PixmapPtr pPixmap);
 
+Bool
+xorg_exa_set_texture(PixmapPtr pPixmap, struct  pipe_texture *tex);
+
+struct pipe_texture *
+xorg_exa_create_root_texture(ScrnInfoPtr pScrn,
+			     int width, int height,
+			     int depth, int bpp);
+
 void *
-xorg_exa_init(ScrnInfoPtr pScrn);
+xorg_exa_init(ScrnInfoPtr pScrn, Bool accel);
 
 void
 xorg_exa_close(ScrnInfoPtr pScrn);
@@ -129,27 +161,34 @@ xorg_exa_close(ScrnInfoPtr pScrn);
  * xorg_dri2.c
  */
 Bool
-driScreenInit(ScreenPtr pScreen);
+xorg_dri2_init(ScreenPtr pScreen);
 
 void
-driCloseScreen(ScreenPtr pScreen);
+xorg_dri2_close(ScreenPtr pScreen);
 
 
 /***********************************************************************
  * xorg_crtc.c
  */
 void
-crtc_init(ScrnInfoPtr pScrn);
+xorg_crtc_init(ScrnInfoPtr pScrn);
 
 void
-crtc_cursor_destroy(xf86CrtcPtr crtc);
+xorg_crtc_cursor_destroy(xf86CrtcPtr crtc);
 
 
 /***********************************************************************
  * xorg_output.c
  */
 void
-output_init(ScrnInfoPtr pScrn);
+xorg_output_init(ScrnInfoPtr pScrn);
+
+
+/***********************************************************************
+ * xorg_xv.c
+ */
+void
+xorg_xv_init(ScreenPtr pScreen);
 
 
 #endif /* _XORG_TRACKER_H_ */
diff --git a/src/gallium/state_trackers/xorg/xorg_xv.c b/src/gallium/state_trackers/xorg/xorg_xv.c
new file mode 100644
index 00000000000..b3315dccad8
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xorg_xv.c
@@ -0,0 +1,714 @@
+#include "xorg_tracker.h"
+
+#include <xf86xv.h>
+#include <X11/extensions/Xv.h>
+#include <fourcc.h>
+
+#include "xorg_exa.h"
+#include "xorg_renderer.h"
+#include "xorg_exa_tgsi.h"
+
+#include "cso_cache/cso_context.h"
+
+#include "pipe/p_screen.h"
+#include "pipe/p_inlines.h"
+
+/*XXX get these from pipe's texture limits */
+#define IMAGE_MAX_WIDTH		2048
+#define IMAGE_MAX_HEIGHT	2048
+
+#define RES_720P_X 1280
+#define RES_720P_Y 720
+
+
+/* The ITU-R BT.601 conversion matrix for SDTV. */
+/* original, matrix, but we transpose it to
+ * make the shader easier
+static const float bt_601[] = {
+    1.0, 0.0, 1.4075,   ,
+    1.0, -0.3455, -0.7169, 0,
+    1.0, 1.7790, 0., 0,
+};*/
+static const float bt_601[] = {
+    1.0, 1.0, 1.0,        0.5,
+    0.0, -0.3455, 1.7790, 0,
+    1.4075, -0.7169, 0.,  0,
+};
+
+/* The ITU-R BT.709 conversion matrix for HDTV. */
+/* original, but we transpose to make the conversion
+ * in the shader easier
+static const float bt_709[] = {
+    1.0, 0.0, 1.581, 0,
+    1.0, -0.1881, -0.47, 0,
+    1.0, 1.8629, 0., 0,
+};*/
+static const float bt_709[] = {
+    1.0,   1.0,     1.0,     0.5,
+    0.0,  -0.1881,  1.8629,  0,
+    1.581,-0.47   , 0.0,     0,
+};
+
+#define MAKE_ATOM(a) MakeAtom(a, sizeof(a) - 1, TRUE)
+
+static Atom xvBrightness, xvContrast;
+
+#define NUM_TEXTURED_ATTRIBUTES 2
+static XF86AttributeRec TexturedAttributes[NUM_TEXTURED_ATTRIBUTES] = {
+   {XvSettable | XvGettable, -128, 127, "XV_BRIGHTNESS"},
+   {XvSettable | XvGettable, 0, 255, "XV_CONTRAST"}
+};
+
+#define NUM_FORMATS 3
+static XF86VideoFormatRec Formats[NUM_FORMATS] = {
+   {15, TrueColor}, {16, TrueColor}, {24, TrueColor}
+};
+
+static XF86VideoEncodingRec DummyEncoding[1] = {
+   {
+      0,
+      "XV_IMAGE",
+      IMAGE_MAX_WIDTH, IMAGE_MAX_HEIGHT,
+      {1, 1}
+   }
+};
+
+#define NUM_IMAGES 2
+static XF86ImageRec Images[NUM_IMAGES] = {
+   XVIMAGE_UYVY,
+   XVIMAGE_YUY2,
+};
+
+struct xorg_xv_port_priv {
+   struct xorg_renderer *r;
+
+   RegionRec clip;
+
+   int brightness;
+   int contrast;
+
+   int current_set;
+   /* juggle two sets of seperate Y, U and V
+    * textures */
+   struct pipe_texture *yuv[2][3];
+};
+
+
+static void
+stop_video(ScrnInfoPtr pScrn, pointer data, Bool shutdown)
+{
+   struct xorg_xv_port_priv *priv = (struct xorg_xv_port_priv *)data;
+
+   REGION_EMPTY(pScrn->pScreen, &priv->clip);
+}
+
+static int
+set_port_attribute(ScrnInfoPtr pScrn,
+                   Atom attribute, INT32 value, pointer data)
+{
+   struct xorg_xv_port_priv *priv = (struct xorg_xv_port_priv *)data;
+
+   if (attribute == xvBrightness) {
+      if ((value < -128) || (value > 127))
+         return BadValue;
+      priv->brightness = value;
+   } else if (attribute == xvContrast) {
+      if ((value < 0) || (value > 255))
+         return BadValue;
+      priv->contrast = value;
+   } else
+      return BadMatch;
+
+   return Success;
+}
+
+static int
+get_port_attribute(ScrnInfoPtr pScrn,
+                   Atom attribute, INT32 * value, pointer data)
+{
+   struct xorg_xv_port_priv *priv = (struct xorg_xv_port_priv *)data;
+
+   if (attribute == xvBrightness)
+      *value = priv->brightness;
+   else if (attribute == xvContrast)
+      *value = priv->contrast;
+   else
+      return BadMatch;
+
+   return Success;
+}
+
+static void
+query_best_size(ScrnInfoPtr pScrn,
+                Bool motion,
+                short vid_w, short vid_h,
+                short drw_w, short drw_h,
+                unsigned int *p_w, unsigned int *p_h, pointer data)
+{
+   if (vid_w > (drw_w << 1))
+      drw_w = vid_w >> 1;
+   if (vid_h > (drw_h << 1))
+      drw_h = vid_h >> 1;
+
+   *p_w = drw_w;
+   *p_h = drw_h;
+}
+
+static INLINE struct pipe_texture *
+create_component_texture(struct pipe_context *pipe,
+                         int width, int height)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_texture *tex = 0;
+   struct pipe_texture templ;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.format = PIPE_FORMAT_L8_UNORM;
+   templ.last_level = 0;
+   templ.width[0] = width;
+   templ.height[0] = height;
+   templ.depth[0] = 1;
+   pf_get_block(PIPE_FORMAT_L8_UNORM, &templ.block);
+   templ.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER;
+
+   tex = screen->texture_create(screen, &templ);
+
+   return tex;
+}
+
+static int
+check_yuv_textures(struct xorg_xv_port_priv *priv,  int width, int height)
+{
+   struct pipe_texture **dst = priv->yuv[priv->current_set];
+   if (!dst[0] ||
+       dst[0]->width[0] != width ||
+       dst[0]->height[0] != height) {
+      pipe_texture_reference(&dst[0], NULL);
+   }
+   if (!dst[1] ||
+       dst[1]->width[0] != width ||
+       dst[1]->height[0] != height) {
+      pipe_texture_reference(&dst[1], NULL);
+   }
+   if (!dst[2] ||
+       dst[2]->width[0] != width ||
+       dst[2]->height[0] != height) {
+      pipe_texture_reference(&dst[2], NULL);
+   }
+
+   if (!dst[0])
+      dst[0] = create_component_texture(priv->r->pipe, width, height);
+
+   if (!dst[1])
+      dst[1] = create_component_texture(priv->r->pipe, width, height);
+
+   if (!dst[2])
+      dst[2] = create_component_texture(priv->r->pipe, width, height);
+
+   if (!dst[0] || !dst[1] || !dst[2])
+      return BadAlloc;
+
+   return Success;
+}
+
+static void
+copy_packed_data(ScrnInfoPtr pScrn,
+                 struct xorg_xv_port_priv *port,
+                 int id,
+                 unsigned char *buf,
+                 int srcPitch,
+                 int left,
+                 int top,
+                 int w, int h)
+{
+   unsigned char *src;
+   int i, j;
+   struct pipe_texture **dst = port->yuv[port->current_set];
+   struct pipe_transfer *ytrans, *utrans, *vtrans;
+   struct pipe_screen *screen = port->r->pipe->screen;
+   char *ymap, *vmap, *umap;
+   unsigned char y1, y2, u, v;
+   int yidx, uidx, vidx;
+   int y_array_size = w * h;
+
+   src = buf + (top * srcPitch) + (left << 1);
+
+   ytrans = screen->get_tex_transfer(screen, dst[0],
+                                     0, 0, 0,
+                                     PIPE_TRANSFER_WRITE,
+                                     left, top, w, h);
+   utrans = screen->get_tex_transfer(screen, dst[1],
+                                     0, 0, 0,
+                                     PIPE_TRANSFER_WRITE,
+                                     left, top, w, h);
+   vtrans = screen->get_tex_transfer(screen, dst[2],
+                                     0, 0, 0,
+                                     PIPE_TRANSFER_WRITE,
+                                     left, top, w, h);
+
+   ymap = (char*)screen->transfer_map(screen, ytrans);
+   umap = (char*)screen->transfer_map(screen, utrans);
+   vmap = (char*)screen->transfer_map(screen, vtrans);
+
+   yidx = uidx = vidx = 0;
+
+   switch (id) {
+   case FOURCC_YV12: {
+      for (i = 0; i < w; ++i) {
+         for (j = 0; j < h; ++j) {
+            /*XXX use src? */
+            y1  = buf[j*w + i];
+            u   = buf[(j/2) * (w/2) + i/2 + y_array_size];
+            v   = buf[(j/2) * (w/2) + i/2 + y_array_size + y_array_size/4];
+            ymap[yidx++] = y1;
+            umap[uidx++] = u;
+            vmap[vidx++] = v;
+         }
+      }
+   }
+      break;
+   case FOURCC_UYVY:
+      for (i = 0; i < y_array_size; i +=2 ) {
+         /* extracting two pixels */
+         u  = buf[0];
+         y1 = buf[1];
+         v  = buf[2];
+         y2 = buf[3];
+         buf += 4;
+
+         ymap[yidx++] = y1;
+         ymap[yidx++] = y2;
+         umap[uidx++] = u;
+         umap[uidx++] = u;
+         vmap[vidx++] = v;
+         vmap[vidx++] = v;
+      }
+      break;
+   case FOURCC_YUY2:
+      for (i = 0; i < y_array_size; i +=2 ) {
+         /* extracting two pixels */
+         y1 = buf[0];
+         u  = buf[1];
+         y2 = buf[2];
+         v  = buf[3];
+
+         buf += 4;
+
+         ymap[yidx++] = y1;
+         ymap[yidx++] = y2;
+         umap[uidx++] = u;
+         umap[uidx++] = u;
+         vmap[vidx++] = v;
+         vmap[vidx++] = v;
+      }
+      break;
+   default:
+      debug_assert(!"Unsupported yuv format!");
+      break;
+   }
+
+   screen->transfer_unmap(screen, ytrans);
+   screen->transfer_unmap(screen, utrans);
+   screen->transfer_unmap(screen, vtrans);
+   screen->tex_transfer_destroy(ytrans);
+   screen->tex_transfer_destroy(utrans);
+   screen->tex_transfer_destroy(vtrans);
+}
+
+
+static void
+setup_fs_video_constants(struct xorg_renderer *r, boolean hdtv)
+{
+   const int param_bytes = 12 * sizeof(float);
+   const float *video_constants = (hdtv) ? bt_709 : bt_601;
+
+   renderer_set_constants(r, PIPE_SHADER_FRAGMENT,
+                          video_constants, param_bytes);
+}
+
+static void
+draw_yuv(struct xorg_xv_port_priv *port,
+         int src_x, int src_y, int src_w, int src_h,
+         int dst_x, int dst_y, int dst_w, int dst_h)
+{
+   struct pipe_texture **textures = port->yuv[port->current_set];
+
+   renderer_draw_yuv(port->r,
+                     src_x, src_y, src_w, src_h,
+                     dst_x, dst_y, dst_w, dst_h,
+                     textures);
+}
+
+static void
+bind_blend_state(struct xorg_xv_port_priv *port)
+{
+   struct pipe_blend_state blend;
+
+   memset(&blend, 0, sizeof(struct pipe_blend_state));
+   blend.blend_enable = 1;
+   blend.colormask |= PIPE_MASK_RGBA;
+
+   /* porter&duff src */
+   blend.rgb_src_factor   = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.rgb_dst_factor   = PIPE_BLENDFACTOR_ZERO;
+   blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+
+   cso_set_blend(port->r->cso, &blend);
+}
+
+
+static void
+bind_shaders(struct xorg_xv_port_priv *port)
+{
+   unsigned vs_traits = 0, fs_traits = 0;
+   struct xorg_shader shader;
+
+   vs_traits |= VS_YUV;
+   fs_traits |= FS_YUV;
+
+   shader = xorg_shaders_get(port->r->shaders, vs_traits, fs_traits);
+   cso_set_vertex_shader_handle(port->r->cso, shader.vs);
+   cso_set_fragment_shader_handle(port->r->cso, shader.fs);
+}
+
+static INLINE void
+conditional_flush(struct pipe_context *pipe, struct pipe_texture **tex,
+                  int num)
+{
+   int i;
+   for (i = 0; i < num; ++i) {
+      if (tex[i] && pipe->is_texture_referenced(pipe, tex[i], 0, 0) &
+          PIPE_REFERENCED_FOR_WRITE) {
+         pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+         return;
+      }
+   }
+}
+
+static void
+bind_samplers(struct xorg_xv_port_priv *port)
+{
+   struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_state sampler;
+   struct pipe_texture **dst = port->yuv[port->current_set];
+
+   memset(&sampler, 0, sizeof(struct pipe_sampler_state));
+
+   conditional_flush(port->r->pipe, dst, 3);
+
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+   sampler.normalized_coords = 1;
+
+   samplers[0] = &sampler;
+   samplers[1] = &sampler;
+   samplers[2] = &sampler;
+
+
+   cso_set_samplers(port->r->cso, 3,
+                    (const struct pipe_sampler_state **)samplers);
+   cso_set_sampler_textures(port->r->cso, 3,
+                            dst);
+}
+
+static int
+display_video(ScrnInfoPtr pScrn, struct xorg_xv_port_priv *pPriv, int id,
+              RegionPtr dstRegion,
+              int src_x, int src_y, int src_w, int src_h,
+              int dstX, int dstY, int dst_w, int dst_h,
+              PixmapPtr pPixmap)
+{
+   modesettingPtr ms = modesettingPTR(pScrn);
+   BoxPtr pbox;
+   int nbox;
+   int dxo, dyo;
+   Bool hdtv;
+   int x, y, w, h;
+   struct exa_pixmap_priv *dst = exaGetPixmapDriverPrivate(pPixmap);
+   struct pipe_surface *dst_surf = xorg_gpu_surface(pPriv->r->pipe->screen, dst);
+
+   if (dst && !dst->tex) {
+	xorg_exa_set_shared_usage(pPixmap);
+	pScrn->pScreen->ModifyPixmapHeader(pPixmap, 0, 0, 0, 0, 0, NULL);
+   }
+
+   if (!dst || !dst->tex)
+      XORG_FALLBACK("Xv destination %s", !dst ? "!dst" : "!dst->tex");
+
+   hdtv = ((src_w >= RES_720P_X) && (src_h >= RES_720P_Y));
+
+   REGION_TRANSLATE(pScrn->pScreen, dstRegion, -pPixmap->screen_x,
+                    -pPixmap->screen_y);
+
+   dxo = dstRegion->extents.x1;
+   dyo = dstRegion->extents.y1;
+
+   pbox = REGION_RECTS(dstRegion);
+   nbox = REGION_NUM_RECTS(dstRegion);
+
+   renderer_bind_destination(pPriv->r, dst_surf, 
+                             dst_surf->width, dst_surf->height);
+
+   bind_blend_state(pPriv);
+   bind_shaders(pPriv);
+   bind_samplers(pPriv);
+   setup_fs_video_constants(pPriv->r, hdtv);
+
+   exaMoveInPixmap(pPixmap);
+   DamageDamageRegion(&pPixmap->drawable, dstRegion);
+
+   while (nbox--) {
+      int box_x1 = pbox->x1;
+      int box_y1 = pbox->y1;
+      int box_x2 = pbox->x2;
+      int box_y2 = pbox->y2;
+      float diff_x = (float)src_w / (float)dst_w;
+      float diff_y = (float)src_h / (float)dst_h;
+      int offset_x = box_x1 - dstX + pPixmap->screen_x;
+      int offset_y = box_y1 - dstY + pPixmap->screen_y;
+      int offset_w;
+      int offset_h;
+
+      x = box_x1;
+      y = box_y1;
+      w = box_x2 - box_x1;
+      h = box_y2 - box_y1;
+
+      offset_w = dst_w - w;
+      offset_h = dst_h - h;
+
+      draw_yuv(pPriv, src_x + offset_x*diff_x, src_y + offset_y*diff_y,
+               src_w - offset_w*diff_x, src_h - offset_h*diff_x,
+               x, y, w, h);
+
+      pbox++;
+   }
+   DamageRegionProcessPending(&pPixmap->drawable);
+
+   pipe_surface_reference(&dst_surf, NULL);
+
+   return TRUE;
+}
+
+static int
+put_image(ScrnInfoPtr pScrn,
+          short src_x, short src_y,
+          short drw_x, short drw_y,
+          short src_w, short src_h,
+          short drw_w, short drw_h,
+          int id, unsigned char *buf,
+          short width, short height,
+          Bool sync, RegionPtr clipBoxes, pointer data,
+          DrawablePtr pDraw)
+{
+   struct xorg_xv_port_priv *pPriv = (struct xorg_xv_port_priv *) data;
+   ScreenPtr pScreen = screenInfo.screens[pScrn->scrnIndex];
+   PixmapPtr pPixmap;
+   INT32 x1, x2, y1, y2;
+   int srcPitch;
+   BoxRec dstBox;
+   int ret;
+
+   /* Clip */
+   x1 = src_x;
+   x2 = src_x + src_w;
+   y1 = src_y;
+   y2 = src_y + src_h;
+
+   dstBox.x1 = drw_x;
+   dstBox.x2 = drw_x + drw_w;
+   dstBox.y1 = drw_y;
+   dstBox.y2 = drw_y + drw_h;
+
+   if (!xf86XVClipVideoHelper(&dstBox, &x1, &x2, &y1, &y2, clipBoxes,
+			      width, height))
+      return Success;
+
+   switch (id) {
+   case FOURCC_UYVY:
+   case FOURCC_YUY2:
+   default:
+      srcPitch = width << 1;
+      break;
+   }
+
+   ret = check_yuv_textures(pPriv, width, height);
+
+   if (ret)
+      return ret;
+
+   copy_packed_data(pScrn, pPriv, id, buf, srcPitch,
+                    src_x, src_y, width, height);
+
+   if (pDraw->type == DRAWABLE_WINDOW) {
+      pPixmap = (*pScreen->GetWindowPixmap)((WindowPtr)pDraw);
+   } else {
+      pPixmap = (PixmapPtr)pDraw;
+   }
+
+   display_video(pScrn, pPriv, id, clipBoxes,
+                 src_x, src_y, src_w, src_h,
+                 drw_x, drw_y,
+                 drw_w, drw_h, pPixmap);
+
+   pPriv->current_set = (pPriv->current_set + 1) & 1;
+   return Success;
+}
+
+static int
+query_image_attributes(ScrnInfoPtr pScrn,
+                       int id,
+                       unsigned short *w, unsigned short *h,
+                       int *pitches, int *offsets)
+{
+   int size;
+
+   if (*w > IMAGE_MAX_WIDTH)
+      *w = IMAGE_MAX_WIDTH;
+   if (*h > IMAGE_MAX_HEIGHT)
+      *h = IMAGE_MAX_HEIGHT;
+
+   *w = (*w + 1) & ~1;
+   if (offsets)
+      offsets[0] = 0;
+
+   switch (id) {
+   case FOURCC_UYVY:
+   case FOURCC_YUY2:
+   default:
+      size = *w << 1;
+      if (pitches)
+	 pitches[0] = size;
+      size *= *h;
+      break;
+   }
+
+   return size;
+}
+
+static struct xorg_xv_port_priv *
+port_priv_create(struct xorg_renderer *r)
+{
+   struct xorg_xv_port_priv *priv = NULL;
+
+   priv = calloc(1, sizeof(struct xorg_xv_port_priv));
+
+   if (!priv)
+      return NULL;
+
+   priv->r = r;
+
+   REGION_NULL(pScreen, &priv->clip);
+
+   debug_assert(priv && priv->r);
+
+   return priv;
+}
+
+static XF86VideoAdaptorPtr
+xorg_setup_textured_adapter(ScreenPtr pScreen)
+{
+   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+   modesettingPtr ms = modesettingPTR(pScrn);
+   XF86VideoAdaptorPtr adapt;
+   XF86AttributePtr attrs;
+   DevUnion *dev_unions;
+   int nports = 16, i;
+   int nattributes;
+
+   nattributes = NUM_TEXTURED_ATTRIBUTES;
+
+   debug_assert(ms->exa);
+   debug_assert(ms->exa->renderer);
+
+   adapt = calloc(1, sizeof(XF86VideoAdaptorRec));
+   dev_unions = calloc(nports, sizeof(DevUnion));
+   attrs = calloc(nattributes, sizeof(XF86AttributeRec));
+   if (adapt == NULL || dev_unions == NULL || attrs == NULL) {
+      free(adapt);
+      free(dev_unions);
+      free(attrs);
+      return NULL;
+   }
+
+   adapt->type = XvWindowMask | XvInputMask | XvImageMask;
+   adapt->flags = 0;
+   adapt->name = "Gallium3D Textured Video";
+   adapt->nEncodings = 1;
+   adapt->pEncodings = DummyEncoding;
+   adapt->nFormats = NUM_FORMATS;
+   adapt->pFormats = Formats;
+   adapt->nPorts = 0;
+   adapt->pPortPrivates = dev_unions;
+   adapt->nAttributes = nattributes;
+   adapt->pAttributes = attrs;
+   memcpy(attrs, TexturedAttributes, nattributes * sizeof(XF86AttributeRec));
+   adapt->nImages = NUM_IMAGES;
+   adapt->pImages = Images;
+   adapt->PutVideo = NULL;
+   adapt->PutStill = NULL;
+   adapt->GetVideo = NULL;
+   adapt->GetStill = NULL;
+   adapt->StopVideo = stop_video;
+   adapt->SetPortAttribute = set_port_attribute;
+   adapt->GetPortAttribute = get_port_attribute;
+   adapt->QueryBestSize = query_best_size;
+   adapt->PutImage = put_image;
+   adapt->QueryImageAttributes = query_image_attributes;
+
+   for (i = 0; i < nports; i++) {
+      struct xorg_xv_port_priv *priv =
+         port_priv_create(ms->exa->renderer);
+
+      adapt->pPortPrivates[i].ptr = (pointer) (priv);
+      adapt->nPorts++;
+   }
+
+   return adapt;
+}
+
+void
+xorg_xv_init(ScreenPtr pScreen)
+{
+   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+   /*modesettingPtr ms = modesettingPTR(pScrn);*/
+   XF86VideoAdaptorPtr *adaptors, *new_adaptors = NULL;
+   XF86VideoAdaptorPtr textured_adapter;
+   int num_adaptors;
+
+   num_adaptors = xf86XVListGenericAdaptors(pScrn, &adaptors);
+   new_adaptors = malloc((num_adaptors + 1) * sizeof(XF86VideoAdaptorPtr *));
+   if (new_adaptors == NULL)
+      return;
+
+   memcpy(new_adaptors, adaptors, num_adaptors * sizeof(XF86VideoAdaptorPtr));
+   adaptors = new_adaptors;
+
+   /* Add the adaptors supported by our hardware.  First, set up the atoms
+    * that will be used by both output adaptors.
+    */
+   xvBrightness = MAKE_ATOM("XV_BRIGHTNESS");
+   xvContrast = MAKE_ATOM("XV_CONTRAST");
+
+   textured_adapter = xorg_setup_textured_adapter(pScreen);
+
+   debug_assert(textured_adapter);
+
+   if (textured_adapter) {
+      adaptors[num_adaptors++] = textured_adapter;
+   }
+
+   if (num_adaptors) {
+      xf86XVScreenInit(pScreen, adaptors, num_adaptors);
+   } else {
+      xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
+                 "Disabling Xv because no adaptors could be initialized.\n");
+   }
+
+   free(adaptors);
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/Makefile b/src/gallium/state_trackers/xorg/xvmc/Makefile
new file mode 100644
index 00000000000..126dc6d58f1
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/Makefile
@@ -0,0 +1,16 @@
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = xvmctracker
+
+LIBRARY_INCLUDES = \
+	$(shell pkg-config --cflags-only-I xvmc) \
+	-I$(TOP)/src/gallium/winsys/g3dvl
+
+C_SOURCES = block.c \
+            surface.c \
+            context.c \
+            subpicture.c \
+            attributes.c
+
+include ../../../Makefile.template
diff --git a/src/gallium/state_trackers/xorg/xvmc/SConscript b/src/gallium/state_trackers/xorg/xvmc/SConscript
new file mode 100644
index 00000000000..cb25d68bd80
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/SConscript
@@ -0,0 +1,27 @@
+#######################################################################
+# SConscript for xvmc state_tracker
+
+Import('*')
+
+if 'xorg/xvmc' in env['statetrackers']:
+
+    env = env.Clone()
+    
+    env.Append(CPPPATH = [
+	'#/src/gallium/include',
+	'#/src/gallium/auxiliary',
+	'#/src/gallium/winsys/g3dvl',
+    ])
+
+    env.ParseConfig('pkg-config --cflags --libs xvmc')
+
+    st_xvmc = env.ConvenienceLibrary(
+	target = 'st_xvmc',
+	source = [ 'block.c',
+		'surface.c',
+		'context.c',
+		'subpicture.c',
+		'attributes.c',
+		]
+    )
+    Export('st_xvmc')
diff --git a/src/gallium/include/pipe/p_error.h b/src/gallium/state_trackers/xorg/xvmc/attributes.c
index b865b226357..79a67838e6e 100644
--- a/src/gallium/include/pipe/p_error.h
+++ b/src/gallium/state_trackers/xorg/xvmc/attributes.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,41 +25,22 @@
  * 
  **************************************************************************/
 
-/**
- * @file
- * Gallium error codes.
- * 
- * @author José Fonseca <jrfonseca@tungstengraphics.com>
- */
-
-#ifndef P_ERROR_H_
-#define P_ERROR_H_
-
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-   
-/**
- * Gallium error codes.
- * 
- * - A zero value always means success.
- * - A negative value always means failure.
- * - The meaning of a positive value is function dependent. 
- */
-enum pipe_error {
-   PIPE_OK = 0,
-   PIPE_ERROR = -1,    /**< Generic error */
-   PIPE_ERROR_BAD_INPUT = -2, 
-   PIPE_ERROR_OUT_OF_MEMORY = -3,
-   PIPE_ERROR_RETRY = -4
-   /* TODO */
-};
+#include <assert.h>
+#include <X11/Xlib.h>
+#include <X11/extensions/Xvlib.h>
+#include <X11/extensions/XvMClib.h>
 
+XvAttribute* XvMCQueryAttributes(Display *dpy, XvMCContext *context, int *number)
+{
+   return NULL;
+}
 
-#ifdef	__cplusplus
+Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int value)
+{
+   return BadImplementation;
 }
-#endif
 
-#endif /* P_ERROR_H_ */
+Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int *value)
+{
+   return BadImplementation;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/block.c b/src/gallium/state_trackers/xorg/xvmc/block.c
new file mode 100644
index 00000000000..5102375fcf8
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/block.c
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <X11/Xlib.h>
+#include <X11/extensions/XvMClib.h>
+#include <util/u_memory.h>
+#include "xvmc_private.h"
+
+Status XvMCCreateBlocks(Display *dpy, XvMCContext *context, unsigned int num_blocks, XvMCBlockArray *blocks)
+{
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+   if (num_blocks == 0)
+      return BadValue;
+
+   assert(blocks);
+
+   blocks->context_id = context->context_id;
+   blocks->num_blocks = num_blocks;
+   blocks->blocks = MALLOC(BLOCK_SIZE_BYTES * num_blocks);
+   blocks->privData = NULL;
+
+   return Success;
+}
+
+Status XvMCDestroyBlocks(Display *dpy, XvMCBlockArray *blocks)
+{
+   assert(dpy);
+   assert(blocks);
+   FREE(blocks->blocks);
+
+   return Success;
+}
+
+Status XvMCCreateMacroBlocks(Display *dpy, XvMCContext *context, unsigned int num_blocks, XvMCMacroBlockArray *blocks)
+{
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+   if (num_blocks == 0)
+      return BadValue;
+
+   assert(blocks);
+
+   blocks->context_id = context->context_id;
+   blocks->num_blocks = num_blocks;
+   blocks->macro_blocks = MALLOC(sizeof(XvMCMacroBlock) * num_blocks);
+   blocks->privData = NULL;
+
+   return Success;
+}
+
+Status XvMCDestroyMacroBlocks(Display *dpy, XvMCMacroBlockArray *blocks)
+{
+   assert(dpy);
+   assert(blocks);
+   FREE(blocks->macro_blocks);
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/context.c b/src/gallium/state_trackers/xorg/xvmc/context.c
new file mode 100644
index 00000000000..c8a389385a8
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/context.c
@@ -0,0 +1,252 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <X11/Xlibint.h>
+#include <X11/extensions/XvMClib.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_video_context.h>
+#include <pipe/p_video_state.h>
+#include <pipe/p_state.h>
+#include <vl_winsys.h>
+#include <util/u_memory.h>
+#include <util/u_debug.h>
+#include <vl/vl_csc.h>
+#include "xvmc_private.h"
+
+static Status Validate(Display *dpy, XvPortID port, int surface_type_id,
+                       unsigned int width, unsigned int height, int flags,
+                       bool *found_port, int *screen, int *chroma_format,
+                       int *mc_type, int *surface_flags)
+{
+   bool found_surface = false;
+   XvAdaptorInfo *adaptor_info;
+   unsigned int num_adaptors;
+   int num_types;
+   unsigned int max_width, max_height;
+   Status ret;
+
+   assert(dpy);
+   assert(found_port);
+   assert(screen);
+   assert(chroma_format);
+   assert(mc_type);
+   assert(surface_flags);
+
+   *found_port = false;
+
+   for (unsigned int i = 0; i < XScreenCount(dpy); ++i) {
+      ret = XvQueryAdaptors(dpy, XRootWindow(dpy, i), &num_adaptors, &adaptor_info);
+      if (ret != Success)
+         return ret;
+
+      for (unsigned int j = 0; j < num_adaptors && !*found_port; ++j) {
+         for (unsigned int k = 0; k < adaptor_info[j].num_ports && !*found_port; ++k) {
+            XvMCSurfaceInfo *surface_info;
+
+            if (adaptor_info[j].base_id + k != port)
+               continue;
+
+            *found_port = true;
+
+            surface_info = XvMCListSurfaceTypes(dpy, adaptor_info[j].base_id, &num_types);
+            if (!surface_info) {
+               XvFreeAdaptorInfo(adaptor_info);
+               return BadAlloc;
+            }
+
+            for (unsigned int l = 0; l < num_types && !found_surface; ++l) {
+               if (surface_info[l].surface_type_id != surface_type_id)
+                  continue;
+
+               found_surface = true;
+               max_width = surface_info[l].max_width;
+               max_height = surface_info[l].max_height;
+               *chroma_format = surface_info[l].chroma_format;
+               *mc_type = surface_info[l].mc_type;
+               *surface_flags = surface_info[l].flags;
+               *screen = i;
+            }
+
+            XFree(surface_info);
+         }
+      }
+
+      XvFreeAdaptorInfo(adaptor_info);
+   }
+
+   if (!*found_port)
+      return XvBadPort;
+   if (!found_surface)
+      return BadMatch;
+   if (width > max_width || height > max_height)
+      return BadValue;
+   if (flags != XVMC_DIRECT && flags != 0)
+      return BadValue;
+
+   return Success;
+}
+
+static enum pipe_video_profile ProfileToPipe(int xvmc_profile)
+{
+   if (xvmc_profile & XVMC_MPEG_1)
+      assert(0);
+   if (xvmc_profile & XVMC_MPEG_2)
+      return PIPE_VIDEO_PROFILE_MPEG2_MAIN;
+   if (xvmc_profile & XVMC_H263)
+      assert(0);
+   if (xvmc_profile & XVMC_MPEG_4)
+      assert(0);
+	
+   assert(0);
+
+   return -1;
+}
+
+static enum pipe_video_chroma_format FormatToPipe(int xvmc_format)
+{
+   switch (xvmc_format) {
+      case XVMC_CHROMA_FORMAT_420:
+         return PIPE_VIDEO_CHROMA_FORMAT_420;
+      case XVMC_CHROMA_FORMAT_422:
+         return PIPE_VIDEO_CHROMA_FORMAT_422;
+      case XVMC_CHROMA_FORMAT_444:
+         return PIPE_VIDEO_CHROMA_FORMAT_444;
+      default:
+         assert(0);
+   }
+
+   return -1;
+}
+
+Status XvMCCreateContext(Display *dpy, XvPortID port, int surface_type_id,
+                         int width, int height, int flags, XvMCContext *context)
+{
+   bool found_port;
+   int scrn;
+   int chroma_format;
+   int mc_type;
+   int surface_flags;
+   Status ret;
+   struct pipe_screen *screen;
+   struct pipe_video_context *vpipe;
+   XvMCContextPrivate *context_priv;
+   float csc[16];
+
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+
+   ret = Validate(dpy, port, surface_type_id, width, height, flags,
+                  &found_port, &scrn, &chroma_format, &mc_type, &surface_flags);
+
+   /* Success and XvBadPort have the same value */
+   if (ret != Success || !found_port)
+      return ret;
+
+   /* XXX: Current limits */
+   if (chroma_format != XVMC_CHROMA_FORMAT_420) {
+      debug_printf("[XvMCg3dvl] Cannot decode requested surface type. Unsupported chroma format.\n");
+      return BadImplementation;
+   }
+   if (mc_type != (XVMC_MOCOMP | XVMC_MPEG_2)) {
+      debug_printf("[XvMCg3dvl] Cannot decode requested surface type. Non-MPEG2/Mocomp acceleration unsupported.\n");
+      return BadImplementation;
+   }
+   if (!(surface_flags & XVMC_INTRA_UNSIGNED)) {
+      debug_printf("[XvMCg3dvl] Cannot decode requested surface type. Signed intra unsupported.\n");
+      return BadImplementation;
+   }
+
+   context_priv = CALLOC(1, sizeof(XvMCContextPrivate));
+   if (!context_priv)
+      return BadAlloc;
+
+   /* TODO: Reuse screen if process creates another context */
+   screen = vl_screen_create(dpy, scrn);
+
+   if (!screen) {
+      FREE(context_priv);
+      return BadAlloc;
+   }
+
+   vpipe = vl_video_create(dpy, scrn, screen, ProfileToPipe(mc_type),
+                           FormatToPipe(chroma_format), width, height);
+
+   if (!vpipe) {
+      screen->destroy(screen);
+      FREE(context_priv);
+      return BadAlloc;
+   }
+
+   /* TODO: Define some Xv attribs to allow users to specify color standard, procamp */
+   vl_csc_get_matrix
+   (
+      debug_get_bool_option("G3DVL_NO_CSC", FALSE) ?
+      VL_CSC_COLOR_STANDARD_IDENTITY : VL_CSC_COLOR_STANDARD_BT_601,
+      NULL, true, csc
+   );
+   vpipe->set_csc_matrix(vpipe, csc);
+
+   context_priv->vpipe = vpipe;
+
+   context->context_id = XAllocID(dpy);
+   context->surface_type_id = surface_type_id;
+   context->width = width;
+   context->height = height;
+   context->flags = flags;
+   context->port = port;
+   context->privData = context_priv;
+	
+   SyncHandle();
+
+   return Success;
+}
+
+Status XvMCDestroyContext(Display *dpy, XvMCContext *context)
+{
+   struct pipe_screen *screen;
+   struct pipe_video_context *vpipe;
+   XvMCContextPrivate *context_priv;
+
+   assert(dpy);
+
+   if (!context || !context->privData)
+      return XvMCBadContext;
+
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+   pipe_surface_reference(&context_priv->backbuffer, NULL);
+   screen = vpipe->screen;
+   vpipe->destroy(vpipe);
+   screen->destroy(screen);
+   FREE(context_priv);
+   context->privData = NULL;
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/subpicture.c b/src/gallium/state_trackers/xorg/xvmc/subpicture.c
new file mode 100644
index 00000000000..69898d5fcd3
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/subpicture.c
@@ -0,0 +1,195 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <X11/Xlibint.h>
+#include <X11/extensions/XvMClib.h>
+
+Status XvMCCreateSubpicture(Display *dpy, XvMCContext *context, XvMCSubpicture *subpicture,
+                            unsigned short width, unsigned short height, int xvimage_id)
+{
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+
+   assert(subpicture);
+
+   /*if (width > || height > )
+      return BadValue;*/
+
+   /*if (xvimage_id != )
+      return BadMatch;*/
+
+   subpicture->subpicture_id = XAllocID(dpy);
+   subpicture->context_id = context->context_id;
+   subpicture->xvimage_id = xvimage_id;
+   subpicture->width = width;
+   subpicture->height = height;
+   subpicture->num_palette_entries = 0;
+   subpicture->entry_bytes = 0;
+   subpicture->component_order[0] = 0;
+   subpicture->component_order[1] = 0;
+   subpicture->component_order[2] = 0;
+   subpicture->component_order[3] = 0;
+   /* TODO: subpicture->privData = ;*/
+
+   SyncHandle();
+
+   return Success;
+}
+
+Status XvMCClearSubpicture(Display *dpy, XvMCSubpicture *subpicture, short x, short y,
+                           unsigned short width, unsigned short height, unsigned int color)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   /* TODO: Assert clear rect is within bounds? Or clip? */
+
+   return Success;
+}
+
+Status XvMCCompositeSubpicture(Display *dpy, XvMCSubpicture *subpicture, XvImage *image,
+                               short srcx, short srcy, unsigned short width, unsigned short height,
+                               short dstx, short dsty)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   assert(image);
+
+   if (subpicture->xvimage_id != image->id)
+      return BadMatch;
+
+   /* TODO: Assert rects are within bounds? Or clip? */
+
+   return Success;
+}
+
+Status XvMCDestroySubpicture(Display *dpy, XvMCSubpicture *subpicture)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   return BadImplementation;
+}
+
+Status XvMCSetSubpicturePalette(Display *dpy, XvMCSubpicture *subpicture, unsigned char *palette)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   assert(palette);
+
+   /* We don't support paletted subpictures */
+   return BadMatch;
+}
+
+Status XvMCBlendSubpicture(Display *dpy, XvMCSurface *target_surface, XvMCSubpicture *subpicture,
+                           short subx, short suby, unsigned short subw, unsigned short subh,
+                           short surfx, short surfy, unsigned short surfw, unsigned short surfh)
+{
+   assert(dpy);
+
+   if (!target_surface)
+      return XvMCBadSurface;
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   if (target_surface->context_id != subpicture->context_id)
+      return BadMatch;
+
+   /* TODO: Assert rects are within bounds? Or clip? */
+   return Success;
+}
+
+Status XvMCBlendSubpicture2(Display *dpy, XvMCSurface *source_surface, XvMCSurface *target_surface,
+                            XvMCSubpicture *subpicture, short subx, short suby, unsigned short subw, unsigned short subh,
+                            short surfx, short surfy, unsigned short surfw, unsigned short surfh)
+{
+   assert(dpy);
+
+   if (!source_surface || !target_surface)
+      return XvMCBadSurface;
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   if (source_surface->context_id != subpicture->context_id)
+      return BadMatch;
+
+   if (source_surface->context_id != subpicture->context_id)
+      return BadMatch;
+
+   /* TODO: Assert rects are within bounds? Or clip? */
+   return Success;
+}
+
+Status XvMCSyncSubpicture(Display *dpy, XvMCSubpicture *subpicture)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   return Success;
+}
+
+Status XvMCFlushSubpicture(Display *dpy, XvMCSubpicture *subpicture)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   return Success;
+}
+
+Status XvMCGetSubpictureStatus(Display *dpy, XvMCSubpicture *subpicture, int *status)
+{
+   assert(dpy);
+
+   if (!subpicture)
+      return XvMCBadSubpicture;
+
+   assert(status);
+
+   /* TODO */
+   *status = 0;
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/surface.c b/src/gallium/state_trackers/xorg/xvmc/surface.c
new file mode 100644
index 00000000000..bf9038f356e
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/surface.c
@@ -0,0 +1,409 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <X11/Xlibint.h>
+#include <pipe/p_video_context.h>
+#include <pipe/p_video_state.h>
+#include <pipe/p_state.h>
+#include <util/u_memory.h>
+#include "xvmc_private.h"
+
+static enum pipe_mpeg12_macroblock_type TypeToPipe(int xvmc_mb_type)
+{
+   if (xvmc_mb_type & XVMC_MB_TYPE_INTRA)
+      return PIPE_MPEG12_MACROBLOCK_TYPE_INTRA;
+   if ((xvmc_mb_type & (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD)) == XVMC_MB_TYPE_MOTION_FORWARD)
+      return PIPE_MPEG12_MACROBLOCK_TYPE_FWD;
+   if ((xvmc_mb_type & (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD)) == XVMC_MB_TYPE_MOTION_BACKWARD)
+      return PIPE_MPEG12_MACROBLOCK_TYPE_BKWD;
+   if ((xvmc_mb_type & (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD)) == (XVMC_MB_TYPE_MOTION_FORWARD | XVMC_MB_TYPE_MOTION_BACKWARD))
+      return PIPE_MPEG12_MACROBLOCK_TYPE_BI;
+
+   assert(0);
+
+   return -1;
+}
+
+static enum pipe_mpeg12_picture_type PictureToPipe(int xvmc_pic)
+{
+   switch (xvmc_pic) {
+      case XVMC_TOP_FIELD:
+         return PIPE_MPEG12_PICTURE_TYPE_FIELD_TOP;
+      case XVMC_BOTTOM_FIELD:
+         return PIPE_MPEG12_PICTURE_TYPE_FIELD_BOTTOM;
+      case XVMC_FRAME_PICTURE:
+         return PIPE_MPEG12_PICTURE_TYPE_FRAME;
+      default:
+         assert(0);
+   }
+
+   return -1;
+}
+
+static enum pipe_mpeg12_motion_type MotionToPipe(int xvmc_motion_type, int xvmc_dct_type)
+{
+   switch (xvmc_motion_type) {
+      case XVMC_PREDICTION_FRAME:
+         return xvmc_dct_type == XVMC_DCT_TYPE_FIELD ?
+            PIPE_MPEG12_MOTION_TYPE_16x8 : PIPE_MPEG12_MOTION_TYPE_FRAME;
+      case XVMC_PREDICTION_FIELD:
+         return PIPE_MPEG12_MOTION_TYPE_FIELD;
+      case XVMC_PREDICTION_DUAL_PRIME:
+         return PIPE_MPEG12_MOTION_TYPE_DUALPRIME;
+      default:
+         assert(0);
+   }
+
+   return -1;
+}
+
+static bool
+CreateOrResizeBackBuffer(struct pipe_video_context *vpipe, unsigned int width, unsigned int height,
+                         struct pipe_surface **backbuffer)
+{
+   struct pipe_texture template;
+   struct pipe_texture *tex;
+
+   assert(vpipe);
+
+   if (*backbuffer) {
+      if ((*backbuffer)->width != width || (*backbuffer)->height != height)
+         pipe_surface_reference(backbuffer, NULL);
+      else
+         return true;
+   }
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   /* XXX: Needs to match the drawable's format? */
+   template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+   template.last_level = 0;
+   template.width[0] = width;
+   template.height[0] = height;
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
+   tex = vpipe->screen->texture_create(vpipe->screen, &template);
+   if (!tex)
+      return false;
+
+   *backbuffer = vpipe->screen->get_tex_surface(vpipe->screen, tex, 0, 0, 0,
+                                                PIPE_BUFFER_USAGE_GPU_READ |
+                                                PIPE_BUFFER_USAGE_GPU_WRITE);
+   pipe_texture_reference(&tex, NULL);
+
+   if (!*backbuffer)
+      return false;
+
+   /* Clear the backbuffer in case the video doesn't cover the whole window */
+   /* FIXME: Need to clear every time a frame moves and leaves dirty rects */
+   vpipe->clear_surface(vpipe, 0, 0, width, height, 0, *backbuffer);
+
+   return true;
+}
+
+static void
+MacroBlocksToPipe(const XvMCMacroBlockArray *xvmc_macroblocks,
+                  const XvMCBlockArray *xvmc_blocks,
+                  unsigned int first_macroblock,
+                  unsigned int num_macroblocks,
+                  struct pipe_mpeg12_macroblock *pipe_macroblocks)
+{
+   unsigned int i, j, k, l;
+   XvMCMacroBlock *xvmc_mb;
+
+   assert(xvmc_macroblocks);
+   assert(xvmc_blocks);
+   assert(pipe_macroblocks);
+   assert(num_macroblocks);
+
+   xvmc_mb = xvmc_macroblocks->macro_blocks + first_macroblock;
+
+   for (i = 0; i < num_macroblocks; ++i) {
+      pipe_macroblocks->base.codec = PIPE_VIDEO_CODEC_MPEG12;
+      pipe_macroblocks->mbx = xvmc_mb->x;
+      pipe_macroblocks->mby = xvmc_mb->y;
+      pipe_macroblocks->mb_type = TypeToPipe(xvmc_mb->macroblock_type);
+      if (pipe_macroblocks->mb_type != PIPE_MPEG12_MACROBLOCK_TYPE_INTRA)
+         pipe_macroblocks->mo_type = MotionToPipe(xvmc_mb->motion_type, xvmc_mb->dct_type);
+      /* Get rid of Valgrind 'undefined' warnings */
+      else
+         pipe_macroblocks->mo_type = -1;
+      pipe_macroblocks->dct_type = xvmc_mb->dct_type == XVMC_DCT_TYPE_FIELD ?
+         PIPE_MPEG12_DCT_TYPE_FIELD : PIPE_MPEG12_DCT_TYPE_FRAME;
+
+      for (j = 0; j < 2; ++j)
+         for (k = 0; k < 2; ++k)
+            for (l = 0; l < 2; ++l)
+               pipe_macroblocks->pmv[j][k][l] = xvmc_mb->PMV[j][k][l];
+
+      pipe_macroblocks->cbp = xvmc_mb->coded_block_pattern;
+      pipe_macroblocks->blocks = xvmc_blocks->blocks + xvmc_mb->index * BLOCK_SIZE_SAMPLES;
+
+      ++pipe_macroblocks;
+      ++xvmc_mb;
+   }
+}
+
+Status XvMCCreateSurface(Display *dpy, XvMCContext *context, XvMCSurface *surface)
+{
+   XvMCContextPrivate *context_priv;
+   struct pipe_video_context *vpipe;
+   XvMCSurfacePrivate *surface_priv;
+   struct pipe_video_surface *vsfc;
+
+   assert(dpy);
+
+   if (!context)
+      return XvMCBadContext;
+   if (!surface)
+      return XvMCBadSurface;
+
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+
+   surface_priv = CALLOC(1, sizeof(XvMCSurfacePrivate));
+   if (!surface_priv)
+      return BadAlloc;
+
+   vsfc = vpipe->screen->video_surface_create(vpipe->screen, vpipe->chroma_format,
+                                              vpipe->width, vpipe->height);
+   if (!vsfc) {
+      FREE(surface_priv);
+      return BadAlloc;
+   }
+
+   surface_priv->pipe_vsfc = vsfc;
+   surface_priv->context = context;
+
+   surface->surface_id = XAllocID(dpy);
+   surface->context_id = context->context_id;
+   surface->surface_type_id = context->surface_type_id;
+   surface->width = context->width;
+   surface->height = context->height;
+   surface->privData = surface_priv;
+
+   SyncHandle();
+
+   return Success;
+}
+
+Status XvMCRenderSurface(Display *dpy, XvMCContext *context, unsigned int picture_structure,
+                         XvMCSurface *target_surface, XvMCSurface *past_surface, XvMCSurface *future_surface,
+                         unsigned int flags, unsigned int num_macroblocks, unsigned int first_macroblock,
+                         XvMCMacroBlockArray *macroblocks, XvMCBlockArray *blocks
+)
+{
+   struct pipe_video_context *vpipe;
+   struct pipe_surface *t_vsfc;
+   struct pipe_surface *p_vsfc;
+   struct pipe_surface *f_vsfc;
+   XvMCContextPrivate *context_priv;
+   XvMCSurfacePrivate *target_surface_priv;
+   XvMCSurfacePrivate *past_surface_priv;
+   XvMCSurfacePrivate *future_surface_priv;
+   struct pipe_mpeg12_macroblock pipe_macroblocks[num_macroblocks];
+
+   assert(dpy);
+
+   if (!context || !context->privData)
+      return XvMCBadContext;
+   if (!target_surface || !target_surface->privData)
+      return XvMCBadSurface;
+
+   if (picture_structure != XVMC_TOP_FIELD &&
+       picture_structure != XVMC_BOTTOM_FIELD &&
+       picture_structure != XVMC_FRAME_PICTURE)
+      return BadValue;
+   /* Bkwd pred equivalent to fwd (past && !future) */
+   if (future_surface && !past_surface)
+      return BadMatch;
+
+   assert(context->context_id == target_surface->context_id);
+   assert(!past_surface || context->context_id == past_surface->context_id);
+   assert(!future_surface || context->context_id == future_surface->context_id);
+
+   assert(macroblocks);
+   assert(blocks);
+
+   assert(macroblocks->context_id == context->context_id);
+   assert(blocks->context_id == context->context_id);
+
+   assert(flags == 0 || flags == XVMC_SECOND_FIELD);
+
+   target_surface_priv = target_surface->privData;
+   past_surface_priv = past_surface ? past_surface->privData : NULL;
+   future_surface_priv = future_surface ? future_surface->privData : NULL;
+
+   assert(target_surface_priv->context == context);
+   assert(!past_surface || past_surface_priv->context == context);
+   assert(!future_surface || future_surface_priv->context == context);
+
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+
+   t_vsfc = target_surface_priv->pipe_vsfc;
+   p_vsfc = past_surface ? past_surface_priv->pipe_vsfc : NULL;
+   f_vsfc = future_surface ? future_surface_priv->pipe_vsfc : NULL;
+
+   MacroBlocksToPipe(macroblocks, blocks, first_macroblock,
+                     num_macroblocks, pipe_macroblocks);
+
+   vpipe->set_decode_target(vpipe, t_vsfc);
+   vpipe->decode_macroblocks(vpipe, p_vsfc, f_vsfc, num_macroblocks,
+                             &pipe_macroblocks->base, target_surface_priv->render_fence);
+
+   return Success;
+}
+
+Status XvMCFlushSurface(Display *dpy, XvMCSurface *surface)
+{
+   assert(dpy);
+
+   if (!surface)
+      return XvMCBadSurface;
+
+   return Success;
+}
+
+Status XvMCSyncSurface(Display *dpy, XvMCSurface *surface)
+{
+   assert(dpy);
+
+   if (!surface)
+      return XvMCBadSurface;
+
+   return Success;
+}
+
+Status XvMCPutSurface(Display *dpy, XvMCSurface *surface, Drawable drawable,
+                      short srcx, short srcy, unsigned short srcw, unsigned short srch,
+                      short destx, short desty, unsigned short destw, unsigned short desth,
+                      int flags)
+{
+   Window root;
+   int x, y;
+   unsigned int width, height;
+   unsigned int border_width;
+   unsigned int depth;
+   struct pipe_video_context *vpipe;
+   XvMCSurfacePrivate *surface_priv;
+   XvMCContextPrivate *context_priv;
+   XvMCContext *context;
+   struct pipe_video_rect src_rect = {srcx, srcy, srcw, srch};
+   struct pipe_video_rect dst_rect = {destx, desty, destw, desth};
+
+   assert(dpy);
+
+   if (!surface || !surface->privData)
+      return XvMCBadSurface;
+
+   if (XGetGeometry(dpy, drawable, &root, &x, &y, &width, &height, &border_width, &depth) == BadDrawable)
+      return BadDrawable;
+
+   assert(flags == XVMC_TOP_FIELD || flags == XVMC_BOTTOM_FIELD || flags == XVMC_FRAME_PICTURE);
+   assert(srcx + srcw - 1 < surface->width);
+   assert(srcy + srch - 1 < surface->height);
+   /*
+    * Some apps (mplayer) hit these asserts because they call
+    * this function after the window has been resized by the WM
+    * but before they've handled the corresponding XEvent and
+    * know about the new dimensions. The output should be clipped
+    * until the app updates destw and desth.
+    */
+   /*
+   assert(destx + destw - 1 < width);
+   assert(desty + desth - 1 < height);
+    */
+
+   surface_priv = surface->privData;
+   context = surface_priv->context;
+   context_priv = context->privData;
+   vpipe = context_priv->vpipe;
+
+   if (!CreateOrResizeBackBuffer(vpipe, width, height, &context_priv->backbuffer))
+      return BadAlloc;
+
+   vpipe->render_picture(vpipe, surface_priv->pipe_vsfc, PictureToPipe(flags), &src_rect,
+                         context_priv->backbuffer, &dst_rect, surface_priv->disp_fence);
+
+   vl_video_bind_drawable(vpipe, drawable);
+	
+   vpipe->screen->flush_frontbuffer
+   (
+      vpipe->screen,
+      context_priv->backbuffer,
+      vpipe->priv
+   );
+
+   return Success;
+}
+
+Status XvMCGetSurfaceStatus(Display *dpy, XvMCSurface *surface, int *status)
+{
+   assert(dpy);
+
+   if (!surface)
+      return XvMCBadSurface;
+
+   assert(status);
+
+   *status = 0;
+
+   return Success;
+}
+
+Status XvMCDestroySurface(Display *dpy, XvMCSurface *surface)
+{
+   XvMCSurfacePrivate *surface_priv;
+
+   assert(dpy);
+
+   if (!surface || !surface->privData)
+      return XvMCBadSurface;
+
+   surface_priv = surface->privData;
+   pipe_video_surface_reference(&surface_priv->pipe_vsfc, NULL);
+   FREE(surface_priv);
+   surface->privData = NULL;
+
+   return Success;
+}
+
+Status XvMCHideSurface(Display *dpy, XvMCSurface *surface)
+{
+   assert(dpy);
+
+   if (!surface || !surface->privData)
+      return XvMCBadSurface;
+
+   /* No op, only for overlaid rendering */
+
+   return Success;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/.gitignore b/src/gallium/state_trackers/xorg/xvmc/tests/.gitignore
new file mode 100644
index 00000000000..e1d2f9023df
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/.gitignore
@@ -0,0 +1,5 @@
+test_context
+test_surface
+test_blocks
+test_rendering
+xvmc_bench
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/Makefile b/src/gallium/state_trackers/xorg/xvmc/tests/Makefile
new file mode 100644
index 00000000000..c875dd76058
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBS = -lXvMCW -lXvMC -lXv -lX11
+
+#############################################
+
+.PHONY: default clean
+
+default: test_context test_surface test_blocks test_rendering xvmc_bench
+
+test_context: test_context.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+test_surface: test_surface.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+test_blocks: test_blocks.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+test_rendering: test_rendering.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+xvmc_bench: xvmc_bench.o testlib.o
+	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
+
+clean:
+	$(RM) -rf *.o test_context test_surface test_blocks test_rendering xvmc_bench
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_blocks.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_blocks.c
new file mode 100644
index 00000000000..994e3ca4d14
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_blocks.c
@@ -0,0 +1,111 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <error.h>
+#include "testlib.h"
+
+int main(int argc, char **argv)
+{
+	const unsigned int	width = 16, height = 16;
+	const unsigned int	min_required_blocks = 1, min_required_macroblocks = 1;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+
+	Display			*display;
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface;
+	XvMCBlockArray		blocks = {0};
+	XvMCMacroBlockArray	macroblocks = {0};
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		width,
+		height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, &context) == Success);
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+
+	/* Test NULL context */
+	assert(XvMCCreateBlocks(display, NULL, 1, &blocks) == XvMCBadContext);
+	/* Test 0 blocks */
+	assert(XvMCCreateBlocks(display, &context, 0, &blocks) == BadValue);
+	/* Test valid params */
+	assert(XvMCCreateBlocks(display, &context, min_required_blocks, &blocks) == Success);
+	/* Test context id assigned and correct */
+	assert(blocks.context_id == context.context_id);
+	/* Test number of blocks assigned and correct */
+	assert(blocks.num_blocks == min_required_blocks);
+	/* Test block pointer valid */
+	assert(blocks.blocks != NULL);
+	/* Test NULL context */
+	assert(XvMCCreateMacroBlocks(display, NULL, 1, &macroblocks) == XvMCBadContext);
+	/* Test 0 macroblocks */
+	assert(XvMCCreateMacroBlocks(display, &context, 0, &macroblocks) == BadValue);
+	/* Test valid params */
+	assert(XvMCCreateMacroBlocks(display, &context, min_required_macroblocks, &macroblocks) == Success);
+	/* Test context id assigned and correct */
+	assert(macroblocks.context_id == context.context_id);
+	/* Test macroblock pointer valid */
+	assert(macroblocks.macro_blocks != NULL);
+	/* Test valid params */
+	assert(XvMCDestroyMacroBlocks(display, &macroblocks) == Success);
+	/* Test valid params */
+	assert(XvMCDestroyBlocks(display, &blocks) == Success);
+
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_context.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_context.c
new file mode 100644
index 00000000000..3da957c9330
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_context.c
@@ -0,0 +1,119 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <error.h>
+#include "testlib.h"
+
+int main(int argc, char **argv)
+{
+	const unsigned int	width = 16, height = 16;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+
+	Display			*display;
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context = {0};
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		width,
+		height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	/* Test NULL context */
+	/* XXX: XvMCBadContext not a valid return for XvMCCreateContext in the XvMC API, but openChrome driver returns it */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, NULL) == XvMCBadContext);
+	/* Test invalid port */
+	/* XXX: Success and XvBadPort have the same value, if this call actually gets passed the validation step as of now we'll crash later */
+	assert(XvMCCreateContext(display, -1, surface_type_id, width, height, XVMC_DIRECT, &context) == XvBadPort);
+	/* Test invalid surface */
+	assert(XvMCCreateContext(display, port_num, -1, width, height, XVMC_DIRECT, &context) == BadMatch);
+	/* Test invalid flags */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, -1, &context) == BadValue);
+	/* Test huge width */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, 16384, height, XVMC_DIRECT, &context) == BadValue);
+	/* Test huge height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, 16384, XVMC_DIRECT, &context) == BadValue);
+	/* Test huge width & height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, 16384, 16384, XVMC_DIRECT, &context) == BadValue);
+	/* Test valid params */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, &context) == Success);
+	/* Test context id assigned */
+	assert(context.context_id != 0);
+	/* Test surface type id assigned and correct */
+	assert(context.surface_type_id == surface_type_id);
+	/* Test width & height assigned and correct */
+	assert(context.width == width && context.height == height);
+	/* Test port assigned and correct */
+	assert(context.port == port_num);
+	/* Test flags assigned and correct */
+	assert(context.flags == XVMC_DIRECT);
+	/* Test NULL context */
+	assert(XvMCDestroyContext(display, NULL) == XvMCBadContext);
+	/* Test valid params */
+	assert(XvMCDestroyContext(display, &context) == Success);
+	/* Test awkward but valid width */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width + 1, height, XVMC_DIRECT, &context) == Success);
+	assert(context.width >= width + 1);
+	assert(XvMCDestroyContext(display, &context) == Success);
+	/* Test awkward but valid height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height + 1, XVMC_DIRECT, &context) == Success);
+	assert(context.height >= height + 1);
+	assert(XvMCDestroyContext(display, &context) == Success);
+	/* Test awkward but valid width & height */
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width + 1, height + 1, XVMC_DIRECT, &context) == Success);
+	assert(context.width >= width + 1 && context.height >= height + 1);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_rendering.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_rendering.c
new file mode 100644
index 00000000000..6058783a798
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_rendering.c
@@ -0,0 +1,317 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <error.h>
+#include "testlib.h"
+
+#define BLOCK_WIDTH			8
+#define BLOCK_HEIGHT			8
+#define BLOCK_SIZE			(BLOCK_WIDTH * BLOCK_HEIGHT)
+#define MACROBLOCK_WIDTH		16
+#define MACROBLOCK_HEIGHT		16
+#define MACROBLOCK_WIDTH_IN_BLOCKS	(MACROBLOCK_WIDTH / BLOCK_WIDTH)
+#define MACROBLOCK_HEIGHT_IN_BLOCKS	(MACROBLOCK_HEIGHT / BLOCK_HEIGHT)
+#define BLOCKS_PER_MACROBLOCK		6
+
+#define INPUT_WIDTH			16
+#define INPUT_HEIGHT			16
+#define INPUT_WIDTH_IN_MACROBLOCKS	(INPUT_WIDTH / MACROBLOCK_WIDTH)
+#define INPUT_HEIGHT_IN_MACROBLOCKS	(INPUT_HEIGHT / MACROBLOCK_HEIGHT)
+#define NUM_MACROBLOCKS			(INPUT_WIDTH_IN_MACROBLOCKS * INPUT_HEIGHT_IN_MACROBLOCKS)
+
+#define DEFAULT_OUTPUT_WIDTH		INPUT_WIDTH
+#define DEFAULT_OUTPUT_HEIGHT		INPUT_HEIGHT
+#define DEFAULT_ACCEPTABLE_ERR		0.01
+
+void ParseArgs(int argc, char **argv, unsigned int *output_width, unsigned int *output_height, double *acceptable_error, int *prompt);
+void Gradient(short *block, unsigned int start, unsigned int stop, int horizontal);
+
+void ParseArgs(int argc, char **argv, unsigned int *output_width, unsigned int *output_height, double *acceptable_error, int *prompt)
+{
+	int fail = 0;
+	int i;
+
+	*output_width = DEFAULT_OUTPUT_WIDTH;
+	*output_height = DEFAULT_OUTPUT_WIDTH;
+	*acceptable_error = DEFAULT_ACCEPTABLE_ERR;
+	*prompt = 1;
+
+	for (i = 1; i < argc && !fail; ++i)
+	{
+		if (!strcmp(argv[i], "-w"))
+		{
+			if (sscanf(argv[++i], "%u", output_width) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-h"))
+		{
+			if (sscanf(argv[++i], "%u", output_height) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-e"))
+		{
+			if (sscanf(argv[++i], "%lf", acceptable_error) != 1)
+				fail = 1;
+		}
+		else if (strcmp(argv[i], "-n"))
+			*prompt = 0;
+		else
+			fail = 1;
+	}
+
+	if (fail)
+		error
+		(
+			1, 0,
+			"Bad argument.\n"
+			"\n"
+			"Usage: %s [options]\n"
+			"\t-w <width>\tOutput width\n"
+			"\t-h <height>\tOutput height\n"
+			"\t-e <error>\tAcceptable margin of error per pixel, from 0 to 1\n"
+			"\t-n\tDon't prompt for quit\n",
+			argv[0]
+		);
+}
+
+void Gradient(short *block, unsigned int start, unsigned int stop, int horizontal)
+{
+	unsigned int x, y;
+	unsigned int range = stop - start;
+
+	if (horizontal)
+	{
+		for (y = 0; y < BLOCK_HEIGHT; ++y)
+			for (x = 0; x < BLOCK_WIDTH; ++x)
+				block[y * BLOCK_WIDTH + x] = (short)(start + range * (x / (float)(BLOCK_WIDTH - 1)));
+	}
+	else
+	{
+		for (y = 0; y < BLOCK_HEIGHT; ++y)
+			for (x = 0; x < BLOCK_WIDTH; ++x)
+				block[y * BLOCK_WIDTH + x] = (short)(start + range * (y / (float)(BLOCK_HEIGHT - 1)));
+	}
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int		output_width;
+	unsigned int		output_height;
+	double			acceptable_error;
+	int			prompt;
+	Display			*display;
+	Window			root, window;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface;
+	XvMCBlockArray		block_array;
+	XvMCMacroBlockArray	mb_array;
+	int			mbx, mby, bx, by;
+	XvMCMacroBlock		*mb;
+	short			*blocks;
+	int			quit = 0;
+
+	ParseArgs(argc, argv, &output_width, &output_height, &acceptable_error, &prompt);
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		INPUT_WIDTH,
+		INPUT_HEIGHT,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	root = XDefaultRootWindow(display);
+	window = XCreateSimpleWindow(display, root, 0, 0, output_width, output_height, 0, 0, colorkey);
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, INPUT_WIDTH, INPUT_HEIGHT, XVMC_DIRECT, &context) == Success);
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+	assert(XvMCCreateBlocks(display, &context, NUM_MACROBLOCKS * BLOCKS_PER_MACROBLOCK, &block_array) == Success);
+	assert(XvMCCreateMacroBlocks(display, &context, NUM_MACROBLOCKS, &mb_array) == Success);
+
+	mb = mb_array.macro_blocks;
+	blocks = block_array.blocks;
+
+	for (mby = 0; mby < INPUT_HEIGHT_IN_MACROBLOCKS; ++mby)
+		for (mbx = 0; mbx < INPUT_WIDTH_IN_MACROBLOCKS; ++mbx)
+		{
+			mb->x = mbx;
+			mb->y = mby;
+			mb->macroblock_type = XVMC_MB_TYPE_INTRA;
+			/*mb->motion_type = ;*/
+			/*mb->motion_vertical_field_select = ;*/
+			mb->dct_type = XVMC_DCT_TYPE_FRAME;
+			/*mb->PMV[0][0][0] = ;
+			mb->PMV[0][0][1] = ;
+			mb->PMV[0][1][0] = ;
+			mb->PMV[0][1][1] = ;
+			mb->PMV[1][0][0] = ;
+			mb->PMV[1][0][1] = ;
+			mb->PMV[1][1][0] = ;
+			mb->PMV[1][1][1] = ;*/
+			mb->index = (mby * INPUT_WIDTH_IN_MACROBLOCKS + mbx) * BLOCKS_PER_MACROBLOCK;
+			mb->coded_block_pattern = 0x3F;
+
+			mb++;
+
+			for (by = 0; by < MACROBLOCK_HEIGHT_IN_BLOCKS; ++by)
+				for (bx = 0; bx < MACROBLOCK_WIDTH_IN_BLOCKS; ++bx)
+				{
+					const int start = 16, stop = 235, range = stop - start;
+
+					Gradient
+					(
+						blocks,
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH) / (float)(INPUT_WIDTH - 1))),
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH + BLOCK_WIDTH - 1) / (float)(INPUT_WIDTH - 1))),
+						1
+					);
+
+					blocks += BLOCK_SIZE;
+				}
+
+			for (by = 0; by < MACROBLOCK_HEIGHT_IN_BLOCKS / 2; ++by)
+				for (bx = 0; bx < MACROBLOCK_WIDTH_IN_BLOCKS / 2; ++bx)
+				{
+					const int start = 16, stop = 240, range = stop - start;
+
+					Gradient
+					(
+						blocks,
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH) / (float)(INPUT_WIDTH - 1))),
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH + BLOCK_WIDTH - 1) / (float)(INPUT_WIDTH - 1))),
+						1
+					);
+
+					blocks += BLOCK_SIZE;
+
+					Gradient
+					(
+						blocks,
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH) / (float)(INPUT_WIDTH - 1))),
+						(short)(start + range * ((mbx * MACROBLOCK_WIDTH + bx * BLOCK_WIDTH + BLOCK_WIDTH - 1) / (float)(INPUT_WIDTH - 1))),
+						1
+					);
+
+					blocks += BLOCK_SIZE;
+				}
+		}
+
+	XSelectInput(display, window, ExposureMask | KeyPressMask);
+	XMapWindow(display, window);
+	XSync(display, 0);
+
+	/* Test NULL context */
+	assert(XvMCRenderSurface(display, NULL, XVMC_FRAME_PICTURE, &surface, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == XvMCBadContext);
+	/* Test NULL surface */
+	assert(XvMCRenderSurface(display, &context, XVMC_FRAME_PICTURE, NULL, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == XvMCBadSurface);
+	/* Test bad picture structure */
+	assert(XvMCRenderSurface(display, &context, 0, &surface, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == BadValue);
+	/* Test valid params */
+	assert(XvMCRenderSurface(display, &context, XVMC_FRAME_PICTURE, &surface, NULL, NULL, 0, NUM_MACROBLOCKS, 0, &mb_array, &block_array) == Success);
+
+	/* Test NULL surface */
+	assert(XvMCPutSurface(display, NULL, window, 0, 0, INPUT_WIDTH, INPUT_HEIGHT, 0, 0, output_width, output_height, XVMC_FRAME_PICTURE) == XvMCBadSurface);
+	/* Test bad window */
+	/* XXX: X halts with a bad drawable for some reason, doesn't return BadDrawable as expected */
+	/*assert(XvMCPutSurface(display, &surface, 0, 0, 0, width, height, 0, 0, width, height, XVMC_FRAME_PICTURE) == BadDrawable);*/
+
+	if (prompt)
+	{
+		puts("Press any button to quit...");
+
+		while (!quit)
+		{
+			if (XPending(display) > 0)
+			{
+				XEvent event;
+
+				XNextEvent(display, &event);
+
+				switch (event.type)
+				{
+					case Expose:
+					{
+						/* Test valid params */
+						assert
+						(
+							XvMCPutSurface
+							(
+								display, &surface, window,
+								0, 0, INPUT_WIDTH, INPUT_HEIGHT,
+								0, 0, output_width, output_height,
+								XVMC_FRAME_PICTURE
+							) == Success
+						);
+						break;
+					}
+					case KeyPress:
+					{
+						quit = 1;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	assert(XvMCDestroyBlocks(display, &block_array) == Success);
+	assert(XvMCDestroyMacroBlocks(display, &mb_array) == Success);
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XDestroyWindow(display, window);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/test_surface.c b/src/gallium/state_trackers/xorg/xvmc/tests/test_surface.c
new file mode 100644
index 00000000000..b65eb265c0a
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/test_surface.c
@@ -0,0 +1,98 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <error.h>
+#include "testlib.h"
+
+int main(int argc, char **argv)
+{
+	const unsigned int	width = 16, height = 16;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+
+	Display			*display;
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface = {0};
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		width,
+		height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, width, height, XVMC_DIRECT, &context) == Success);
+
+	/* Test NULL context */
+	assert(XvMCCreateSurface(display, NULL, &surface) == XvMCBadContext);
+	/* Test NULL surface */
+	assert(XvMCCreateSurface(display, &context, NULL) == XvMCBadSurface);
+	/* Test valid params */
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+	/* Test surface id assigned */
+	assert(surface.surface_id != 0);
+	/* Test context id assigned and correct */
+	assert(surface.context_id == context.context_id);
+	/* Test surface type id assigned and correct */
+	assert(surface.surface_type_id == surface_type_id);
+	/* Test width & height assigned and correct */
+	assert(surface.width == width && surface.height == height);
+	/* Test valid params */
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	/* Test NULL surface */
+	assert(XvMCDestroySurface(display, NULL) == XvMCBadSurface);
+
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/testlib.c b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.c
new file mode 100644
index 00000000000..142c09bb590
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.c
@@ -0,0 +1,146 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "testlib.h"
+#include <stdio.h>
+
+/*
+void test(int pred, const char *pred_string, const char *doc_string, const char *file, unsigned int line)
+{
+	fputs(doc_string, stderr);
+	if (!pred)
+		fprintf(stderr, " FAIL!\n\t\"%s\" at %s:%u\n", pred_string, file, line);
+	else
+		fputs(" PASS!\n", stderr);
+}
+*/
+
+int GetPort
+(
+	Display *display,
+	unsigned int width,
+	unsigned int height,
+	unsigned int chroma_format,
+	const unsigned int *mc_types,
+	unsigned int num_mc_types,
+	XvPortID *port_id,
+	int *surface_type_id,
+	unsigned int *is_overlay,
+	unsigned int *intra_unsigned
+)
+{
+	unsigned int	found_port = 0;
+	XvAdaptorInfo	*adaptor_info;
+	unsigned int	num_adaptors;
+	int		num_types;
+	int		ev_base, err_base;
+	unsigned int	i, j, k, l;
+
+	if (!XvMCQueryExtension(display, &ev_base, &err_base))
+		return 0;
+	if (XvQueryAdaptors(display, XDefaultRootWindow(display), &num_adaptors, &adaptor_info) != Success)
+		return 0;
+
+	for (i = 0; i < num_adaptors && !found_port; ++i)
+	{
+		if (adaptor_info[i].type & XvImageMask)
+		{
+			XvMCSurfaceInfo *surface_info = XvMCListSurfaceTypes(display, adaptor_info[i].base_id, &num_types);
+
+			if (surface_info)
+			{
+				for (j = 0; j < num_types && !found_port; ++j)
+				{
+					if
+					(
+						surface_info[j].chroma_format == chroma_format &&
+						surface_info[j].max_width >= width &&
+						surface_info[j].max_height >= height
+					)
+					{
+						for (k = 0; k < num_mc_types && !found_port; ++k)
+						{
+							if (surface_info[j].mc_type == mc_types[k])
+							{
+								for (l = 0; l < adaptor_info[i].num_ports && !found_port; ++l)
+								{
+									if (XvGrabPort(display, adaptor_info[i].base_id + l, CurrentTime) == Success)
+									{
+										*port_id = adaptor_info[i].base_id + l;
+										*surface_type_id = surface_info[j].surface_type_id;
+										*is_overlay = surface_info[j].flags & XVMC_OVERLAID_SURFACE;
+										*intra_unsigned = surface_info[j].flags & XVMC_INTRA_UNSIGNED;
+										found_port = 1;
+									}
+								}
+							}
+						}
+					}
+				}
+
+				XFree(surface_info);
+			}
+		}
+	}
+
+	XvFreeAdaptorInfo(adaptor_info);
+
+	return found_port;
+}
+
+unsigned int align(unsigned int value, unsigned int alignment)
+{
+	return (value + alignment - 1) & ~(alignment - 1);
+}
+
+/* From the glibc manual */
+int timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y)
+{
+	/* Perform the carry for the later subtraction by updating y. */
+	if (x->tv_usec < y->tv_usec)
+	{
+		int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
+		y->tv_usec -= 1000000 * nsec;
+		y->tv_sec += nsec;
+	}
+	if (x->tv_usec - y->tv_usec > 1000000)
+	{
+		int nsec = (x->tv_usec - y->tv_usec) / 1000000;
+		y->tv_usec += 1000000 * nsec;
+		y->tv_sec -= nsec;
+	}
+
+	/*
+	 * Compute the time remaining to wait.
+	 * tv_usec is certainly positive.
+	 */
+	result->tv_sec = x->tv_sec - y->tv_sec;
+	result->tv_usec = x->tv_usec - y->tv_usec;
+
+	/* Return 1 if result is negative. */
+	return x->tv_sec < y->tv_sec;
+}
diff --git a/src/gallium/drivers/i965simple/brw_flush.c b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.h
index e6001c30d94..0438e52928b 100644
--- a/src/gallium/drivers/i965simple/brw_flush.c
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/testlib.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,49 +25,45 @@
  * 
  **************************************************************************/
 
-/* Author:
- *    Keith Whitwell <keith@tungstengraphics.com>
- */
-
-
-#include "pipe/p_defines.h"
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_batch.h"
-
+#ifndef testlib_h
+#define testlib_h
 
-static void brw_flush( struct pipe_context *pipe,
-                       unsigned flags,
-                       struct pipe_fence_handle **fence )
-{
-   struct brw_context *brw = brw_context(pipe);
+/*
+#define TEST(pred, doc)	test(pred, #pred, doc, __FILE__, __LINE__)
 
-   /* Do we need to emit an MI_FLUSH command to flush the hardware
-    * caches?
-    */
-   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
-      struct brw_mi_flush flush;
+void test(int pred, const char *pred_string, const char *doc_string, const char *file, unsigned int line);
+*/
 
-      memset(&flush, 0, sizeof(flush));      
-      flush.opcode = CMD_MI_FLUSH;
+#include <sys/time.h>
+#include <X11/Xlib.h>
+#include <X11/extensions/XvMClib.h>
 
-      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
-	 flush.flags |= BRW_INHIBIT_FLUSH_RENDER_CACHE;
-
-      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
-	 flush.flags |= BRW_FLUSH_READ_CACHE;
-
-      BRW_BATCH_STRUCT(brw, &flush);
-   }
-
-   /* If there are no flags, just flush pending commands to hardware:
-    */
-   FLUSH_BATCH( fence );
-}
+/*
+ * display: IN			A valid X display
+ * width, height: IN		Surface size that the port must display
+ * chroma_format: IN		Chroma format that the port must display
+ * mc_types, num_mc_types: IN	List of MC types that the port must support, first port that matches the first mc_type will be returned
+ * port_id: OUT			Your port's ID
+ * surface_type_id: OUT		Your port's surface ID
+ * is_overlay: OUT		If 1, port uses overlay surfaces, you need to set a colorkey
+ * intra_unsigned: OUT		If 1, port uses unsigned values for intra-coded blocks
+ */
+int GetPort
+(
+	Display *display,
+	unsigned int width,
+	unsigned int height,
+	unsigned int chroma_format,
+	const unsigned int *mc_types,
+	unsigned int num_mc_types,
+	XvPortID *port_id,
+	int *surface_type_id,
+	unsigned int *is_overlay,
+	unsigned int *intra_unsigned
+);
 
+unsigned int align(unsigned int value, unsigned int alignment);
 
+int timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y);
 
-void brw_init_flush_functions( struct brw_context *brw )
-{
-   brw->pipe.flush = brw_flush;
-}
+#endif
diff --git a/src/gallium/state_trackers/xorg/xvmc/tests/xvmc_bench.c b/src/gallium/state_trackers/xorg/xvmc/tests/xvmc_bench.c
new file mode 100644
index 00000000000..bf94d856234
--- /dev/null
+++ b/src/gallium/state_trackers/xorg/xvmc/tests/xvmc_bench.c
@@ -0,0 +1,300 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <error.h>
+#include <sys/time.h>
+#include "testlib.h"
+
+#define MACROBLOCK_WIDTH		16
+#define MACROBLOCK_HEIGHT		16
+#define BLOCKS_PER_MACROBLOCK		6
+
+#define DEFAULT_INPUT_WIDTH		720
+#define DEFAULT_INPUT_HEIGHT		480
+#define DEFAULT_REPS			100
+
+#define PIPELINE_STEP_MC		1
+#define PIPELINE_STEP_CSC		2
+#define PIPELINE_STEP_SWAP		4
+
+#define MB_TYPE_I			1
+#define MB_TYPE_P			2
+#define MB_TYPE_B			4
+
+struct Config
+{
+	unsigned int input_width;
+	unsigned int input_height;
+	unsigned int output_width;
+	unsigned int output_height;
+	unsigned int pipeline;
+	unsigned int mb_types;
+	unsigned int reps;
+};
+
+void ParseArgs(int argc, char **argv, struct Config *config);
+
+void ParseArgs(int argc, char **argv, struct Config *config)
+{
+	int fail = 0;
+	int i;
+
+	config->input_width = DEFAULT_INPUT_WIDTH;
+	config->input_height = DEFAULT_INPUT_HEIGHT;
+	config->output_width = 0;
+	config->output_height = 0;
+	config->pipeline = 0;
+	config->mb_types = 0;
+	config->reps = DEFAULT_REPS;
+
+	for (i = 1; i < argc && !fail; ++i)
+	{
+		if (!strcmp(argv[i], "-iw"))
+		{
+			if (sscanf(argv[++i], "%u", &config->input_width) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-ih"))
+		{
+			if (sscanf(argv[++i], "%u", &config->input_height) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-ow"))
+		{
+			if (sscanf(argv[++i], "%u", &config->output_width) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-oh"))
+		{
+			if (sscanf(argv[++i], "%u", &config->output_height) != 1)
+				fail = 1;
+		}
+		else if (!strcmp(argv[i], "-p"))
+		{
+			char *token = strtok(argv[++i], ",");
+
+			while (token && !fail)
+			{
+				if (!strcmp(token, "mc"))
+					config->pipeline |= PIPELINE_STEP_MC;
+				else if (!strcmp(token, "csc"))
+					config->pipeline |= PIPELINE_STEP_CSC;
+				else if (!strcmp(token, "swp"))
+					config->pipeline |= PIPELINE_STEP_SWAP;
+				else
+					fail = 1;
+
+				if (!fail)
+					token = strtok(NULL, ",");
+			}
+		}
+		else if (!strcmp(argv[i], "-mb"))
+		{
+			char *token = strtok(argv[++i], ",");
+
+			while (token && !fail)
+			{
+				if (strcmp(token, "i"))
+					config->mb_types |= MB_TYPE_I;
+				else if (strcmp(token, "p"))
+					config->mb_types |= MB_TYPE_P;
+				else if (strcmp(token, "b"))
+					config->mb_types |= MB_TYPE_B;
+				else
+					fail = 1;
+
+				if (!fail)
+					token = strtok(NULL, ",");
+			}
+		}
+		else if (!strcmp(argv[i], "-r"))
+		{
+			if (sscanf(argv[++i], "%u", &config->reps) != 1)
+				fail = 1;
+		}
+		else
+			fail = 1;
+	}
+
+	if (fail)
+		error
+		(
+			1, 0,
+			"Bad argument.\n"
+			"\n"
+			"Usage: %s [options]\n"
+			"\t-iw <width>\tInput width\n"
+			"\t-ih <height>\tInput height\n"
+			"\t-ow <width>\tOutput width\n"
+			"\t-oh <height>\tOutput height\n"
+			"\t-p <pipeline>\tPipeline to test\n"
+			"\t-mb <mb type>\tMacroBlock types to use\n"
+			"\t-r <reps>\tRepetitions\n\n"
+			"\tPipeline steps: mc,csc,swap\n"
+			"\tMB types: i,p,b\n",
+			argv[0]
+		);
+
+	if (config->output_width == 0)
+		config->output_width = config->input_width;
+	if (config->output_height == 0)
+		config->output_height = config->input_height;
+	if (!config->pipeline)
+		config->pipeline = PIPELINE_STEP_MC | PIPELINE_STEP_CSC | PIPELINE_STEP_SWAP;
+	if (!config->mb_types)
+		config->mb_types = MB_TYPE_I | MB_TYPE_P | MB_TYPE_B;
+}
+
+int main(int argc, char **argv)
+{
+	struct Config		config;
+	Display			*display;
+	Window			root, window;
+	const unsigned int	mc_types[2] = {XVMC_MOCOMP | XVMC_MPEG_2, XVMC_IDCT | XVMC_MPEG_2};
+	XvPortID		port_num;
+	int			surface_type_id;
+	unsigned int		is_overlay, intra_unsigned;
+	int			colorkey;
+	XvMCContext		context;
+	XvMCSurface		surface;
+	XvMCBlockArray		block_array;
+	XvMCMacroBlockArray	mb_array;
+	unsigned int		mbw, mbh;
+	unsigned int		mbx, mby;
+	unsigned int		reps;
+	struct timeval		start, stop, diff;
+	double			diff_secs;
+
+	ParseArgs(argc, argv, &config);
+
+	mbw = align(config.input_width, MACROBLOCK_WIDTH) / MACROBLOCK_WIDTH;
+	mbh = align(config.input_height, MACROBLOCK_HEIGHT) / MACROBLOCK_HEIGHT;
+
+	display = XOpenDisplay(NULL);
+
+	if (!GetPort
+	(
+		display,
+		config.input_width,
+		config.input_height,
+		XVMC_CHROMA_FORMAT_420,
+    		mc_types,
+    		2,
+    		&port_num,
+    		&surface_type_id,
+    		&is_overlay,
+    		&intra_unsigned
+	))
+	{
+		XCloseDisplay(display);
+		error(1, 0, "Error, unable to find a good port.\n");
+	}
+
+	if (is_overlay)
+	{
+		Atom xv_colorkey = XInternAtom(display, "XV_COLORKEY", 0);
+		XvGetPortAttribute(display, port_num, xv_colorkey, &colorkey);
+	}
+
+	root = XDefaultRootWindow(display);
+	window = XCreateSimpleWindow(display, root, 0, 0, config.output_width, config.output_height, 0, 0, colorkey);
+
+	assert(XvMCCreateContext(display, port_num, surface_type_id, config.input_width, config.input_height, XVMC_DIRECT, &context) == Success);
+	assert(XvMCCreateSurface(display, &context, &surface) == Success);
+	assert(XvMCCreateBlocks(display, &context, mbw * mbh * BLOCKS_PER_MACROBLOCK, &block_array) == Success);
+	assert(XvMCCreateMacroBlocks(display, &context, mbw * mbh, &mb_array) == Success);
+
+	for (mby = 0; mby < mbh; ++mby)
+		for (mbx = 0; mbx < mbw; ++mbx)
+		{
+			mb_array.macro_blocks[mby * mbw + mbx].x = mbx;
+			mb_array.macro_blocks[mby * mbw + mbx].y = mby;
+			mb_array.macro_blocks[mby * mbw + mbx].macroblock_type = XVMC_MB_TYPE_INTRA;
+			/*mb->motion_type = ;*/
+			/*mb->motion_vertical_field_select = ;*/
+			mb_array.macro_blocks[mby * mbw + mbx].dct_type = XVMC_DCT_TYPE_FRAME;
+			/*mb->PMV[0][0][0] = ;
+			mb->PMV[0][0][1] = ;
+			mb->PMV[0][1][0] = ;
+			mb->PMV[0][1][1] = ;
+			mb->PMV[1][0][0] = ;
+			mb->PMV[1][0][1] = ;
+			mb->PMV[1][1][0] = ;
+			mb->PMV[1][1][1] = ;*/
+			mb_array.macro_blocks[mby * mbw + mbx].index = (mby * mbw + mbx) * BLOCKS_PER_MACROBLOCK;
+			mb_array.macro_blocks[mby * mbw + mbx].coded_block_pattern = 0x3F;
+		}
+
+	XSelectInput(display, window, ExposureMask | KeyPressMask);
+	XMapWindow(display, window);
+	XSync(display, 0);
+
+	gettimeofday(&start, NULL);
+
+	for (reps = 0; reps < config.reps; ++reps)
+	{
+		if (config.pipeline & PIPELINE_STEP_MC)
+		{
+			assert(XvMCRenderSurface(display, &context, XVMC_FRAME_PICTURE, &surface, NULL, NULL, 0, mbw * mbh, 0, &mb_array, &block_array) == Success);
+			assert(XvMCFlushSurface(display, &surface) == Success);
+		}
+		if (config.pipeline & PIPELINE_STEP_CSC)
+			assert(XvMCPutSurface(display, &surface, window, 0, 0, config.input_width, config.input_height, 0, 0, config.output_width, config.output_height, XVMC_FRAME_PICTURE) == Success);
+	}
+
+	gettimeofday(&stop, NULL);
+
+	timeval_subtract(&diff, &stop, &start);
+	diff_secs = (double)diff.tv_sec + (double)diff.tv_usec / 1000000.0;
+
+	printf("XvMC Benchmark\n");
+	printf("Input: %u,%u\nOutput: %u,%u\n", config.input_width, config.input_height, config.output_width, config.output_height);
+	printf("Pipeline: ");
+	if (config.pipeline & PIPELINE_STEP_MC)
+		printf("|mc|");
+	if (config.pipeline & PIPELINE_STEP_CSC)
+		printf("|csc|");
+	if (config.pipeline & PIPELINE_STEP_SWAP)
+		printf("|swap|");
+	printf("\n");
+	printf("Reps: %u\n", config.reps);
+	printf("Total time: %.2lf (%.2lf reps / sec)\n", diff_secs, config.reps / diff_secs);
+
+	assert(XvMCDestroyBlocks(display, &block_array) == Success);
+	assert(XvMCDestroyMacroBlocks(display, &mb_array) == Success);
+	assert(XvMCDestroySurface(display, &surface) == Success);
+	assert(XvMCDestroyContext(display, &context) == Success);
+
+	XvUngrabPort(display, port_num, CurrentTime);
+	XDestroyWindow(display, window);
+	XCloseDisplay(display);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/wgl/shared/stw_winsys.h b/src/gallium/state_trackers/xorg/xvmc/xvmc_private.h
index c0bf82c9ed7..42337631ca1 100644
--- a/src/gallium/state_trackers/wgl/shared/stw_winsys.h
+++ b/src/gallium/state_trackers/xorg/xvmc/xvmc_private.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * 
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
- *
+ * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- *
+ * 
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- *
+ * 
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,44 +22,37 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
+ * 
  **************************************************************************/
 
-#ifndef STW_WINSYS_H
-#define STW_WINSYS_H
+#ifndef xvmc_private_h
+#define xvmc_private_h
 
-#include <windows.h> /* for HDC */
+#include <X11/Xlib.h>
+#include <X11/extensions/XvMClib.h>
 
-#include "pipe/p_compiler.h"
+#define BLOCK_SIZE_SAMPLES 64
+#define BLOCK_SIZE_BYTES (BLOCK_SIZE_SAMPLES * 2)
 
-struct pipe_screen;
-struct pipe_context;
+struct pipe_video_context;
 struct pipe_surface;
+struct pipe_fence_handle;
 
-struct stw_winsys
+typedef struct
 {
-   struct pipe_screen *
-   (*create_screen)( void );
+	struct pipe_video_context *vpipe;
+	struct pipe_surface *backbuffer;
+} XvMCContextPrivate;
 
-   struct pipe_context *
-   (*create_context)( struct pipe_screen *screen );
-
-   void
-   (*flush_frontbuffer)( struct pipe_screen *screen,
-                         struct pipe_surface *surf,
-                         HDC hDC );
-};
-
-boolean
-stw_init(const struct stw_winsys *stw_winsys);
-
-boolean
-stw_init_thread(void);
-
-void
-stw_cleanup_thread(void);
-
-void
-stw_cleanup(void);
-
-#endif /* STW_WINSYS_H */
+typedef struct
+{
+	struct pipe_video_surface *pipe_vsfc;
+	struct pipe_fence_handle *render_fence;
+	struct pipe_fence_handle *disp_fence;
+	
+	/* Some XvMC functions take a surface but not a context,
+	   so we keep track of which context each surface belongs to. */
+	XvMCContext *context;
+} XvMCSurfacePrivate;
+
+#endif /* xvmc_private_h */
diff --git a/src/gallium/winsys/drm/SConscript b/src/gallium/winsys/drm/SConscript
index a9e9f2682a7..9f7b383d2d3 100644
--- a/src/gallium/winsys/drm/SConscript
+++ b/src/gallium/winsys/drm/SConscript
@@ -48,6 +48,11 @@ if env['dri']:
 	#	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
 	#	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
 
+	if 'vmware' in env['winsys']:
+		SConscript([
+			'vmware/SConscript',
+		])
+
 	if 'intel' in env['winsys']:
 		SConscript([
 			'intel/SConscript',
diff --git a/src/gallium/winsys/drm/intel/dri/Makefile b/src/gallium/winsys/drm/intel/dri/Makefile
index 5e212b62a46..c0ecd9680e2 100644
--- a/src/gallium/winsys/drm/intel/dri/Makefile
+++ b/src/gallium/winsys/drm/intel/dri/Makefile
@@ -9,7 +9,7 @@ PIPE_DRIVERS = \
 	$(TOP)/src/gallium/drivers/trace/libtrace.a \
 	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
 	$(TOP)/src/gallium/drivers/identity/libidentity.a \
-	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a
+	$(TOP)/src/gallium/drivers/i915/libi915.a
 
 
 DRIVER_SOURCES =
diff --git a/src/gallium/winsys/drm/intel/dri/SConscript b/src/gallium/winsys/drm/intel/dri/SConscript
index 6c00861f517..b1b654d9f8b 100644
--- a/src/gallium/winsys/drm/intel/dri/SConscript
+++ b/src/gallium/winsys/drm/intel/dri/SConscript
@@ -8,12 +8,13 @@ drivers = [
     st_dri,
     inteldrm,
     softpipe,
-    i915simple,
+    i915,
     trace,
 ]
 
-env.SharedLibrary(
+env.LoadableModule(
     target ='i915_dri.so',
     source = COMMON_GALLIUM_SOURCES,
     LIBS = drivers + mesa + auxiliaries + env['LIBS'],
+    SHLIBPREFIX = '',
 )
diff --git a/src/gallium/winsys/drm/intel/egl/Makefile b/src/gallium/winsys/drm/intel/egl/Makefile
index 490baded66b..1397e9f7290 100644
--- a/src/gallium/winsys/drm/intel/egl/Makefile
+++ b/src/gallium/winsys/drm/intel/egl/Makefile
@@ -9,7 +9,7 @@ PIPE_DRIVERS = \
 	$(GALLIUMDIR)/winsys/drm/intel/gem/libinteldrm.a \
 	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
 	$(TOP)/src/gallium/drivers/trace/libtrace.a \
-	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a
+	$(TOP)/src/gallium/drivers/i915/libi915.a
 
 DRIVER_SOURCES =
 
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_api.c b/src/gallium/winsys/drm/intel/gem/intel_drm_api.c
index 4c5a1d2ea89..9ed570ff6e4 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_api.c
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_api.c
@@ -4,9 +4,10 @@
 #include "intel_drm_winsys.h"
 #include "util/u_memory.h"
 
-#include "i915simple/i915_context.h"
-#include "i915simple/i915_screen.h"
+#include "i915/i915_context.h"
+#include "i915/i915_screen.h"
 
+#include "trace/tr_drm.h"
 
 /*
  * Helper functions
@@ -40,6 +41,7 @@ intel_drm_buffer_from_handle(struct intel_drm_winsys *idws,
                              const char* name, unsigned handle)
 {
    struct intel_drm_buffer *buf = CALLOC_STRUCT(intel_drm_buffer);
+   uint32_t tile = 0, swizzle = 0;
 
    if (!buf)
       return NULL;
@@ -52,6 +54,10 @@ intel_drm_buffer_from_handle(struct intel_drm_winsys *idws,
    if (!buf->bo)
       goto err;
 
+   drm_intel_bo_get_tiling(buf->bo, &tile, &swizzle);
+   if (tile != INTEL_TILE_NONE)
+      buf->map_gtt = TRUE;
+
    return (struct intel_buffer *)buf;
 
 err:
@@ -166,6 +172,7 @@ intel_drm_create_screen(struct drm_api *api, int drmFD,
    idws->base.destroy = intel_drm_winsys_destroy;
 
    idws->pools.gem = drm_intel_bufmgr_gem_init(idws->fd, idws->max_batch_size);
+   drm_intel_bufmgr_gem_enable_reuse(idws->pools.gem);
 
    idws->softpipe = FALSE;
    idws->dump_cmd = debug_get_bool_option("INTEL_DUMP_CMD", FALSE);
@@ -198,5 +205,5 @@ struct drm_api intel_drm_api =
 struct drm_api *
 drm_api_create()
 {
-   return &intel_drm_api;
+   return trace_drm_create(&intel_drm_api);
 }
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c b/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c
index 5ca3ad9762d..5b4dafc8e41 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_batchbuffer.c
@@ -13,6 +13,9 @@
 #define INTEL_BATCH_CLIPRECTS    0x2
 
 #undef INTEL_RUN_SYNC
+#undef INTEL_MAP_BATCHBUFFER
+#undef INTEL_MAP_GTT
+#define INTEL_ALWAYS_FLUSH
 
 struct intel_drm_batchbuffer
 {
@@ -33,6 +36,7 @@ static void
 intel_drm_batchbuffer_reset(struct intel_drm_batchbuffer *batch)
 {
    struct intel_drm_winsys *idws = intel_drm_winsys(batch->base.iws);
+   int ret;
 
    if (batch->bo)
       drm_intel_bo_unreference(batch->bo);
@@ -40,8 +44,18 @@ intel_drm_batchbuffer_reset(struct intel_drm_batchbuffer *batch)
                                   "gallium3d_batchbuffer",
                                   batch->actual_size,
                                   4096);
-   drm_intel_bo_map(batch->bo, TRUE);
+
+#ifdef INTEL_MAP_BATCHBUFFER
+#ifdef INTEL_MAP_GTT
+   ret = drm_intel_gem_bo_map_gtt(batch->bo);
+#else
+   ret = drm_intel_bo_map(batch->bo, TRUE);
+#endif
+   assert(ret == 0);
    batch->base.map = batch->bo->virtual;
+#else
+   (void)ret;
+#endif
 
    memset(batch->base.map, 0, batch->actual_size);
    batch->base.ptr = batch->base.map;
@@ -55,7 +69,13 @@ intel_drm_batchbuffer_create(struct intel_winsys *iws)
    struct intel_drm_winsys *idws = intel_drm_winsys(iws);
    struct intel_drm_batchbuffer *batch = CALLOC_STRUCT(intel_drm_batchbuffer);
 
+   batch->actual_size = idws->max_batch_size;
+
+#ifdef INTEL_MAP_BATCHBUFFER
    batch->base.map = NULL;
+#else
+   batch->base.map = MALLOC(batch->actual_size);
+#endif
    batch->base.ptr = NULL;
    batch->base.size = 0;
 
@@ -64,8 +84,6 @@ intel_drm_batchbuffer_create(struct intel_winsys *iws)
 
    batch->base.iws = iws;
 
-   batch->actual_size = idws->max_batch_size;
-
    intel_drm_batchbuffer_reset(batch);
 
    return &batch->base;
@@ -140,23 +158,32 @@ intel_drm_batchbuffer_flush(struct intel_batchbuffer *ibatch,
    used = batch->base.ptr - batch->base.map;
    assert((used & 3) == 0);
 
-   if (used & 4) {
-      // MI_FLUSH | FLUSH_MAP_CACHE;
-      intel_batchbuffer_dword(ibatch, (0x0<<29)|(0x4<<23)|(1<<0));
-      // MI_NOOP
-      intel_batchbuffer_dword(ibatch, (0x0<<29)|(0x0<<23));
-      // MI_BATCH_BUFFER_END;
-      intel_batchbuffer_dword(ibatch, (0x0<<29)|(0xA<<23));
-   } else {
-      //MI_FLUSH | FLUSH_MAP_CACHE;
-      intel_batchbuffer_dword(ibatch, (0x0<<29)|(0x4<<23)|(1<<0));
-      // MI_BATCH_BUFFER_END;
-      intel_batchbuffer_dword(ibatch, (0x0<<29)|(0xA<<23));
+
+#ifdef INTEL_ALWAYS_FLUSH
+   /* MI_FLUSH | FLUSH_MAP_CACHE */
+   intel_batchbuffer_dword(ibatch, (0x4<<23)|(1<<0));
+   used += 4;
+#endif
+
+   if ((used & 4) == 0) {
+      /* MI_NOOP */
+      intel_batchbuffer_dword(ibatch, 0);
    }
+   /* MI_BATCH_BUFFER_END */
+   intel_batchbuffer_dword(ibatch, (0xA<<23));
 
    used = batch->base.ptr - batch->base.map;
+   assert((used & 4) == 0);
 
+#ifdef INTEL_MAP_BATCHBUFFER
+#ifdef INTEL_MAP_GTT
+   drm_intel_gem_bo_unmap_gtt(batch->bo);
+#else
    drm_intel_bo_unmap(batch->bo);
+#endif
+#else
+   drm_intel_bo_subdata(batch->bo, 0, used, batch->base.map);
+#endif
 
    /* Do the sending to HW */
    ret = drm_intel_bo_exec(batch->bo, used, NULL, 0, 0);
@@ -202,7 +229,10 @@ intel_drm_batchbuffer_destroy(struct intel_batchbuffer *ibatch)
    if (batch->bo)
       drm_intel_bo_unreference(batch->bo);
 
-   free(batch);
+#ifndef INTEL_MAP_BATCHBUFFER
+   FREE(batch->base.map);
+#endif
+   FREE(batch);
 }
 
 void intel_drm_winsys_init_batchbuffer_functions(struct intel_drm_winsys *idws)
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c b/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c
index e017cd2e982..ac4dd6e00e9 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_buffer.c
@@ -28,6 +28,7 @@ intel_drm_buffer_create(struct intel_winsys *iws,
    } else if (type == INTEL_NEW_VERTEX) {
       name = "gallium3d_vertex";
       pool = idws->pools.gem;
+      buf->map_gtt = TRUE;
    } else if (type == INTEL_NEW_SCANOUT) {
       name = "gallium3d_scanout";
       pool = idws->pools.gem;
@@ -57,11 +58,17 @@ intel_drm_buffer_set_fence_reg(struct intel_winsys *iws,
                                unsigned stride,
                                enum intel_buffer_tile tile)
 {
+   struct intel_drm_buffer *buf = intel_drm_buffer(buffer);
    assert(I915_TILING_NONE == INTEL_TILE_NONE);
    assert(I915_TILING_X == INTEL_TILE_X);
    assert(I915_TILING_Y == INTEL_TILE_Y);
 
-   return drm_intel_bo_set_tiling(intel_bo(buffer), &tile, stride);
+   if (tile != INTEL_TILE_NONE) {
+      assert(buf->map_count == 0);
+      buf->map_gtt = TRUE;
+   }
+
+   return drm_intel_bo_set_tiling(buf->bo, &tile, stride);
 }
 
 static void *
@@ -109,6 +116,18 @@ intel_drm_buffer_unmap(struct intel_winsys *iws,
       drm_intel_bo_unmap(intel_bo(buffer));
 }
 
+static int
+intel_drm_buffer_write(struct intel_winsys *iws,
+                       struct intel_buffer *buffer,
+                       size_t offset,
+                       size_t size,
+                       const void *data)
+{
+   struct intel_drm_buffer *buf = intel_drm_buffer(buffer);
+
+   return drm_intel_bo_subdata(buf->bo, offset, size, (void*)data);
+}
+
 static void
 intel_drm_buffer_destroy(struct intel_winsys *iws,
                          struct intel_buffer *buffer)
@@ -130,5 +149,6 @@ intel_drm_winsys_init_buffer_functions(struct intel_drm_winsys *idws)
    idws->base.buffer_set_fence_reg = intel_drm_buffer_set_fence_reg;
    idws->base.buffer_map = intel_drm_buffer_map;
    idws->base.buffer_unmap = intel_drm_buffer_unmap;
+   idws->base.buffer_write = intel_drm_buffer_write;
    idws->base.buffer_destroy = intel_drm_buffer_destroy;
 }
diff --git a/src/gallium/winsys/drm/intel/gem/intel_drm_winsys.h b/src/gallium/winsys/drm/intel/gem/intel_drm_winsys.h
index 415c45feea0..b4a60563ef4 100644
--- a/src/gallium/winsys/drm/intel/gem/intel_drm_winsys.h
+++ b/src/gallium/winsys/drm/intel/gem/intel_drm_winsys.h
@@ -2,7 +2,7 @@
 #ifndef INTEL_DRM_WINSYS_H
 #define INTEL_DRM_WINSYS_H
 
-#include "i915simple/intel_batchbuffer.h"
+#include "i915/intel_batchbuffer.h"
 
 #include "drm.h"
 #include "intel_bufmgr.h"
diff --git a/src/gallium/winsys/drm/intel/xorg/Makefile b/src/gallium/winsys/drm/intel/xorg/Makefile
index 9e56853b021..14c2462524b 100644
--- a/src/gallium/winsys/drm/intel/xorg/Makefile
+++ b/src/gallium/winsys/drm/intel/xorg/Makefile
@@ -18,7 +18,7 @@ INCLUDES = \
 LIBS = \
 	$(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
 	$(TOP)/src/gallium/winsys/drm/intel/gem/libinteldrm.a \
-	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a \
+	$(TOP)/src/gallium/drivers/i915/libi915.a \
 	$(TOP)/src/gallium/drivers/trace/libtrace.a \
 	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
 	$(GALLIUM_AUXILIARIES)
diff --git a/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c b/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
index 091cbbcfed1..317dc44d22f 100644
--- a/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
+++ b/src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c
@@ -21,8 +21,7 @@ dri_surface_from_handle(struct drm_api *api, struct pipe_screen *pscreen,
 	struct pipe_texture tmpl;
 
 	memset(&tmpl, 0, sizeof(tmpl));
-	tmpl.tex_usage = PIPE_TEXTURE_USAGE_PRIMARY |
-			 NOUVEAU_TEXTURE_USAGE_LINEAR;
+	tmpl.tex_usage = PIPE_TEXTURE_USAGE_PRIMARY;
 	tmpl.target = PIPE_TEXTURE_2D;
 	tmpl.last_level = 0;
 	tmpl.depth[0] = 1;
@@ -112,7 +111,7 @@ nouveau_drm_create_screen(struct drm_api *api, int fd,
 		return NULL;
 	}
 
-	if (arg->mode == DRM_CREATE_DRI1) {
+	if (arg && arg->mode == DRM_CREATE_DRI1) {
 		struct nouveau_dri *nvdri = dri1->ddx_info;
 		enum pipe_format format;
 
@@ -197,6 +196,7 @@ nouveau_drm_pt_from_name(struct drm_api *api, struct pipe_screen *pscreen,
 			 unsigned stride, unsigned handle)
 {
 	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+	struct pipe_texture *pt;
 	struct pipe_buffer *pb;
 	int ret;
 
@@ -218,7 +218,9 @@ nouveau_drm_pt_from_name(struct drm_api *api, struct pipe_screen *pscreen,
 	pb->usage = PIPE_BUFFER_USAGE_GPU_READ_WRITE |
 		    PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 	pb->size = nouveau_bo(pb)->size;
-	return pscreen->texture_blanket(pscreen, templ, &stride, pb);
+	pt = pscreen->texture_blanket(pscreen, templ, &stride, pb);
+	pipe_buffer_reference(&pb, NULL);
+	return pt;
 }
 
 static boolean
@@ -245,6 +247,7 @@ nouveau_drm_handle_from_pt(struct drm_api *api, struct pipe_screen *pscreen,
 		return false;
 
 	*handle = mt->bo->handle;
+	*stride = mt->base.nblocksx[0] * mt->base.block.size;
 	return true;
 }
 
diff --git a/src/gallium/winsys/drm/nouveau/xorg/Makefile b/src/gallium/winsys/drm/nouveau/xorg/Makefile
new file mode 100644
index 00000000000..f0d3b337e83
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/xorg/Makefile
@@ -0,0 +1,61 @@
+TARGET     = modesetting_drv.so
+CFILES     = $(wildcard ./*.c)
+OBJECTS    = $(patsubst ./%.c,./%.o,$(CFILES))
+TOP        = ../../../../../..
+
+include $(TOP)/configs/current
+
+INCLUDES = \
+	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto) \
+	-I../gem \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/include \
+	-I$(TOP)/src/egl/main
+
+LIBS = \
+	$(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
+	$(TOP)/src/gallium/winsys/drm/nouveau/drm/libnouveaudrm.a \
+	$(TOP)/src/gallium/drivers/nv04/libnv04.a \
+	$(TOP)/src/gallium/drivers/nv10/libnv10.a \
+	$(TOP)/src/gallium/drivers/nv20/libnv20.a \
+	$(TOP)/src/gallium/drivers/nv30/libnv30.a \
+	$(TOP)/src/gallium/drivers/nv40/libnv40.a \
+	$(TOP)/src/gallium/drivers/nv50/libnv50.a \
+	$(TOP)/src/gallium/drivers/nouveau/libnouveau.a \
+	$(GALLIUM_AUXILIARIES)
+
+DRIVER_DEFINES = \
+	-DHAVE_CONFIG_H
+
+
+#############################################
+
+
+
+all default: $(TARGET)
+
+$(TARGET): $(OBJECTS) Makefile $(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a $(LIBS)
+	$(TOP)/bin/mklib -noprefix -o $@ \
+	$(OBJECTS) $(LIBS) $(shell pkg-config --libs libdrm) -ldrm_nouveau
+
+clean:
+	rm -rf $(OBJECTS) $(TARGET)
+
+install:
+	$(INSTALL) -d $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(TARGET) $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
+
+
+##############################################
+
+
+.c.o:
+	$(CC) -c $(CFLAGS) $(INCLUDES) $(DRIVER_DEFINES) $< -o $@
+
+
+##############################################
+
+.PHONY	= all clean install
diff --git a/src/gallium/winsys/drm/nouveau/xorg/nouveau_xorg.c b/src/gallium/winsys/drm/nouveau/xorg/nouveau_xorg.c
new file mode 100644
index 00000000000..a669b3080aa
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/xorg/nouveau_xorg.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ * Author: Alan Hourihane <alanh@tungstengraphics.com>
+ * Author: Jakob Bornecrantz <wallbraker@gmail.com>
+ *
+ */
+
+#include "../../../../state_trackers/xorg/xorg_winsys.h"
+
+static void nouveau_xorg_identify(int flags);
+static Bool nouveau_xorg_pci_probe(DriverPtr driver, int entity_num,
+				   struct pci_device *device,
+				   intptr_t match_data);
+
+static const struct pci_id_match nouveau_xorg_device_match[] = {
+    { 0x10de, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY,
+      0x00030000, 0x00ffffff, 0 },
+    { 0x12d2, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY,
+      0x00030000, 0x00ffffff, 0 },
+    {0, 0, 0},
+};
+
+static SymTabRec nouveau_xorg_chipsets[] = {
+    {PCI_MATCH_ANY, "NVIDIA Graphics Device"},
+    {-1, NULL}
+};
+
+static PciChipsets nouveau_xorg_pci_devices[] = {
+    {PCI_MATCH_ANY, PCI_MATCH_ANY, NULL},
+    {-1, -1, NULL}
+};
+
+static XF86ModuleVersionInfo nouveau_xorg_version = {
+    "modesetting",
+    MODULEVENDORSTRING,
+    MODINFOSTRING1,
+    MODINFOSTRING2,
+    XORG_VERSION_CURRENT,
+    0, 1, 0, /* major, minor, patch */
+    ABI_CLASS_VIDEODRV,
+    ABI_VIDEODRV_VERSION,
+    MOD_CLASS_VIDEODRV,
+    {0, 0, 0, 0}
+};
+
+/*
+ * Xorg driver exported structures
+ */
+
+_X_EXPORT DriverRec modesetting = {
+    1,
+    "modesetting",
+    nouveau_xorg_identify,
+    NULL,
+    xorg_tracker_available_options,
+    NULL,
+    0,
+    NULL,
+    nouveau_xorg_device_match,
+    nouveau_xorg_pci_probe
+};
+
+static MODULESETUPPROTO(nouveau_xorg_setup);
+
+_X_EXPORT XF86ModuleData modesettingModuleData = {
+    &nouveau_xorg_version,
+    nouveau_xorg_setup,
+    NULL
+};
+
+/*
+ * Xorg driver functions
+ */
+
+static pointer
+nouveau_xorg_setup(pointer module, pointer opts, int *errmaj, int *errmin)
+{
+    static Bool setupDone = 0;
+
+    /* This module should be loaded only once, but check to be sure.
+     */
+    if (!setupDone) {
+	setupDone = 1;
+	xf86AddDriver(&modesetting, module, HaveDriverFuncs);
+
+	/*
+	 * The return value must be non-NULL on success even though there
+	 * is no TearDownProc.
+	 */
+	return (pointer) 1;
+    } else {
+	if (errmaj)
+	    *errmaj = LDR_ONCEONLY;
+	return NULL;
+    }
+}
+
+static void
+nouveau_xorg_identify(int flags)
+{
+    xf86PrintChipsets("modesetting", "Driver for Modesetting Kernel Drivers",
+		      nouveau_xorg_chipsets);
+}
+
+static Bool
+nouveau_xorg_pci_probe(DriverPtr driver,
+	  int entity_num, struct pci_device *device, intptr_t match_data)
+{
+    ScrnInfoPtr scrn = NULL;
+    EntityInfoPtr entity;
+
+    scrn = xf86ConfigPciEntity(scrn, 0, entity_num, nouveau_xorg_pci_devices,
+			       NULL, NULL, NULL, NULL, NULL);
+    if (scrn != NULL) {
+	scrn->driverVersion = 1;
+	scrn->driverName = "i915";
+	scrn->name = "modesetting";
+	scrn->Probe = NULL;
+
+	entity = xf86GetEntityInfo(entity_num);
+
+	/* Use all the functions from the xorg tracker */
+	xorg_tracker_set_functions(scrn);
+    }
+    return scrn != NULL;
+}
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
index 07551e7cd16..81cd9dc4fb1 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_buffer.c
@@ -32,6 +32,17 @@
 
 #include "radeon_buffer.h"
 
+#include "radeon_bo_gem.h"
+#include "softpipe/sp_texture.h"
+#include "r300_context.h"
+#include <X11/Xutil.h>
+struct radeon_vl_context
+{
+    Display *display;
+    int screen;
+    Drawable drawable;
+};
+
 static const char *radeon_get_name(struct pipe_winsys *ws)
 {
     return "Radeon/GEM+KMS";
@@ -99,6 +110,7 @@ static struct pipe_buffer *radeon_surface_buffer_create(struct pipe_winsys *ws,
                                                         unsigned height,
                                                         enum pipe_format format,
                                                         unsigned usage,
+                                                        unsigned tex_usage,
                                                         unsigned *stride)
 {
     struct pipe_format_block block;
@@ -134,8 +146,11 @@ static void *radeon_buffer_map(struct pipe_winsys *ws,
         (struct radeon_pipe_buffer*)buffer;
     int write = 0;
 
-    if (!(flags & PIPE_BUFFER_USAGE_DONTBLOCK)) {
-        radeon_bo_wait(radeon_buffer->bo);
+    if (flags & PIPE_BUFFER_USAGE_DONTBLOCK) {
+        uint32_t domain;
+
+        if (radeon_bo_is_busy(radeon_buffer->bo, &domain))
+            return NULL;
     }
     if (flags & PIPE_BUFFER_USAGE_CPU_WRITE) {
         write = 1;
@@ -177,17 +192,58 @@ static int radeon_fence_finish(struct pipe_winsys *ws,
     return 0;
 }
 
+static void radeon_display_surface(struct pipe_winsys *pws,
+                                   struct pipe_surface *psurf,
+                                   struct radeon_vl_context *rvl_ctx)
+{
+    struct r300_texture *r300tex = (struct r300_texture *)(psurf->texture);
+    XImage *ximage;
+    void *data;
+
+    ximage = XCreateImage(rvl_ctx->display,
+                          XDefaultVisual(rvl_ctx->display, rvl_ctx->screen),
+                          XDefaultDepth(rvl_ctx->display, rvl_ctx->screen),
+                          ZPixmap, 0,   /* format, offset */
+                          NULL,         /* data */
+                          0, 0,         /* size */
+                          32,           /* bitmap_pad */
+                          0);           /* bytes_per_line */
+
+    assert(ximage->format);
+    assert(ximage->bitmap_unit);
+
+    data = pws->buffer_map(pws, r300tex->buffer, 0);
+
+    /* update XImage's fields */
+    ximage->data = data;
+    ximage->width = psurf->width;
+    ximage->height = psurf->height;
+    ximage->bytes_per_line = psurf->width * (ximage->bits_per_pixel >> 3);
+
+    XPutImage(rvl_ctx->display, rvl_ctx->drawable,
+              XDefaultGC(rvl_ctx->display, rvl_ctx->screen),
+              ximage, 0, 0, 0, 0, psurf->width, psurf->height);
+
+    XSync(rvl_ctx->display, 0);
+
+    ximage->data = NULL;
+    XDestroyImage(ximage);
+
+    pws->buffer_unmap(pws, r300tex->buffer);
+}
+
 static void radeon_flush_frontbuffer(struct pipe_winsys *pipe_winsys,
                                      struct pipe_surface *pipe_surface,
                                      void *context_private)
 {
-    /* XXX TODO: call dri2CopyRegion */
+    struct radeon_vl_context *rvl_ctx;
+    rvl_ctx = (struct radeon_vl_context *) context_private;
+    radeon_display_surface(pipe_winsys, pipe_surface, rvl_ctx);
 }
 
 struct radeon_winsys* radeon_pipe_winsys(int fd)
 {
     struct radeon_winsys* radeon_ws;
-    struct radeon_bo_manager* bom;
 
     radeon_ws = CALLOC_STRUCT(radeon_winsys);
     if (radeon_ws == NULL) {
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_drm.c b/src/gallium/winsys/drm/radeon/core/radeon_drm.c
index 47376a0f07b..69f14e54f26 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_drm.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_drm.c
@@ -38,7 +38,7 @@ struct pipe_screen* radeon_create_screen(struct drm_api* api,
 {
     struct radeon_winsys* winsys = radeon_pipe_winsys(drmFB);
 
-    if (getenv("RADEON_SOFTPIPE")) {
+    if (debug_get_bool_option("RADEON_SOFTPIPE", FALSE)) {
         return softpipe_create_screen((struct pipe_winsys*)winsys);
     } else {
         struct r300_winsys* r300 = radeon_create_r300_winsys(drmFB, winsys);
@@ -51,7 +51,7 @@ struct pipe_screen* radeon_create_screen(struct drm_api* api,
 struct pipe_context* radeon_create_context(struct drm_api* api,
                                            struct pipe_screen* screen)
 {
-    if (getenv("RADEON_SOFTPIPE")) {
+    if (debug_get_bool_option("RADEON_SOFTPIPE", FALSE)) {
         return radeon_create_softpipe(screen->winsys);
     } else {
         return r300_create_context(screen,
@@ -98,7 +98,7 @@ struct pipe_buffer* radeon_buffer_from_handle(struct drm_api* api,
     return &radeon_buffer->base;
 }
 
-struct pipe_texture*
+static struct pipe_texture*
 radeon_texture_from_shared_handle(struct drm_api *api,
                                   struct pipe_screen *screen,
                                   struct pipe_texture *templ,
@@ -107,29 +107,36 @@ radeon_texture_from_shared_handle(struct drm_api *api,
                                   unsigned handle)
 {
     struct pipe_buffer *buffer;
+    struct pipe_texture *blanket;
 
     buffer = radeon_buffer_from_handle(api, screen, name, handle);
     if (!buffer) {
         return NULL;
     }
 
-    return screen->texture_blanket(screen, templ, &stride, buffer);
+    blanket = screen->texture_blanket(screen, templ, &stride, buffer);
+
+    pipe_buffer_reference(&buffer, NULL);
+
+    return blanket;
 }
 
-boolean radeon_shared_handle_from_texture(struct drm_api *api,
-                                          struct pipe_screen *screen,
-                                          struct pipe_texture *texture,
-                                          unsigned *stride,
-                                          unsigned *handle)
+static boolean radeon_shared_handle_from_texture(struct drm_api *api,
+                                                 struct pipe_screen *screen,
+                                                 struct pipe_texture *texture,
+                                                 unsigned *stride,
+                                                 unsigned *handle)
 {
     int retval, fd;
     struct drm_gem_flink flink;
     struct radeon_pipe_buffer* radeon_buffer;
-    struct pipe_buffer* buffer = &radeon_buffer->base;
-    if (!radeon_buffer_from_texture(api, texture, buffer, stride)) {
+    struct pipe_buffer *buffer;
+
+    if (!radeon_buffer_from_texture(api, texture, &buffer, stride)) {
         return FALSE;
     }
 
+    radeon_buffer = (struct radeon_pipe_buffer*)buffer;
     if (!radeon_buffer->flinked) {
         fd = ((struct radeon_winsys*)screen->winsys)->priv->fd;
 
@@ -150,11 +157,11 @@ boolean radeon_shared_handle_from_texture(struct drm_api *api,
     return TRUE;
 }
 
-boolean radeon_local_handle_from_texture(struct drm_api *api,
-                                         struct pipe_screen *screen,
-                                         struct pipe_texture *texture,
-                                         unsigned *stride,
-                                         unsigned *handle)
+static boolean radeon_local_handle_from_texture(struct drm_api *api,
+                                                struct pipe_screen *screen,
+                                                struct pipe_texture *texture,
+                                                unsigned *stride,
+                                                unsigned *handle)
 {
     struct pipe_buffer *buffer;
     if (!radeon_buffer_from_texture(api, texture, &buffer, stride)) {
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_drm.h b/src/gallium/winsys/drm/radeon/core/radeon_drm.h
index 88a5c82b284..9a789ec1a45 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_drm.h
+++ b/src/gallium/winsys/drm/radeon/core/radeon_drm.h
@@ -37,6 +37,7 @@
 #include "pipe/p_screen.h"
 
 #include "trace/tr_drm.h"
+#include "util/u_debug.h"
 #include "util/u_memory.h"
 
 #include "state_tracker/drm_api.h"
diff --git a/src/gallium/winsys/drm/radeon/core/radeon_r300.c b/src/gallium/winsys/drm/radeon/core/radeon_r300.c
index d7238762219..7ea5d1fb4e7 100644
--- a/src/gallium/winsys/drm/radeon/core/radeon_r300.c
+++ b/src/gallium/winsys/drm/radeon/core/radeon_r300.c
@@ -22,6 +22,17 @@
 
 #include "radeon_r300.h"
 
+static void radeon_r300_set_flush_cb(struct r300_winsys *winsys,
+				     void (*flush_cb)(void *),
+				     void *data)
+{
+    struct radeon_winsys_priv* priv =
+        (struct radeon_winsys_priv*)winsys->radeon_winsys;
+
+    radeon_cs_space_set_flush(priv->cs, flush_cb,
+			      data);
+}
+
 static boolean radeon_r300_add_buffer(struct r300_winsys* winsys,
                                       struct pipe_buffer* pbuffer,
                                       uint32_t rd,
@@ -95,6 +106,13 @@ static void radeon_r300_write_cs_reloc(struct r300_winsys* winsys,
     }
 }
 
+static void radeon_r300_reset_bos(struct r300_winsys *winsys)
+{
+    struct radeon_winsys_priv* priv =
+        (struct radeon_winsys_priv*)winsys->radeon_winsys;
+    radeon_cs_space_reset_bos(priv->cs);
+}
+
 static void radeon_r300_end_cs(struct r300_winsys* winsys,
                                const char* file,
                                const char* function,
@@ -119,9 +137,6 @@ static void radeon_r300_flush_cs(struct r300_winsys* winsys)
         radeon_cs_print(priv->cs, stderr);
     }
 
-    /* Clean out BOs. */
-    radeon_cs_space_reset_bos(priv->cs);
-
     /* Reset CS.
      * Someday, when we care about performance, we should really find a way
      * to rotate between two or three CS objects so that the GPU can be
@@ -137,7 +152,7 @@ static void do_ioctls(struct r300_winsys* winsys, int fd)
     int target = 0;
     int retval;
 
-    info.value = &target;
+    info.value = (unsigned long)&target;
 
     /* First, get the number of pixel pipes */
     info.request = RADEON_INFO_NUM_GB_PIPES;
@@ -149,6 +164,16 @@ static void do_ioctls(struct r300_winsys* winsys, int fd)
     }
     winsys->gb_pipes = target;
 
+    /* get Z pipes */
+    info.request = RADEON_INFO_NUM_Z_PIPES;
+    retval = drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info));
+    if (retval) {
+        fprintf(stderr, "%s: Failed to get GB pipe count, "
+                "error number %d\n", __FUNCTION__, retval);
+        exit(1);
+    }
+    winsys->z_pipes = target;
+
     /* Then, get PCI ID */
     info.request = RADEON_INFO_DEVICE_ID;
     retval = drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info));
@@ -203,6 +228,8 @@ radeon_create_r300_winsys(int fd, struct radeon_winsys* old_winsys)
     winsys->write_cs_reloc = radeon_r300_write_cs_reloc;
     winsys->end_cs = radeon_r300_end_cs;
     winsys->flush_cs = radeon_r300_flush_cs;
+    winsys->reset_bos = radeon_r300_reset_bos;
+    winsys->set_flush_cb = radeon_r300_set_flush_cb;
 
     memcpy(winsys, old_winsys, sizeof(struct radeon_winsys));
 
diff --git a/src/gallium/winsys/drm/radeon/xorg/Makefile b/src/gallium/winsys/drm/radeon/xorg/Makefile
index 0241625f69b..9fa16dab24c 100644
--- a/src/gallium/winsys/drm/radeon/xorg/Makefile
+++ b/src/gallium/winsys/drm/radeon/xorg/Makefile
@@ -20,6 +20,8 @@ LIBS = \
 	$(GALLIUMDIR)/state_trackers/xorg/libxorgtracker.a \
 	$(GALLIUMDIR)/winsys/drm/radeon/core/libradeonwinsys.a \
 	$(TOP)/src/gallium/drivers/r300/libr300.a \
+	$(TOP)/src/gallium/drivers/trace/libtrace.a \
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
 	$(GALLIUM_AUXILIARIES)
 
 #############################################
diff --git a/src/gallium/winsys/drm/vmware/Makefile b/src/gallium/winsys/drm/vmware/Makefile
new file mode 100644
index 00000000000..2ae6dead5c1
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/Makefile
@@ -0,0 +1,12 @@
+# src/gallium/winsys/drm/vmware/Makefile
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+SUBDIRS = core $(GALLIUM_STATE_TRACKERS_DIRS)
+
+default install clean:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) $@) || exit 1; \
+		fi \
+	done
diff --git a/src/gallium/winsys/drm/vmware/SConscript b/src/gallium/winsys/drm/vmware/SConscript
new file mode 100644
index 00000000000..06e6d5be9ca
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/SConscript
@@ -0,0 +1,11 @@
+Import('*')
+
+SConscript(['core/SConscript',])
+
+if 'mesa' in env['statetrackers']:
+
+    SConscript(['dri/SConscript'])
+
+if 'xorg' in env['statetrackers']:
+
+    SConscript(['xorg/SConscript'])
diff --git a/src/gallium/winsys/drm/vmware/core/Makefile b/src/gallium/winsys/drm/vmware/core/Makefile
new file mode 100644
index 00000000000..a52957c1a5b
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/Makefile
@@ -0,0 +1,35 @@
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = svgadrm
+
+C_SOURCES = \
+        vmw_buffer.c \
+        vmw_context.c  \
+        vmw_fence.c  \
+        vmw_screen.c  \
+        vmw_screen_dri.c  \
+        vmw_screen_ioctl.c  \
+        vmw_screen_pools.c  \
+        vmw_screen_svga.c  \
+        vmw_surface.c
+
+LIBRARY_INCLUDES = \
+       -I$(TOP)/src/gallium/drivers/svga \
+       -I$(TOP)/src/gallium/drivers/svga/include \
+       -I$(GALLIUM)/src/mesa/drivers/dri/common \
+       -I$(GALLIUM)/include \
+       -I$(GALLIUM)/include/GL/internal \
+       -I$(GALLIUM)/src/mesa \
+       -I$(GALLIUM)/src/mesa/main \
+       -I$(GALLIUM)/src/mesa/glapi \
+       -I$(GALLIUM)/src/egl/main \
+       -I$(GALLIUM)/src/egl/drivers/dri \
+       $(shell pkg-config libdrm --cflags-only-I)
+
+LIBRARY_DEFINES = \
+       -std=gnu99 -fvisibility=hidden \
+       -DHAVE_STDINT_H -D_FILE_OFFSET_BITS=64 \
+       $(shell pkg-config libdrm --cflags-only-other)
+
+include ../../../../Makefile.template
diff --git a/src/gallium/winsys/drm/vmware/core/SConscript b/src/gallium/winsys/drm/vmware/core/SConscript
new file mode 100644
index 00000000000..edaf9458bee
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/SConscript
@@ -0,0 +1,39 @@
+Import('*')
+
+env = env.Clone()
+
+if env['gcc']:
+	env.Append(CCFLAGS = ['-fvisibility=hidden'])
+	env.Append(CPPDEFINES = [
+		'HAVE_STDINT_H', 
+		'HAVE_SYS_TYPES_H',
+                '-D_FILE_OFFSET_BITS=64',
+	])
+	
+env.Prepend(CPPPATH = [
+	'include',
+        '#/src/gallium/drivers/svga',
+        '#/src/gallium/drivers/svga/include',
+])
+
+env.Append(CPPDEFINES = [
+])
+
+sources = [
+        'vmw_buffer.c',
+        'vmw_context.c',
+        'vmw_fence.c',
+        'vmw_screen.c',
+        'vmw_screen_dri.c',
+        'vmw_screen_ioctl.c',
+        'vmw_screen_pools.c',
+        'vmw_screen_svga.c',
+        'vmw_surface.c',
+]
+
+svgadrm = env.ConvenienceLibrary(
+	target = 'svgadrm',
+	source = sources,
+)
+
+Export('svgadrm')
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_buffer.c b/src/gallium/winsys/drm/vmware/core/vmw_buffer.c
new file mode 100644
index 00000000000..b812fb59d39
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_buffer.c
@@ -0,0 +1,274 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * SVGA buffer manager for Guest Memory Regions (GMRs).
+ * 
+ * GMRs are used for pixel and vertex data upload/download to/from the virtual
+ * SVGA hardware. There is a limited number of GMRs available, and 
+ * creating/destroying them is also a slow operation so we must suballocate 
+ * them.
+ * 
+ * This file implements a pipebuffer library's buffer manager, so that we can
+ * use pipepbuffer's suballocation, fencing, and debugging facilities with GMRs. 
+ * 
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "svga_cmd.h"
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "pipebuffer/pb_buffer.h"
+#include "pipebuffer/pb_bufmgr.h"
+
+#include "svga_winsys.h"
+
+#include "vmw_screen.h"
+#include "vmw_buffer.h"
+
+
+struct vmw_gmr_bufmgr;
+
+
+struct vmw_gmr_buffer
+{
+   struct pb_buffer base;
+   
+   struct vmw_gmr_bufmgr *mgr;
+   
+   struct vmw_region *region;
+   void *map;
+   
+#ifdef DEBUG
+   struct pipe_fence_handle *last_fence;
+#endif
+};
+
+
+extern const struct pb_vtbl vmw_gmr_buffer_vtbl;
+
+
+static INLINE struct vmw_gmr_buffer *
+vmw_gmr_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &vmw_gmr_buffer_vtbl);
+   return (struct vmw_gmr_buffer *)buf;
+}
+
+
+struct vmw_gmr_bufmgr
+{
+   struct pb_manager base;
+   
+   struct vmw_winsys_screen *vws;
+};
+
+
+static INLINE struct vmw_gmr_bufmgr *
+vmw_gmr_bufmgr(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct vmw_gmr_bufmgr *)mgr;
+}
+
+
+static void
+vmw_gmr_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct vmw_gmr_buffer *buf = vmw_gmr_buffer(_buf);
+
+#ifdef DEBUG
+   if(buf->last_fence) {
+      struct svga_winsys_screen *sws = &buf->mgr->vws->base;
+      assert(sws->fence_signalled(sws, buf->last_fence, 0) == 0);
+   }
+#endif
+
+   vmw_ioctl_region_unmap(buf->region);
+   
+   vmw_ioctl_region_destroy(buf->region);
+
+   FREE(buf);
+}
+
+
+static void *
+vmw_gmr_buffer_map(struct pb_buffer *_buf,
+               unsigned flags)
+{
+   struct vmw_gmr_buffer *buf = vmw_gmr_buffer(_buf);
+   return buf->map;
+}
+
+
+static void
+vmw_gmr_buffer_unmap(struct pb_buffer *_buf)
+{
+   /* Do nothing */
+   (void)_buf;
+}
+
+
+static void
+vmw_gmr_buffer_get_base_buffer(struct pb_buffer *buf,
+                           struct pb_buffer **base_buf,
+                           unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+
+static enum pipe_error
+vmw_gmr_buffer_validate( struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags )
+{
+   /* Always pinned */
+   return PIPE_OK;
+}
+
+
+static void
+vmw_gmr_buffer_fence( struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence )
+{
+   /* We don't need to do anything, as the pipebuffer library
+    * will take care of delaying the destruction of fenced buffers */  
+#ifdef DEBUG
+   struct vmw_gmr_buffer *buf = vmw_gmr_buffer(_buf);
+   if(fence)
+      buf->last_fence = fence;
+#endif
+}
+
+
+const struct pb_vtbl vmw_gmr_buffer_vtbl = {
+   vmw_gmr_buffer_destroy,
+   vmw_gmr_buffer_map,
+   vmw_gmr_buffer_unmap,
+   vmw_gmr_buffer_validate,
+   vmw_gmr_buffer_fence,
+   vmw_gmr_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+vmw_gmr_bufmgr_create_buffer(struct pb_manager *_mgr,
+                         pb_size size,
+                         const struct pb_desc *desc) 
+{
+   struct vmw_gmr_bufmgr *mgr = vmw_gmr_bufmgr(_mgr);
+   struct vmw_winsys_screen *vws = mgr->vws;
+   struct vmw_gmr_buffer *buf;
+   
+   buf = CALLOC_STRUCT(vmw_gmr_buffer);
+   if(!buf)
+      goto error1;
+
+   pipe_reference_init(&buf->base.base.reference, 1);
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   buf->base.base.size = size;
+   buf->base.vtbl = &vmw_gmr_buffer_vtbl;
+   buf->mgr = mgr;
+
+   buf->region = vmw_ioctl_region_create(vws, size);
+   if(!buf->region)
+      goto error2;
+	 
+   buf->map = vmw_ioctl_region_map(buf->region);
+   if(!buf->map)
+      goto error3;
+
+   return &buf->base;
+
+error3:
+   vmw_ioctl_region_destroy(buf->region);
+error2:
+   FREE(buf);
+error1:
+   return NULL;
+}
+
+
+static void
+vmw_gmr_bufmgr_flush(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static void
+vmw_gmr_bufmgr_destroy(struct pb_manager *_mgr) 
+{
+   struct vmw_gmr_bufmgr *mgr = vmw_gmr_bufmgr(_mgr);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+vmw_gmr_bufmgr_create(struct vmw_winsys_screen *vws) 
+{
+   struct vmw_gmr_bufmgr *mgr;
+   
+   mgr = CALLOC_STRUCT(vmw_gmr_bufmgr);
+   if(!mgr)
+      return NULL;
+
+   mgr->base.destroy = vmw_gmr_bufmgr_destroy;
+   mgr->base.create_buffer = vmw_gmr_bufmgr_create_buffer;
+   mgr->base.flush = vmw_gmr_bufmgr_flush;
+   
+   mgr->vws = vws;
+   
+   return &mgr->base;
+}
+
+
+boolean
+vmw_gmr_bufmgr_region_ptr(struct pb_buffer *buf, 
+                          struct SVGAGuestPtr *ptr)
+{
+   struct pb_buffer *base_buf;
+   unsigned offset = 0;
+   struct vmw_gmr_buffer *gmr_buf;
+   
+   pb_get_base_buffer( buf, &base_buf, &offset );
+   
+   gmr_buf = vmw_gmr_buffer(base_buf);
+   if(!gmr_buf)
+      return FALSE;
+   
+   *ptr = vmw_ioctl_region_ptr(gmr_buf->region);
+   
+   ptr->offset += offset;
+   
+   return TRUE;
+}
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_buffer.h b/src/gallium/winsys/drm/vmware/core/vmw_buffer.h
new file mode 100644
index 00000000000..634bdcabd26
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_buffer.h
@@ -0,0 +1,65 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#ifndef VMW_BUFFER_H_
+#define VMW_BUFFER_H_
+
+
+#include "pipe/p_compiler.h"
+
+struct SVGAGuestPtr;
+struct pb_buffer;
+struct pb_manager;
+struct svga_winsys_buffer;
+struct svga_winsys_surface;
+struct vmw_winsys_screen;
+
+
+static INLINE struct pb_buffer *
+vmw_pb_buffer(struct svga_winsys_buffer *buffer)
+{
+   assert(buffer);
+   return (struct pb_buffer *)buffer;
+}
+
+
+static INLINE struct svga_winsys_buffer *
+vmw_svga_winsys_buffer(struct pb_buffer *buffer)
+{
+   assert(buffer);
+   return (struct svga_winsys_buffer *)buffer;
+}
+
+
+struct pb_manager *
+vmw_gmr_bufmgr_create(struct vmw_winsys_screen *vws);
+
+boolean
+vmw_gmr_bufmgr_region_ptr(struct pb_buffer *buf, 
+                          struct SVGAGuestPtr *ptr);
+
+
+#endif /* VMW_BUFFER_H_ */
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_context.c b/src/gallium/winsys/drm/vmware/core/vmw_context.c
new file mode 100644
index 00000000000..b6997588de4
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_context.c
@@ -0,0 +1,297 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "svga_cmd.h"
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_debug_stack.h"
+#include "pipebuffer/pb_buffer.h"
+#include "pipebuffer/pb_validate.h"
+
+#include "svga_winsys.h"
+#include "vmw_context.h"
+#include "vmw_screen.h"
+#include "vmw_buffer.h"
+#include "vmw_surface.h"
+#include "vmw_fence.h"
+
+#define VMW_COMMAND_SIZE (64*1024)
+#define VMW_SURFACE_RELOCS (1024)
+
+#define VMW_MUST_FLUSH_STACK 8
+
+struct vmw_svga_winsys_context
+{
+   struct svga_winsys_context base;
+
+   struct vmw_winsys_screen *vws;
+
+#ifdef DEBUG
+   boolean must_flush;
+   struct debug_stack_frame must_flush_stack[VMW_MUST_FLUSH_STACK];
+#endif
+
+   struct {
+      uint8_t buffer[VMW_COMMAND_SIZE];
+      uint32_t size;
+      uint32_t used;
+      uint32_t reserved;
+   } command;
+
+   struct {
+      struct vmw_svga_winsys_surface *handles[VMW_SURFACE_RELOCS];
+      uint32_t size;
+      uint32_t used;
+      uint32_t staged;
+      uint32_t reserved;
+   } surface;
+
+   struct pb_validate *validate;
+
+   uint32_t last_fence;
+};
+
+
+static INLINE struct vmw_svga_winsys_context *
+vmw_svga_winsys_context(struct svga_winsys_context *swc)
+{
+   assert(swc);
+   return (struct vmw_svga_winsys_context *)swc;
+}
+
+
+static enum pipe_error
+vmw_swc_flush(struct svga_winsys_context *swc,
+              struct pipe_fence_handle **pfence)
+{
+   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+   struct pipe_fence_handle *fence = NULL;
+   unsigned i;
+   enum pipe_error ret;
+
+   ret = pb_validate_validate(vswc->validate);
+   assert(ret == PIPE_OK);
+   if(ret == PIPE_OK) {
+
+      if (vswc->command.used)
+         vmw_ioctl_command(vswc->vws,
+                           vswc->command.buffer,
+                           vswc->command.used,
+                           &vswc->last_fence);
+
+      fence = vmw_pipe_fence(vswc->last_fence);
+
+      pb_validate_fence(vswc->validate, fence);
+   }
+
+   vswc->command.used = 0;
+   vswc->command.reserved = 0;
+
+   for(i = 0; i < vswc->surface.used + vswc->surface.staged; ++i) {
+      struct vmw_svga_winsys_surface *vsurf =
+	 vswc->surface.handles[i];
+      p_atomic_dec(&vsurf->validated);
+      vmw_svga_winsys_surface_reference(&vswc->surface.handles[i], NULL);
+   }
+
+   vswc->surface.used = 0;
+   vswc->surface.reserved = 0;
+
+#ifdef DEBUG
+   vswc->must_flush = FALSE;
+#endif
+
+   if(pfence)
+      *pfence = fence;
+
+   return ret;
+}
+
+
+static void *
+vmw_swc_reserve(struct svga_winsys_context *swc,
+                uint32_t nr_bytes, uint32_t nr_relocs )
+{
+   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+
+#ifdef DEBUG
+   /* Check if somebody forgot to check the previous failure */
+   if(vswc->must_flush) {
+      debug_printf("Forgot to flush:\n");
+      debug_backtrace_dump(vswc->must_flush_stack, VMW_MUST_FLUSH_STACK);
+      assert(!vswc->must_flush);
+   }
+#endif
+
+   assert(nr_bytes <= vswc->command.size);
+   if(nr_bytes > vswc->command.size)
+      return NULL;
+
+   if(vswc->command.used + nr_bytes > vswc->command.size ||
+      vswc->surface.used + nr_relocs > vswc->surface.size) {
+#ifdef DEBUG
+      vswc->must_flush = TRUE;
+      debug_backtrace_capture(vswc->must_flush_stack, 1,
+                              VMW_MUST_FLUSH_STACK);
+#endif
+      return NULL;
+   }
+
+   assert(vswc->command.used + nr_bytes <= vswc->command.size);
+   assert(vswc->surface.used + nr_relocs <= vswc->surface.size);
+
+   vswc->command.reserved = nr_bytes;
+   vswc->surface.reserved = nr_relocs;
+   vswc->surface.staged = 0;
+
+   return vswc->command.buffer + vswc->command.used;
+}
+
+
+static void
+vmw_swc_surface_relocation(struct svga_winsys_context *swc,
+                           uint32 *where,
+                           struct svga_winsys_surface *surface,
+                           unsigned flags)
+{
+   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+   struct vmw_svga_winsys_surface *vsurf;
+
+   if(!surface) {
+      *where = SVGA3D_INVALID_ID;
+      return;
+   }
+
+   assert(vswc->surface.staged < vswc->surface.reserved);
+
+   vsurf = vmw_svga_winsys_surface(surface);
+
+   *where = vsurf->sid;
+
+   vmw_svga_winsys_surface_reference(&vswc->surface.handles[vswc->surface.used + vswc->surface.staged], vsurf);
+   p_atomic_inc(&vsurf->validated);
+   ++vswc->surface.staged;
+}
+
+
+static void
+vmw_swc_region_relocation(struct svga_winsys_context *swc,
+                          struct SVGAGuestPtr *where,
+                          struct svga_winsys_buffer *buffer,
+                          uint32 offset,
+                          unsigned flags)
+{
+   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+   struct SVGAGuestPtr ptr;
+   struct pb_buffer *buf = vmw_pb_buffer(buffer);
+   enum pipe_error ret;
+
+   if(!vmw_gmr_bufmgr_region_ptr(buf, &ptr))
+      assert(0);
+
+   ptr.offset += offset;
+
+   *where = ptr;
+
+   ret = pb_validate_add_buffer(vswc->validate, buf, flags);
+   /* TODO: Update pipebuffer to reserve buffers and not fail here */
+   assert(ret == PIPE_OK);
+}
+
+
+static void
+vmw_swc_commit(struct svga_winsys_context *swc)
+{
+   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+
+   assert(vswc->command.reserved);
+   assert(vswc->command.used + vswc->command.reserved <= vswc->command.size);
+   vswc->command.used += vswc->command.reserved;
+   vswc->command.reserved = 0;
+
+   assert(vswc->surface.staged <= vswc->surface.reserved);
+   assert(vswc->surface.used + vswc->surface.staged <= vswc->surface.size);
+   vswc->surface.used += vswc->surface.staged;
+   vswc->surface.staged = 0;
+   vswc->surface.reserved = 0;
+}
+
+
+static void
+vmw_swc_destroy(struct svga_winsys_context *swc)
+{
+   struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+   unsigned i;
+   for(i = 0; i < vswc->surface.used; ++i) {
+      p_atomic_dec(&vswc->surface.handles[i]->validated);
+      vmw_svga_winsys_surface_reference(&vswc->surface.handles[i], NULL);
+   }
+   pb_validate_destroy(vswc->validate);
+   vmw_ioctl_context_destroy(vswc->vws, swc->cid);
+   FREE(vswc);
+}
+
+
+struct svga_winsys_context *
+vmw_svga_winsys_context_create(struct svga_winsys_screen *sws)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+   struct vmw_svga_winsys_context *vswc;
+
+   vswc = CALLOC_STRUCT(vmw_svga_winsys_context);
+   if(!vswc)
+      return NULL;
+
+   vswc->base.destroy = vmw_swc_destroy;
+   vswc->base.reserve = vmw_swc_reserve;
+   vswc->base.surface_relocation = vmw_swc_surface_relocation;
+   vswc->base.region_relocation = vmw_swc_region_relocation;
+   vswc->base.commit = vmw_swc_commit;
+   vswc->base.flush = vmw_swc_flush;
+
+   vswc->base.cid = vmw_ioctl_context_create(vws);
+
+   vswc->vws = vws;
+
+   vswc->command.size = VMW_COMMAND_SIZE;
+   vswc->surface.size = VMW_SURFACE_RELOCS;
+
+   vswc->validate = pb_validate_create();
+   if(!vswc->validate) {
+      FREE(vswc);
+      return NULL;
+   }
+
+   return &vswc->base;
+}
+
+
+struct pipe_context *
+vmw_svga_context_create(struct pipe_screen *screen)
+{
+   return svga_context_create(screen);
+}
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_context.h b/src/gallium/winsys/drm/vmware/core/vmw_context.h
new file mode 100644
index 00000000000..305ce9b5bec
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_context.h
@@ -0,0 +1,59 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef VMW_CONTEXT_H_
+#define VMW_CONTEXT_H_
+
+#include "pipe/p_compiler.h"
+
+struct svga_winsys_screen;
+struct svga_winsys_context;
+struct pipe_context;
+struct pipe_screen;
+
+#define VMW_DEBUG 0
+
+#if VMW_DEBUG
+#define vmw_printf debug_printf
+#define VMW_FUNC  debug_printf("%s\n", __FUNCTION__)
+#else
+#define VMW_FUNC
+#define vmw_printf(...)
+#endif
+
+
+struct svga_winsys_context *
+vmw_svga_winsys_context_create(struct svga_winsys_screen *sws);
+
+struct pipe_context *
+vmw_svga_context_create(struct pipe_screen *screen);
+
+
+#endif /* VMW_CONTEXT_H_ */
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_fence.c b/src/gallium/winsys/drm/vmware/core/vmw_fence.c
new file mode 100644
index 00000000000..873dd51166c
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_fence.c
@@ -0,0 +1,108 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipebuffer/pb_buffer_fenced.h"
+
+#include "vmw_screen.h"
+#include "vmw_fence.h"
+
+
+
+struct vmw_fence_ops 
+{
+   struct pb_fence_ops base;
+
+   struct vmw_winsys_screen *vws;
+};
+
+
+static INLINE struct vmw_fence_ops *
+vmw_fence_ops(struct pb_fence_ops *ops)
+{
+   assert(ops);
+   return (struct vmw_fence_ops *)ops;
+}
+
+
+static void
+vmw_fence_ops_fence_reference(struct pb_fence_ops *ops,
+                              struct pipe_fence_handle **ptr,
+                              struct pipe_fence_handle *fence)
+{
+   *ptr = fence;
+}
+
+
+static int
+vmw_fence_ops_fence_signalled(struct pb_fence_ops *ops,
+                              struct pipe_fence_handle *fence,
+                              unsigned flag)
+{
+   struct vmw_winsys_screen *vws = vmw_fence_ops(ops)->vws;
+   (void)flag;
+   return vmw_ioctl_fence_signalled(vws, vmw_fence(fence));
+}
+
+
+static int
+vmw_fence_ops_fence_finish(struct pb_fence_ops *ops,
+                           struct pipe_fence_handle *fence,
+                           unsigned flag)
+{
+   struct vmw_winsys_screen *vws = vmw_fence_ops(ops)->vws;
+   (void)flag;
+   return vmw_ioctl_fence_finish(vws, vmw_fence(fence));
+}
+
+
+static void
+vmw_fence_ops_destroy(struct pb_fence_ops *ops)
+{
+   FREE(ops);
+}
+
+
+struct pb_fence_ops *
+vmw_fence_ops_create(struct vmw_winsys_screen *vws) 
+{
+   struct vmw_fence_ops *ops;
+
+   ops = CALLOC_STRUCT(vmw_fence_ops);
+   if(!ops)
+      return NULL;
+
+   ops->base.destroy = &vmw_fence_ops_destroy;
+   ops->base.fence_reference = &vmw_fence_ops_fence_reference;
+   ops->base.fence_signalled = &vmw_fence_ops_fence_signalled;
+   ops->base.fence_finish = &vmw_fence_ops_fence_finish;
+
+   ops->vws = vws;
+
+   return &ops->base;
+}
+
+
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_fence.h b/src/gallium/winsys/drm/vmware/core/vmw_fence.h
new file mode 100644
index 00000000000..5357b4f61de
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_fence.h
@@ -0,0 +1,59 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#ifndef VMW_FENCE_H_
+#define VMW_FENCE_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+struct pipe_fence_handle;
+struct pb_fence_ops;
+struct vmw_winsys_screen;
+
+
+/** Cast from a pipe_fence_handle pointer into a SVGA fence */
+static INLINE uint32_t
+vmw_fence( struct pipe_fence_handle *fence )
+{
+   return (uint32_t)(uintptr_t)fence;
+}
+
+
+/** Cast from a SVGA fence number to pipe_fence_handle pointer */
+static INLINE struct pipe_fence_handle *
+vmw_pipe_fence( uint32_t fence )
+{
+   return (struct pipe_fence_handle *)(uintptr_t)fence;
+}
+
+
+struct pb_fence_ops *
+vmw_fence_ops_create(struct vmw_winsys_screen *vws); 
+
+
+#endif /* VMW_FENCE_H_ */
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_screen.c b/src/gallium/winsys/drm/vmware/core/vmw_screen.c
new file mode 100644
index 00000000000..911eec5e254
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_screen.c
@@ -0,0 +1,74 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "vmw_screen.h"
+
+#include "vmw_context.h"
+
+#include "util/u_memory.h"
+#include "pipe/p_compiler.h"
+
+
+/* Called from vmw_drm_create_screen(), creates and initializes the
+ * vmw_winsys_screen structure, which is the main entity in this
+ * module.
+ */
+struct vmw_winsys_screen *
+vmw_winsys_create( int fd )
+{
+   struct vmw_winsys_screen *vws = CALLOC_STRUCT(vmw_winsys_screen);
+   if (!vws)
+      goto out_no_vws;
+
+   vws->ioctl.drm_fd = fd;
+
+   if (!vmw_ioctl_init(vws))
+      goto out_no_ioctl;
+
+   if(!vmw_pools_init(vws))
+      goto out_no_pools;
+
+   if (!vmw_winsys_screen_init_svga(vws))
+      goto out_no_svga;
+
+   return vws;
+out_no_svga:
+   vmw_pools_cleanup(vws);
+out_no_pools:
+   vmw_ioctl_cleanup(vws);
+out_no_ioctl:
+   FREE(vws);
+out_no_vws:
+   return NULL;
+}
+
+void
+vmw_winsys_destroy(struct vmw_winsys_screen *vws)
+{
+   vmw_pools_cleanup(vws);
+   vmw_ioctl_cleanup(vws);
+   FREE(vws);
+}
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_screen.h b/src/gallium/winsys/drm/vmware/core/vmw_screen.h
new file mode 100644
index 00000000000..a875107370c
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_screen.h
@@ -0,0 +1,134 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Common definitions for the VMware SVGA winsys.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef VMW_SCREEN_H_
+#define VMW_SCREEN_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "svga_winsys.h"
+
+struct pb_manager;
+struct vmw_region;
+
+
+struct vmw_winsys_screen
+{
+   struct svga_winsys_screen base;
+
+   struct {
+      volatile uint32_t *fifo_map;
+      uint64_t last_fence;
+      int drm_fd;
+   } ioctl;
+
+   struct {
+      struct pb_manager *gmr;
+      struct pb_manager *gmr_mm;
+      struct pb_manager *gmr_fenced;
+   } pools;
+};
+
+
+static INLINE struct vmw_winsys_screen *
+vmw_winsys_screen(struct svga_winsys_screen *base)
+{
+   return (struct vmw_winsys_screen *)base;
+}
+
+/*  */
+uint32
+vmw_ioctl_context_create(struct vmw_winsys_screen *vws);
+
+void
+vmw_ioctl_context_destroy(struct vmw_winsys_screen *vws,
+                          uint32 cid);
+
+uint32
+vmw_ioctl_surface_create(struct vmw_winsys_screen *vws,
+                              SVGA3dSurfaceFlags flags,
+                              SVGA3dSurfaceFormat format,
+                              SVGA3dSize size,
+                              uint32 numFaces,
+                              uint32 numMipLevels);
+
+void
+vmw_ioctl_surface_destroy(struct vmw_winsys_screen *vws,
+                          uint32 sid);
+
+void
+vmw_ioctl_command(struct vmw_winsys_screen *vws,
+                       void *commands,
+                       uint32_t size,
+                       uint32_t *fence);
+
+struct vmw_region *
+vmw_ioctl_region_create(struct vmw_winsys_screen *vws, uint32_t size);
+
+void
+vmw_ioctl_region_destroy(struct vmw_region *region);
+
+struct SVGAGuestPtr
+vmw_ioctl_region_ptr(struct vmw_region *region);
+
+void *
+vmw_ioctl_region_map(struct vmw_region *region);
+void
+vmw_ioctl_region_unmap(struct vmw_region *region);
+
+
+int
+vmw_ioctl_fence_finish(struct vmw_winsys_screen *vws,
+                       uint32_t fence);
+
+int
+vmw_ioctl_fence_signalled(struct vmw_winsys_screen *vws,
+                          uint32_t fence);
+
+
+/* Initialize parts of vmw_winsys_screen at startup:
+ */
+boolean vmw_ioctl_init(struct vmw_winsys_screen *vws);
+boolean vmw_pools_init(struct vmw_winsys_screen *vws);
+boolean vmw_winsys_screen_init_svga(struct vmw_winsys_screen *vws);
+
+void vmw_ioctl_cleanup(struct vmw_winsys_screen *vws);
+void vmw_pools_cleanup(struct vmw_winsys_screen *vws);
+
+struct vmw_winsys_screen *vmw_winsys_create(int fd);
+void vmw_winsys_destroy(struct vmw_winsys_screen *sws);
+
+
+#endif /* VMW_SCREEN_H_ */
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_screen_dri.c b/src/gallium/winsys/drm/vmware/core/vmw_screen_dri.c
new file mode 100644
index 00000000000..5995eee34ba
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_screen_dri.c
@@ -0,0 +1,371 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "vmw_screen.h"
+
+#include "trace/tr_drm.h"
+
+#include "vmw_screen.h"
+#include "vmw_surface.h"
+#include "vmw_fence.h"
+#include "vmw_context.h"
+
+#include <state_tracker/dri1_api.h>
+#include <state_tracker/drm_api.h>
+#include <vmwgfx_drm.h>
+#include <xf86drm.h>
+
+#include <stdio.h>
+
+static struct dri1_api dri1_api_hooks;
+static struct dri1_api_version ddx_required = { 0, 1, 0 };
+static struct dri1_api_version ddx_compat = { 0, 0, 0 };
+static struct dri1_api_version dri_required = { 4, 0, 0 };
+static struct dri1_api_version dri_compat = { 4, 0, 0 };
+static struct dri1_api_version drm_required = { 0, 1, 0 };
+static struct dri1_api_version drm_compat = { 0, 0, 0 };
+
+static boolean
+vmw_dri1_check_version(const struct dri1_api_version *cur,
+		       const struct dri1_api_version *required,
+		       const struct dri1_api_version *compat,
+		       const char component[])
+{
+   if (cur->major > required->major && cur->major <= compat->major)
+      return TRUE;
+   if (cur->major == required->major && cur->minor >= required->minor)
+      return TRUE;
+
+   fprintf(stderr, "%s version failure.\n", component);
+   fprintf(stderr, "%s version is %d.%d.%d and this driver can only work\n"
+	   "with versions %d.%d.x through %d.x.x.\n",
+	   component,
+	   cur->major,
+	   cur->minor,
+	   cur->patch_level, required->major, required->minor, compat->major);
+   return FALSE;
+}
+
+/* This is actually the entrypoint to the entire driver, called by the
+ * libGL (or EGL, or ...) code via the drm_api_hooks table at the
+ * bottom of the file.
+ */
+static struct pipe_screen *
+vmw_drm_create_screen(struct drm_api *drm_api,
+                      int fd,
+                      struct drm_create_screen_arg *arg)
+{
+   struct vmw_winsys_screen *vws;
+   struct pipe_screen *screen;
+   struct dri1_create_screen_arg *dri1;
+
+   if (arg != NULL) {
+      switch (arg->mode) {
+      case DRM_CREATE_NORMAL:
+	 break;
+      case DRM_CREATE_DRI1:
+	 dri1 = (struct dri1_create_screen_arg *)arg;
+	 if (!vmw_dri1_check_version(&dri1->ddx_version, &ddx_required,
+				     &ddx_compat, "ddx - driver api"))
+	    return NULL;
+	 if (!vmw_dri1_check_version(&dri1->dri_version, &dri_required,
+				     &dri_compat, "dri info"))
+	    return NULL;
+	 if (!vmw_dri1_check_version(&dri1->drm_version, &drm_required,
+				     &drm_compat, "vmwgfx drm driver"))
+	    return NULL;
+	 dri1->api = &dri1_api_hooks;
+	 break;
+      default:
+	 return NULL;
+      }
+   }
+
+   vws = vmw_winsys_create( fd );
+   if (!vws)
+      goto out_no_vws;
+
+   screen = svga_screen_create( &vws->base );
+   if (!screen)
+      goto out_no_screen;
+
+   return screen;
+
+   /* Failure cases:
+    */
+out_no_screen:
+   vmw_winsys_destroy( vws );
+
+out_no_vws:
+   return NULL;
+}
+
+static INLINE boolean
+vmw_dri1_intersect_src_bbox(struct drm_clip_rect *dst,
+			    int dst_x,
+			    int dst_y,
+			    const struct drm_clip_rect *src,
+			    const struct drm_clip_rect *bbox)
+{
+   int xy1;
+   int xy2;
+
+   xy1 = ((int)src->x1 > (int)bbox->x1 + dst_x) ? src->x1 :
+      (int)bbox->x1 + dst_x;
+   xy2 = ((int)src->x2 < (int)bbox->x2 + dst_x) ? src->x2 :
+      (int)bbox->x2 + dst_x;
+   if (xy1 >= xy2 || xy1 < 0)
+      return FALSE;
+
+   dst->x1 = xy1;
+   dst->x2 = xy2;
+
+   xy1 = ((int)src->y1 > (int)bbox->y1 + dst_y) ? src->y1 :
+      (int)bbox->y1 + dst_y;
+   xy2 = ((int)src->y2 < (int)bbox->y2 + dst_y) ? src->y2 :
+      (int)bbox->y2 + dst_y;
+   if (xy1 >= xy2 || xy1 < 0)
+      return FALSE;
+
+   dst->y1 = xy1;
+   dst->y2 = xy2;
+   return TRUE;
+}
+
+/**
+ * No fancy get-surface-from-sarea stuff here.
+ * Just use the present blit.
+ */
+
+static void
+vmw_dri1_present_locked(struct pipe_context *locked_pipe,
+			struct pipe_surface *surf,
+			const struct drm_clip_rect *rect,
+			unsigned int num_clip,
+			int x_draw, int y_draw,
+			const struct drm_clip_rect *bbox,
+			struct pipe_fence_handle **p_fence)
+{
+   struct svga_winsys_surface *srf =
+      svga_screen_texture_get_winsys_surface(surf->texture);
+   struct vmw_svga_winsys_surface *vsrf = vmw_svga_winsys_surface(srf);
+   struct vmw_winsys_screen *vws =
+      vmw_winsys_screen(svga_winsys_screen(locked_pipe->screen));
+   struct drm_clip_rect clip;
+   int i;
+   struct
+   {
+      SVGA3dCmdHeader header;
+      SVGA3dCmdPresent body;
+      SVGA3dCopyRect rect;
+   } cmd;
+   boolean visible = FALSE;
+   uint32_t fence_seq = 0;
+
+   VMW_FUNC;
+   cmd.header.id = SVGA_3D_CMD_PRESENT;
+   cmd.header.size = sizeof cmd.body + sizeof cmd.rect;
+   cmd.body.sid = vsrf->sid;
+
+   for (i = 0; i < num_clip; ++i) {
+      if (!vmw_dri1_intersect_src_bbox(&clip, x_draw, y_draw, rect++, bbox))
+	 continue;
+
+      cmd.rect.x = clip.x1;
+      cmd.rect.y = clip.y1;
+      cmd.rect.w = clip.x2 - clip.x1;
+      cmd.rect.h = clip.y2 - clip.y1;
+      cmd.rect.srcx = (int)clip.x1 - x_draw;
+      cmd.rect.srcy = (int)clip.y1 - y_draw;
+
+      vmw_printf("%s: Clip %d x %d y %d w %d h %d srcx %d srcy %d\n",
+		   __FUNCTION__,
+		   i,
+		   cmd.rect.x,
+		   cmd.rect.y,
+		   cmd.rect.w, cmd.rect.h, cmd.rect.srcx, cmd.rect.srcy);
+
+      vmw_ioctl_command(vws, &cmd, sizeof cmd.header + cmd.header.size,
+                        &fence_seq);
+      visible = TRUE;
+   }
+
+   *p_fence = (visible) ? vmw_pipe_fence(fence_seq) : NULL;
+   vmw_svga_winsys_surface_reference(&vsrf, NULL);
+}
+
+/**
+ * FIXME: We'd probably want to cache these buffers in the
+ * screen, based on handle.
+ */
+
+static struct pipe_buffer *
+vmw_drm_buffer_from_handle(struct drm_api *drm_api,
+                           struct pipe_screen *screen,
+			   const char *name,
+			   unsigned handle)
+{
+    struct vmw_svga_winsys_surface *vsrf;
+    struct svga_winsys_surface *ssrf;
+    struct vmw_winsys_screen *vws =
+	vmw_winsys_screen(svga_winsys_screen(screen));
+    struct pipe_buffer *buf;
+    union drm_vmw_surface_reference_arg arg;
+    struct drm_vmw_surface_arg *req = &arg.req;
+    struct drm_vmw_surface_create_req *rep = &arg.rep;
+    int ret;
+    int i;
+
+    /**
+     * The vmware device specific handle is the hardware SID.
+     * FIXME: We probably want to move this to the ioctl implementations.
+     */
+
+    memset(&arg, 0, sizeof(arg));
+    req->sid = handle;
+
+    ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_REF_SURFACE,
+			      &arg, sizeof(arg));
+
+    if (ret) {
+	fprintf(stderr, "Failed referencing shared surface. SID %d.\n"
+		"Error %d (%s).\n",
+		handle, ret, strerror(-ret));
+	return NULL;
+    }
+
+    if (rep->mip_levels[0] != 1) {
+	fprintf(stderr, "Incorrect number of mipmap levels on shared surface."
+		" SID %d, levels %d\n",
+		handle, rep->mip_levels[0]);
+	goto out_mip;
+    }
+
+    for (i=1; i < DRM_VMW_MAX_SURFACE_FACES; ++i) {
+	if (rep->mip_levels[i] != 0) {
+	    fprintf(stderr, "Incorrect number of faces levels on shared surface."
+		    " SID %d, face %d present.\n",
+		    handle, i);
+	    goto out_mip;
+	}
+    }
+
+    vsrf = CALLOC_STRUCT(vmw_svga_winsys_surface);
+    if (!vsrf)
+	goto out_mip;
+
+    pipe_reference_init(&vsrf->refcnt, 1);
+    p_atomic_set(&vsrf->validated, 0);
+    vsrf->sid = handle;
+    ssrf = svga_winsys_surface(vsrf);
+    buf = svga_screen_buffer_wrap_surface(screen, rep->format, ssrf);
+    if (!buf)
+	vmw_svga_winsys_surface_reference(&vsrf, NULL);
+
+    return buf;
+  out_mip:
+    vmw_ioctl_surface_destroy(vws, handle);
+    return NULL;
+}
+
+static struct pipe_texture *
+vmw_drm_texture_from_handle(struct drm_api *drm_api,
+			    struct pipe_screen *screen,
+			    struct pipe_texture *templat,
+			    const char *name,
+			    unsigned stride,
+			    unsigned handle)
+{
+    struct pipe_buffer *buffer;
+    buffer = vmw_drm_buffer_from_handle(drm_api, screen, name, handle);
+
+    if (!buffer)
+	return NULL;
+
+    return screen->texture_blanket(screen, templat, &stride, buffer);
+}
+
+static boolean
+vmw_drm_handle_from_buffer(struct drm_api *drm_api,
+                           struct pipe_screen *screen,
+			   struct pipe_buffer *buffer,
+			   unsigned *handle)
+{
+    struct svga_winsys_surface *surface =
+	svga_screen_buffer_get_winsys_surface(buffer);
+    struct vmw_svga_winsys_surface *vsrf;
+
+    if (!surface)
+	return FALSE;
+
+    vsrf = vmw_svga_winsys_surface(surface);
+    *handle = vsrf->sid;
+    vmw_svga_winsys_surface_reference(&vsrf, NULL);
+    return TRUE;
+}
+
+static boolean
+vmw_drm_handle_from_texture(struct drm_api *drm_api,
+			    struct pipe_screen *screen,
+			    struct pipe_texture *texture,
+			    unsigned *stride,
+			    unsigned *handle)
+{
+    struct pipe_buffer *buffer;
+
+    if (!svga_screen_buffer_from_texture(texture, &buffer, stride))
+	return FALSE;
+
+    return vmw_drm_handle_from_buffer(drm_api, screen, buffer, handle);
+}
+
+static struct pipe_context*
+vmw_drm_create_context(struct drm_api *drm_api,
+                       struct pipe_screen *screen)
+{
+   return vmw_svga_context_create(screen);
+}
+
+static struct dri1_api dri1_api_hooks = {
+   .front_srf_locked = NULL,
+   .present_locked = vmw_dri1_present_locked
+};
+
+static struct drm_api vmw_drm_api_hooks = {
+   .create_screen = vmw_drm_create_screen,
+   .create_context = vmw_drm_create_context,
+   .texture_from_shared_handle = vmw_drm_texture_from_handle,
+   .shared_handle_from_texture = vmw_drm_handle_from_texture,
+   .local_handle_from_texture = vmw_drm_handle_from_texture,
+};
+
+struct drm_api* drm_api_create()
+{
+   return trace_drm_create(&vmw_drm_api_hooks);
+}
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_screen_ioctl.c b/src/gallium/winsys/drm/vmware/core/vmw_screen_ioctl.c
new file mode 100644
index 00000000000..51e455f9254
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_screen_ioctl.c
@@ -0,0 +1,504 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ *
+ * Wrappers for DRM ioctl functionlaity used by the rest of the vmw
+ * drm winsys.
+ *
+ * Based on svgaicd_escape.c
+ */
+
+
+#include "svga_cmd.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "svgadump/svga_dump.h"
+#include "vmw_screen.h"
+#include "vmw_context.h"
+#include "xf86drm.h"
+#include "vmwgfx_drm.h"
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <unistd.h>
+
+struct vmw_region
+{
+   SVGAGuestPtr ptr;
+   uint32_t handle;
+   uint64_t map_handle;
+   void *data;
+   uint32_t map_count;
+   int drm_fd;
+   uint32_t size;
+};
+
+static void
+vmw_check_last_cmd(struct vmw_winsys_screen *vws)
+{
+   static uint32_t buffer[16384];
+   struct drm_vmw_fifo_debug_arg arg;
+   int ret;
+
+   return;
+   memset(&arg, 0, sizeof(arg));
+   arg.debug_buffer = (unsigned long)buffer;
+   arg.debug_buffer_size = 65536;
+
+   ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_FIFO_DEBUG,
+			     &arg, sizeof(arg));
+
+   if (ret) {
+      debug_printf("%s Ioctl error: \"%s\".\n", __FUNCTION__, strerror(-ret));
+      return;
+   }
+
+   if (arg.did_not_fit) {
+      debug_printf("%s Command did not fit completely.\n", __FUNCTION__);
+   }
+
+   svga_dump_commands(buffer, arg.used_size);
+}
+
+static void
+vmw_ioctl_fifo_unmap(struct vmw_winsys_screen *vws, void *mapping)
+{
+   VMW_FUNC;
+   (void)munmap(mapping, getpagesize());
+}
+
+
+static void *
+vmw_ioctl_fifo_map(struct vmw_winsys_screen *vws,
+                   uint32_t fifo_offset )
+{
+   void *map;
+
+   VMW_FUNC;
+
+   map = mmap(NULL, getpagesize(), PROT_READ, MAP_SHARED,
+	      vws->ioctl.drm_fd, fifo_offset);
+
+   if (map == MAP_FAILED) {
+      debug_printf("Map failed %s\n", strerror(errno));
+      return NULL;
+   }
+
+   vmw_printf("Fifo (min) is 0x%08x\n", ((uint32_t *) map)[SVGA_FIFO_MIN]);
+
+   return map;
+}
+
+uint32
+vmw_ioctl_context_create(struct vmw_winsys_screen *vws)
+{
+   struct drm_vmw_context_arg c_arg;
+   int ret;
+
+   VMW_FUNC;
+
+   ret = drmCommandRead(vws->ioctl.drm_fd, DRM_VMW_CREATE_CONTEXT,
+			&c_arg, sizeof(c_arg));
+
+   if (ret)
+      return -1;
+
+   vmw_check_last_cmd(vws);
+   vmw_printf("Context id is %d\n", c_arg.cid);
+
+   return c_arg.cid;
+}
+
+void
+vmw_ioctl_context_destroy(struct vmw_winsys_screen *vws, uint32 cid)
+{
+   struct drm_vmw_context_arg c_arg;
+
+   VMW_FUNC;
+
+   memset(&c_arg, 0, sizeof(c_arg));
+   c_arg.cid = cid;
+
+   (void)drmCommandWrite(vws->ioctl.drm_fd, DRM_VMW_UNREF_CONTEXT,
+			 &c_arg, sizeof(c_arg));
+
+   vmw_check_last_cmd(vws);
+}
+
+uint32
+vmw_ioctl_surface_create(struct vmw_winsys_screen *vws,
+			      SVGA3dSurfaceFlags flags,
+			      SVGA3dSurfaceFormat format,
+			      SVGA3dSize size,
+			      uint32_t numFaces, uint32_t numMipLevels)
+{
+   union drm_vmw_surface_create_arg s_arg;
+   struct drm_vmw_surface_create_req *req = &s_arg.req;
+   struct drm_vmw_surface_arg *rep = &s_arg.rep;
+   struct drm_vmw_size sizes[DRM_VMW_MAX_SURFACE_FACES*
+			     DRM_VMW_MAX_MIP_LEVELS];
+   struct drm_vmw_size *cur_size;
+   uint32_t iFace;
+   uint32_t iMipLevel;
+   int ret;
+
+   vmw_printf("%s flags %d format %d\n", __FUNCTION__, flags, format);
+
+   memset(&s_arg, 0, sizeof(s_arg));
+   req->flags = (uint32_t) flags;
+   req->format = (uint32_t) format;
+   req->shareable = 1;
+
+   assert(numFaces * numMipLevels < DRM_VMW_MAX_SURFACE_FACES*
+	  DRM_VMW_MAX_MIP_LEVELS);
+   cur_size = sizes;
+   for (iFace = 0; iFace < numFaces; ++iFace) {
+      SVGA3dSize mipSize = size;
+
+      req->mip_levels[iFace] = numMipLevels;
+      for (iMipLevel = 0; iMipLevel < numMipLevels; ++iMipLevel) {
+	 cur_size->width = mipSize.width;
+	 cur_size->height = mipSize.height;
+	 cur_size->depth = mipSize.depth;
+	 mipSize.width = MAX2(mipSize.width >> 1, 1);
+	 mipSize.height = MAX2(mipSize.height >> 1, 1);
+	 mipSize.depth = MAX2(mipSize.depth >> 1, 1);
+	 cur_size++;
+      }
+   }
+   for (iFace = numFaces; iFace < SVGA3D_MAX_SURFACE_FACES; ++iFace) {
+      req->mip_levels[iFace] = 0;
+   }
+
+   req->size_addr = (unsigned long)&sizes;
+
+   ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_CREATE_SURFACE,
+			     &s_arg, sizeof(s_arg));
+
+   if (ret)
+      return -1;
+
+   vmw_printf("Surface id is %d\n", rep->sid);
+   vmw_check_last_cmd(vws);
+
+   return rep->sid;
+}
+
+void
+vmw_ioctl_surface_destroy(struct vmw_winsys_screen *vws, uint32 sid)
+{
+   struct drm_vmw_surface_arg s_arg;
+
+   VMW_FUNC;
+
+   memset(&s_arg, 0, sizeof(s_arg));
+   s_arg.sid = sid;
+
+   (void)drmCommandWrite(vws->ioctl.drm_fd, DRM_VMW_UNREF_SURFACE,
+			 &s_arg, sizeof(s_arg));
+   vmw_check_last_cmd(vws);
+
+}
+
+void
+vmw_ioctl_command(struct vmw_winsys_screen *vws, void *commands, uint32_t size,
+		       uint32_t * pfence)
+{
+   struct drm_vmw_execbuf_arg arg;
+   struct drm_vmw_fence_rep rep;
+   int ret;
+
+#ifdef DEBUG
+   {
+      static boolean firsttime = TRUE;
+      static boolean debug = FALSE;
+      static boolean skip = FALSE;
+      if (firsttime) {
+         debug = debug_get_bool_option("SVGA_DUMP_CMD", FALSE);
+         skip = debug_get_bool_option("SVGA_SKIP_CMD", FALSE);
+      }
+      if (debug) {
+         VMW_FUNC;
+         svga_dump_commands(commands, size);
+      }
+      firsttime = FALSE;
+      if (skip) {
+         size = 0;
+      }
+   }
+#endif
+
+   memset(&arg, 0, sizeof(arg));
+   memset(&rep, 0, sizeof(rep));
+
+   rep.error = -EFAULT;
+   arg.fence_rep = (unsigned long)&rep;
+   arg.commands = (unsigned long)commands;
+   arg.command_size = size;
+
+   do {
+       ret = drmCommandWrite(vws->ioctl.drm_fd, DRM_VMW_EXECBUF, &arg, sizeof(arg));
+   } while(ret == -ERESTART);
+   if (ret) {
+      debug_printf("%s error %s.\n", __FUNCTION__, strerror(-ret));
+   }
+   if (rep.error) {
+
+      /*
+       * Kernel has synced and put the last fence sequence in the FIFO
+       * register.
+       */
+
+      if (rep.error == -EFAULT)
+	 rep.fence_seq = vws->ioctl.fifo_map[SVGA_FIFO_FENCE];
+
+      debug_printf("%s Fence error %s.\n", __FUNCTION__,
+		   strerror(-rep.error));
+   }
+
+   vws->ioctl.last_fence = rep.fence_seq;
+
+   if (pfence)
+      *pfence = rep.fence_seq;
+   vmw_check_last_cmd(vws);
+
+}
+
+
+struct vmw_region *
+vmw_ioctl_region_create(struct vmw_winsys_screen *vws, uint32_t size)
+{
+   struct vmw_region *region;
+   union drm_vmw_alloc_dmabuf_arg arg;
+   struct drm_vmw_alloc_dmabuf_req *req = &arg.req;
+   struct drm_vmw_dmabuf_rep *rep = &arg.rep;
+   int ret;
+
+   vmw_printf("%s: size = %u\n", __FUNCTION__, size);
+
+   region = CALLOC_STRUCT(vmw_region);
+   if (!region)
+      goto out_err1;
+
+   memset(&arg, 0, sizeof(arg));
+   req->size = size;
+   do {
+      ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_ALLOC_DMABUF, &arg,
+				sizeof(arg));
+   } while (ret == -ERESTART);
+
+   if (ret) {
+      debug_printf("IOCTL failed %d: %s\n", ret, strerror(-ret));
+      goto out_err1;
+   }
+
+   region->ptr.gmrId = rep->cur_gmr_id;
+   region->ptr.offset = rep->cur_gmr_offset;
+   region->data = NULL;
+   region->handle = rep->handle;
+   region->map_handle = rep->map_handle;
+   region->map_count = 0;
+   region->size = size;
+   region->drm_fd = vws->ioctl.drm_fd;
+
+   vmw_printf("   gmrId = %u, offset = %u\n",
+              region->ptr.gmrId, region->ptr.offset);
+
+   return region;
+
+ out_err1:
+   FREE(region);
+   return NULL;
+}
+
+void
+vmw_ioctl_region_destroy(struct vmw_region *region)
+{
+   struct drm_vmw_unref_dmabuf_arg arg;
+
+   vmw_printf("%s: gmrId = %u, offset = %u\n", __FUNCTION__,
+              region->ptr.gmrId, region->ptr.offset);
+
+   if (region->data) {
+      munmap(region->data, region->size);
+      region->data = NULL;
+   }
+
+   memset(&arg, 0, sizeof(arg));
+   arg.handle = region->handle;
+   drmCommandWrite(region->drm_fd, DRM_VMW_UNREF_DMABUF, &arg, sizeof(arg));
+
+   FREE(region);
+}
+
+SVGAGuestPtr
+vmw_ioctl_region_ptr(struct vmw_region *region)
+{
+   return region->ptr;
+}
+
+void *
+vmw_ioctl_region_map(struct vmw_region *region)
+{
+   void *map;
+
+   vmw_printf("%s: gmrId = %u, offset = %u\n", __FUNCTION__,
+              region->ptr.gmrId, region->ptr.offset);
+
+   if (region->data == NULL) {
+      map = mmap(NULL, region->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+		 region->drm_fd, region->map_handle);
+      if (map == MAP_FAILED) {
+	 debug_printf("%s: Map failed.\n", __FUNCTION__);
+	 return NULL;
+      }
+
+      region->data = map;
+   }
+
+   ++region->map_count;
+
+   return region->data;
+}
+
+void
+vmw_ioctl_region_unmap(struct vmw_region *region)
+{
+   vmw_printf("%s: gmrId = %u, offset = %u\n", __FUNCTION__,
+              region->ptr.gmrId, region->ptr.offset);
+   --region->map_count;
+}
+
+
+int
+vmw_ioctl_fence_signalled(struct vmw_winsys_screen *vws,
+                          uint32_t fence)
+{
+   uint32_t expected;
+   uint32_t current;
+   
+   assert(fence);
+   if(!fence)
+      return 0;
+   
+   expected = fence;
+   current = vws->ioctl.fifo_map[SVGA_FIFO_FENCE];
+   
+   if ((int32)(current - expected) >= 0)
+      return 0; /* fence passed */
+   else
+      return -1;
+}
+
+
+static void
+vmw_ioctl_sync(struct vmw_winsys_screen *vws, 
+		    uint32_t fence)
+{
+   uint32_t cur_fence;
+   struct drm_vmw_fence_wait_arg arg;
+   int ret;
+
+   vmw_printf("%s: fence = %lu\n", __FUNCTION__,
+              (unsigned long)fence);
+
+   cur_fence = vws->ioctl.fifo_map[SVGA_FIFO_FENCE];
+   vmw_printf("%s: Fence id read is 0x%08x\n", __FUNCTION__,
+              (unsigned int)cur_fence);
+
+   if ((cur_fence - fence) < (1 << 24))
+      return;
+
+   memset(&arg, 0, sizeof(arg));
+   arg.sequence = fence;
+
+   do {
+       ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_FENCE_WAIT, &arg,
+				 sizeof(arg));
+   } while (ret == -ERESTART);
+}
+
+
+int
+vmw_ioctl_fence_finish(struct vmw_winsys_screen *vws,
+                       uint32_t fence)
+{
+   assert(fence);
+   
+   if(fence) {
+      if(vmw_ioctl_fence_signalled(vws, fence) != 0) {
+         vmw_ioctl_sync(vws, fence);
+      }
+   }
+   
+   return 0;
+}
+
+
+boolean
+vmw_ioctl_init(struct vmw_winsys_screen *vws)
+{
+   struct drm_vmw_getparam_arg gp_arg;
+   int ret;
+
+   VMW_FUNC;
+
+   memset(&gp_arg, 0, sizeof(gp_arg));
+   gp_arg.param = DRM_VMW_PARAM_FIFO_OFFSET;
+   ret = drmCommandWriteRead(vws->ioctl.drm_fd, DRM_VMW_GET_PARAM,
+			     &gp_arg, sizeof(gp_arg));
+
+   if (ret) {
+      debug_printf("GET_PARAM on %d returned %d: %s\n",
+		   vws->ioctl.drm_fd, ret, strerror(-ret));
+      goto out_err1;
+   }
+
+   vmw_printf("Offset to map is 0x%08llx\n",
+              (unsigned long long)gp_arg.value);
+
+   vws->ioctl.fifo_map = vmw_ioctl_fifo_map(vws, gp_arg.value);
+   if (vws->ioctl.fifo_map == NULL)
+      goto out_err1;
+
+   vmw_printf("%s OK\n", __FUNCTION__);
+   return TRUE;
+
+ out_err1:
+   debug_printf("%s Failed\n", __FUNCTION__);
+   return FALSE;
+}
+
+
+
+void
+vmw_ioctl_cleanup(struct vmw_winsys_screen *vws)
+{
+   VMW_FUNC;
+
+   vmw_ioctl_fifo_unmap(vws, (void *)vws->ioctl.fifo_map);
+}
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_screen_pools.c b/src/gallium/winsys/drm/vmware/core/vmw_screen_pools.c
new file mode 100644
index 00000000000..b1c24b0cb6a
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_screen_pools.c
@@ -0,0 +1,79 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "vmw_screen.h"
+
+#include "vmw_buffer.h"
+#include "vmw_fence.h"
+
+#include "pipebuffer/pb_buffer.h"
+#include "pipebuffer/pb_bufmgr.h"
+
+void
+vmw_pools_cleanup(struct vmw_winsys_screen *vws)
+{
+   if(vws->pools.gmr_fenced)
+      vws->pools.gmr_fenced->destroy(vws->pools.gmr_fenced);
+
+   /* gmr_mm pool is already destroyed above */
+
+   if(vws->pools.gmr)
+      vws->pools.gmr->destroy(vws->pools.gmr);
+}
+
+
+boolean
+vmw_pools_init(struct vmw_winsys_screen *vws)
+{
+   vws->pools.gmr = vmw_gmr_bufmgr_create(vws);
+   if(!vws->pools.gmr)
+      goto error;
+
+   vws->pools.gmr_mm = mm_bufmgr_create(vws->pools.gmr,
+                                        16*1024*1024,
+                                        12 /* 4096 alignment */);
+   if(!vws->pools.gmr_mm)
+      goto error;
+
+   vws->pools.gmr_fenced = fenced_bufmgr_create(
+      vws->pools.gmr_mm,
+      vmw_fence_ops_create(vws));
+
+#ifdef DEBUG
+   vws->pools.gmr_fenced = pb_debug_manager_create(vws->pools.gmr_fenced,
+						   4096,
+						   4096);
+#endif
+   if(!vws->pools.gmr_fenced)
+      goto error;
+
+   return TRUE;
+
+error:
+   vmw_pools_cleanup(vws);
+   return FALSE;
+}
+
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_screen_svga.c b/src/gallium/winsys/drm/vmware/core/vmw_screen_svga.c
new file mode 100644
index 00000000000..d7d008859b3
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_screen_svga.c
@@ -0,0 +1,295 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * This file implements the SVGA interface into this winsys, defined
+ * in drivers/svga/svga_winsys.h.
+ *
+ * @author Keith Whitwell
+ * @author Jose Fonseca
+ */
+
+
+#include "svga_cmd.h"
+#include "svga3d_caps.h"
+
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipebuffer/pb_buffer.h"
+#include "pipebuffer/pb_bufmgr.h"
+#include "svga_winsys.h"
+#include "vmw_context.h"
+#include "vmw_screen.h"
+#include "vmw_surface.h"
+#include "vmw_buffer.h"
+#include "vmw_fence.h"
+
+
+static struct svga_winsys_buffer *
+vmw_svga_winsys_buffer_create(struct svga_winsys_screen *sws,
+                              unsigned alignment,
+                              unsigned usage,
+                              unsigned size)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+   struct pb_desc desc;
+   struct pb_manager *provider;
+   struct pb_buffer *buffer;
+
+   memset(&desc, 0, sizeof desc);
+   desc.alignment = alignment;
+   desc.usage = usage;
+
+   provider = vws->pools.gmr_fenced;
+
+   assert(provider);
+   buffer = provider->create_buffer(provider, size, &desc);
+   if(!buffer)
+      return NULL;
+
+   return vmw_svga_winsys_buffer(buffer);
+}
+
+
+static void *
+vmw_svga_winsys_buffer_map(struct svga_winsys_screen *sws,
+                           struct svga_winsys_buffer *buf,
+                           unsigned flags)
+{
+   (void)sws;
+   return pb_map(vmw_pb_buffer(buf), flags);
+}
+
+
+static void
+vmw_svga_winsys_buffer_unmap(struct svga_winsys_screen *sws,
+                             struct svga_winsys_buffer *buf)
+{
+   (void)sws;
+   pb_unmap(vmw_pb_buffer(buf));
+}
+
+
+static void
+vmw_svga_winsys_buffer_destroy(struct svga_winsys_screen *sws,
+                               struct svga_winsys_buffer *buf)
+{
+   struct pb_buffer *pbuf = vmw_pb_buffer(buf);
+   (void)sws;
+   pb_reference(&pbuf, NULL);
+}
+
+
+static void
+vmw_svga_winsys_fence_reference(struct svga_winsys_screen *sws,
+                                struct pipe_fence_handle **pdst,
+                                struct pipe_fence_handle *src)
+{
+   (void)sws;
+   *pdst = src;
+}
+
+
+static int
+vmw_svga_winsys_fence_signalled(struct svga_winsys_screen *sws,
+                                struct pipe_fence_handle *fence,
+                                unsigned flag)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+   (void)flag;
+   return vmw_ioctl_fence_signalled(vws, vmw_fence(fence));
+}
+
+
+static int
+vmw_svga_winsys_fence_finish(struct svga_winsys_screen *sws,
+                             struct pipe_fence_handle *fence,
+                             unsigned flag)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+   (void)flag;
+   return vmw_ioctl_fence_finish(vws, vmw_fence(fence));
+}
+
+
+
+static struct svga_winsys_surface *
+vmw_svga_winsys_surface_create(struct svga_winsys_screen *sws,
+                               SVGA3dSurfaceFlags flags,
+                               SVGA3dSurfaceFormat format,
+                               SVGA3dSize size,
+                               uint32 numFaces,
+                               uint32 numMipLevels)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+   struct vmw_svga_winsys_surface *surface;
+
+   surface = CALLOC_STRUCT(vmw_svga_winsys_surface);
+   if(!surface)
+      goto no_surface;
+
+   pipe_reference_init(&surface->refcnt, 1);
+   p_atomic_set(&surface->validated, 0);
+   surface->screen = vws;
+   surface->sid = vmw_ioctl_surface_create(vws,
+                                           flags, format, size,
+                                           numFaces, numMipLevels);
+   if(surface->sid == SVGA3D_INVALID_ID)
+      goto no_sid;
+
+   return svga_winsys_surface(surface);
+
+no_sid:
+   FREE(surface);
+no_surface:
+   return NULL;
+}
+
+
+static boolean
+vmw_svga_winsys_surface_is_flushed(struct svga_winsys_screen *sws,
+                                   struct svga_winsys_surface *surface)
+{
+   struct vmw_svga_winsys_surface *vsurf = vmw_svga_winsys_surface(surface);
+   return (p_atomic_read(&vsurf->validated) == 0);
+}
+
+
+static void
+vmw_svga_winsys_surface_ref(struct svga_winsys_screen *sws,
+			    struct svga_winsys_surface **pDst,
+			    struct svga_winsys_surface *src)
+{
+   struct vmw_svga_winsys_surface *d_vsurf = vmw_svga_winsys_surface(*pDst);
+   struct vmw_svga_winsys_surface *s_vsurf = vmw_svga_winsys_surface(src);
+
+   vmw_svga_winsys_surface_reference(&d_vsurf, s_vsurf);
+   *pDst = svga_winsys_surface(d_vsurf);
+}
+
+
+static void
+vmw_svga_winsys_destroy(struct svga_winsys_screen *sws)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+
+   vmw_winsys_destroy(vws);
+}
+
+
+static boolean
+vmw_svga_winsys_get_cap(struct svga_winsys_screen *sws,
+                        SVGA3dDevCapIndex index,
+                        SVGA3dDevCapResult *result)
+{
+   struct vmw_winsys_screen *vws = vmw_winsys_screen(sws);
+   const uint32 *capsBlock;
+   const SVGA3dCapsRecord *capsRecord = NULL;
+   uint32 offset;
+   const SVGA3dCapPair *capArray;
+   int numCaps, first, last;
+
+   if(!vws->ioctl.fifo_map)
+      return FALSE;
+
+   if(vws->ioctl.fifo_map[SVGA_FIFO_3D_HWVERSION] < SVGA3D_HWVERSION_WS6_B1)
+      return FALSE;
+
+   /*
+    * Search linearly through the caps block records for the specified type.
+    */
+   capsBlock = (const uint32 *)&vws->ioctl.fifo_map[SVGA_FIFO_3D_CAPS];
+   for (offset = 0; capsBlock[offset] != 0; offset += capsBlock[offset]) {
+      const SVGA3dCapsRecord *record;
+      assert(offset < SVGA_FIFO_3D_CAPS_SIZE);
+      record = (const SVGA3dCapsRecord *) (capsBlock + offset);
+      if ((record->header.type >= SVGA3DCAPS_RECORD_DEVCAPS_MIN) &&
+          (record->header.type <= SVGA3DCAPS_RECORD_DEVCAPS_MAX) &&
+          (!capsRecord || (record->header.type > capsRecord->header.type))) {
+         capsRecord = record;
+      }
+   }
+
+   if(!capsRecord)
+      return FALSE;
+
+   /*
+    * Calculate the number of caps from the size of the record.
+    */
+   capArray = (const SVGA3dCapPair *) capsRecord->data;
+   numCaps = (int) ((capsRecord->header.length * sizeof(uint32) -
+                     sizeof capsRecord->header) / (2 * sizeof(uint32)));
+
+   /*
+    * Binary-search for the cap with the specified index.
+    */
+   for (first = 0, last = numCaps - 1; first <= last; ) {
+      int mid = (first + last) / 2;
+
+      if ((SVGA3dDevCapIndex) capArray[mid][0] == index) {
+         /*
+          * Found it.
+          */
+         result->u = capArray[mid][1];
+         return TRUE;
+      }
+
+      /*
+       * Divide and conquer.
+       */
+      if ((SVGA3dDevCapIndex) capArray[mid][0] > index) {
+         last = mid - 1;
+      } else {
+         first = mid + 1;
+      }
+   }
+
+   return FALSE;
+}
+
+
+boolean
+vmw_winsys_screen_init_svga(struct vmw_winsys_screen *vws)
+{
+   vws->base.destroy = vmw_svga_winsys_destroy;
+   vws->base.get_cap = vmw_svga_winsys_get_cap;
+   vws->base.context_create = vmw_svga_winsys_context_create;
+   vws->base.surface_create = vmw_svga_winsys_surface_create;
+   vws->base.surface_is_flushed = vmw_svga_winsys_surface_is_flushed;
+   vws->base.surface_reference = vmw_svga_winsys_surface_ref;
+   vws->base.buffer_create = vmw_svga_winsys_buffer_create;
+   vws->base.buffer_map = vmw_svga_winsys_buffer_map;
+   vws->base.buffer_unmap = vmw_svga_winsys_buffer_unmap;
+   vws->base.buffer_destroy = vmw_svga_winsys_buffer_destroy;
+   vws->base.fence_reference = vmw_svga_winsys_fence_reference;
+   vws->base.fence_signalled = vmw_svga_winsys_fence_signalled;
+   vws->base.fence_finish = vmw_svga_winsys_fence_finish;
+
+   return TRUE;
+}
+
+
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_surface.c b/src/gallium/winsys/drm/vmware/core/vmw_surface.c
new file mode 100644
index 00000000000..64eb32f8b94
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_surface.c
@@ -0,0 +1,61 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+
+#include "svga_cmd.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "vmw_surface.h"
+#include "vmw_screen.h"
+
+void
+vmw_svga_winsys_surface_reference(struct vmw_svga_winsys_surface **pdst,
+                                  struct vmw_svga_winsys_surface *src)
+{
+   struct pipe_reference *src_ref;
+   struct pipe_reference *dst_ref;
+   struct vmw_svga_winsys_surface *dst;
+
+   if(pdst == NULL || *pdst == src)
+      return;
+
+   dst = *pdst;
+
+   src_ref = src ? &src->refcnt : NULL;
+   dst_ref = dst ? &dst->refcnt : NULL;
+
+   if (pipe_reference(&dst_ref, src_ref)) {
+      vmw_ioctl_surface_destroy(dst->screen, dst->sid);
+#ifdef DEBUG
+      /* to detect dangling pointers */
+      assert(p_atomic_read(&dst->validated) == 0);
+      dst->sid = SVGA3D_INVALID_ID;
+#endif
+      FREE(dst);
+   }
+
+   *pdst = src;
+}
diff --git a/src/gallium/winsys/drm/vmware/core/vmw_surface.h b/src/gallium/winsys/drm/vmware/core/vmw_surface.h
new file mode 100644
index 00000000000..340cc1532e0
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmw_surface.h
@@ -0,0 +1,79 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Surfaces for VMware SVGA winsys.
+ * 
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef VMW_SURFACE_H_
+#define VMW_SURFACE_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_atomic.h"
+#include "pipe/p_refcnt.h"
+
+#define VMW_MAX_PRESENTS 3
+
+
+
+struct vmw_svga_winsys_surface
+{
+   struct pipe_atomic validated;
+   struct pipe_reference refcnt;
+
+   struct vmw_winsys_screen *screen;
+   uint32_t sid;
+
+   /* FIXME: make this thread safe */
+   unsigned next_present_no;
+   uint32_t present_fences[VMW_MAX_PRESENTS];
+};
+
+
+static INLINE struct svga_winsys_surface *
+svga_winsys_surface(struct vmw_svga_winsys_surface *surf)
+{
+   assert(!surf || surf->sid != SVGA3D_INVALID_ID);
+   return (struct svga_winsys_surface *)surf;
+}
+
+
+static INLINE struct vmw_svga_winsys_surface *
+vmw_svga_winsys_surface(struct svga_winsys_surface *surf)
+{
+   return (struct vmw_svga_winsys_surface *)surf;
+}
+
+
+void
+vmw_svga_winsys_surface_reference(struct vmw_svga_winsys_surface **pdst,
+                                  struct vmw_svga_winsys_surface *src);
+
+#endif /* VMW_SURFACE_H_ */
diff --git a/src/gallium/winsys/drm/vmware/core/vmwgfx_drm.h b/src/gallium/winsys/drm/vmware/core/vmwgfx_drm.h
new file mode 100644
index 00000000000..56070a1ba10
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/core/vmwgfx_drm.h
@@ -0,0 +1,538 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 VMware, Inc., Palo Alto, CA., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _VMWGFX_DRM_H_
+#define _VMWGFX_DRM_H_
+
+#define DRM_VMW_MAX_SURFACE_FACES 6
+#define DRM_VMW_MAX_MIP_LEVELS 24
+
+#define DRM_VMW_EXT_NAME_LEN 128
+
+#define DRM_VMW_GET_PARAM            1
+#define DRM_VMW_EXTENSION            2
+#define DRM_VMW_CREATE_CONTEXT       3
+#define DRM_VMW_UNREF_CONTEXT        4
+#define DRM_VMW_CREATE_SURFACE       5
+#define DRM_VMW_UNREF_SURFACE        6
+#define DRM_VMW_REF_SURFACE          7
+#define DRM_VMW_EXECBUF              8
+#define DRM_VMW_ALLOC_DMABUF         9
+#define DRM_VMW_UNREF_DMABUF         10
+#define DRM_VMW_FIFO_DEBUG           11
+#define DRM_VMW_FENCE_WAIT           12
+#define DRM_VMW_OVERLAY              13
+#define DRM_VMW_CURSOR_BYPASS        14
+
+/*************************************************************************/
+/**
+ * DRM_VMW_GET_PARAM - get device information.
+ *
+ * Currently we support only one parameter:
+ *
+ * DRM_VMW_PARAM_FIFO_OFFSET:
+ * Offset to use to map the first page of the FIFO read-only.
+ * The fifo is mapped using the mmap() system call on the drm device.
+ */
+
+#define DRM_VMW_PARAM_FIFO_OFFSET    0
+
+/**
+ * struct drm_vmw_getparam_arg
+ *
+ * @value: Returned value. //Out
+ * @param: Parameter to query. //In.
+ *
+ * Argument to the DRM_VMW_GET_PARAM Ioctl.
+ */
+
+struct drm_vmw_getparam_arg {
+	uint64_t value;
+	uint32_t param;
+	uint32_t pad64;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_EXTENSION - Query device extensions.
+ */
+
+/**
+ * struct drm_vmw_extension_rep
+ *
+ * @exists: The queried extension exists.
+ * @driver_ioctl_offset: Ioctl number of the first ioctl in the extension.
+ * @driver_sarea_offset: Offset to any space in the DRI SAREA
+ * used by the extension.
+ * @major: Major version number of the extension.
+ * @minor: Minor version number of the extension.
+ * @pl: Patch level version number of the extension.
+ *
+ * Output argument to the DRM_VMW_EXTENSION Ioctl.
+ */
+
+struct drm_vmw_extension_rep {
+	int32_t exists;
+	uint32_t driver_ioctl_offset;
+	uint32_t driver_sarea_offset;
+	uint32_t major;
+	uint32_t minor;
+	uint32_t pl;
+	uint32_t pad64;
+};
+
+/**
+ * union drm_vmw_extension_arg
+ *
+ * @extension - Ascii name of the extension to be queried. //In
+ * @rep - Reply as defined above. //Out
+ *
+ * Argument to the DRM_VMW_EXTENSION Ioctl.
+ */
+
+union drm_vmw_extension_arg {
+	char extension[DRM_VMW_EXT_NAME_LEN];
+	struct drm_vmw_extension_rep rep;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_CREATE_CONTEXT - Create a host context.
+ *
+ * Allocates a device unique context id, and queues a create context command
+ * for the host. Does not wait for host completion.
+ */
+
+/**
+ * struct drm_vmw_context_arg
+ *
+ * @cid: Device unique context ID.
+ *
+ * Output argument to the DRM_VMW_CREATE_CONTEXT Ioctl.
+ * Input argument to the DRM_VMW_UNREF_CONTEXT Ioctl.
+ */
+
+struct drm_vmw_context_arg {
+	int32_t cid;
+	uint32_t pad64;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_UNREF_CONTEXT - Create a host context.
+ *
+ * Frees a global context id, and queues a destroy host command for the host.
+ * Does not wait for host completion. The context ID can be used directly
+ * in the command stream and shows up as the same context ID on the host.
+ */
+
+/*************************************************************************/
+/**
+ * DRM_VMW_CREATE_SURFACE - Create a host suface.
+ *
+ * Allocates a device unique surface id, and queues a create surface command
+ * for the host. Does not wait for host completion. The surface ID can be
+ * used directly in the command stream and shows up as the same surface
+ * ID on the host.
+ */
+
+/**
+ * struct drm_wmv_surface_create_req
+ *
+ * @flags: Surface flags as understood by the host.
+ * @format: Surface format as understood by the host.
+ * @mip_levels: Number of mip levels for each face.
+ * An unused face should have 0 encoded.
+ * @size_addr: Address of a user-space array of sruct drm_vmw_size
+ * cast to an uint64_t for 32-64 bit compatibility.
+ * The size of the array should equal the total number of mipmap levels.
+ * @shareable: Boolean whether other clients (as identified by file descriptors)
+ * may reference this surface.
+ *
+ * Input data to the DRM_VMW_CREATE_SURFACE Ioctl.
+ * Output data from the DRM_VMW_REF_SURFACE Ioctl.
+ */
+
+struct drm_vmw_surface_create_req {
+	uint32_t flags;
+	uint32_t format;
+	uint32_t mip_levels[DRM_VMW_MAX_SURFACE_FACES];
+	uint64_t size_addr;
+	int32_t shareable;
+	uint32_t pad64;
+};
+
+/**
+ * struct drm_wmv_surface_arg
+ *
+ * @sid: Surface id of created surface or surface to destroy or reference.
+ *
+ * Output data from the DRM_VMW_CREATE_SURFACE Ioctl.
+ * Input argument to the DRM_VMW_UNREF_SURFACE Ioctl.
+ * Input argument to the DRM_VMW_REF_SURFACE Ioctl.
+ */
+
+struct drm_vmw_surface_arg {
+	int32_t sid;
+	uint32_t pad64;
+};
+
+/**
+ * struct drm_vmw_size ioctl.
+ *
+ * @width - mip level width
+ * @height - mip level height
+ * @depth - mip level depth
+ *
+ * Description of a mip level.
+ * Input data to the DRM_WMW_CREATE_SURFACE Ioctl.
+ */
+
+struct drm_vmw_size {
+	uint32_t width;
+	uint32_t height;
+	uint32_t depth;
+	uint32_t pad64;
+};
+
+/**
+ * union drm_vmw_surface_create_arg
+ *
+ * @rep: Output data as described above.
+ * @req: Input data as described above.
+ *
+ * Argument to the DRM_VMW_CREATE_SURFACE Ioctl.
+ */
+
+union drm_vmw_surface_create_arg {
+	struct drm_vmw_surface_arg rep;
+	struct drm_vmw_surface_create_req req;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_REF_SURFACE - Reference a host surface.
+ *
+ * Puts a reference on a host surface with a give sid, as previously
+ * returned by the DRM_VMW_CREATE_SURFACE ioctl.
+ * A reference will make sure the surface isn't destroyed while we hold
+ * it and will allow the calling client to use the surface ID in the command
+ * stream.
+ *
+ * On successful return, the Ioctl returns the surface information given
+ * in the DRM_VMW_CREATE_SURFACE ioctl.
+ */
+
+/**
+ * union drm_vmw_surface_reference_arg
+ *
+ * @rep: Output data as described above.
+ * @req: Input data as described above.
+ *
+ * Argument to the DRM_VMW_REF_SURFACE Ioctl.
+ */
+
+union drm_vmw_surface_reference_arg {
+	struct drm_vmw_surface_create_req rep;
+	struct drm_vmw_surface_arg req;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_UNREF_SURFACE - Unreference a host surface.
+ *
+ * Clear a reference previously put on a host surface.
+ * When all references are gone, including the one implicitly placed
+ * on creation,
+ * a destroy surface command will be queued for the host.
+ * Does not wait for completion.
+ */
+
+/*************************************************************************/
+/**
+ * DRM_VMW_EXECBUF
+ *
+ * Submit a command buffer for execution on the host, and return a
+ * fence sequence that when signaled, indicates that the command buffer has
+ * executed.
+ */
+
+/**
+ * struct drm_vmw_execbuf_arg
+ *
+ * @commands: User-space address of a command buffer cast to an uint64_t.
+ * @command-size: Size in bytes of the command buffer.
+ * @fence_rep: User-space address of a struct drm_vmw_fence_rep cast to an
+ * uint64_t.
+ *
+ * Argument to the DRM_VMW_EXECBUF Ioctl.
+ */
+
+struct drm_vmw_execbuf_arg {
+	uint64_t commands;
+	uint32_t command_size;
+	uint32_t pad64;
+	uint64_t fence_rep;
+};
+
+/**
+ * struct drm_vmw_fence_rep
+ *
+ * @fence_seq: Fence sequence associated with a command submission.
+ * @error: This member should've been set to -EFAULT on submission.
+ * The following actions should be take on completion:
+ * error == -EFAULT: Fence communication failed. The host is synchronized.
+ * Use the last fence id read from the FIFO fence register.
+ * error != 0 && error != -EFAULT:
+ * Fence submission failed. The host is synchronized. Use the fence_seq member.
+ * error == 0: All is OK, The host may not be synchronized.
+ * Use the fence_seq member.
+ *
+ * Input / Output data to the DRM_VMW_EXECBUF Ioctl.
+ */
+
+struct drm_vmw_fence_rep {
+	uint64_t fence_seq;
+	int32_t error;
+	uint32_t pad64;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_ALLOC_DMABUF
+ *
+ * Allocate a DMA buffer that is visible also to the host.
+ * NOTE: The buffer is
+ * identified by a handle and an offset, which are private to the guest, but
+ * useable in the command stream. The guest kernel may translate these
+ * and patch up the command stream accordingly. In the future, the offset may
+ * be zero at all times, or it may disappear from the interface before it is
+ * fixed.
+ *
+ * The DMA buffer may stay user-space mapped in the guest at all times,
+ * and is thus suitable for sub-allocation.
+ *
+ * DMA buffers are mapped using the mmap() syscall on the drm device.
+ */
+
+/**
+ * struct drm_vmw_alloc_dmabuf_req
+ *
+ * @size: Required minimum size of the buffer.
+ *
+ * Input data to the DRM_VMW_ALLOC_DMABUF Ioctl.
+ */
+
+struct drm_vmw_alloc_dmabuf_req {
+	uint32_t size;
+	uint32_t pad64;
+};
+
+/**
+ * struct drm_vmw_dmabuf_rep
+ *
+ * @map_handle: Offset to use in the mmap() call used to map the buffer.
+ * @handle: Handle unique to this buffer. Used for unreferencing.
+ * @cur_gmr_id: GMR id to use in the command stream when this buffer is
+ * referenced. See not above.
+ * @cur_gmr_offset: Offset to use in the command stream when this buffer is
+ * referenced. See note above.
+ *
+ * Output data from the DRM_VMW_ALLOC_DMABUF Ioctl.
+ */
+
+struct drm_vmw_dmabuf_rep {
+	uint64_t map_handle;
+	uint32_t handle;
+	uint32_t cur_gmr_id;
+	uint32_t cur_gmr_offset;
+	uint32_t pad64;
+};
+
+/**
+ * union drm_vmw_dmabuf_arg
+ *
+ * @req: Input data as described above.
+ * @rep: Output data as described above.
+ *
+ * Argument to the DRM_VMW_ALLOC_DMABUF Ioctl.
+ */
+
+union drm_vmw_alloc_dmabuf_arg {
+	struct drm_vmw_alloc_dmabuf_req req;
+	struct drm_vmw_dmabuf_rep rep;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_UNREF_DMABUF - Free a DMA buffer.
+ *
+ */
+
+/**
+ * struct drm_vmw_unref_dmabuf_arg
+ *
+ * @handle: Handle indicating what buffer to free. Obtained from the
+ * DRM_VMW_ALLOC_DMABUF Ioctl.
+ *
+ * Argument to the DRM_VMW_UNREF_DMABUF Ioctl.
+ */
+
+struct drm_vmw_unref_dmabuf_arg {
+	uint32_t handle;
+	uint32_t pad64;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_FIFO_DEBUG - Get last FIFO submission.
+ *
+ * This IOCTL copies the last FIFO submission directly out of the FIFO buffer.
+ */
+
+/**
+ * struct drm_vmw_fifo_debug_arg
+ *
+ * @debug_buffer: User space address of a debug_buffer cast to an uint64_t //In
+ * @debug_buffer_size: Size in bytes of debug buffer //In
+ * @used_size: Number of bytes copied to the buffer // Out
+ * @did_not_fit: Boolean indicating that the fifo contents did not fit. //Out
+ *
+ * Argument to the DRM_VMW_FIFO_DEBUG Ioctl.
+ */
+
+struct drm_vmw_fifo_debug_arg {
+	uint64_t debug_buffer;
+	uint32_t debug_buffer_size;
+	uint32_t used_size;
+	int32_t did_not_fit;
+	uint32_t pad64;
+};
+
+struct drm_vmw_fence_wait_arg {
+	uint64_t sequence;
+	uint64_t kernel_cookie;
+	int32_t cookie_valid;
+	int32_t pad64;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_OVERLAY - Control overlays.
+ *
+ * This IOCTL controls the overlay units of the svga device.
+ * The SVGA overlay units does not work like regular hardware units in
+ * that they do not automaticaly read back the contents of the given dma
+ * buffer. But instead only read back for each call to this ioctl, and
+ * at any point between this call being made and a following call that
+ * either changes the buffer or disables the stream.
+ */
+
+/**
+ * struct drm_vmw_rect
+ *
+ * Defines a rectangle. Used in the overlay ioctl to define
+ * source and destination rectangle.
+ */
+
+struct drm_vmw_rect {
+	int32_t x;
+	int32_t y;
+	uint32_t w;
+	uint32_t h;
+};
+
+/**
+ * struct drm_vmw_overlay_arg
+ *
+ * @stream_id: Stearm to control
+ * @enabled: If false all following arguments are ignored.
+ * @handle: Handle to buffer for getting data from.
+ * @format: Format of the overlay as understood by the host.
+ * @width: Width of the overlay.
+ * @height: Height of the overlay.
+ * @size: Size of the overlay in bytes.
+ * @pitch: Array of pitches, the two last are only used for YUV12 formats.
+ * @offset: Offset from start of dma buffer to overlay.
+ * @src: Source rect, must be within the defined area above.
+ * @dst: Destination rect, x and y may be negative.
+ *
+ * Argument to the DRM_VMW_OVERLAY Ioctl.
+ */
+
+struct drm_vmw_overlay_arg {
+	uint32_t stream_id;
+	uint32_t enabled;
+
+	uint32_t flags;
+	uint32_t color_key;
+
+	uint32_t handle;
+	uint32_t offset;
+	int32_t format;
+	uint32_t size;
+	uint32_t width;
+	uint32_t height;
+	uint32_t pitch[3];
+
+	uint32_t pad64;
+	struct drm_vmw_rect src;
+	struct drm_vmw_rect dst;
+};
+
+/*************************************************************************/
+/**
+ * DRM_VMW_CURSOR_BYPASS - Give extra information about cursor bypass.
+ *
+ */
+
+#define DRM_VMW_CURSOR_BYPASS_ALL    (1 << 0)
+#define DRM_VMW_CURSOR_BYPASS_FLAGS       (1)
+
+/**
+ * struct drm_vmw_cursor_bypass_arg
+ *
+ * @flags: Flags.
+ * @crtc_id: Crtc id, only used if DMR_CURSOR_BYPASS_ALL isn't passed.
+ * @xpos: X position of cursor.
+ * @ypos: Y position of cursor.
+ * @xhot: X hotspot.
+ * @yhot: Y hotspot.
+ *
+ * Argument to the DRM_VMW_CURSOR_BYPASS Ioctl.
+ */
+
+struct drm_vmw_cursor_bypass_arg {
+	uint32_t flags;
+	uint32_t crtc_id;
+	int32_t xpos;
+	int32_t ypos;
+	int32_t xhot;
+	int32_t yhot;
+};
+
+#endif
diff --git a/src/gallium/winsys/drm/vmware/dri/Makefile b/src/gallium/winsys/drm/vmware/dri/Makefile
new file mode 100644
index 00000000000..8a39e23da6d
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/dri/Makefile
@@ -0,0 +1,18 @@
+
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = vmwgfx_dri.so
+
+PIPE_DRIVERS = \
+	$(TOP)/src/gallium/state_trackers/dri/libdridrm.a \
+	$(TOP)/src/gallium/winsys/drm/vmware/core/libsvgadrm.a \
+	$(TOP)/src/gallium/drivers/trace/libtrace.a \
+	$(TOP)/src/gallium/drivers/svga/libsvga.a
+
+C_SOURCES = \
+	$(COMMON_GALLIUM_SOURCES)
+
+include ../../Makefile.template
+
+symlinks:
diff --git a/src/gallium/winsys/drm/vmware/dri/SConscript b/src/gallium/winsys/drm/vmware/dri/SConscript
new file mode 100644
index 00000000000..1019f577a5f
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/dri/SConscript
@@ -0,0 +1,62 @@
+import os
+import os.path
+
+Import('*')
+
+if env['platform'] == 'linux':
+
+   if env['dri']:
+      env = env.Clone()
+
+      sources = [
+        '#/src/mesa/drivers/dri/common/utils.c',
+        '#/src/mesa/drivers/dri/common/vblank.c',
+        '#/src/mesa/drivers/dri/common/dri_util.c',
+        '#/src/mesa/drivers/dri/common/xmlconfig.c',
+         ]
+   
+      
+      env.ParseConfig('pkg-config --cflags --libs libdrm')
+      
+      env.Prepend(CPPPATH = [
+            '#/src/mesa/state_tracker',
+            '#/src/mesa/drivers/dri/common',
+            '#/src/mesa/main',
+            '#/src/mesa/glapi',
+            '#/src/mesa',
+            '#/include',
+            '#/src/gallium/drivers/svga',
+            '#/src/gallium/drivers/svga/include',
+            ])
+      
+      env.Append(CPPDEFINES = [
+            'HAVE_STDINT_H', 
+            'HAVE_SYS_TYPES_H',
+            ])
+
+      env.Append(CFLAGS = [
+            '-std=gnu99',
+            '-D_FILE_OFFSET_BITS=64',
+            ])
+      
+      env.Prepend(LIBPATH = [
+            ])
+      
+      env.Prepend(LIBS = [
+            trace,
+            st_dri,
+            svgadrm,
+            svga,
+            mesa,
+            auxiliaries,
+            ])
+      
+      # TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
+      env.LoadableModule(
+         target ='vmwgfx_dri.so',
+         source = sources,
+         LIBS = env['LIBS'],
+         SHLIBPREFIX = '',
+         )
+      
+
diff --git a/src/gallium/winsys/drm/vmware/egl/Makefile b/src/gallium/winsys/drm/vmware/egl/Makefile
new file mode 100644
index 00000000000..8e2980c318c
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/egl/Makefile
@@ -0,0 +1,18 @@
+
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = EGL_svga.so
+
+PIPE_DRIVERS = \
+	$(TOP)/src/gallium/state_trackers/egl/libegldrm.a \
+	$(TOP)/src/gallium/winsys/drm/vmware/core/libsvgadrm.a \
+	$(TOP)/src/gallium/drivers/trace/libtrace.a \
+	$(TOP)/src/gallium/drivers/svga/libsvga.a
+
+C_SOURCES = \
+	$(COMMON_GALLIUM_SOURCES)
+
+include ../../Makefile.template
+
+symlinks:
diff --git a/src/gallium/winsys/drm/vmware/xorg/Makefile b/src/gallium/winsys/drm/vmware/xorg/Makefile
new file mode 100644
index 00000000000..49e28ae17f5
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/Makefile
@@ -0,0 +1,71 @@
+TOP        = ../../../../../..
+
+include $(TOP)/configs/current
+
+TARGET = vmwgfx_drv.so
+
+CFILES = \
+	vmw_xorg.c \
+	vmw_video.c \
+	vmw_ioctl.c \
+	vmw_screen.c
+
+OBJECTS = $(patsubst %.c,%.o,$(CFILES))
+
+INCLUDES = \
+	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto) \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium
+
+LIBS = \
+	$(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
+	$(TOP)/src/gallium/winsys/drm/vmware/core/libsvgadrm.a \
+	$(TOP)/src/gallium/drivers/trace/libtrace.a \
+	$(TOP)/src/gallium/drivers/svga/libsvga.a \
+	$(GALLIUM_AUXILIARIES)
+
+LINKS = \
+	$(shell pkg-config --libs --silence-errors libkms) \
+	$(shell pkg-config --libs libdrm)
+
+DRIVER_DEFINES = \
+	-std=gnu99 \
+	-DHAVE_CONFIG_H
+
+TARGET_STAGING = $(TOP)/$(LIB_DIR)/gallium/$(TARGET)
+
+#############################################
+
+
+
+all default: $(TARGET) $(TARGET_STAGING)
+
+$(TARGET): $(OBJECTS) Makefile $(LIBS)
+	$(MKLIB) -noprefix -o $@ $(OBJECTS) $(LIBS) $(LINKS)
+
+$(TOP)/$(LIB_DIR)/gallium:
+	mkdir -p $@
+
+$(TARGET_STAGING): $(TARGET) $(TOP)/$(LIB_DIR)/gallium
+	$(INSTALL) $(TARGET) $(TOP)/$(LIB_DIR)/gallium
+
+clean:
+	rm -rf $(OBJECTS) $(TARGET)
+
+install:
+	$(INSTALL) -d $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(TARGET) $(DESTDIR)/$(XORG_DRIVER_INSTALL_DIR)
+
+
+##############################################
+
+
+.c.o:
+	$(CC) -c $(CFLAGS) $(INCLUDES) $(DRIVER_DEFINES) $< -o $@
+
+
+##############################################
+
+.PHONY	= all clean install
diff --git a/src/gallium/winsys/drm/vmware/xorg/SConscript b/src/gallium/winsys/drm/vmware/xorg/SConscript
new file mode 100644
index 00000000000..ff7b2ed34ed
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/SConscript
@@ -0,0 +1,54 @@
+import os.path
+
+Import('*')
+
+if env['platform'] == 'linux':
+
+	env = env.Clone()
+
+	env.ParseConfig('pkg-config --cflags --libs libdrm xorg-server')
+
+	env.Prepend(CPPPATH = [
+		'#/include',
+		'#/src/gallium',
+		'#/src/mesa',
+		'#/src/gallium/drivers/svga',
+		'#/src/gallium/drivers/svga/include',
+	])
+
+	env.Append(CPPDEFINES = [
+	])
+
+	if env['gcc']:
+		env.Append(CPPDEFINES = [
+			'HAVE_STDINT_H',
+			'HAVE_SYS_TYPES_H',
+		])
+
+	env.Append(CFLAGS = [
+		'-std=gnu99',
+		'-D_FILE_OFFSET_BITS=64',
+	])
+
+	env.Prepend(LIBPATH = [
+	])
+
+	env.Prepend(LIBS = [
+		trace,
+		st_xorg,
+		svgadrm,
+		svga,
+                auxiliaries,
+	])
+
+	sources = [
+		'vmw_xorg.c',
+	]
+
+	# TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
+	env.LoadableModule(
+		target ='vmwgfx_drv.so',
+		source = sources,
+		LIBS = env['LIBS'],
+		SHLIBPREFIX = '',
+	)
diff --git a/src/gallium/winsys/drm/vmware/xorg/vmw_driver.h b/src/gallium/winsys/drm/vmware/xorg/vmw_driver.h
new file mode 100644
index 00000000000..db6b89b8bcd
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/vmw_driver.h
@@ -0,0 +1,90 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Contains the shared resources for VMware Xorg driver
+ * that sits ontop of the Xorg State Traker.
+ *
+ * It is initialized in vmw_screen.c.
+ *
+ * @author Jakob Bornecrantz <jakob@vmware.com>
+ */
+
+#ifndef VMW_DRIVER_H_
+#define VMW_DRIVER_H_
+
+#include "state_trackers/xorg/xorg_tracker.h"
+
+struct vmw_dma_buffer;
+
+struct vmw_driver
+{
+    int fd;
+
+    void *cursor_priv;
+
+    /* vmw_video.c */
+    void *video_priv;
+};
+
+static INLINE struct vmw_driver *
+vmw_driver(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    return ms ? (struct vmw_driver *)ms->winsys_priv : NULL;
+}
+
+
+/***********************************************************************
+ * vmw_video.c
+ */
+
+Bool vmw_video_init(ScrnInfoPtr pScrn, struct vmw_driver *vmw);
+
+Bool vmw_video_close(ScrnInfoPtr pScrn, struct vmw_driver *vmw);
+
+
+/***********************************************************************
+ * vmw_ioctl.c
+ */
+
+int vmw_ioctl_cursor_bypass(struct vmw_driver *vmw, int xhot, int yhot);
+
+struct vmw_dma_buffer * vmw_ioctl_buffer_create(struct vmw_driver *vmw,
+						uint32_t size,
+						unsigned *handle);
+
+void * vmw_ioctl_buffer_map(struct vmw_driver *vmw,
+			    struct vmw_dma_buffer *buf);
+
+void vmw_ioctl_buffer_unmap(struct vmw_driver *vmw,
+			    struct vmw_dma_buffer *buf);
+
+void vmw_ioctl_buffer_destroy(struct vmw_driver *vmw,
+			      struct vmw_dma_buffer *buf);
+
+
+#endif
diff --git a/src/gallium/winsys/drm/vmware/xorg/vmw_hook.h b/src/gallium/winsys/drm/vmware/xorg/vmw_hook.h
new file mode 100644
index 00000000000..224a2d92996
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/vmw_hook.h
@@ -0,0 +1,39 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#ifndef VMW_HOOK_H_
+#define VMW_HOOK_H_
+
+#include "state_trackers/xorg/xorg_winsys.h"
+
+
+/***********************************************************************
+ * vmw_screen.c
+ */
+
+void vmw_screen_set_functions(ScrnInfoPtr pScrn);
+
+
+#endif
diff --git a/src/gallium/winsys/drm/vmware/xorg/vmw_ioctl.c b/src/gallium/winsys/drm/vmware/xorg/vmw_ioctl.c
new file mode 100644
index 00000000000..ad6993840d2
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/vmw_ioctl.c
@@ -0,0 +1,157 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Contains the functions for creating dma buffers by calling
+ * the kernel via driver specific ioctls.
+ *
+ * @author Jakob Bornecrantz <jakob@vmware.com>
+ */
+
+#define HAVE_STDINT_H
+#define _FILE_OFFSET_BITS 64
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/mman.h>
+#include "xf86drm.h"
+#include "../core/vmwgfx_drm.h"
+
+#include "vmw_driver.h"
+#include "util/u_debug.h"
+
+struct vmw_dma_buffer
+{
+    void *data;
+    unsigned handle;
+    uint64_t map_handle;
+    unsigned map_count;
+    uint32_t size;
+};
+
+int
+vmw_ioctl_cursor_bypass(struct vmw_driver *vmw, int xhot, int yhot)
+{
+    struct drm_vmw_cursor_bypass_arg arg;
+    int ret;
+
+    memset(&arg, 0, sizeof(arg));
+    arg.flags = DRM_VMW_CURSOR_BYPASS_ALL;
+    arg.xhot = xhot;
+    arg.yhot = yhot;
+
+    ret = drmCommandWrite(vmw->fd, DRM_VMW_CURSOR_BYPASS,
+			  &arg, sizeof(arg));
+
+    return ret;
+}
+
+struct vmw_dma_buffer *
+vmw_ioctl_buffer_create(struct vmw_driver *vmw, uint32_t size, unsigned *handle)
+{
+    struct vmw_dma_buffer *buf;
+    union drm_vmw_alloc_dmabuf_arg arg;
+    struct drm_vmw_alloc_dmabuf_req *req = &arg.req;
+    struct drm_vmw_dmabuf_rep *rep = &arg.rep;
+    int ret;
+
+    buf = xcalloc(1, sizeof(*buf));
+    if (!buf)
+	goto err;
+
+    memset(&arg, 0, sizeof(arg));
+    req->size = size;
+    do {
+	ret = drmCommandWriteRead(vmw->fd, DRM_VMW_ALLOC_DMABUF, &arg, sizeof(arg));
+    } while (ret == -ERESTART);
+
+    if (ret) {
+	debug_printf("IOCTL failed %d: %s\n", ret, strerror(-ret));
+	goto err_free;
+    }
+
+
+    buf->data = NULL;
+    buf->handle = rep->handle;
+    buf->map_handle = rep->map_handle;
+    buf->map_count = 0;
+    buf->size = size;
+
+    *handle = rep->handle;
+
+    return buf;
+
+err_free:
+    xfree(buf);
+err:
+    return NULL;
+}
+
+void
+vmw_ioctl_buffer_destroy(struct vmw_driver *vmw, struct vmw_dma_buffer *buf) 
+{ 
+    struct drm_vmw_unref_dmabuf_arg arg; 
+
+    if (buf->data) { 
+	munmap(buf->data, buf->size); 
+	buf->data = NULL; 
+    } 
+
+    memset(&arg, 0, sizeof(arg)); 
+    arg.handle = buf->handle; 
+    drmCommandWrite(vmw->fd, DRM_VMW_UNREF_DMABUF, &arg, sizeof(arg)); 
+
+    xfree(buf); 
+} 
+
+void *
+vmw_ioctl_buffer_map(struct vmw_driver *vmw, struct vmw_dma_buffer *buf)
+{
+    void *map;
+
+    if (buf->data == NULL) {
+	map = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+		   vmw->fd, buf->map_handle);
+	if (map == MAP_FAILED) {
+	    debug_printf("%s: Map failed.\n", __FUNCTION__);
+	    return NULL;
+	}
+
+	buf->data = map;
+    }
+
+    ++buf->map_count;
+
+    return buf->data;
+}
+
+void
+vmw_ioctl_buffer_unmap(struct vmw_driver *vmw, struct vmw_dma_buffer *buf)
+{
+    --buf->map_count;
+}
diff --git a/src/gallium/winsys/drm/vmware/xorg/vmw_screen.c b/src/gallium/winsys/drm/vmware/xorg/vmw_screen.c
new file mode 100644
index 00000000000..421906da996
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/vmw_screen.c
@@ -0,0 +1,154 @@
+/**********************************************************
+ * Copyright 2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Contains the init code for the VMware Xorg driver.
+ *
+ * @author Jakob Bornecrantz <jakob@vmware.com>
+ */
+
+#include "vmw_hook.h"
+#include "vmw_driver.h"
+
+/* modified version of crtc functions */
+xf86CrtcFuncsRec vmw_screen_crtc_funcs;
+
+static void
+vmw_screen_cursor_load_argb(xf86CrtcPtr crtc, CARD32 *image)
+{
+    struct vmw_driver *vmw = modesettingPTR(crtc->scrn)->winsys_priv;
+    xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(crtc->scrn);
+    xf86CrtcFuncsPtr funcs = vmw->cursor_priv;
+    CursorPtr c = config->cursor;
+
+    /* Run the ioctl before uploading the image */
+    vmw_ioctl_cursor_bypass(vmw, c->bits->xhot, c->bits->yhot);
+
+    funcs->load_cursor_argb(crtc, image);
+}
+
+static void
+vmw_screen_cursor_init(ScrnInfoPtr pScrn, struct vmw_driver *vmw)
+{
+    xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(pScrn);
+    int i;
+
+    /* XXX assume that all crtc's have the same function struct */
+
+    /* Save old struct need to call the old functions as well */
+    vmw->cursor_priv = (void*)(config->crtc[0]->funcs);
+    memcpy(&vmw_screen_crtc_funcs, vmw->cursor_priv, sizeof(xf86CrtcFuncsRec));
+    vmw_screen_crtc_funcs.load_cursor_argb = vmw_screen_cursor_load_argb;
+
+    for (i = 0; i < config->num_crtc; i++)
+	config->crtc[i]->funcs = &vmw_screen_crtc_funcs;
+}
+
+static void
+vmw_screen_cursor_close(ScrnInfoPtr pScrn, struct vmw_driver *vmw)
+{
+    xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(pScrn);
+    int i;
+
+    vmw_ioctl_cursor_bypass(vmw, 0, 0);
+
+    for (i = 0; i < config->num_crtc; i++)
+	config->crtc[i]->funcs = vmw->cursor_priv;
+}
+
+static Bool
+vmw_screen_init(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    struct vmw_driver *vmw;
+
+    vmw = xnfcalloc(sizeof(*vmw), 1);
+    if (!vmw)
+	return FALSE;
+
+    vmw->fd = ms->fd;
+    ms->winsys_priv = vmw;
+
+    vmw_screen_cursor_init(pScrn, vmw);
+
+    /* if gallium is used then we don't need to do anything more. */
+    if (ms->screen)
+	return TRUE;
+
+    vmw_video_init(pScrn, vmw);
+
+    return TRUE;
+}
+
+static Bool
+vmw_screen_close(ScrnInfoPtr pScrn)
+{
+    modesettingPtr ms = modesettingPTR(pScrn);
+    struct vmw_driver *vmw = vmw_driver(pScrn);
+
+    if (!vmw)
+	return TRUE;
+
+    vmw_screen_cursor_close(pScrn, vmw);
+
+    vmw_video_close(pScrn, vmw);
+
+    ms->winsys_priv = NULL;
+    xfree(vmw);
+
+    return TRUE;
+}
+
+/*
+ * Functions for setting up hooks into the xorg state tracker
+ */
+
+static Bool (*vmw_screen_pre_init_saved)(ScrnInfoPtr pScrn, int flags) = NULL;
+
+static Bool
+vmw_screen_pre_init(ScrnInfoPtr pScrn, int flags)
+{
+    modesettingPtr ms;
+
+    pScrn->PreInit = vmw_screen_pre_init_saved;
+    if (!pScrn->PreInit(pScrn, flags))
+	return FALSE;
+
+    ms = modesettingPTR(pScrn);
+    ms->winsys_screen_init = vmw_screen_init;
+    ms->winsys_screen_close = vmw_screen_close;
+
+    return TRUE;
+}
+
+void
+vmw_screen_set_functions(ScrnInfoPtr pScrn)
+{
+    assert(!vmw_screen_pre_init_saved);
+
+    vmw_screen_pre_init_saved = pScrn->PreInit;
+    pScrn->PreInit = vmw_screen_pre_init;
+}
diff --git a/src/gallium/winsys/drm/vmware/xorg/vmw_video.c b/src/gallium/winsys/drm/vmware/xorg/vmw_video.c
new file mode 100644
index 00000000000..d62c3b7296f
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/vmw_video.c
@@ -0,0 +1,1023 @@
+/*
+ * Copyright 2007 by VMware, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the copyright holder(s)
+ * and author(s) shall not be used in advertising or otherwise to promote
+ * the sale, use or other dealings in this Software without prior written
+ * authorization from the copyright holder(s) and author(s).
+ */
+
+/*
+ * vmwarevideo.c --
+ *
+ *      Xv extension support.
+ *      See http://www.xfree86.org/current/DESIGN16.html
+ *
+ */
+
+
+#include "xf86xv.h"
+#include "fourcc.h"
+
+#include "pipe/p_compiler.h"
+/*
+ * We can't incude svga_types.h due to conflicting types for Bool.
+ */
+typedef int64_t int64;
+typedef uint64_t uint64;
+
+typedef int32_t int32;
+typedef uint32_t uint32;
+
+typedef int16_t int16;
+typedef uint16_t uint16;
+
+typedef int8_t int8;
+typedef uint8_t uint8;
+
+#include "svga/include/svga_reg.h"
+#include "svga/include/svga_escape.h"
+#include "svga/include/svga_overlay.h"
+
+#include "vmw_driver.h"
+
+#include <X11/extensions/Xv.h>
+
+#include "xf86drm.h"
+#include "../core/vmwgfx_drm.h"
+
+#define MAKE_ATOM(a) MakeAtom(a, sizeof(a) - 1, TRUE)
+
+/*
+ * Number of videos that can be played simultaneously
+ */
+#define VMWARE_VID_NUM_PORTS 1
+
+/*
+ * Using a dark shade as the default colorKey
+ */
+#define VMWARE_VIDEO_COLORKEY 0x100701
+
+/*
+ * Maximum dimensions
+ */
+#define VMWARE_VID_MAX_WIDTH    2048
+#define VMWARE_VID_MAX_HEIGHT   2048
+
+#define VMWARE_VID_NUM_ENCODINGS 1
+static XF86VideoEncodingRec vmwareVideoEncodings[] =
+{
+    {
+       0,
+       "XV_IMAGE",
+       VMWARE_VID_MAX_WIDTH, VMWARE_VID_MAX_HEIGHT,
+       {1, 1}
+    }
+};
+
+#define VMWARE_VID_NUM_FORMATS 2
+static XF86VideoFormatRec vmwareVideoFormats[] =
+{
+    { 16, TrueColor},
+    { 24, TrueColor}
+};
+
+#define VMWARE_VID_NUM_IMAGES 3
+static XF86ImageRec vmwareVideoImages[] =
+{
+    XVIMAGE_YV12,
+    XVIMAGE_YUY2,
+    XVIMAGE_UYVY
+};
+
+#define VMWARE_VID_NUM_ATTRIBUTES 2
+static XF86AttributeRec vmwareVideoAttributes[] =
+{
+    {
+        XvGettable | XvSettable,
+        0x000000,
+        0xffffff,
+        "XV_COLORKEY"
+    },
+    {
+        XvGettable | XvSettable,
+        0,
+        1,
+        "XV_AUTOPAINT_COLORKEY"
+    }
+};
+
+/*
+ * Video frames are stored in a circular list of buffers.
+ * Must be power or two, See vmw_video_port_play.
+ */
+#define VMWARE_VID_NUM_BUFFERS 1
+
+/*
+ * Defines the structure used to hold and pass video data to the host
+ */
+struct vmw_video_buffer
+{
+    unsigned handle;
+    int size;
+    void *data;
+    void *extra_data;
+    struct vmw_dma_buffer *buf;
+};
+
+
+/**
+ * Structure representing a single video stream, aka port.
+ *
+ * Ports maps one to one to a SVGA stream. Port is just
+ * what Xv calls a SVGA stream.
+ */
+struct vmw_video_port
+{
+    /*
+     * Function prototype same as XvPutImage.
+     *
+     * This is either set to vmw_video_port_init or vmw_video_port_play.
+     * At init this function is set to port_init. In port_init we set it
+     * to port_play and call it, after initializing the struct.
+     */
+    int (*play)(ScrnInfoPtr, struct vmw_video_port *,
+                short, short, short, short, short,
+                short, short, short, int, unsigned char*,
+                short, short, RegionPtr);
+
+    /* values to go into the SVGAOverlayUnit */
+    uint32 streamId;
+    uint32 colorKey;
+    uint32 flags;
+
+    /* round robin of buffers */
+    unsigned currBuf;
+    struct vmw_video_buffer bufs[VMWARE_VID_NUM_BUFFERS];
+
+    /* properties that applies to all buffers */
+    int size;
+    int pitches[3];
+    int offsets[3];
+
+    /* things for X */
+    RegionRec clipBoxes;
+    Bool isAutoPaintColorkey;
+};
+
+
+/**
+ * Structure holding all the infromation for video.
+ */
+struct vmw_video_private
+{
+    int fd;
+
+    /** ports */
+    struct vmw_video_port port[VMWARE_VID_NUM_PORTS];
+
+    /** Used to store port pointers pointers */
+    DevUnion port_ptr[VMWARE_VID_NUM_PORTS];
+};
+
+
+/*
+ * Callback functions exported to Xv, prefixed with vmw_xv_*.
+ */
+static int vmw_xv_put_image(ScrnInfoPtr pScrn, short src_x, short src_y,
+                            short drw_x, short drw_y, short src_w, short src_h,
+                            short drw_w, short drw_h, int image,
+                            unsigned char *buf, short width, short height,
+                            Bool sync, RegionPtr clipBoxes, pointer data,
+                            DrawablePtr dst);
+static void vmw_xv_stop_video(ScrnInfoPtr pScrn, pointer data, Bool Cleanup);
+static int vmw_xv_query_image_attributes(ScrnInfoPtr pScrn, int format,
+                                         unsigned short *width,
+                                         unsigned short *height, int *pitches,
+                                         int *offsets);
+static int vmw_xv_set_port_attribute(ScrnInfoPtr pScrn, Atom attribute,
+                                     INT32 value, pointer data);
+static int vmw_xv_get_port_attribute(ScrnInfoPtr pScrn, Atom attribute,
+                                     INT32 *value, pointer data);
+static void vmw_xv_query_best_size(ScrnInfoPtr pScrn, Bool motion,
+                                short vid_w, short vid_h, short drw_w,
+                                short drw_h, unsigned int *p_w,
+                                unsigned int *p_h, pointer data);
+
+
+/*
+ * Local functions.
+ */
+static XF86VideoAdaptorPtr vmw_video_init_adaptor(ScrnInfoPtr pScrn, struct vmw_driver *vmw);
+
+static int vmw_video_port_init(ScrnInfoPtr pScrn,
+                               struct vmw_video_port *port,
+                               short src_x, short src_y, short drw_x,
+                               short drw_y, short src_w, short src_h,
+                               short drw_w, short drw_h, int format,
+                               unsigned char *buf, short width,
+                               short height, RegionPtr clipBoxes);
+static int vmw_video_port_play(ScrnInfoPtr pScrn, struct vmw_video_port *port,
+                               short src_x, short src_y, short drw_x,
+                               short drw_y, short src_w, short src_h,
+                               short drw_w, short drw_h, int format,
+                               unsigned char *buf, short width,
+                               short height, RegionPtr clipBoxes);
+static void vmw_video_port_cleanup(ScrnInfoPtr pScrn, struct vmw_video_port *port);
+
+static int vmw_video_buffer_alloc(struct vmw_driver *vmw, int size,
+                                  struct vmw_video_buffer *out);
+static int vmw_video_buffer_free(struct vmw_driver *vmw,
+                                 struct vmw_video_buffer *out);
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_init --
+ *
+ *    Initializes Xv support.
+ *
+ * Results:
+ *    TRUE on success, FALSE on error.
+ *
+ * Side effects:
+ *    Xv support is initialized. Memory is allocated for all supported
+ *    video streams.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+Bool
+vmw_video_init(ScrnInfoPtr pScrn, struct vmw_driver *vmw)
+{
+    ScreenPtr pScreen = pScrn->pScreen;
+    XF86VideoAdaptorPtr *overlayAdaptors, *newAdaptors = NULL;
+    XF86VideoAdaptorPtr newAdaptor = NULL;
+    int numAdaptors;
+
+    debug_printf("%s: enter\n", __func__);
+
+    numAdaptors = xf86XVListGenericAdaptors(pScrn, &overlayAdaptors);
+
+    newAdaptor = vmw_video_init_adaptor(pScrn, vmw);
+    if (!newAdaptor) {
+        debug_printf("Failed to initialize Xv extension\n");
+        return FALSE;
+    }
+
+    if (!numAdaptors) {
+        numAdaptors = 1;
+        overlayAdaptors = &newAdaptor;
+    } else {
+         newAdaptors = xalloc((numAdaptors + 1) *
+                              sizeof(XF86VideoAdaptorPtr*));
+         if (!newAdaptors) {
+            xf86XVFreeVideoAdaptorRec(newAdaptor);
+            return FALSE;
+         }
+
+         memcpy(newAdaptors, overlayAdaptors,
+                numAdaptors * sizeof(XF86VideoAdaptorPtr));
+         newAdaptors[numAdaptors++] = newAdaptor;
+         overlayAdaptors = newAdaptors;
+    }
+
+    if (!xf86XVScreenInit(pScreen, overlayAdaptors, numAdaptors)) {
+        debug_printf("Failed to initialize Xv extension\n");
+        xf86XVFreeVideoAdaptorRec(newAdaptor);
+        return FALSE;
+    }
+
+    if (newAdaptors) {
+        xfree(newAdaptors);
+    }
+
+    debug_printf("Initialized VMware Xv extension successfully\n");
+
+    return TRUE;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_close --
+ *
+ *    Unitializes video.
+ *
+ * Results:
+ *    TRUE.
+ *
+ * Side effects:
+ *    vmw->video_priv = NULL
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+Bool
+vmw_video_close(ScrnInfoPtr pScrn, struct vmw_driver *vmw)
+{
+    struct vmw_video_private *video;
+    int i;
+
+    debug_printf("%s: enter\n", __func__);
+
+    video = vmw->video_priv;
+    if (!video)
+	return TRUE;
+
+    for (i = 0; i < VMWARE_VID_NUM_PORTS; ++i) {
+        vmw_video_port_cleanup(pScrn, &video->port[i]);
+    }
+
+    /* XXX: I'm sure this function is missing code for turning off Xv */
+
+    free(vmw->video_priv);
+    vmw->video_priv = NULL;
+
+    return TRUE;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_init_adaptor --
+ *
+ *    Initializes a XF86VideoAdaptor structure with the capabilities and
+ *    functions supported by this video driver.
+ *
+ * Results:
+ *    On success initialized XF86VideoAdaptor struct or NULL on error
+ *
+ * Side effects:
+ *    None.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static XF86VideoAdaptorPtr
+vmw_video_init_adaptor(ScrnInfoPtr pScrn, struct vmw_driver *vmw)
+{
+    XF86VideoAdaptorPtr adaptor;
+    struct vmw_video_private *video;
+    int i;
+
+    debug_printf("%s: enter \n", __func__);
+
+    adaptor = xf86XVAllocateVideoAdaptorRec(pScrn);
+    if (!adaptor) {
+        debug_printf("Not enough memory\n");
+        return NULL;
+    }
+
+    video = xcalloc(1, sizeof(*video));
+    if (!video) {
+        debug_printf("Not enough memory.\n");
+        xf86XVFreeVideoAdaptorRec(adaptor);
+        return NULL;
+    }
+
+    vmw->video_priv = video;
+
+    adaptor->type = XvInputMask | XvImageMask | XvWindowMask;
+    adaptor->flags = VIDEO_OVERLAID_IMAGES | VIDEO_CLIP_TO_VIEWPORT;
+    adaptor->name = "VMware Video Engine";
+    adaptor->nEncodings = VMWARE_VID_NUM_ENCODINGS;
+    adaptor->pEncodings = vmwareVideoEncodings;
+    adaptor->nFormats = VMWARE_VID_NUM_FORMATS;
+    adaptor->pFormats = vmwareVideoFormats;
+    adaptor->nPorts = VMWARE_VID_NUM_PORTS;
+    adaptor->pPortPrivates = video->port_ptr;
+
+    for (i = 0; i < VMWARE_VID_NUM_PORTS; ++i) {
+        video->port[i].streamId = i;
+        video->port[i].play = vmw_video_port_init;
+        video->port[i].flags = SVGA_VIDEO_FLAG_COLORKEY;
+        video->port[i].colorKey = VMWARE_VIDEO_COLORKEY;
+        video->port[i].isAutoPaintColorkey = TRUE;
+        adaptor->pPortPrivates[i].ptr = &video->port[i];
+    }
+
+    adaptor->nAttributes = VMWARE_VID_NUM_ATTRIBUTES;
+    adaptor->pAttributes = vmwareVideoAttributes;
+
+    adaptor->nImages = VMWARE_VID_NUM_IMAGES;
+    adaptor->pImages = vmwareVideoImages;
+
+    adaptor->PutVideo = NULL;
+    adaptor->PutStill = NULL;
+    adaptor->GetVideo = NULL;
+    adaptor->GetStill = NULL;
+    adaptor->StopVideo = vmw_xv_stop_video;
+    adaptor->SetPortAttribute = vmw_xv_set_port_attribute;
+    adaptor->GetPortAttribute = vmw_xv_get_port_attribute;
+    adaptor->QueryBestSize = vmw_xv_query_best_size;
+    adaptor->PutImage = vmw_xv_put_image;
+    adaptor->QueryImageAttributes = vmw_xv_query_image_attributes;
+
+    debug_printf("%s: done %p\n", __func__, adaptor);
+
+    return adaptor;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_port_init --
+ *
+ *    Initializes a video stream in response to the first PutImage() on a
+ *    video stream. The process goes as follows:
+ *    - Figure out characteristics according to format
+ *    - Allocate offscreen memory
+ *    - Pass on video to Play() functions
+ *
+ * Results:
+ *    Success or XvBadAlloc on failure.
+ *
+ * Side effects:
+ *    Video stream is initialized and its first frame sent to the host
+ *    (done by VideoPlay() function called at the end)
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_video_port_init(ScrnInfoPtr pScrn, struct vmw_video_port *port,
+                    short src_x, short src_y, short drw_x,
+                    short drw_y, short src_w, short src_h,
+                    short drw_w, short drw_h, int format,
+                    unsigned char *buf, short width,
+                    short height, RegionPtr clipBoxes)
+{
+    struct vmw_driver *vmw = vmw_driver(pScrn);
+    unsigned short w, h;
+    int i, ret;
+
+    debug_printf("\t%s: id %d, format %d\n", __func__, port->streamId, format);
+
+    w = width;
+    h = height;
+    /* init all the format attributes, used for buffers */
+    port->size = vmw_xv_query_image_attributes(pScrn, format, &w, &h,
+                                               port->pitches, port->offsets);
+
+    if (port->size == -1)
+        return XvBadAlloc;
+
+    port->play = vmw_video_port_play;
+
+    for (i = 0; i < VMWARE_VID_NUM_BUFFERS; ++i) {
+	ret = vmw_video_buffer_alloc(vmw, port->size, &port->bufs[i]);
+	if (ret != Success)
+	    break;
+    }
+
+    /* Free all allocated buffers on failure */
+    if (ret != Success) {
+	for (--i; i >= 0; --i) {
+	    vmw_video_buffer_free(vmw, &port->bufs[i]);
+	}
+	return ret;
+    }
+
+    port->currBuf = 0;
+
+    REGION_COPY(pScrn->pScreen, &port->clipBoxes, clipBoxes);
+
+    if (port->isAutoPaintColorkey)
+        xf86XVFillKeyHelper(pScrn->pScreen, port->colorKey, clipBoxes);
+
+    return port->play(pScrn, port, src_x, src_y, drw_x, drw_y, src_w, src_h,
+                      drw_w, drw_h, format, buf, width, height, clipBoxes);
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_port_play --
+ *
+ *    Sends all the attributes associated with the video frame using the
+ *    FIFO ESCAPE mechanism to the host.
+ *
+ * Results:
+ *    Always returns Success.
+ *
+ * Side effects:
+ *    None.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_video_port_play(ScrnInfoPtr pScrn, struct vmw_video_port *port,
+                    short src_x, short src_y, short drw_x,
+                    short drw_y, short src_w, short src_h,
+                    short drw_w, short drw_h, int format,
+                    unsigned char *buf, short width,
+                    short height, RegionPtr clipBoxes)
+{
+    struct vmw_driver *vmw = vmw_driver(pScrn);
+    struct drm_vmw_overlay_arg arg;
+    unsigned short w, h;
+    int size;
+    int ret;
+
+    debug_printf("\t%s: enter\n", __func__);
+
+    w = width;
+    h = height;
+
+    /* we don't update the ports size */
+    size = vmw_xv_query_image_attributes(pScrn, format, &w, &h,
+                                         port->pitches, port->offsets);
+
+    if (size > port->size) {
+        debug_printf("\t%s: Increase in size of Xv video frame streamId:%d.\n",
+                     __func__, port->streamId);
+        vmw_xv_stop_video(pScrn, port, TRUE);
+        return port->play(pScrn, port, src_x, src_y, drw_x, drw_y, src_w,
+                          src_h, drw_w, drw_h, format, buf, width, height,
+                          clipBoxes);
+    }
+
+    memcpy(port->bufs[port->currBuf].data, buf, port->size);
+
+    memset(&arg, 0, sizeof(arg));
+
+    arg.stream_id = port->streamId;
+    arg.enabled = TRUE;
+    arg.flags = port->flags;
+    arg.color_key = port->colorKey;
+    arg.handle = port->bufs[port->currBuf].handle;
+    arg.format = format;
+    arg.size = port->size;
+    arg.width = w;
+    arg.height = h;
+    arg.src.x = src_x;
+    arg.src.y = src_y;
+    arg.src.w = src_w;
+    arg.src.h = src_h;
+    arg.dst.x = drw_x;
+    arg.dst.y = drw_y;
+    arg.dst.w = drw_w;
+    arg.dst.h = drw_h;
+    arg.pitch[0] = port->pitches[0];
+    arg.pitch[1] = port->pitches[1];
+    arg.pitch[2] = port->pitches[2];
+    arg.offset = 0;
+
+    /*
+     *  Update the clipList and paint the colorkey, if required.
+     */
+    if (!REGION_EQUAL(pScrn->pScreen, &port->clipBoxes, clipBoxes)) {
+        REGION_COPY(pScrn->pScreen, &port->clipBoxes, clipBoxes);
+        if (port->isAutoPaintColorkey) {
+            xf86XVFillKeyHelper(pScrn->pScreen, port->colorKey, clipBoxes);
+        }
+    }
+
+    ret = drmCommandWrite(vmw->fd, DRM_VMW_OVERLAY, &arg, sizeof(arg));
+    if (ret) {
+	vmw_video_port_cleanup(pScrn, port);
+	return XvBadAlloc;
+    }
+
+    port->currBuf = ++port->currBuf & (VMWARE_VID_NUM_BUFFERS - 1);
+
+    return Success;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_port_cleanup --
+ *
+ *    Frees up all resources (if any) taken by a video stream.
+ *
+ * Results:
+ *    None.
+ *
+ * Side effects:
+ *    Same as above.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static void
+vmw_video_port_cleanup(ScrnInfoPtr pScrn, struct vmw_video_port *port)
+{
+    struct vmw_driver *vmw = vmw_driver(pScrn);
+    uint32 id, colorKey, flags;
+    Bool isAutoPaintColorkey;
+    int i;
+
+    debug_printf("\t%s: enter\n", __func__);
+
+    for (i = 0; i < VMWARE_VID_NUM_BUFFERS; i++) {
+	vmw_video_buffer_free(vmw, &port->bufs[i]);
+    }
+
+    /*
+     * reset stream for next video
+     */
+    id = port->streamId;
+    colorKey = port->colorKey;
+    flags = port->flags;
+    isAutoPaintColorkey = port->isAutoPaintColorkey;
+
+    memset(port, 0, sizeof(*port));
+
+    port->streamId = id;
+    port->play = vmw_video_port_init;
+    port->colorKey = colorKey;
+    port->flags = flags;
+    port->isAutoPaintColorkey = isAutoPaintColorkey;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_buffer_alloc --
+ *
+ *    Allocates and map a kernel buffer to be used as data storage.
+ *
+ * Results:
+ *    XvBadAlloc on failure, otherwise Success.
+ *
+ * Side effects:
+ *    Calls into the kernel, sets members of out.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_video_buffer_alloc(struct vmw_driver *vmw, int size,
+                       struct vmw_video_buffer *out)
+{
+    out->buf = vmw_ioctl_buffer_create(vmw, size, &out->handle);
+    if (!out->buf)
+	return XvBadAlloc;
+
+    out->data = vmw_ioctl_buffer_map(vmw, out->buf);
+    if (!out->data) {
+	vmw_ioctl_buffer_destroy(vmw, out->buf);
+
+	out->handle = 0;
+	out->buf = NULL;
+
+	return XvBadAlloc;
+    }
+
+    out->size = size;
+    out->extra_data = xcalloc(1, size);
+
+    debug_printf("\t\t%s: allocated buffer %p of size %i\n", __func__, out, size);
+
+    return Success;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_video_buffer_free --
+ *
+ *    Frees and unmaps an allocated kernel buffer.
+ *
+ * Results:
+ *    Success.
+ *
+ * Side effects:
+ *    Calls into the kernel, sets members of out to 0.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_video_buffer_free(struct vmw_driver *vmw,
+                      struct vmw_video_buffer *out)
+{
+    if (out->size == 0)
+	return Success;
+
+    xfree(out->extra_data);
+    vmw_ioctl_buffer_unmap(vmw, out->buf);
+    vmw_ioctl_buffer_destroy(vmw, out->buf);
+
+    out->buf = NULL;
+    out->data = NULL;
+    out->handle = 0;
+    out->size = 0;
+
+    debug_printf("\t\t%s: freed buffer %p\n", __func__, out);
+
+    return Success;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_xv_put_image --
+ *
+ *    Main video playback function. It copies the passed data which is in
+ *    the specified format (e.g. FOURCC_YV12) into the overlay.
+ *
+ *    If sync is TRUE the driver should not return from this
+ *    function until it is through reading the data from buf.
+ *
+ * Results:
+ *    Success or XvBadAlloc on failure
+ *
+ * Side effects:
+ *    Video port will be played(initialized if 1st frame) on success
+ *    or will fail on error.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_xv_put_image(ScrnInfoPtr pScrn, short src_x, short src_y,
+                 short drw_x, short drw_y, short src_w, short src_h,
+                 short drw_w, short drw_h, int format,
+                 unsigned char *buf, short width, short height,
+                 Bool sync, RegionPtr clipBoxes, pointer data,
+                 DrawablePtr dst)
+{
+    struct vmw_driver *vmw = vmw_driver(pScrn);
+    struct vmw_video_port *port = data;
+
+    debug_printf("%s: enter (%u, %u) (%ux%u) (%u, %u) (%ux%u) (%ux%u)\n", __func__,
+		 src_x, src_y, src_w, src_h,
+		 drw_x, drw_y, drw_w, drw_h,
+		 width, height);
+
+    if (!vmw->video_priv)
+        return XvBadAlloc;
+
+    return port->play(pScrn, port, src_x, src_y, drw_x, drw_y, src_w, src_h,
+                      drw_w, drw_h, format, buf, width, height, clipBoxes);
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_xv_stop_video --
+ *
+ *    Called when we should stop playing video for a particular stream. If
+ *    Cleanup is FALSE, the "stop" operation is only temporary, and thus we
+ *    don't do anything. If Cleanup is TRUE we kill the video port by
+ *    sending a message to the host and freeing up the stream.
+ *
+ * Results:
+ *    None.
+ *
+ * Side effects:
+ *    See above.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static void
+vmw_xv_stop_video(ScrnInfoPtr pScrn, pointer data, Bool cleanup)
+{
+    struct vmw_driver *vmw = vmw_driver(pScrn);
+    struct vmw_video_port *port = data;
+    struct drm_vmw_overlay_arg arg;
+    int ret;
+
+    debug_printf("%s: cleanup is %s\n", __func__, cleanup ? "TRUE" : "FALSE");
+
+    if (!vmw->video_priv)
+        return;
+
+    if (!cleanup)
+        return;
+
+
+    memset(&arg, 0, sizeof(arg));
+    arg.stream_id = port->streamId;
+    arg.enabled = FALSE;
+
+    ret = drmCommandWrite(vmw->fd, DRM_VMW_OVERLAY, &arg, sizeof(arg));
+    assert(ret == 0);
+
+    vmw_video_port_cleanup(pScrn, port);
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_xv_query_image_attributes --
+ *
+ *    From the spec: This function is called to let the driver specify how data
+ *    for a particular image of size width by height should be stored.
+ *    Sometimes only the size and corrected width and height are needed. In
+ *    that case pitches and offsets are NULL.
+ *
+ * Results:
+ *    The size of the memory required for the image, or -1 on error.
+ *
+ * Side effects:
+ *    None.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_xv_query_image_attributes(ScrnInfoPtr pScrn, int format,
+                              unsigned short *width, unsigned short *height,
+                              int *pitches, int *offsets)
+{
+    INT32 size, tmp;
+
+    if (*width > VMWARE_VID_MAX_WIDTH) {
+        *width = VMWARE_VID_MAX_WIDTH;
+    }
+    if (*height > VMWARE_VID_MAX_HEIGHT) {
+        *height = VMWARE_VID_MAX_HEIGHT;
+    }
+
+    *width = (*width + 1) & ~1;
+    if (offsets != NULL) {
+        offsets[0] = 0;
+    }
+
+    switch (format) {
+       case FOURCC_YV12:
+           *height = (*height + 1) & ~1;
+           size = (*width + 3) & ~3;
+           if (pitches) {
+               pitches[0] = size;
+           }
+           size *= *height;
+           if (offsets) {
+               offsets[1] = size;
+           }
+           tmp = ((*width >> 1) + 3) & ~3;
+           if (pitches) {
+                pitches[1] = pitches[2] = tmp;
+           }
+           tmp *= (*height >> 1);
+           size += tmp;
+           if (offsets) {
+               offsets[2] = size;
+           }
+           size += tmp;
+           break;
+       case FOURCC_UYVY:
+       case FOURCC_YUY2:
+           size = *width * 2;
+           if (pitches) {
+               pitches[0] = size;
+           }
+           size *= *height;
+           break;
+       default:
+           debug_printf("Query for invalid video format %d\n", format);
+           return -1;
+    }
+    return size;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_xv_set_port_attribute --
+ *
+ *    From the spec: A port may have particular attributes such as colorKey, hue,
+ *    saturation, brightness or contrast. Xv clients set these
+ *    attribute values by sending attribute strings (Atoms) to the server.
+ *
+ * Results:
+ *    Success if the attribute exists and XvBadAlloc otherwise.
+ *
+ * Side effects:
+ *    The respective attribute gets the new value.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_xv_set_port_attribute(ScrnInfoPtr pScrn, Atom attribute,
+                          INT32 value, pointer data)
+{
+    struct vmw_video_port *port = data;
+    Atom xvColorKey = MAKE_ATOM("XV_COLORKEY");
+    Atom xvAutoPaint = MAKE_ATOM("XV_AUTOPAINT_COLORKEY");
+
+    if (attribute == xvColorKey) {
+        debug_printf("%s: Set colorkey:0x%x\n", __func__, (unsigned)value);
+        port->colorKey = value;
+    } else if (attribute == xvAutoPaint) {
+        debug_printf("%s: Set autoPaint: %s\n", __func__, value? "TRUE": "FALSE");
+        port->isAutoPaintColorkey = value;
+    } else {
+        return XvBadAlloc;
+    }
+
+    return Success;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_xv_get_port_attribute --
+ *
+ *    From the spec: A port may have particular attributes such as hue,
+ *    saturation, brightness or contrast. Xv clients get these
+ *    attribute values by sending attribute strings (Atoms) to the server
+ *
+ * Results:
+ *    Success if the attribute exists and XvBadAlloc otherwise.
+ *
+ * Side effects:
+ *    "value" contains the requested attribute on success.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static int
+vmw_xv_get_port_attribute(ScrnInfoPtr pScrn, Atom attribute,
+                          INT32 *value, pointer data)
+{
+    struct vmw_video_port *port = data;
+    Atom xvColorKey = MAKE_ATOM("XV_COLORKEY");
+    Atom xvAutoPaint = MAKE_ATOM("XV_AUTOPAINT_COLORKEY");
+
+    if (attribute == xvColorKey) {
+        *value = port->colorKey;
+    } else if (attribute == xvAutoPaint) {
+        *value = port->isAutoPaintColorkey;
+    } else {
+        return XvBadAlloc;
+    }
+
+    return Success;
+}
+
+
+/*
+ *-----------------------------------------------------------------------------
+ *
+ * vmw_xv_query_best_size --
+ *
+ *    From the spec: QueryBestSize provides the client with a way to query what
+ *    the destination dimensions would end up being if they were to request
+ *    that an area vid_w by vid_h from the video stream be scaled to rectangle
+ *    of drw_w by drw_h on the screen. Since it is not expected that all
+ *    hardware will be able to get the target dimensions exactly, it is
+ *    important that the driver provide this function.
+ *
+ *    This function seems to never be called, but to be on the safe side
+ *    we apply the same logic that QueryImageAttributes has for width
+ *    and height.
+ *
+ * Results:
+ *    None.
+ *
+ * Side effects:
+ *    None.
+ *
+ *-----------------------------------------------------------------------------
+ */
+
+static void
+vmw_xv_query_best_size(ScrnInfoPtr pScrn, Bool motion,
+                       short vid_w, short vid_h, short drw_w,
+                       short drw_h, unsigned int *p_w,
+                       unsigned int *p_h, pointer data)
+{
+    *p_w = (drw_w + 1) & ~1;
+    *p_h = drw_h;
+
+    return;
+}
diff --git a/src/gallium/winsys/drm/vmware/xorg/vmw_xorg.c b/src/gallium/winsys/drm/vmware/xorg/vmw_xorg.c
new file mode 100644
index 00000000000..4b208719ca3
--- /dev/null
+++ b/src/gallium/winsys/drm/vmware/xorg/vmw_xorg.c
@@ -0,0 +1,152 @@
+/**********************************************************
+ * Copyright 2008-2009 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+/**
+ * @file
+ * Glue file for Xorg State Tracker.
+ *
+ * @author Alan Hourihane <alanh@tungstengraphics.com>
+ * @author Jakob Bornecrantz <wallbraker@gmail.com>
+ */
+
+#include "vmw_hook.h"
+
+static void vmw_xorg_identify(int flags);
+static Bool vmw_xorg_pci_probe(DriverPtr driver,
+			       int entity_num,
+			       struct pci_device *device,
+			       intptr_t match_data);
+
+static const struct pci_id_match vmw_xorg_device_match[] = {
+    {0x15ad, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0, 0},
+};
+
+static SymTabRec vmw_xorg_chipsets[] = {
+    {PCI_MATCH_ANY, "VMware SVGA Device"},
+    {-1, NULL}
+};
+
+static PciChipsets vmw_xorg_pci_devices[] = {
+    {PCI_MATCH_ANY, PCI_MATCH_ANY, NULL},
+    {-1, -1, NULL}
+};
+
+static XF86ModuleVersionInfo vmw_xorg_version = {
+    "vmwgfx",
+    MODULEVENDORSTRING,
+    MODINFOSTRING1,
+    MODINFOSTRING2,
+    XORG_VERSION_CURRENT,
+    0, 1, 0, /* major, minor, patch */
+    ABI_CLASS_VIDEODRV,
+    ABI_VIDEODRV_VERSION,
+    MOD_CLASS_VIDEODRV,
+    {0, 0, 0, 0}
+};
+
+/*
+ * Xorg driver exported structures
+ */
+
+_X_EXPORT DriverRec vmwgfx = {
+    1,
+    "vmwgfx",
+    vmw_xorg_identify,
+    NULL,
+    xorg_tracker_available_options,
+    NULL,
+    0,
+    NULL,
+    vmw_xorg_device_match,
+    vmw_xorg_pci_probe
+};
+
+static MODULESETUPPROTO(vmw_xorg_setup);
+
+_X_EXPORT XF86ModuleData vmwgfxModuleData = {
+    &vmw_xorg_version,
+    vmw_xorg_setup,
+    NULL
+};
+
+/*
+ * Xorg driver functions
+ */
+
+static pointer
+vmw_xorg_setup(pointer module, pointer opts, int *errmaj, int *errmin)
+{
+    static Bool setupDone = 0;
+
+    /* This module should be loaded only once, but check to be sure.
+     */
+    if (!setupDone) {
+	setupDone = 1;
+	xf86AddDriver(&vmwgfx, module, HaveDriverFuncs);
+
+	/*
+	 * The return value must be non-NULL on success even though there
+	 * is no TearDownProc.
+	 */
+	return (pointer) 1;
+    } else {
+	if (errmaj)
+	    *errmaj = LDR_ONCEONLY;
+	return NULL;
+    }
+}
+
+static void
+vmw_xorg_identify(int flags)
+{
+    xf86PrintChipsets("vmwgfx", "Driver for VMware SVGA device",
+		      vmw_xorg_chipsets);
+}
+
+static Bool
+vmw_xorg_pci_probe(DriverPtr driver,
+	  int entity_num, struct pci_device *device, intptr_t match_data)
+{
+    ScrnInfoPtr scrn = NULL;
+    EntityInfoPtr entity;
+
+    scrn = xf86ConfigPciEntity(scrn, 0, entity_num, vmw_xorg_pci_devices,
+			       NULL, NULL, NULL, NULL, NULL);
+    if (scrn != NULL) {
+	scrn->driverVersion = 1;
+	scrn->driverName = "vmwgfx";
+	scrn->name = "vmwgfx";
+	scrn->Probe = NULL;
+
+	entity = xf86GetEntityInfo(entity_num);
+
+	/* Use all the functions from the xorg tracker */
+	xorg_tracker_set_functions(scrn);
+
+	vmw_screen_set_functions(scrn);
+    }
+    return scrn != NULL;
+}
diff --git a/src/gallium/winsys/g3dvl/Makefile b/src/gallium/winsys/g3dvl/Makefile
new file mode 100644
index 00000000000..424ddea87ad
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/Makefile
@@ -0,0 +1,11 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+SUBDIRS = $(GALLIUM_WINSYS_DIRS)
+
+default install clean:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) $@) || exit 1; \
+		fi \
+	done
diff --git a/src/gallium/winsys/g3dvl/vl_winsys.h b/src/gallium/winsys/g3dvl/vl_winsys.h
index c83db28dd98..b4fa0d67a1b 100644
--- a/src/gallium/winsys/g3dvl/vl_winsys.h
+++ b/src/gallium/winsys/g3dvl/vl_winsys.h
@@ -1,14 +1,51 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
 #ifndef vl_winsys_h
 #define vl_winsys_h
 
 #include <X11/Xlib.h>
+#include <pipe/p_defines.h>
+#include <pipe/p_format.h>
 
-struct pipe_context;
+struct pipe_screen;
+struct pipe_video_context;
 
-struct pipe_context* create_pipe_context(Display *display, int screen);
-int destroy_pipe_context(struct pipe_context *pipe);
-int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable);
-int unbind_pipe_drawable(struct pipe_context *pipe);
+struct pipe_screen*
+vl_screen_create(Display *display, int screen);
 
-#endif
+struct pipe_video_context*
+vl_video_create(Display *display, int screen,
+                struct pipe_screen *p_screen,
+                enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height);
 
+Drawable
+vl_video_bind_drawable(struct pipe_video_context *vpipe, Drawable drawable);
+
+#endif
diff --git a/src/gallium/winsys/g3dvl/xlib/Makefile b/src/gallium/winsys/g3dvl/xlib/Makefile
new file mode 100644
index 00000000000..cf765ef51a5
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/xlib/Makefile
@@ -0,0 +1,74 @@
+# This makefile produces a "stand-alone" libXvMCg3dvl.so which is
+# based on Xlib (no DRI HW acceleration)
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+XVMC_MAJOR = 1
+XVMC_MINOR = 0
+XVMC_LIB = XvMCg3dvl
+XVMC_LIB_NAME = lib$(XVMC_LIB).so
+XVMC_LIB_DEPS = $(EXTRA_LIB_PATH) -lXvMC -lXv -lX11 -lm
+
+INCLUDES = -I$(TOP)/src/gallium/include \
+           -I$(TOP)/src/gallium/auxiliary \
+           -I$(TOP)/src/gallium/drivers \
+           -I$(TOP)/src/gallium/winsys/g3dvl
+
+DEFINES += -DGALLIUM_SOFTPIPE \
+	   -DGALLIUM_TRACE
+
+SOURCES = xsp_winsys.c
+
+# XXX: Hack, if we include libxvmctracker.a in LIBS none of the symbols are
+# pulled in by the linker because xsp_winsys.c doesn't refer to them
+OBJECTS = $(SOURCES:.c=.o) $(TOP)/src/gallium/state_trackers/xorg/xvmc/*.o
+
+LIBS = $(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+       $(TOP)/src/gallium/auxiliary/vl/libvl.a \
+       $(TOP)/src/gallium/auxiliary/tgsi/libtgsi.a \
+       $(TOP)/src/gallium/auxiliary/draw/libdraw.a \
+       $(TOP)/src/gallium/auxiliary/translate/libtranslate.a \
+       $(TOP)/src/gallium/auxiliary/cso_cache/libcso_cache.a \
+       $(TOP)/src/gallium/auxiliary/rtasm/librtasm.a \
+       $(TOP)/src/gallium/auxiliary/util/libutil.a
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(DEFINES) $(CFLAGS) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(DEFINES) $(CFLAGS) $< -o $@
+
+.PHONY: default $(TOP)/$(LIB_DIR)/gallium clean
+
+default: depend $(TOP)/$(LIB_DIR)/gallium $(TOP)/$(LIB_DIR)/gallium/$(XVMC_LIB_NAME)
+
+$(TOP)/$(LIB_DIR)/gallium:
+	@mkdir -p $(TOP)/$(LIB_DIR)/gallium
+
+# Make the libXvMCg3dvl.so library
+$(TOP)/$(LIB_DIR)/gallium/$(XVMC_LIB_NAME): $(OBJECTS) $(LIBS) Makefile
+	$(MKLIB) -o $(XVMC_LIB) -linker '$(CC)' -ldflags '$(LDFLAGS)' \
+		-major $(XVMC_MAJOR) -minor $(XVMC_MINOR) $(MKLIB_OPTIONS) \
+		-install $(TOP)/$(LIB_DIR)/gallium -id $(INSTALL_LIB_DIR)/lib$(XVMC_LIB).1.dylib \
+		$(XVMC_LIB_DEPS) $(OBJECTS) $(LIBS)
+
+depend: $(SOURCES) Makefile
+	$(RM) depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DEFINES) $(INCLUDES) $(SOURCES)
+
+#install: default
+#	$(INSTALL) -d $(INSTALL_DIR)/include/GL
+#	$(INSTALL) -d $(INSTALL_DIR)/$(LIB_DIR)
+#	$(INSTALL) -m 644 $(TOP)/include/GL/*.h $(INSTALL_DIR)/include/GL
+#	@if [ -e $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME) ]; then \
+#		$(INSTALL) $(TOP)/$(LIB_DIR)/libGL* $(INSTALL_DIR)/$(LIB_DIR); \
+#	fi
+
+clean: Makefile
+	$(RM) $(TOP)/$(LIB_DIR)/gallium/$(XVMC_LIB_NAME)
+	$(RM) *.o *~
+	$(RM) depend depend.bak
+
+-include depend
diff --git a/src/gallium/winsys/g3dvl/xlib/xsp_winsys.c b/src/gallium/winsys/g3dvl/xlib/xsp_winsys.c
new file mode 100644
index 00000000000..08067aad64c
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/xlib/xsp_winsys.c
@@ -0,0 +1,332 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <vl_winsys.h>
+#include <X11/Xutil.h>
+#include <pipe/internal/p_winsys_screen.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include <util/u_math.h>
+#include <softpipe/sp_winsys.h>
+#include <softpipe/sp_video_context.h>
+#include <softpipe/sp_texture.h>
+
+/* pipe_winsys implementation */
+
+struct xsp_pipe_winsys
+{
+   struct pipe_winsys base;
+   Display *display;
+   int screen;
+   XImage *fbimage;
+};
+
+struct xsp_context
+{
+   Drawable drawable;
+
+   void (*pipe_destroy)(struct pipe_video_context *vpipe);
+};
+
+struct xsp_buffer
+{
+   struct pipe_buffer base;
+   boolean is_user_buffer;
+   void *data;
+   void *mapped_data;
+};
+
+static struct pipe_buffer* xsp_buffer_create(struct pipe_winsys *pws, unsigned alignment, unsigned usage, unsigned size)
+{
+   struct xsp_buffer *buffer;
+
+   assert(pws);
+
+   buffer = calloc(1, sizeof(struct xsp_buffer));
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+   buffer->data = align_malloc(size, alignment);
+
+   return (struct pipe_buffer*)buffer;
+}
+
+static struct pipe_buffer* xsp_user_buffer_create(struct pipe_winsys *pws, void *data, unsigned size)
+{
+   struct xsp_buffer *buffer;
+
+   assert(pws);
+
+   buffer = calloc(1, sizeof(struct xsp_buffer));
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.size = size;
+   buffer->is_user_buffer = TRUE;
+   buffer->data = data;
+
+   return (struct pipe_buffer*)buffer;
+}
+
+static void* xsp_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buffer, unsigned flags)
+{
+   struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+   assert(pws);
+   assert(buffer);
+
+   xsp_buf->mapped_data = xsp_buf->data;
+
+   return xsp_buf->mapped_data;
+}
+
+static void xsp_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buffer)
+{
+   struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+   assert(pws);
+   assert(buffer);
+
+   xsp_buf->mapped_data = NULL;
+}
+
+static void xsp_buffer_destroy(struct pipe_buffer *buffer)
+{
+   struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+   assert(buffer);
+
+   if (!xsp_buf->is_user_buffer)
+      align_free(xsp_buf->data);
+
+   free(xsp_buf);
+}
+
+static struct pipe_buffer* xsp_surface_buffer_create
+(
+   struct pipe_winsys *pws,
+   unsigned width,
+   unsigned height,
+   enum pipe_format format,
+   unsigned usage,
+   unsigned tex_usage,
+   unsigned *stride
+)
+{
+   const unsigned int ALIGNMENT = 1;
+   struct pipe_format_block block;
+   unsigned nblocksx, nblocksy;
+
+   pf_get_block(format, &block);
+   nblocksx = pf_get_nblocksx(&block, width);
+   nblocksy = pf_get_nblocksy(&block, height);
+   *stride = align(nblocksx * block.size, ALIGNMENT);
+
+   return pws->buffer_create(pws, ALIGNMENT, usage,
+                             *stride * nblocksy);
+}
+
+static void xsp_fence_reference(struct pipe_winsys *pws, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence)
+{
+   assert(pws);
+   assert(ptr);
+   assert(fence);
+}
+
+static int xsp_fence_signalled(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+   assert(pws);
+   assert(fence);
+
+   return 0;
+}
+
+static int xsp_fence_finish(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+   assert(pws);
+   assert(fence);
+
+   return 0;
+}
+
+static void xsp_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surface, void *context_private)
+{
+   struct xsp_pipe_winsys *xsp_winsys;
+   struct xsp_context *xsp_context;
+
+   assert(pws);
+   assert(surface);
+   assert(context_private);
+
+   xsp_winsys = (struct xsp_pipe_winsys*)pws;
+   xsp_context = (struct xsp_context*)context_private;
+   xsp_winsys->fbimage->width = surface->width;
+   xsp_winsys->fbimage->height = surface->height;
+   xsp_winsys->fbimage->bytes_per_line = surface->width * (xsp_winsys->fbimage->bits_per_pixel >> 3);
+   xsp_winsys->fbimage->data = (char*)((struct xsp_buffer *)softpipe_texture(surface->texture)->buffer)->data + surface->offset;
+
+   XPutImage
+   (
+      xsp_winsys->display, xsp_context->drawable,
+      XDefaultGC(xsp_winsys->display, xsp_winsys->screen),
+      xsp_winsys->fbimage, 0, 0, 0, 0,
+      surface->width, surface->height
+   );
+   XFlush(xsp_winsys->display);
+}
+
+static const char* xsp_get_name(struct pipe_winsys *pws)
+{
+   assert(pws);
+   return "X11 SoftPipe";
+}
+
+static void xsp_destroy(struct pipe_winsys *pws)
+{
+   struct xsp_pipe_winsys *xsp_winsys = (struct xsp_pipe_winsys*)pws;
+
+   assert(pws);
+
+   /* XDestroyImage() wants to free the data as well */
+   xsp_winsys->fbimage->data = NULL;
+
+   XDestroyImage(xsp_winsys->fbimage);
+   FREE(xsp_winsys);
+}
+
+/* Called through pipe_video_context::destroy() */
+static void xsp_pipe_destroy(struct pipe_video_context *vpipe)
+{
+   struct xsp_context *xsp_context;
+
+   assert(vpipe);
+
+   xsp_context = vpipe->priv;
+
+   /* Call the original destroy */
+   xsp_context->pipe_destroy(vpipe);
+
+   FREE(xsp_context);
+}
+
+/* Show starts here */
+
+Drawable
+vl_video_bind_drawable(struct pipe_video_context *vpipe, Drawable drawable)
+{
+   struct xsp_context *xsp_context;
+   Drawable old_drawable;
+
+   assert(vpipe);
+
+   xsp_context = vpipe->priv;
+   old_drawable = xsp_context->drawable;
+   xsp_context->drawable = drawable;
+
+   return old_drawable;
+}
+
+struct pipe_screen*
+vl_screen_create(Display *display, int screen)
+{
+   struct xsp_pipe_winsys *xsp_winsys;
+
+   assert(display);
+
+   xsp_winsys = CALLOC_STRUCT(xsp_pipe_winsys);
+   if (!xsp_winsys)
+      return NULL;
+
+   xsp_winsys->base.buffer_create = xsp_buffer_create;
+   xsp_winsys->base.user_buffer_create = xsp_user_buffer_create;
+   xsp_winsys->base.buffer_map = xsp_buffer_map;
+   xsp_winsys->base.buffer_unmap = xsp_buffer_unmap;
+   xsp_winsys->base.buffer_destroy = xsp_buffer_destroy;
+   xsp_winsys->base.surface_buffer_create = xsp_surface_buffer_create;
+   xsp_winsys->base.fence_reference = xsp_fence_reference;
+   xsp_winsys->base.fence_signalled = xsp_fence_signalled;
+   xsp_winsys->base.fence_finish = xsp_fence_finish;
+   xsp_winsys->base.flush_frontbuffer = xsp_flush_frontbuffer;
+   xsp_winsys->base.get_name = xsp_get_name;
+   xsp_winsys->base.destroy = xsp_destroy;
+   xsp_winsys->display = display;
+   xsp_winsys->screen = screen;
+   xsp_winsys->fbimage = XCreateImage
+   (
+      display,
+      XDefaultVisual(display, screen),
+      XDefaultDepth(display, screen),
+      ZPixmap,
+      0,
+      NULL,
+      0, /* Don't know the width and height until flush_frontbuffer */
+      0,
+      32,
+      0
+   );
+
+   if (!xsp_winsys->fbimage) {
+      FREE(xsp_winsys);
+      return NULL;
+   }
+
+   XInitImage(xsp_winsys->fbimage);
+
+   return softpipe_create_screen(&xsp_winsys->base);
+}
+
+struct pipe_video_context*
+vl_video_create(Display *display, int screen,
+                struct pipe_screen *p_screen,
+                enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height)
+{
+   struct pipe_video_context *vpipe;
+   struct xsp_context *xsp_context;
+
+   assert(p_screen);
+   assert(width && height);
+
+   vpipe = sp_video_create(p_screen, profile, chroma_format, width, height);
+   if (!vpipe)
+      return NULL;
+
+   xsp_context = CALLOC_STRUCT(xsp_context);
+   if (!xsp_context) {
+      vpipe->destroy(vpipe);
+      return NULL;
+   }
+
+   /* Override this so we can free our xsp_context when the pipe is freed */
+   xsp_context->pipe_destroy = vpipe->destroy;
+   vpipe->destroy = xsp_pipe_destroy;
+
+   vpipe->priv = xsp_context;
+
+   return vpipe;
+}
diff --git a/src/gallium/winsys/g3dvl/xsp_winsys.c b/src/gallium/winsys/g3dvl/xsp_winsys.c
deleted file mode 100644
index 698c2856a4f..00000000000
--- a/src/gallium/winsys/g3dvl/xsp_winsys.c
+++ /dev/null
@@ -1,290 +0,0 @@
-#include "vl_winsys.h"
-#include <X11/Xutil.h>
-#include <pipe/internal/p_winsys_screen.h>
-#include <pipe/p_state.h>
-#include <pipe/p_inlines.h>
-#include <util/u_memory.h>
-#include <util/u_math.h>
-#include <softpipe/sp_winsys.h>
-#include <softpipe/sp_texture.h>
-
-/* pipe_winsys implementation */
-
-struct xsp_pipe_winsys
-{
-	struct pipe_winsys	base;
-	XImage			fbimage;
-};
-
-struct xsp_context
-{
-	Display			*display;
-	int			screen;
-	Drawable		drawable;
-	int			drawable_bound;
-};
-
-struct xsp_buffer
-{
-	struct pipe_buffer	base;
-	boolean			is_user_buffer;
-	void			*data;
-	void			*mapped_data;
-};
-
-static struct pipe_buffer* xsp_buffer_create(struct pipe_winsys *pws, unsigned alignment, unsigned usage, unsigned size)
-{
-	struct xsp_buffer *buffer;
-
-	assert(pws);
-
-	buffer = calloc(1, sizeof(struct xsp_buffer));
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.alignment = alignment;
-	buffer->base.usage = usage;
-	buffer->base.size = size;
-	buffer->data = align_malloc(size, alignment);
-
-	return (struct pipe_buffer*)buffer;
-}
-
-static struct pipe_buffer* xsp_user_buffer_create(struct pipe_winsys *pws, void *data, unsigned size)
-{
-	struct xsp_buffer *buffer;
-
-	assert(pws);
-
-	buffer = calloc(1, sizeof(struct xsp_buffer));
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.size = size;
-	buffer->is_user_buffer = TRUE;
-	buffer->data = data;
-
-	return (struct pipe_buffer*)buffer;
-}
-
-static void* xsp_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buffer, unsigned flags)
-{
-	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
-
-	assert(pws);
-	assert(buffer);
-
-	xsp_buf->mapped_data = xsp_buf->data;
-
-	return xsp_buf->mapped_data;
-}
-
-static void xsp_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buffer)
-{
-	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
-
-	assert(pws);
-	assert(buffer);
-
-	xsp_buf->mapped_data = NULL;
-}
-
-static void xsp_buffer_destroy(struct pipe_winsys *pws, struct pipe_buffer *buffer)
-{
-	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
-
-	assert(pws);
-	assert(buffer);
-
-	if (!xsp_buf->is_user_buffer)
-		align_free(xsp_buf->data);
-
-	free(xsp_buf);
-}
-
-static struct pipe_buffer* xsp_surface_buffer_create
-(
-	struct pipe_winsys *pws,
-	unsigned width,
-	unsigned height,
-	enum pipe_format format,
-	unsigned usage,
-	unsigned *stride
-)
-{
-	const unsigned int ALIGNMENT = 1;
-	struct pipe_format_block block;
-	unsigned nblocksx, nblocksy;
-
-	pf_get_block(format, &block);
-	nblocksx = pf_get_nblocksx(&block, width);
-	nblocksy = pf_get_nblocksy(&block, height);
-	*stride = align(nblocksx * block.size, ALIGNMENT);
-
-	return pws->buffer_create(pws, ALIGNMENT,
-				  usage,
-				  *stride * nblocksy);
-}
-
-static void xsp_fence_reference(struct pipe_winsys *pws, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence)
-{
-	assert(pws);
-	assert(ptr);
-	assert(fence);
-}
-
-static int xsp_fence_signalled(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
-{
-	assert(pws);
-	assert(fence);
-
-	return 0;
-}
-
-static int xsp_fence_finish(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
-{
-	assert(pws);
-	assert(fence);
-
-	return 0;
-}
-
-static void xsp_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surface, void *context_private)
-{
-	struct xsp_pipe_winsys	*xsp_winsys;
-	struct xsp_context	*xsp_context;
-
-	assert(pws);
-	assert(surface);
-	assert(context_private);
-
-	xsp_winsys = (struct xsp_pipe_winsys*)pws;
-	xsp_context = (struct xsp_context*)context_private;
-
-	if (!xsp_context->drawable_bound)
-		return;
-
-	xsp_winsys->fbimage.width = surface->width;
-	xsp_winsys->fbimage.height = surface->height;
-	xsp_winsys->fbimage.bytes_per_line = surface->width * (xsp_winsys->fbimage.bits_per_pixel >> 3);
-	xsp_winsys->fbimage.data = ((struct xsp_buffer *)softpipe_texture(surface->texture)->buffer)->data + surface->offset;
-
-	XPutImage
-	(
-		xsp_context->display,
-		xsp_context->drawable,
-		XDefaultGC(xsp_context->display, xsp_context->screen),
-		&xsp_winsys->fbimage,
-		0,
-		0,
-		0,
-		0,
-		surface->width,
-		surface->height
-	);
-	XFlush(xsp_context->display);
-}
-
-static const char* xsp_get_name(struct pipe_winsys *pws)
-{
-	assert(pws);
-	return "X11 SoftPipe";
-}
-
-/* Show starts here */
-
-int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable)
-{
-	struct xsp_context *xsp_context;
-
-	assert(pipe);
-
-	xsp_context = pipe->priv;
-	xsp_context->drawable = drawable;
-	xsp_context->drawable_bound = 1;
-
-	return 0;
-}
-
-int unbind_pipe_drawable(struct pipe_context *pipe)
-{
-	struct xsp_context *xsp_context;
-
-	assert(pipe);
-
-	xsp_context = pipe->priv;
-	xsp_context->drawable_bound = 0;
-
-	return 0;
-}
-
-struct pipe_context* create_pipe_context(Display *display, int screen)
-{
-	struct xsp_pipe_winsys	*xsp_winsys;
-	struct xsp_context	*xsp_context;
-	struct pipe_screen	*sp_screen;
-	struct pipe_context	*sp_pipe;
-
-	assert(display);
-
-	xsp_winsys = calloc(1, sizeof(struct xsp_pipe_winsys));
-	xsp_winsys->base.buffer_create = xsp_buffer_create;
-	xsp_winsys->base.user_buffer_create = xsp_user_buffer_create;
-	xsp_winsys->base.buffer_map = xsp_buffer_map;
-	xsp_winsys->base.buffer_unmap = xsp_buffer_unmap;
-	xsp_winsys->base.buffer_destroy = xsp_buffer_destroy;
-	xsp_winsys->base.surface_buffer_create = xsp_surface_buffer_create;
-	xsp_winsys->base.fence_reference = xsp_fence_reference;
-	xsp_winsys->base.fence_signalled = xsp_fence_signalled;
-	xsp_winsys->base.fence_finish = xsp_fence_finish;
-	xsp_winsys->base.flush_frontbuffer = xsp_flush_frontbuffer;
-	xsp_winsys->base.get_name = xsp_get_name;
-
-	{
-		/* XXX: Can't use the returned XImage* directly,
-		since we don't have control over winsys destruction
-		and we wouldn't be able to free it */
-		XImage *template = XCreateImage
-		(
-			display,
-			XDefaultVisual(display, XDefaultScreen(display)),
-			XDefaultDepth(display, XDefaultScreen(display)),
-			ZPixmap,
-			0,
-			NULL,
-			0,	/* Don't know the width and height until flush_frontbuffer */
-			0,
-			32,
-			0
-		);
-
-		memcpy(&xsp_winsys->fbimage, template, sizeof(XImage));
-		XInitImage(&xsp_winsys->fbimage);
-
-		XDestroyImage(template);
-	}
-
-	sp_screen = softpipe_create_screen((struct pipe_winsys*)xsp_winsys);
-	sp_pipe = softpipe_create(sp_screen);
-
-	xsp_context = calloc(1, sizeof(struct xsp_context));
-	xsp_context->display = display;
-	xsp_context->screen = screen;
-
-	sp_pipe->priv = xsp_context;
-
-	return sp_pipe;
-}
-
-int destroy_pipe_context(struct pipe_context *pipe)
-{
-	struct pipe_screen *screen;
-	struct pipe_winsys *winsys;
-
-	assert(pipe);
-
-	screen = pipe->screen;
-	winsys = pipe->winsys;
-	free(pipe->priv);
-	pipe->destroy(pipe);
-	screen->destroy(screen);
-	free(winsys);
-
-	return 0;
-}
diff --git a/src/gallium/winsys/gdi/SConscript b/src/gallium/winsys/gdi/SConscript
index 86eb9ef55ed..8f556daf04a 100644
--- a/src/gallium/winsys/gdi/SConscript
+++ b/src/gallium/winsys/gdi/SConscript
@@ -5,35 +5,45 @@ Import('*')
 
 if env['platform'] == 'windows':
 
-	env = env.Clone()
-
-	env.Append(CPPPATH = [
-		'#src/gallium/state_trackers/wgl',
-	])
-
-	env.Append(LIBS = [
-		'gdi32',
-		'user32',
-		'kernel32',
-		'ws2_32',
-	])
-
-	sources = [
-		'gdi_softpipe_winsys.c',
-	]
-	
-	if env['gcc']:
-		sources += ['#src/gallium/state_trackers/wgl/opengl32.mingw.def']
-	else:
-		sources += ['#src/gallium/state_trackers/wgl/opengl32.def']
-		
-	drivers = [
-		trace,
-		softpipe,
-	]
-
-	env.SharedLibrary(
-		target ='opengl32',
-		source = sources,
-		LIBS = wgl + glapi + mesa + drivers + auxiliaries + env['LIBS'],
-	)
+    env = env.Clone()
+
+    env.Append(CPPPATH = [
+        '#src/gallium/state_trackers/wgl',
+    ])
+
+    env.Append(LIBS = [
+        'gdi32',
+        'user32',
+        'kernel32',
+        'ws2_32',
+    ])
+
+    sources = []
+    drivers = []
+
+    if 'softpipe' in env['drivers']:
+        sources = ['gdi_softpipe_winsys.c']
+        drivers = [softpipe]
+
+    if 'llvmpipe' in env['drivers']:
+        env.Tool('llvm')
+        if 'LLVM_VERSION' in env:
+            sources = ['gdi_llvmpipe_winsys.c']
+            drivers = [llvmpipe]
+
+    if not sources or not drivers:
+        print 'warning: softpipe or llvmpipe not selected, gdi winsys disabled'
+        Return()
+    
+    if env['gcc']:
+        sources += ['#src/gallium/state_trackers/wgl/opengl32.mingw.def']
+    else:
+        sources += ['#src/gallium/state_trackers/wgl/opengl32.def']
+        
+    drivers += [trace]
+
+    env.SharedLibrary(
+        target ='opengl32',
+        source = sources,
+        LIBS = wgl + glapi + mesa + drivers + auxiliaries + env['LIBS'],
+    )
diff --git a/src/gallium/winsys/gdi/gdi_llvmpipe_winsys.c b/src/gallium/winsys/gdi/gdi_llvmpipe_winsys.c
new file mode 100644
index 00000000000..e8bc0f55ac4
--- /dev/null
+++ b/src/gallium/winsys/gdi/gdi_llvmpipe_winsys.c
@@ -0,0 +1,288 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * LLVMpipe support.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include <windows.h>
+
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "llvmpipe/lp_winsys.h"
+#include "llvmpipe/lp_texture.h"
+#include "stw_winsys.h"
+
+
+struct gdi_llvmpipe_displaytarget
+{
+   enum pipe_format format;
+   struct pipe_format_block block;
+   unsigned width;
+   unsigned height;
+   unsigned stride;
+
+   unsigned size;
+
+   void *data;
+
+   BITMAPINFO bmi;
+};
+
+
+/** Cast wrapper */
+static INLINE struct gdi_llvmpipe_displaytarget *
+gdi_llvmpipe_displaytarget( struct llvmpipe_displaytarget *buf )
+{
+   return (struct gdi_llvmpipe_displaytarget *)buf;
+}
+
+
+static boolean
+gdi_llvmpipe_is_displaytarget_format_supported( struct llvmpipe_winsys *ws,
+                                                enum pipe_format format )
+{
+   switch(format) {
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return TRUE;
+
+   /* TODO: Support other formats possible with BMPs, as described in 
+    * http://msdn.microsoft.com/en-us/library/dd183376(VS.85).aspx */
+      
+   default:
+      return FALSE;
+   }
+}
+
+
+static void *
+gdi_llvmpipe_displaytarget_map(struct llvmpipe_winsys *ws,
+                               struct llvmpipe_displaytarget *dt,
+                               unsigned flags )
+{
+   struct gdi_llvmpipe_displaytarget *gdt = gdi_llvmpipe_displaytarget(dt);
+
+   return gdt->data;
+}
+
+
+static void
+gdi_llvmpipe_displaytarget_unmap(struct llvmpipe_winsys *ws,
+                                 struct llvmpipe_displaytarget *dt )
+{
+
+}
+
+
+static void
+gdi_llvmpipe_displaytarget_destroy(struct llvmpipe_winsys *winsys,
+                                   struct llvmpipe_displaytarget *dt)
+{
+   struct gdi_llvmpipe_displaytarget *gdt = gdi_llvmpipe_displaytarget(dt);
+
+   align_free(gdt->data);
+   FREE(gdt);
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+static struct llvmpipe_displaytarget *
+gdi_llvmpipe_displaytarget_create(struct llvmpipe_winsys *winsys,
+                                  enum pipe_format format,
+                                  unsigned width, unsigned height,
+                                  unsigned alignment,
+                                  unsigned *stride)
+{
+   struct gdi_llvmpipe_displaytarget *gdt;
+   unsigned cpp;
+   unsigned bpp;
+   
+   gdt = CALLOC_STRUCT(gdi_llvmpipe_displaytarget);
+   if(!gdt)
+      goto no_gdt;
+
+   gdt->format = format;
+   gdt->width = width;
+   gdt->height = height;
+
+   bpp = pf_get_bits(format);
+   cpp = pf_get_size(format);
+   
+   gdt->stride = round_up(width * cpp, alignment);
+   gdt->size = gdt->stride * height;
+   
+   gdt->data = align_malloc(gdt->size, alignment);
+   if(!gdt->data)
+      goto no_data;
+
+   gdt->bmi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+   gdt->bmi.bmiHeader.biWidth = gdt->stride / cpp;
+   gdt->bmi.bmiHeader.biHeight= -(long)height;
+   gdt->bmi.bmiHeader.biPlanes = 1;
+   gdt->bmi.bmiHeader.biBitCount = bpp;
+   gdt->bmi.bmiHeader.biCompression = BI_RGB;
+   gdt->bmi.bmiHeader.biSizeImage = 0;
+   gdt->bmi.bmiHeader.biXPelsPerMeter = 0;
+   gdt->bmi.bmiHeader.biYPelsPerMeter = 0;
+   gdt->bmi.bmiHeader.biClrUsed = 0;
+   gdt->bmi.bmiHeader.biClrImportant = 0;
+
+   *stride = gdt->stride;
+   return (struct llvmpipe_displaytarget *)gdt;
+
+no_data:
+   FREE(gdt);
+no_gdt:
+   return NULL;
+}
+
+
+static void
+gdi_llvmpipe_displaytarget_display(struct llvmpipe_winsys *winsys, 
+                                   struct llvmpipe_displaytarget *dt,
+                                   void *context_private)
+{
+   assert(0);
+}
+
+
+static void
+gdi_llvmpipe_destroy(struct llvmpipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+static struct pipe_screen *
+gdi_llvmpipe_screen_create(void)
+{
+   static struct llvmpipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(llvmpipe_winsys);
+   if(!winsys)
+      goto no_winsys;
+
+   winsys->destroy = gdi_llvmpipe_destroy;
+   winsys->is_displaytarget_format_supported = gdi_llvmpipe_is_displaytarget_format_supported;
+   winsys->displaytarget_create = gdi_llvmpipe_displaytarget_create;
+   winsys->displaytarget_map = gdi_llvmpipe_displaytarget_map;
+   winsys->displaytarget_unmap = gdi_llvmpipe_displaytarget_unmap;
+   winsys->displaytarget_display = gdi_llvmpipe_displaytarget_display;
+   winsys->displaytarget_destroy = gdi_llvmpipe_displaytarget_destroy;
+
+   screen = llvmpipe_create_screen(winsys);
+   if(!screen)
+      goto no_screen;
+
+   return screen;
+   
+no_screen:
+   FREE(winsys);
+no_winsys:
+   return NULL;
+}
+
+
+static struct pipe_context *
+gdi_llvmpipe_context_create(struct pipe_screen *screen)
+{
+   return llvmpipe_create(screen);
+}
+
+
+static void
+gdi_llvmpipe_present(struct pipe_screen *screen,
+                     struct pipe_surface *surface,
+                     HDC hDC)
+{
+    struct llvmpipe_texture *texture;
+    struct gdi_llvmpipe_displaytarget *gdt;
+
+    texture = llvmpipe_texture(surface->texture);
+    gdt = gdi_llvmpipe_displaytarget(texture->dt);
+
+    StretchDIBits(hDC,
+                  0, 0, gdt->width, gdt->height,
+                  0, 0, gdt->width, gdt->height,
+                  gdt->data, &gdt->bmi, 0, SRCCOPY);
+}
+
+
+static const struct stw_winsys stw_winsys = {
+   &gdi_llvmpipe_screen_create,
+   &gdi_llvmpipe_context_create,
+   &gdi_llvmpipe_present,
+   NULL, /* get_adapter_luid */
+   NULL, /* shared_surface_open */
+   NULL, /* shared_surface_close */
+   NULL  /* compose */
+};
+
+
+BOOL WINAPI
+DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved)
+{
+   switch (fdwReason) {
+   case DLL_PROCESS_ATTACH:
+      if (!stw_init(&stw_winsys)) {
+         return FALSE;
+      }
+      return stw_init_thread();
+
+   case DLL_THREAD_ATTACH:
+      return stw_init_thread();
+
+   case DLL_THREAD_DETACH:
+      stw_cleanup_thread();
+      break;
+
+   case DLL_PROCESS_DETACH:
+      stw_cleanup_thread();
+      stw_cleanup();
+      break;
+   }
+   return TRUE;
+}
diff --git a/src/gallium/winsys/gdi/gdi_softpipe_winsys.c b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
index 33826524d7a..5e0ccf32f48 100644
--- a/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
+++ b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
@@ -46,7 +46,7 @@
 #include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "softpipe/sp_texture.h"
-#include "shared/stw_winsys.h"
+#include "stw_winsys.h"
 
 
 struct gdi_softpipe_buffer
@@ -166,6 +166,7 @@ gdi_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
                                    unsigned width, unsigned height,
                                    enum pipe_format format,
                                    unsigned usage,
+                                   unsigned tex_usage,
                                    unsigned *stride)
 {
    const unsigned alignment = 64;
@@ -268,9 +269,9 @@ gdi_softpipe_context_create(struct pipe_screen *screen)
 
 
 static void
-gdi_softpipe_flush_frontbuffer(struct pipe_screen *screen,
-                               struct pipe_surface *surface,
-                               HDC hDC)
+gdi_softpipe_present(struct pipe_screen *screen,
+                     struct pipe_surface *surface,
+                     HDC hDC)
 {
     struct softpipe_texture *texture;
     struct gdi_softpipe_buffer *buffer;
@@ -303,7 +304,11 @@ gdi_softpipe_flush_frontbuffer(struct pipe_screen *screen,
 static const struct stw_winsys stw_winsys = {
    &gdi_softpipe_screen_create,
    &gdi_softpipe_context_create,
-   &gdi_softpipe_flush_frontbuffer
+   &gdi_softpipe_present,
+   NULL, /* get_adapter_luid */
+   NULL, /* shared_surface_open */
+   NULL, /* shared_surface_close */
+   NULL  /* compose */
 };
 
 
diff --git a/src/gallium/winsys/xlib/Makefile b/src/gallium/winsys/xlib/Makefile
index 3a1945d92c5..3dc38a78e45 100644
--- a/src/gallium/winsys/xlib/Makefile
+++ b/src/gallium/winsys/xlib/Makefile
@@ -31,9 +31,6 @@ DEFINES += \
 XLIB_WINSYS_SOURCES = \
 	xlib.c \
 	xlib_cell.c \
-	xlib_brw_aub.c \
-	xlib_brw_context.c \
-	xlib_brw_screen.c \
 	xlib_llvmpipe.c \
 	xlib_softpipe.c \
 	xlib_trace.c 
diff --git a/src/gallium/winsys/xlib/SConscript b/src/gallium/winsys/xlib/SConscript
index 467d595d33b..dfe550f733b 100644
--- a/src/gallium/winsys/xlib/SConscript
+++ b/src/gallium/winsys/xlib/SConscript
@@ -5,7 +5,7 @@ Import('*')
 
 if env['platform'] == 'linux' \
         and 'mesa' in env['statetrackers'] \
-        and set(('softpipe', 'llvmpipe', 'i915simple', 'trace')).intersection(env['drivers']) \
+        and set(('softpipe', 'llvmpipe', 'i915', 'trace')).intersection(env['drivers']) \
         and not env['dri']:
 
     env = env.Clone()
@@ -36,15 +36,6 @@ if env['platform'] == 'linux' \
             env.Tool('udis86')
             sources += ['xlib_llvmpipe.c']
             drivers += [llvmpipe]
-
-    if 'i965simple' in env['drivers']:
-        env.Append(CPPDEFINES = 'GALLIUM_I965SIMPLE')
-        sources += [
-            'xlib_brw_aub.c',
-            'xlib_brw_context.c',
-            'xlib_brw_screen.c',
-        ]
-        drivers += [i965simple]
         
     if 'cell' in env['drivers']:
         env.Append(CPPDEFINES = 'GALLIUM_CELL')
diff --git a/src/gallium/winsys/xlib/xlib.c b/src/gallium/winsys/xlib/xlib.c
index 4b71cf7ec38..163cc8863cb 100644
--- a/src/gallium/winsys/xlib/xlib.c
+++ b/src/gallium/winsys/xlib/xlib.c
@@ -43,7 +43,6 @@
 
 enum mode {
    MODE_TRACE,
-   MODE_BRW,
    MODE_CELL,
    MODE_LLVMPIPE,
    MODE_SOFTPIPE
@@ -55,9 +54,6 @@ static enum mode get_mode()
    if (getenv("XMESA_TRACE"))
       return MODE_TRACE;
 
-   if (getenv("XMESA_BRW"))
-      return MODE_BRW;
-
 #ifdef GALLIUM_CELL
    if (!getenv("GALLIUM_NOCELL")) 
       return MODE_CELL;
@@ -82,11 +78,6 @@ static void _init( void )
       xmesa_set_driver( &xlib_trace_driver );
 #endif
       break;
-   case MODE_BRW:
-#if defined(GALLIUM_BRW)
-      xmesa_set_driver( &xlib_brw_driver );
-#endif
-      break;
    case MODE_CELL:
 #if defined(GALLIUM_CELL)
       xmesa_set_driver( &xlib_cell_driver );
diff --git a/src/gallium/winsys/xlib/xlib.h b/src/gallium/winsys/xlib/xlib.h
index 347d45f4d66..f0855035f77 100644
--- a/src/gallium/winsys/xlib/xlib.h
+++ b/src/gallium/winsys/xlib/xlib.h
@@ -9,7 +9,6 @@ extern struct xm_driver xlib_trace_driver;
 extern struct xm_driver xlib_softpipe_driver;
 extern struct xm_driver xlib_llvmpipe_driver;
 extern struct xm_driver xlib_cell_driver;
-extern struct xm_driver xlib_brw_driver;
 
 
 #endif
diff --git a/src/gallium/winsys/xlib/xlib_brw.h b/src/gallium/winsys/xlib/xlib_brw.h
deleted file mode 100644
index be2dd147dbe..00000000000
--- a/src/gallium/winsys/xlib/xlib_brw.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef XLIB_BRW_H
-#define XLIB_BRW_H
-
-struct pipe_winsys;
-struct pipe_buffer;
-struct pipe_surface;
-struct xmesa_buffer;
-
-unsigned xlib_brw_get_buffer_offset( struct pipe_winsys *pws,
-                                     struct pipe_buffer *buf,
-                                     unsigned access_flags );
-
-void xlib_brw_buffer_subdata_typed( struct pipe_winsys *pws,
-                                    struct pipe_buffer *buf,
-                                    unsigned long offset, 
-                                    unsigned long size, 
-                                    const void *data,
-                                    unsigned data_type );
-
-
-
-void xlib_brw_commands_aub(struct pipe_winsys *winsys,
-                           unsigned *cmds,
-                           unsigned nr_dwords);
-
-struct pipe_context *
-xlib_create_brw_context( struct pipe_screen *screen,
-                         void *unused );
-
-#endif
diff --git a/src/gallium/winsys/xlib/xlib_brw_aub.c b/src/gallium/winsys/xlib/xlib_brw_aub.c
deleted file mode 100644
index b6bd849ef21..00000000000
--- a/src/gallium/winsys/xlib/xlib_brw_aub.c
+++ /dev/null
@@ -1,399 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "xlib_brw_aub.h"
-#include "pipe/p_context.h"
-#include "pipe/p_state.h"
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-#include "softpipe/sp_texture.h"
-
-
-struct brw_aubfile {
-   FILE *file;
-   unsigned next_free_page;
-};
-
-
-extern char *__progname;
-
-
-struct aub_file_header {
-   unsigned int instruction_type;
-   unsigned int pad0:16;
-   unsigned int minor:8;
-   unsigned int major:8;
-   unsigned char application[8*4];
-   unsigned int day:8;
-   unsigned int month:8;
-   unsigned int year:16;
-   unsigned int timezone:8;
-   unsigned int second:8;
-   unsigned int minute:8;
-   unsigned int hour:8;
-   unsigned int comment_length:16;   
-   unsigned int pad1:16;
-};
-
-struct aub_block_header {
-   unsigned int instruction_type;
-   unsigned int operation:8;
-   unsigned int type:8;
-   unsigned int address_space:8;
-   unsigned int pad0:8;
-   unsigned int general_state_type:8;
-   unsigned int surface_state_type:8;
-   unsigned int pad1:16;
-   unsigned int address;
-   unsigned int length;
-};
-
-struct aub_dump_bmp {
-   unsigned int instruction_type;
-   unsigned int xmin:16;
-   unsigned int ymin:16;
-   unsigned int pitch:16;
-   unsigned int bpp:8;
-   unsigned int format:8;
-   unsigned int xsize:16;
-   unsigned int ysize:16;
-   unsigned int addr;
-   unsigned int unknown;
-};
-
-enum bh_operation {
-   BH_COMMENT,
-   BH_DATA_WRITE,
-   BH_COMMAND_WRITE,
-   BH_MMI0_WRITE32,
-   BH_END_SCENE,
-   BH_CONFIG_MEMORY_MAP,
-   BH_MAX_OPERATION
-};
-
-enum command_write_type {
-   CW_HWB_RING = 1,
-   CW_PRIMARY_RING_A,
-   CW_PRIMARY_RING_B,		/* XXX - disagreement with listaub! */
-   CW_PRIMARY_RING_C,
-   CW_MAX_TYPE
-};
-
-enum memory_map_type {
-   MM_DEFAULT,
-   MM_DYNAMIC,
-   MM_MAX_TYPE
-};
-
-enum address_space {
-   ADDR_GTT,
-   ADDR_LOCAL,
-   ADDR_MAIN,
-   ADDR_MAX
-};
-
-
-#define AUB_FILE_HEADER 0xe085000b
-#define AUB_BLOCK_HEADER 0xe0c10003
-#define AUB_DUMP_BMP 0xe09e0004
-
-/* Registers to control page table
- */
-#define PGETBL_CTL       0x2020
-#define PGETBL_ENABLED   0x1
-
-#define NR_GTT_ENTRIES  65536	/* 256 mb */
-
-#define FAIL										\
-do {											\
-   fprintf(stderr, "failed to write aub data at %s/%d\n", __FUNCTION__, __LINE__);	\
-   exit(1);										\
-} while (0)
-
-
-/* Emit the headers at the top of each aubfile.  Initialize the GTT.
- */
-static void init_aubfile( FILE *aub_file )
-{   
-   struct aub_file_header fh;
-   struct aub_block_header bh;
-   unsigned int data;
-
-   static int nr;
-   
-   nr++;
-
-   /* Emit the aub header:
-    */
-   memset(&fh, 0, sizeof(fh));
-
-   fh.instruction_type = AUB_FILE_HEADER;
-   fh.minor = 0x0;
-   fh.major = 0x7;
-   memcpy(fh.application, __progname, sizeof(fh.application));
-   fh.day = (nr>>24) & 0xff;
-   fh.month = 0x0;
-   fh.year = 0x0;
-   fh.timezone = 0x0;
-   fh.second = nr & 0xff;
-   fh.minute = (nr>>8) & 0xff;
-   fh.hour = (nr>>16) & 0xff;
-   fh.comment_length = 0x0;   
-
-   if (fwrite(&fh, sizeof(fh), 1, aub_file) < 0) 
-      FAIL;
-         
-   /* Setup the GTT starting at main memory address zero (!):
-    */
-   memset(&bh, 0, sizeof(bh));
-   
-   bh.instruction_type = AUB_BLOCK_HEADER;
-   bh.operation = BH_MMI0_WRITE32;
-   bh.type = 0x0;
-   bh.address_space = ADDR_GTT;	/* ??? */
-   bh.general_state_type = 0x0;
-   bh.surface_state_type = 0x0;
-   bh.address = PGETBL_CTL;
-   bh.length = 0x4;
-
-   if (fwrite(&bh, sizeof(bh), 1, aub_file) < 0) 
-      FAIL;
-
-   data = 0x0 | PGETBL_ENABLED;
-
-   if (fwrite(&data, sizeof(data), 1, aub_file) < 0) 
-      FAIL;
-}
-
-
-static void init_aub_gtt( struct brw_aubfile *aubfile,
-			  unsigned start_offset, 
-			  unsigned size )
-{
-   FILE *aub_file = aubfile->file;
-   struct aub_block_header bh;
-   unsigned int i;
-
-   assert(start_offset + size < NR_GTT_ENTRIES * 4096);
-
-
-   memset(&bh, 0, sizeof(bh));
-   
-   bh.instruction_type = AUB_BLOCK_HEADER;
-   bh.operation = BH_DATA_WRITE;
-   bh.type = 0x0;
-   bh.address_space = ADDR_MAIN;
-   bh.general_state_type = 0x0;
-   bh.surface_state_type = 0x0;
-   bh.address =  start_offset / 4096 * 4;
-   bh.length = size / 4096 * 4;
-
-   if (fwrite(&bh, sizeof(bh), 1, aub_file) < 0) 
-      FAIL;
-
-   for (i = 0; i < size / 4096; i++) {
-      unsigned data = aubfile->next_free_page | 1;
-
-      aubfile->next_free_page += 4096;
-
-      if (fwrite(&data, sizeof(data), 1, aub_file) < 0) 
-	 FAIL;
-   }
-
-}
-
-static void write_block_header( FILE *aub_file,
-				struct aub_block_header *bh,
-				const unsigned *data,
-				unsigned sz )
-{
-   sz = (sz + 3) & ~3;
-
-   if (fwrite(bh, sizeof(*bh), 1, aub_file) < 0) 
-      FAIL;
-
-   if (fwrite(data, sz, 1, aub_file) < 0) 
-      FAIL;
-
-   fflush(aub_file);
-}
-
-
-static void write_dump_bmp( FILE *aub_file,
-			    struct aub_dump_bmp *db )
-{
-   if (fwrite(db, sizeof(*db), 1, aub_file) < 0) 
-      FAIL;
-
-   fflush(aub_file);
-}
-
-
-
-void brw_aub_gtt_data( struct brw_aubfile *aubfile,
-		       unsigned offset,
-		       const void *data,
-		       unsigned sz,
-		       unsigned type,
-		       unsigned state_type )
-{
-   struct aub_block_header bh;
-
-   bh.instruction_type = AUB_BLOCK_HEADER;
-   bh.operation = BH_DATA_WRITE;
-   bh.type = type;
-   bh.address_space = ADDR_GTT;
-   bh.pad0 = 0;
-
-   if (type == DW_GENERAL_STATE) {
-      bh.general_state_type = state_type;
-      bh.surface_state_type = 0;
-   }
-   else {
-      bh.general_state_type = 0;
-      bh.surface_state_type = state_type;
-   }
-
-   bh.pad1 = 0;
-   bh.address = offset;
-   bh.length = sz;
-
-   write_block_header(aubfile->file, &bh, data, sz);
-}
-
-
-
-void brw_aub_gtt_cmds( struct brw_aubfile *aubfile,
-		       unsigned offset,
-		       const void *data,
-		       unsigned sz )
-{
-   struct aub_block_header bh;   
-   unsigned type = CW_PRIMARY_RING_A;
-   
-
-   bh.instruction_type = AUB_BLOCK_HEADER;
-   bh.operation = BH_COMMAND_WRITE;
-   bh.type = type;
-   bh.address_space = ADDR_GTT;
-   bh.pad0 = 0;
-   bh.general_state_type = 0;
-   bh.surface_state_type = 0;
-   bh.pad1 = 0;
-   bh.address = offset;
-   bh.length = sz;
-
-   write_block_header(aubfile->file, &bh, data, sz);
-}
-
-void brw_aub_dump_bmp( struct brw_aubfile *aubfile,
-		       struct pipe_surface *surface,
-		       unsigned gtt_offset )
-{
-   struct aub_dump_bmp db;
-   unsigned format;
-
-   assert(surface->texture->block.width == 1);
-   assert(surface->texture->block.height == 1);
-   
-   if (surface->texture->block.size == 4)
-      format = 0x7;
-   else
-      format = 0x3;
-
-   db.instruction_type = AUB_DUMP_BMP;
-   db.xmin = 0;
-   db.ymin = 0;
-   db.format = format;
-   db.bpp = surface->texture->block.size * 8;
-   db.pitch = softpipe_texture(surface->texture)->stride[surface->level] /
-      surface->texture->block.size;
-   db.xsize = surface->width;
-   db.ysize = surface->height;
-   db.addr = gtt_offset;
-   db.unknown = /* surface->tiled ? 0x4 : */ 0x0;
-
-   write_dump_bmp(aubfile->file, &db);
-}
-
-
-
-struct brw_aubfile *brw_aubfile_create( void )
-{
-   struct brw_aubfile *aubfile = CALLOC_STRUCT(brw_aubfile);
-   char filename[80];
-   int val;
-   static int i = 0;
-
-   i++;
-
-   if (getenv("INTEL_AUBFILE")) {
-      val = snprintf(filename, sizeof(filename), "%s%d.aub", getenv("INTEL_AUBFILE"), i%4);
-      debug_printf("--> Aub file: %s\n", filename);
-      aubfile->file = fopen(filename, "w");
-   }
-   else {
-      val = snprintf(filename, sizeof(filename), "%s.aub", __progname);
-      if (val < 0 || val > sizeof(filename)) 
-	 strcpy(filename, "default.aub");   
-   
-      debug_printf("--> Aub file: %s\n", filename);
-      aubfile->file = fopen(filename, "w");
-   }
-
-   if (!aubfile->file) {
-      debug_printf("couldn't open aubfile\n");
-      exit(1);
-   }
-
-   init_aubfile(aubfile->file);
-
-   /* The GTT is located starting address zero in main memory.  Pages
-    * to populate the gtt start after this point.
-    */
-   aubfile->next_free_page = (NR_GTT_ENTRIES * 4 + 4095) & ~4095;
-
-   /* More or less correspond with all the agp regions mapped by the
-    * driver:
-    */
-   init_aub_gtt(aubfile, 0, 4096*4);
-   init_aub_gtt(aubfile, AUB_BUF_START, AUB_BUF_SIZE);
-
-   return aubfile;
-}
-
-void brw_aub_destroy( struct brw_aubfile *aubfile )
-{
-   fclose(aubfile->file);
-   FREE(aubfile);
-}
diff --git a/src/gallium/winsys/xlib/xlib_brw_aub.h b/src/gallium/winsys/xlib/xlib_brw_aub.h
deleted file mode 100644
index f5c60c7be28..00000000000
--- a/src/gallium/winsys/xlib/xlib_brw_aub.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#ifndef BRW_AUB_H
-#define BRW_AUB_H
-
-/* We set up this region, buffers may be allocated here:
- */
-#define AUB_BUF_START (4096*4)
-#define AUB_BUF_SIZE  (8*1024*1024)
-
-struct intel_context;
-struct pipe_surface;
-
-struct brw_aubfile *brw_aubfile_create( void );
-
-void brw_aub_destroy( struct brw_aubfile *aubfile );
-
-void brw_aub_gtt_data( struct brw_aubfile *aubfile,
-		       unsigned offset,
-		       const void *data,
-		       unsigned sz,
-		       unsigned type,
-		       unsigned state_type );
-
-void brw_aub_gtt_cmds( struct brw_aubfile *aubfile,
-		       unsigned offset,
-		       const void *data,
-		       unsigned sz );
-
-void brw_aub_dump_bmp( struct brw_aubfile *aubfile,
-		       struct pipe_surface *surface,
-		       unsigned gtt_offset );
-
-
-enum data_write_type {
-   DW_NOTYPE,
-   DW_BATCH_BUFFER,
-   DW_BIN_BUFFER,
-   DW_BIN_POINTER_LIST,
-   DW_SLOW_STATE_BUFFER,
-   DW_VERTEX_BUFFER,
-   DW_2D_MAP,
-   DW_CUBE_MAP,
-   DW_INDIRECT_STATE_BUFFER,
-   DW_VOLUME_MAP,
-   DW_1D_MAP,
-   DW_CONSTANT_BUFFER,
-   DW_CONSTANT_URB_ENTRY,
-   DW_INDEX_BUFFER,
-   DW_GENERAL_STATE,
-   DW_SURFACE_STATE,
-   DW_MEDIA_OBJECT_INDIRECT_DATA,
-   DW_MAX_TYPE
-};
-
-enum data_write_general_state_type {
-   DWGS_NOTYPE,
-   DWGS_VERTEX_SHADER_STATE,
-   DWGS_GEOMETRY_SHADER_STATE ,
-   DWGS_CLIPPER_STATE,
-   DWGS_STRIPS_FANS_STATE,
-   DWGS_WINDOWER_IZ_STATE,
-   DWGS_COLOR_CALC_STATE,
-   DWGS_CLIPPER_VIEWPORT_STATE,	/* was 0x7 */
-   DWGS_STRIPS_FANS_VIEWPORT_STATE,
-   DWGS_COLOR_CALC_VIEWPORT_STATE, /* was 0x9 */
-   DWGS_SAMPLER_STATE,
-   DWGS_KERNEL_INSTRUCTIONS,
-   DWGS_SCRATCH_SPACE,
-   DWGS_SAMPLER_DEFAULT_COLOR,
-   DWGS_INTERFACE_DESCRIPTOR,
-   DWGS_VLD_STATE,
-   DWGS_VFE_STATE,
-   DWGS_MAX_TYPE
-};
-
-enum data_write_surface_state_type {
-   DWSS_NOTYPE,
-   DWSS_BINDING_TABLE_STATE,
-   DWSS_SURFACE_STATE,
-   DWSS_MAX_TYPE
-};
-
-
-#endif
diff --git a/src/gallium/winsys/xlib/xlib_brw_context.c b/src/gallium/winsys/xlib/xlib_brw_context.c
deleted file mode 100644
index 09599507f44..00000000000
--- a/src/gallium/winsys/xlib/xlib_brw_context.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * 
- **************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell
- *   Brian Paul
- */
-
-
-//#include "glxheader.h"
-//#include "xmesaP.h"
-
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_inlines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "i965simple/brw_winsys.h"
-#include "xlib_brw_aub.h"
-#include "xlib_brw.h"
-
-
-
-
-#define XBCWS_BATCHBUFFER_SIZE 1024
-
-
-/* The backend to the brw driver (ie struct brw_winsys) is actually a
- * per-context entity.
- */
-struct xlib_brw_context_winsys {
-   struct brw_winsys brw_context_winsys;   /**< batch buffer funcs */
-   struct aub_context *aub;
-                         
-   struct pipe_winsys *pipe_winsys;
-
-   unsigned batch_data[XBCWS_BATCHBUFFER_SIZE];
-   unsigned batch_nr;
-   unsigned batch_size;
-   unsigned batch_alloc;
-};
-
-
-/* Turn a brw_winsys into an xlib_brw_context_winsys:
- */
-static inline struct xlib_brw_context_winsys *
-xlib_brw_context_winsys( struct brw_winsys *sws )
-{
-   return (struct xlib_brw_context_winsys *)sws;
-}
-
-
-/* Simple batchbuffer interface:
- */
-
-static unsigned *xbcws_batch_start( struct brw_winsys *sws,
-					 unsigned dwords,
-					 unsigned relocs )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-
-   if (xbcws->batch_size < xbcws->batch_nr + dwords)
-      return NULL;
-
-   xbcws->batch_alloc = xbcws->batch_nr + dwords;
-   return (void *)1;			/* not a valid pointer! */
-}
-
-static void xbcws_batch_dword( struct brw_winsys *sws,
-				    unsigned dword )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-
-   assert(xbcws->batch_nr < xbcws->batch_alloc);
-   xbcws->batch_data[xbcws->batch_nr++] = dword;
-}
-
-static void xbcws_batch_reloc( struct brw_winsys *sws,
-			     struct pipe_buffer *buf,
-			     unsigned access_flags,
-			     unsigned delta )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-
-   assert(xbcws->batch_nr < xbcws->batch_alloc);
-   xbcws->batch_data[xbcws->batch_nr++] = 
-      ( xlib_brw_get_buffer_offset( NULL, buf, access_flags ) +
-        delta );
-}
-
-static void xbcws_batch_end( struct brw_winsys *sws )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-
-   assert(xbcws->batch_nr <= xbcws->batch_alloc);
-   xbcws->batch_alloc = 0;
-}
-
-static void xbcws_batch_flush( struct brw_winsys *sws,
-				    struct pipe_fence_handle **fence )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-   assert(xbcws->batch_nr <= xbcws->batch_size);
-
-   if (xbcws->batch_nr) {
-      xlib_brw_commands_aub( xbcws->pipe_winsys,
-                             xbcws->batch_data,
-                             xbcws->batch_nr );
-   }
-
-   xbcws->batch_nr = 0;
-}
-
-  
-
-/* Really a per-device function, just pass through:
- */
-static unsigned xbcws_get_buffer_offset( struct brw_winsys *sws,
-                                         struct pipe_buffer *buf,
-                                         unsigned access_flags )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-
-   return xlib_brw_get_buffer_offset( xbcws->pipe_winsys,
-                                      buf,
-                                      access_flags );
-}
-
-
-/* Really a per-device function, just pass through:
- */
-static void xbcws_buffer_subdata_typed( struct brw_winsys *sws,
-                                       struct pipe_buffer *buf,
-                                       unsigned long offset, 
-                                       unsigned long size, 
-                                       const void *data,
-                                       unsigned data_type )
-{
-   struct xlib_brw_context_winsys *xbcws = xlib_brw_context_winsys(sws);
-
-   xlib_brw_buffer_subdata_typed( xbcws->pipe_winsys,
-                                  buf,
-                                  offset,
-                                  size,
-                                  data,
-                                  data_type );
-}
-
-
-/**
- * Create i965 hardware rendering context, but plugged into a
- * dump-to-aubfile backend.
- */
-struct pipe_context *
-xlib_create_brw_context( struct pipe_screen *screen,
-                         void *unused )
-{
-   struct xlib_brw_context_winsys *xbcws = CALLOC_STRUCT( xlib_brw_context_winsys );
-   
-   /* Fill in this struct with callbacks that i965simple will need to
-    * communicate with the window system, buffer manager, etc. 
-    */
-   xbcws->brw_context_winsys.batch_start = xbcws_batch_start;
-   xbcws->brw_context_winsys.batch_dword = xbcws_batch_dword;
-   xbcws->brw_context_winsys.batch_reloc = xbcws_batch_reloc;
-   xbcws->brw_context_winsys.batch_end = xbcws_batch_end;
-   xbcws->brw_context_winsys.batch_flush = xbcws_batch_flush;
-   xbcws->brw_context_winsys.buffer_subdata_typed = xbcws_buffer_subdata_typed;
-   xbcws->brw_context_winsys.get_buffer_offset = xbcws_get_buffer_offset;
-
-   xbcws->pipe_winsys = screen->winsys; /* redundant */
-
-   xbcws->batch_size = XBCWS_BATCHBUFFER_SIZE;
-
-   /* Create the i965simple context:
-    */
-#ifdef GALLIUM_CELL
-   return NULL;
-#else
-   return brw_create( screen,
-		      &xbcws->brw_context_winsys,
-		      0 );
-#endif
-}
diff --git a/src/gallium/winsys/xlib/xlib_brw_screen.c b/src/gallium/winsys/xlib/xlib_brw_screen.c
deleted file mode 100644
index ef545796f3c..00000000000
--- a/src/gallium/winsys/xlib/xlib_brw_screen.c
+++ /dev/null
@@ -1,469 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * 
- **************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell
- *   Brian Paul
- */
-
-
-//#include "state_trackers/xlib/glxheader.h"
-//#include "state_trackers/xlib/xmesaP.h"
-
-#include "pipe/internal/p_winsys_screen.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "i965simple/brw_winsys.h"
-#include "i965simple/brw_screen.h"
-#include "i965simple/brw_context.h"
-
-
-#include "xlib_brw_aub.h"
-#include "xlib_brw.h"
-#include "xlib.h"
-
-static struct pipe_buffer *
-buffer_from_surface(struct pipe_surface *surface)
-{
-   struct brw_texture *texture = (struct brw_texture *)surface;
-   return texture->buffer;
-}
-
-struct aub_buffer {
-   struct pipe_reference reference;
-   char *data;
-   unsigned offset;
-   unsigned size;
-   unsigned map_count;
-   boolean dump_on_unmap;
-};
-
-
-
-struct aub_pipe_winsys {
-   struct pipe_winsys winsys;
-
-   struct brw_aubfile *aubfile;
-
-   /* This is simple, isn't it:
-    */
-   char *pool;
-   unsigned size;
-   unsigned used;
-};
-
-
-/* Turn a pipe winsys into an aub/pipe winsys:
- */
-static inline struct aub_pipe_winsys *
-aub_pipe_winsys( struct pipe_winsys *winsys )
-{
-   return (struct aub_pipe_winsys *)winsys;
-}
-
-
-
-static INLINE struct aub_buffer *
-aub_bo( struct pipe_buffer *bo )
-{
-   return (struct aub_buffer *)bo;
-}
-
-static INLINE struct pipe_buffer *
-pipe_bo( struct aub_buffer *bo )
-{
-   return (struct pipe_buffer *)bo;
-}
-
-
-
-
-static void *aub_buffer_map(struct pipe_winsys *winsys, 
-			      struct pipe_buffer *buf,
-			      unsigned flags )
-{
-   struct aub_buffer *sbo = aub_bo(buf);
-
-   assert(sbo->data);
-
-   if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
-      sbo->dump_on_unmap = 1;
-
-   sbo->map_count++;
-   return sbo->data;
-}
-
-static void aub_buffer_unmap(struct pipe_winsys *winsys, 
-			       struct pipe_buffer *buf)
-{
-   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
-   struct aub_buffer *sbo = aub_bo(buf);
-
-   sbo->map_count--;
-
-   if (sbo->map_count == 0 &&
-       sbo->dump_on_unmap) {
-
-      sbo->dump_on_unmap = 0;
-
-      brw_aub_gtt_data( iws->aubfile, 
-			sbo->offset,
-			sbo->data,
-			sbo->size,
-			0,
-			0);
-   }
-}
-
-
-static void
-aub_buffer_destroy(struct pipe_buffer *buf)
-{
-   free(buf);
-}
-
-
-
-void xlib_brw_commands_aub(struct pipe_winsys *winsys,
-			unsigned *cmds,
-			unsigned nr_dwords)
-{
-   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
-   unsigned size = nr_dwords * 4;
-
-   assert(iws->used + size < iws->size);
-
-   brw_aub_gtt_cmds( iws->aubfile, 
-		     AUB_BUF_START + iws->used,
-		     cmds,
-		     nr_dwords * sizeof(int) );
-
-   iws->used += align(size, 4096);
-}
-
-
-/* XXX: fix me:
- */
-static struct aub_pipe_winsys *global_winsys = NULL;
-
-
-
-
-/* Pipe has no concept of pools.  We choose the tex/region pool
- * for all buffers.
- */
-static struct pipe_buffer *
-aub_buffer_create(struct pipe_winsys *winsys,
-                  unsigned alignment,
-                  unsigned usage,
-                  unsigned size)
-{
-   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
-   struct aub_buffer *sbo = CALLOC_STRUCT(aub_buffer);
-
-   pipe_reference_init(&sbo->reference, 1);
-
-   /* Could reuse buffers that are not referenced in current
-    * batchbuffer.  Can't do that atm, so always reallocate:
-    */
-   assert(iws->used + size < iws->size);
-   sbo->data = iws->pool + iws->used;
-   sbo->offset = AUB_BUF_START + iws->used;
-   iws->used += align(size, 4096);
-
-   sbo->size = size;
-
-   return pipe_bo(sbo);
-}
-
-
-static struct pipe_buffer *
-aub_user_buffer_create(struct pipe_winsys *winsys, void *ptr, unsigned bytes)
-{
-   struct aub_buffer *sbo;
-
-   /* Lets hope this is meant for upload, not as a result!  
-    */
-   sbo = aub_bo(aub_buffer_create( winsys, 0, 0, 0 ));
-
-   sbo->data = ptr;
-   sbo->size = bytes;
-
-   return pipe_bo(sbo);
-}
-
-
-/* The state tracker (should!) keep track of whether the fake
- * frontbuffer has been touched by any rendering since the last time
- * we copied its contents to the real frontbuffer.  Our task is easy:
- */
-static void
-aub_flush_frontbuffer( struct pipe_winsys *winsys,
-                       struct pipe_surface *surface,
-                       void *context_private)
-{
-//   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
-   brw_aub_dump_bmp( global_winsys->aubfile, 
-		     surface,
-		     aub_bo(buffer_from_surface(surface))->offset );
-}
-
-
-/**
- * Round n up to next multiple.
- */
-static INLINE unsigned
-round_up(unsigned n, unsigned multiple)
-{
-   return (n + multiple - 1) & ~(multiple - 1);
-}
-
-static struct pipe_buffer *
-aub_i915_surface_buffer_create(struct pipe_winsys *winsys,
-                               unsigned width, unsigned height,
-                               enum pipe_format format,
-                               unsigned usage,
-                               unsigned tex_usage,
-                               unsigned *stride)
-{
-   const unsigned alignment = 64;
-   struct pipe_format_block block;
-   unsigned nblocksx, nblocksy;
-
-   pf_get_block(format, &block);
-   nblocksx = pf_get_nblocksx(&block, width);
-   nblocksy = pf_get_nblocksy(&block, height);
-   *stride = round_up(nblocksx * block.size, alignment);
-
-   return winsys->buffer_create(winsys, alignment,
-                                usage,
-                                *stride * nblocksy);
-}
-
-
-static const char *
-aub_get_name( struct pipe_winsys *winsys )
-{
-   return "Aub/xlib";
-}
-
-static void
-xlib_brw_destroy_pipe_winsys_aub( struct pipe_winsys *winsys )
-
-{
-   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
-   brw_aub_destroy(iws->aubfile);
-   free(iws->pool);
-   free(iws);
-}
-
-
-
-static struct pipe_winsys *
-xlib_create_brw_winsys( void )
-{
-   struct aub_pipe_winsys *iws = CALLOC_STRUCT( aub_pipe_winsys );
-   
-   /* Fill in this struct with callbacks that pipe will need to
-    * communicate with the window system, buffer manager, etc. 
-    *
-    * Pipe would be happy with a malloc based memory manager, but
-    * the SwapBuffers implementation in this winsys driver requires
-    * that rendering be done to an appropriate _DriBufferObject.  
-    */
-   iws->winsys.buffer_create = aub_buffer_create;
-   iws->winsys.user_buffer_create = aub_user_buffer_create;
-   iws->winsys.buffer_map = aub_buffer_map;
-   iws->winsys.buffer_unmap = aub_buffer_unmap;
-   iws->winsys.buffer_destroy = aub_buffer_destroy;
-   iws->winsys.flush_frontbuffer = aub_flush_frontbuffer;
-   iws->winsys.get_name = aub_get_name;
-   iws->winsys.destroy = xlib_brw_destroy_pipe_winsys_aub;
-
-   iws->winsys.surface_buffer_create = aub_i915_surface_buffer_create;
-
-   iws->aubfile = brw_aubfile_create();
-   iws->size = AUB_BUF_SIZE;
-   iws->pool = malloc(AUB_BUF_SIZE);
-
-   /* HACK: static copy of this pointer:
-    */
-   assert(global_winsys == NULL);
-   global_winsys = iws;
-
-   return &iws->winsys;
-}
-
-
-static struct pipe_screen *
-xlib_create_brw_screen( void )
-{
-#ifndef GALLIUM_CELL
-   struct pipe_winsys *winsys;
-   struct pipe_screen *screen;
-
-   winsys = xlib_create_brw_winsys();
-   if (winsys == NULL)
-      return NULL;
-
-   screen = brw_create_screen(winsys, 0/* XXX pci_id */);
-   if (screen == NULL)
-      goto fail;
-
-   return screen;
-
-fail:
-   if (winsys)
-      winsys->destroy( winsys );
-
-#endif
-   return NULL;
-}
-
-
-/* These per-screen functions are acually made available to the driver
- * through the brw_winsys (per-context) entity.
- */
-unsigned xlib_brw_get_buffer_offset( struct pipe_winsys *pws,
-                                     struct pipe_buffer *buf,
-                                     unsigned access_flags )
-{
-   return aub_bo(buf)->offset;
-}
-
-void xlib_brw_buffer_subdata_typed( struct pipe_winsys *pws,
-                                    struct pipe_buffer *buf,
-                                    unsigned long offset, 
-                                    unsigned long size, 
-                                    const void *data,
-                                    unsigned data_type )
-{
-   unsigned aub_type = DW_GENERAL_STATE;
-   unsigned aub_sub_type = 0;
-
-   switch (data_type) {
-   case BRW_CC_VP:
-      aub_sub_type = DWGS_COLOR_CALC_VIEWPORT_STATE;
-      break;
-   case BRW_CC_UNIT:
-      aub_sub_type = DWGS_COLOR_CALC_STATE;
-      break;
-   case BRW_WM_PROG:
-      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
-      break;
-   case BRW_SAMPLER_DEFAULT_COLOR:
-      aub_sub_type = DWGS_SAMPLER_DEFAULT_COLOR;
-      break;
-   case BRW_SAMPLER:
-      aub_sub_type = DWGS_SAMPLER_STATE;
-      break;
-   case BRW_WM_UNIT:
-      aub_sub_type = DWGS_WINDOWER_IZ_STATE;
-      break;
-   case BRW_SF_PROG:
-      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
-      break;
-   case BRW_SF_VP:
-      aub_sub_type = DWGS_STRIPS_FANS_VIEWPORT_STATE;
-      break;
-   case BRW_SF_UNIT:
-      aub_sub_type = DWGS_STRIPS_FANS_STATE;
-      break;
-   case BRW_VS_UNIT:
-      aub_sub_type = DWGS_VERTEX_SHADER_STATE;
-      break;
-   case BRW_VS_PROG:
-      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
-      break;
-   case BRW_GS_UNIT:
-      aub_sub_type = DWGS_GEOMETRY_SHADER_STATE;
-      break;
-   case BRW_GS_PROG:
-      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
-      break;
-   case BRW_CLIP_VP:
-      aub_sub_type = DWGS_CLIPPER_VIEWPORT_STATE;
-      break;
-   case BRW_CLIP_UNIT:
-      aub_sub_type = DWGS_CLIPPER_STATE;
-      break;
-   case BRW_CLIP_PROG:
-      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
-      break;
-   case BRW_SS_SURFACE:
-      aub_type = DW_SURFACE_STATE;
-      aub_sub_type = DWSS_SURFACE_STATE; 
-      break;
-   case BRW_SS_SURF_BIND:
-      aub_type = DW_SURFACE_STATE;
-      aub_sub_type = DWSS_BINDING_TABLE_STATE; 
-      break;
-   case BRW_CONSTANT_BUFFER:
-      aub_type = DW_CONSTANT_URB_ENTRY;
-      aub_sub_type = 0; 
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-
-   {
-      struct aub_pipe_winsys *iws = aub_pipe_winsys(pws);
-      struct aub_buffer *sbo = aub_bo(buf);
-
-      assert(sbo->size > offset + size);
-      memcpy(sbo->data + offset, data, size);
-
-      brw_aub_gtt_data( iws->aubfile, 
-                        sbo->offset + offset,
-                        sbo->data + offset,
-                        size,
-                        aub_type,
-                        aub_sub_type );
-   }
-}
- 
-
-static void
-xlib_brw_display_surface(struct xmesa_buffer *b, 
-                         struct pipe_surface *surf)
-{
-   brw_aub_dump_bmp( global_winsys->aubfile, 
-		     surf,
-		     aub_bo(buffer_from_surface(surf))->offset );
-}
-
-
-struct xm_driver xlib_brw_driver = 
-{
-   .create_pipe_screen = xlib_create_brw_screen,
-   .create_pipe_context = xlib_create_brw_context,
-   .display_surface = xlib_brw_display_surface,
-};
diff --git a/src/gallium/winsys/xlib/xlib_softpipe.c b/src/gallium/winsys/xlib/xlib_softpipe.c
index 67fea023a3b..260b39e2a0f 100644
--- a/src/gallium/winsys/xlib/xlib_softpipe.c
+++ b/src/gallium/winsys/xlib/xlib_softpipe.c
@@ -75,9 +75,6 @@ struct xmesa_pipe_winsys
 {
    struct pipe_winsys base;
 /*   struct xmesa_visual *xm_visual; */
-#ifdef USE_XSHM
-   int shm;
-#endif
 };
 
 
@@ -93,11 +90,6 @@ xm_buffer( struct pipe_buffer *buf )
 /**
  * X Shared Memory Image extension code
  */
-#ifdef USE_XSHM
-#define XSHM_ENABLED(b) ((b)->shm)
-#else
-#define XSHM_ENABLED(b) 0
-#endif
 
 #ifdef USE_XSHM
 
@@ -116,23 +108,23 @@ mesaHandleXError(Display *dpy, XErrorEvent *event)
 }
 
 
-static GLboolean alloc_shm(struct xm_buffer *buf, unsigned size)
+static char *alloc_shm(struct xm_buffer *buf, unsigned size)
 {
    XShmSegmentInfo *const shminfo = & buf->shminfo;
 
    shminfo->shmid = shmget(IPC_PRIVATE, size, IPC_CREAT|0777);
    if (shminfo->shmid < 0) {
-      return GL_FALSE;
+      return NULL;
    }
 
    shminfo->shmaddr = (char *) shmat(shminfo->shmid, 0, 0);
    if (shminfo->shmaddr == (char *) -1) {
       shmctl(shminfo->shmid, IPC_RMID, 0);
-      return GL_FALSE;
+      return NULL;
    }
 
    shminfo->readOnly = False;
-   return GL_TRUE;
+   return shminfo->shmaddr;
 }
 
 
@@ -258,25 +250,30 @@ xlib_softpipe_display_surface(struct xmesa_buffer *b,
       return;
 
 #ifdef USE_XSHM
-   if (XSHM_ENABLED(xm_buf) && (xm_buf->tempImage == NULL)) {
-      assert(surf->texture->block.width == 1);
-      assert(surf->texture->block.height == 1);
-      alloc_shm_ximage(xm_buf, b, spt->stride[surf->level] /
-                       surf->texture->block.size, surf->height);
-   }
-#endif
+   if (xm_buf->shm)
+   {
+      if (xm_buf->tempImage == NULL) 
+      {
+         assert(surf->texture->block.width == 1);
+         assert(surf->texture->block.height == 1);
+         alloc_shm_ximage(xm_buf, b, spt->stride[surf->level] /
+                          surf->texture->block.size, surf->height);
+      }
 
-   ximage = (XSHM_ENABLED(xm_buf)) ? xm_buf->tempImage : b->tempImage;
-   ximage->data = xm_buf->data;
+      ximage = xm_buf->tempImage;
+      ximage->data = xm_buf->data;
 
-   /* display image in Window */
-#ifdef USE_XSHM
-   if (XSHM_ENABLED(xm_buf)) {
+      /* _debug_printf("XSHM\n"); */
       XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
                    ximage, 0, 0, 0, 0, surf->width, surf->height, False);
-   } else
+   }
+   else
 #endif
    {
+      /* display image in Window */
+      ximage = b->tempImage;
+      ximage->data = xm_buf->data;
+
       /* check that the XImage has been previously initialized */
       assert(ximage->format);
       assert(ximage->bitmap_unit);
@@ -286,6 +283,7 @@ xlib_softpipe_display_surface(struct xmesa_buffer *b,
       ximage->height = surf->height;
       ximage->bytes_per_line = spt->stride[surf->level];
 
+      /* _debug_printf("XPUT\n"); */
       XPutImage(b->xm_visual->display, b->drawable, b->gc,
                 ximage, 0, 0, 0, 0, surf->width, surf->height);
    }
@@ -322,21 +320,6 @@ xm_buffer_create(struct pipe_winsys *pws,
                  unsigned size)
 {
    struct xm_buffer *buffer = CALLOC_STRUCT(xm_buffer);
-#ifdef USE_XSHM
-   struct xmesa_pipe_winsys *xpws = (struct xmesa_pipe_winsys *) pws;
-
-   buffer->shminfo.shmid = -1;
-   buffer->shminfo.shmaddr = (char *) -1;
-
-   if (xpws->shm && (usage & PIPE_BUFFER_USAGE_PIXEL) != 0) {
-      buffer->shm = xpws->shm;
-
-      if (alloc_shm(buffer, size)) {
-         buffer->data = buffer->shminfo.shmaddr;
-         buffer->shm = 1;
-      }
-   }
-#endif
 
    pipe_reference_init(&buffer->base.reference, 1);
    buffer->base.alignment = alignment;
@@ -363,9 +346,6 @@ xm_user_buffer_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
    buffer->base.size = bytes;
    buffer->userBuffer = TRUE;
    buffer->data = ptr;
-#ifdef USE_XSHM
-   buffer->shm = 0;
-#endif
 
    return &buffer->base;
 }
@@ -381,16 +361,44 @@ xm_surface_buffer_create(struct pipe_winsys *winsys,
 {
    const unsigned alignment = 64;
    struct pipe_format_block block;
-   unsigned nblocksx, nblocksy;
+   unsigned nblocksx, nblocksy, size;
 
    pf_get_block(format, &block);
    nblocksx = pf_get_nblocksx(&block, width);
    nblocksy = pf_get_nblocksy(&block, height);
    *stride = align(nblocksx * block.size, alignment);
+   size = *stride * nblocksy;
+
+#ifdef USE_XSHM
+   if (!debug_get_bool_option("XLIB_NO_SHM", FALSE))
+   {
+      struct xm_buffer *buffer = CALLOC_STRUCT(xm_buffer);
+
+      pipe_reference_init(&buffer->base.reference, 1);
+      buffer->base.alignment = alignment;
+      buffer->base.usage = usage;
+      buffer->base.size = size;
+      buffer->userBuffer = FALSE;
+      buffer->shminfo.shmid = -1;
+      buffer->shminfo.shmaddr = (char *) -1;
+      buffer->shm = TRUE;
+         
+      buffer->data = alloc_shm(buffer, size);
+      if (!buffer->data)
+         goto out;
+
+      return &buffer->base;
+         
+   out:
+      if (buffer)
+         FREE(buffer);
+   }
+#endif
+   
 
    return winsys->buffer_create(winsys, alignment,
                                 usage,
-                                *stride * nblocksy);
+                                size);
 }